[Twisted-web] How can I change the HTTP request to avoid gzip

Tue Dec 16 08:57:18 EST 2008

On Tue, Dec 16, 2008 at 2:20 PM, Jean-Paul Calderone <exarkun at divmod.com>wr=
ote:

> On Tue, 16 Dec 2008 05:31:45 +0200, Radu Dragusin <radudragusin at gmail.com>
> wrote:
>
>> I have a HTTP Proxy made with twisted.web and want to change the request
>> that the browser sends to the Proxy such that I erase the value of the
>> 'accept-encoding' key from 'gzip,deflate' to ' '.
>>
>> I use the example from the Tisted Book:
>>
>> By adding the overriden process method in WordCountProxyRequest I can get
>> the request header but have found no way to set a key, value pair.
>> I want make the server think that the browser does not support gzip
>> because
>> twisted seems to not support gzip as the response from www.google.com and
>> many (but not all) sites appears still encoded. www.dpreview.com seems
>> not
>> to gzip the response, and so the resonse is processed correctly.
>>
>> What can I do to either correctly decode gzip responses or modify the
>> 'accept-encoding' value to nothing so the server does not compress the
>> response?
>>
>> Thank you!
>> *Example 4-8. wordcountproxy.py*
>>
>> import sgmllib, re
>> from twisted.web import proxy, http
>> import sys
>> from twisted.python import log
>> log.startLogging(sys.stdout)
>>
>> WEB_PORT =3D 8000
>> PROXY_PORT =3D 8001
>>
>> class WordParser(sgmllib.SGMLParser):
>>   def __init__(self):
>>       sgmllib.SGMLParser.__init__(self)
>>       self.chardata =3D []
>>       self.inBody =3D False
>>
>>   def start_body(self, attrs):
>>       self.inBody =3D True
>>
>>   def end_body(self):
>>       self.inBody =3D False
>>
>>   def handle_data(self, data):
>>       if self.inBody:
>>           self.chardata.append(data)
>>
>>   def getWords(self):
>>       # extract words
>>       wordFinder =3D re.compile(r'\w*')
>>       words =3D wordFinder.findall("".join(self.chardata))
>>       words =3D filter(lambda word: word.strip( ), words)
>>       print "WORDS ARE", words
>>       return words
>>
>> class WordCounter(object):
>>   ignoredWords =3D "the a of in from to this that and or but is was be
>> can could i you they we at".split( )
>>
>>   def __init__(self):
>>       self.words =3D {}
>>
>>   def addWords(self, words):
>>       for word in words:
>>           word =3D word.lower( )
>>           if not word in self.ignoredWords:
>>               currentCount =3D self.words.get(word, 0)
>>               self.words[word] =3D currentCount + 1
>>
>> class WordCountProxyClient(proxy.ProxyClient):
>>   def handleHeader(self, key, value):
>>       proxy.ProxyClient.handleHeader(self, key, value)
>>
>
> How about skipping it here?
>

If I use here the following:

print "[", key, ":", value,"]"

I get:
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Cache-Control :
no-cache, no-store, max-age=3D0, must-revalidate ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Pragma : no-cache ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Expires : Fri, 01
Jan 1990 00:00:00 GMT ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec
2008 13:37:21 GMT ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Type :
text/javascript; charset=3DUTF-8 ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie :
GMAIL_STAT_3492=3DEXPIRED; Expires=3DMon, 15-Dec-2008 13:37:21 GMT; Path=3D=
/a/
dragusin.ro ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Set-Cookie :
GMAIL_IMP=3DEXPIRED; Expires=3DMon, 15-Dec-2008 13:37:21 GMT; Path=3D/a/
dragusin.ro ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Encoding :
gzip ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [
X-Content-Type-Options : nosniff ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Content-Length :
14340 ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Server : GFE/1.3 ]
2008-12-16 15:37:21+0200 [WordCountProxyClient,client] [ Connection : Close
]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Cache-Control :
private, max-age=3D0 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Date : Tue, 16 Dec
2008 13:37:21 GMT ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Expires : -1 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Type :
text/html; charset=3DUTF-8 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Encoding :
gzip ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Server : gws ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Content-Length :
2597 ]
2008-12-16 15:37:22+0200 [WordCountProxyClient,client] [ Connection : Close
]

So that is the response header.
I need to override the request header, the one that the browser sends to the
proxy server.
See below:

>
>
>        if key.lower( ) =3D=3D "content-type":
>>           if value.split(';')[0] =3D=3D 'text/html':
>>               self.parser =3D WordParser( )
>>
>>   def handleResponsePart(self, data):
>>       proxy.ProxyClient.handleResponsePart(self, data)
>>       if hasattr(self, 'parser'): self.parser.feed(data)
>>
>>
>>   def handleResponseEnd(self):
>>       proxy.ProxyClient.handleResponseEnd(self)
>>       if hasattr(self, 'parser'):
>>           self.parser.close( )
>>           self.father.wordCounter.addWords(self.parser.getWords( ))
>>           del(self.parser)
>>
>> class WordCountProxyClientFactory(proxy.ProxyClientFactory):
>>   def buildProtocol(self, addr):
>>       client =3D proxy.ProxyClientFactory.buildProtocol(self, addr)
>>       # upgrade proxy.proxyClient object to WordCountProxyClient
>>       client.__class__ =3D WordCountProxyClient
>>       return client
>>
>> class WordCountProxyRequest(proxy.ProxyRequest):
>>   protocols =3D {'http': WordCountProxyClientFactory}
>>
>>   def __init__(self, wordCounter, *args):
>>       self.wordCounter =3D wordCounter
>>       proxy.ProxyRequest.__init__(self, *args)
>>
>> *    def process(self):
>>       proxy.ProxyRequest.process(self)
>>       print "received_headers", proxy.ProxyRequest.getAllHeaders(self)*
>
>
the print above prints:

received_headers: {'accept-language': 'en-us,en;q=3D0.5', 'accept-encoding':
'gzip,deflate', 'keep-alive': '300', 'accept':
'text/html,application/xhtml+xml,application/xml;q=3D0.9,*/*;q=3D0.8',
'user-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.4)
Gecko/2008111318 Ubuntu/8.10 (intrepid) Firefox/3.0.4', 'accept-charset':
'ISO-8859-1,utf-8;q=3D0.7,*;q=3D0.7', 'host': 'www.google.com', 'cookie':
'PREF=3DID=3Dcfb3eb179de0c1e6:LD=3Den:NR=3D100:CR=3D2:TM=3D1228315308:LM=3D=
1229032156:GM=3D1:S=3DImAuEufbnV6S7BAz;
NID=3D17=3DlOVMiFLculcrfN-zUO7xxFTTUFzqQqaHOFHcG_BDmYFX8QKYbMoo7GrDoYH-8ASP=
BlVijG_Hstp7HSDQ_8WQexHPjwz6g_7ZVpBhwmh3vkKuO3jpf9dnzrnWthcW1mGh;
S=3Dphotos_html=3D6ScUGfd699g4Xuuh0FeizA; TZ=3D-120', 'cache-control':
'max-age=3D0', 'proxy-connection': 'keep-alive'}

these are the values I want to modify, the 'accept-encoding', to be
specific. How can I do it?

Thank you!

> class WordCountProxy(proxy.Proxy):
>>   def __init__(self, wordCounter):
>>       self.wordCounter =3D wordCounter
>>       proxy.Proxy.__init__(self)
>>
>>   def requestFactory(self, *args):
>>       return WordCountProxyRequest(self.wordCounter, *args)
>>
>> class WordCountProxyFactory(http.HTTPFactory):
>>   def __init__(self, wordCounter):
>>       self.wordCounter =3D wordCounter
>>       http.HTTPFactory.__init__(self)
>>
>>   def buildProtocol(self, addr):
>>       protocol =3D WordCountProxy(self.wordCounter)
>>       return protocol
>>
>> # classes for web reporting interface
>> class WebReportRequest(http.Request):
>>   def __init__(self, wordCounter, *args):
>>       self.wordCounter =3D wordCounter
>>       http.Request.__init__(self, *args)
>>
>>   def process(self):
>>       self.setHeader("Content-Type", "text/html")
>>       words =3D self.wordCounter.words.items( )
>>       words.sort(lambda (w1, c1), (w2, c2): cmp(c2, c1))
>>       for word, count in words:
>>           self.write("<li>%s %s</li>" % (word, count))
>>       self.finish( )
>>
>> class WebReportChannel(http.HTTPChannel):
>>   def __init__(self, wordCounter):
>>       self.wordCounter =3D wordCounter
>>       http.HTTPChannel.__init__(self)
>>
>>   def requestFactory(self, *args):
>>       return WebReportRequest(self.wordCounter, *args)
>>
>> class WebReportFactory(http.HTTPFactory):
>>   def __init__(self, wordCounter):
>>       self.wordCounter =3D wordCounter
>>       http.HTTPFactory.__init__(self)
>>
>>   def buildProtocol(self, addr):
>>       return WebReportChannel(self.wordCounter)
>>
>> if __name__ =3D=3D "__main__":
>>   from twisted.internet import reactor
>>   counter =3D WordCounter( )
>>   prox =3D WordCountProxyFactory(counter)
>>   reactor.listenTCP(PROXY_PORT, prox)
>>   reactor.listenTCP(WEB_PORT, WebReportFactory(counter))
>>   reactor.run( )
>>
>>
>>
> Jean-Paul
>
> _______________________________________________
> Twisted-web mailing list
> Twisted-web at twistedmatrix.com
> http://twistedmatrix.com/cgi-bin/mailman/listinfo/twisted-web
>

-- =

Radu
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://twistedmatrix.com/pipermail/twisted-web/attachments/20081216/da=
b51661/attachment-0001.htm