Ticket #1109: proxymap.py

File proxymap.py, 14.5 KB (added by kragen, 9 years ago)
Line 
1#!/usr/bin/python
2import twisted.python.urlpath, urlparse, twisted.web.proxy
3"""proxymap.py: More powerful reverse-proxy setup for twisted.web.
4
5As documented in http://twistedmatrix.com/bugs/issue1109
6"twisted.web.proxy doesn't reverse-map redirects like
7ProxyPassReverse", reverse proxies need to rewrite HTTP redirect URLs,
8and rewriting redirects in a reverse proxy requires knowledge of more
9than just a single ReverseProxyResource.
10
11This code makes such a setup easy to configure.  You create a ProxyMap
12object and hand your root Resource to its createMappings method:
13
14    p = ProxyMap({'/foo': 'http://localhost:8000/f00',
15                  '/slash': 'http://localhost:8100/slash/',
16                  '/slush/': 'http://localhost:8100/slush',
17                  '/bar': 'http://localhost/b4r',
18                  '/baz/quux': 'http://localhost:8000/quux'})
19
20    r.putChild('slush', twisted.web.resource.Resource())
21    r.putChild('baz', twisted.web.resource.Resource())
22    p.createMappings(r)
23
24If there are intermediate resources that must be created, you must
25create them yourself.  They probably won't work if they're anything
26other than simply resource.Resource, because ProxyMap uses
27.getStaticEntity to traverse the resource tree.
28
29This also includes the fix for http://twistedmatrix.com/bugs/issue1117
30"twisted.web.proxy.ReverseProxyResource incorrectly sends host header
31with no port," but in a slightly different form.
32
33Deficiencies:
34
35It does not support anything other than plain HTTP for either side of
36the reverse proxy.
37
38At foom's request (if I understood it correctly), this is written as a
39bunch of subclasses rather than simply a patch.  Consequently a lot of
40it consists of calls to super.
41
42There are relatively thorough tests for ProxyMap, hostport, and
43URLPathWithRelpath, but not with_explicit_port, ReverseProxyResource,
44ProxyClientFactory, or ProxyClient.  None of these tests yet use trial
45or even unittest; I have not yet learned how to write tests with
46trial.
47
48It will raise an exception trying to rewrite redirect URLs when
49talking to a client that doesn't send a Host: header.
50
51It does not support credentials in the redirect URLs.
52
53Copyright 2005, CommerceNet.  By Kragen Sitaker.
54
55"""
56#'#"#'#"# appease Emacs's stupid quote matching
57
58### to actually rewrite the location header, we need to intercept it;
59# here we have subclasses of the relevant classes.
60
61class ProxyClient(twisted.web.proxy.ProxyClient):
62    def handleHeader(self, key, value):
63        if key == 'location':
64            value = self.proxymap.absoluteURLOf(value, self.host_header())
65        return twisted.web.proxy.ProxyClient.handleHeader(self, key, value)
66    def host_header(self):
67        host = self.father.getHeader("host")
68        assert host is not None  # XXX there are other alternatives...
69        return host
70    def __init__(self, command, rest, version, headers, data, father, proxymap):
71        twisted.web.proxy.ProxyClient.__init__(self,
72                                               command=command,
73                                               rest=rest,
74                                               version=version,
75                                               headers=headers,
76                                               data=data,
77                                               father=father)
78        self.proxymap = proxymap
79
80class ProxyClientFactory(twisted.web.proxy.ProxyClientFactory):
81    def __init__(self, command, rest, version, headers, data, father, proxymap):
82        twisted.web.proxy.ProxyClientFactory.__init__(self,
83                                                      command=command,
84                                                      rest=rest,
85                                                      version=version,
86                                                      headers=headers,
87                                                      data=data,
88                                                      father=father)
89        self.proxymap = proxymap
90    def buildProtocol(self, addr):
91        return ProxyClient(command=self.command,
92                           rest=self.rest,
93                           version=self.version,
94                           headers=self.headers,
95                           data=self.data,
96                           father=self.father,
97                           proxymap=self.proxymap)
98
99def hostport(host, port, defaultport=80):
100    if port == defaultport: return host
101    return '%s:%d' % (host, port)
102
103class ReverseProxyResource(twisted.web.proxy.ReverseProxyResource):
104    clientFactory = ProxyClientFactory
105    def __init__(self, host, port, path, proxymap):
106        twisted.web.proxy.ReverseProxyResource.__init__(self, host, port, path)
107        self.proxymap = proxymap
108    def getChild(self, path, request):
109        # XXX have to make sure we get this one instead of the other one
110        return ReverseProxyResource(self.host, self.port, self.path+'/'+path,
111                                    proxymap=self.proxymap)
112    def render(self, request):
113        # XXX too bad we had to copy and paste all this code due to
114        # its poor factoring!
115
116        # Copy 'headers' rather than modify it in place --- we may
117        # need that 'Host:' header to correctly rewrite redirects
118        # later on.
119        headers = request.getAllHeaders().copy()
120        headers['host'] = hostport(self.host, self.port)
121
122        request.content.seek(0, 0)
123        qs = urlparse.urlparse(request.uri)[4]
124        if qs:
125            rest = self.path + '?' + qs
126        else:
127            rest = self.path
128        clientFactory = self.clientFactory(command=request.method,
129                                           rest=rest, 
130                                           version=request.clientproto, 
131                                           headers=headers,
132                                           data=request.content.read(),
133                                           father=request,
134                                           proxymap=self.proxymap)
135        twisted.internet.reactor.connectTCP(self.host, self.port, clientFactory)
136        return twisted.web.server.NOT_DONE_YET
137
138### Proxy map objects.
139
140def with_explicit_port(orig, default_port=80):
141    assert '@' not in orig   # XXX we don't support credentials yet
142    if ':' in orig: return orig
143    else: return '%s:%d' % (orig, default_port)
144
145class URLPathWithRelpath(twisted.python.urlpath.URLPath):
146    def relativePathTo(self, other_absolute_url):
147        """Path from me to absolute URL string 'other_absolute_url'.
148
149        If 'child' is a descendant of mine, returns the intermediate path
150        segments to get there; otherwise returns None.
151
152        """
153        c = twisted.python.urlpath.URLPath.fromString(other_absolute_url)
154        assert self.scheme == 'http'
155        if self.scheme != c.scheme: return None
156        # XXX 80 is too specific, but I'm not supporting non-http
157        # schemes yet (see assert above)
158        if with_explicit_port(self.netloc) != with_explicit_port(c.netloc):
159            return None
160        bpath = self.pathList()
161        if bpath[-1] == '': bpath = bpath[:-1]  # trailing slash
162        if c.pathList()[:len(bpath)] == bpath:
163            return urlparse.urlunsplit((None, None,
164                                        '/'.join(c.pathList()[len(bpath):]),
165                                        c.query, c.fragment))
166        return None
167
168def urlpath(urlstring):
169    return URLPathWithRelpath(*urlparse.urlsplit(urlstring))
170
171class ProxyMap:
172    def __init__(self, urlmap): self.urlmap = urlmap
173    #resourceType = twisted.web.proxy.ReverseProxyResource
174    resourceType = ReverseProxyResource
175    def createMappings(self, root):
176        """Create ReverseProxyResources to establish this mapping.
177
178        Walks the resource tree to find the place to insert each
179        ReverseProxyResource, then putChild()s it there.
180
181        """
182        for k, v in self.urlmap.items():
183            assert k.startswith('/')
184            path = k.split('/')[1:]
185            node = root
186            for segment in path[:-1]:
187                node = node.getStaticEntity(segment)
188                assert node is not None
189            scheme, netloc, lpath, query, frag = urlparse.urlsplit(v)
190            assert scheme == 'http'  # ReverseProxyResource only does http
191            assert query == ''    # how would you handle a query?
192            assert frag == ''     # and a fragment would obviously be nonsense
193            (host, port) = with_explicit_port(netloc).split(':')
194            node.putChild(path[-1], self.resourceType(host, int(port), lpath,
195                                                      proxymap=self))
196    def reverseMap(self, url):
197        """Finds the local URL path at which some absolute URL is mapped.
198
199        For rewriting HTTP "Location:" headers in redirects.
200
201        """
202        for k, v in self.urlmap.items():
203            path = urlpath(v).relativePathTo(url)
204            if path is not None:
205                if path == '': return k
206                if k.endswith('/'): return k + path
207                return k + '/' + path
208        return None  # normally we return a path, not an absolute URL
209
210    def absoluteURLOf(self, mappable_url, host_header):
211        """Remaps an URL into my URL space if possible.
212
213        host_header is the hostname and possibly port by which this
214        server is known.
215
216        Note that this doesn't have https support, even on the front
217        end, yet.  That would probably involve making host_header
218        become a URL.
219        """
220        path = self.reverseMap(mappable_url)
221        if path is None: return mappable_url
222        else: return 'http://%s%s' % (host_header, path)
223
224def ok(a, b): assert a == b, (a, b)
225def test_hostport():
226    ok(hostport('wibble', 80), 'wibble')
227    ok(hostport('wibble', 8080), 'wibble:8080')
228
229def test_relative_paths():
230    mapurl = urlpath('http://localhost/foo/bar').relativePathTo
231    ok(mapurl('http://localhost/foo/bar/baz'), 'baz')
232    ok(mapurl('http://localhost/foo/bar/buz'), 'buz')
233    ok(mapurl('http://localhost/foo/bar/baz/buz'), 'baz/buz')
234    ok(mapurl('http://localhost/foo/bar'), '')
235    ok(mapurl('http://localhost/foo/bar/'), '')
236    ok(mapurl('http://somewhereelse/foo/bar/baz'), None)
237    ok(mapurl('http://localhost:8080/foo/bar/baz'), None)
238    ok(mapurl('http://localhost/foo/barbaz'), None)
239
240    # with trailing slash
241    mapurl2 = urlpath('http://localhost/bar/').relativePathTo
242    ok(mapurl2('http://localhost/foo/bar'), None)
243    ok(mapurl2('http://localhost/bar/'), '')
244    ok(mapurl2('http://localhost/bar'), '')  # not sure about this one
245    ok(mapurl2('http://localhost/bar/bligz'), 'bligz')
246    ok(mapurl2('http://localhost/barbligz'), None)
247
248    # Usage scenario in reverse proxy includes construction of 'host'
249    # header from the host and port, so we can't ensure that the
250    # default port will be present or omitted exactly as in the mapped
251    # URL.  So:
252
253    # default port
254    ok(urlpath('http://www:80/x').relativePathTo('http://www/x'), '')
255    ok(urlpath('http://www:80/x').relativePathTo('http://www:80/x'), '')
256    ok(urlpath('http://www/x').relativePathTo('http://www:80/x'), '')
257
258    # nondefault port
259    ok(urlpath('http://www:8080/x').relativePathTo('http://www/x'), None)
260    ok(urlpath('http://www:8080/x').relativePathTo('http://www:8080/x'), '')
261    ok(urlpath('http://www/x').relativePathTo('http://www:8080/x'), None)
262
263    # Since that means we have to actually parse URLs, make sure we're
264    # paying attention to the scheme:
265    ok(urlpath('http://www/x').relativePathTo('ftp://www/x'), None)
266    ok(urlpath('http://www/x').relativePathTo('https://www/x'), None)
267
268    # and not discarding the query:
269    gbase = urlpath('http://google.com/x')
270    ok(gbase.relativePathTo('http://google.com/x/y?z'), 'y?z')
271    # or fragment:
272    ok(gbase.relativePathTo('http://google.com/x/y#2'), 'y#2')
273
274def test_proxymap():
275    p = ProxyMap({'/foo': 'http://localhost:8000/f00',
276                  '/slash': 'http://localhost:8100/slash/',
277                  '/slush/': 'http://localhost:8100/slush',
278                  '/bar': 'http://localhost/b4r',
279                  '/baz/quux': 'http://localhost:8000/quux'})
280
281    ok(p.reverseMap('http://localhost:8000/f00'), '/foo')
282    ok(p.reverseMap('http://localhost:8000/f00/bar'), '/foo/bar')
283    ok(p.reverseMap('http://localhost:8000/quux/snorf'), '/baz/quux/snorf')
284    ok(p.reverseMap('http://localhost/b4r/bie'), '/bar/bie')
285
286    # on non-match, we return None (not the original url)
287    ok(p.reverseMap('http://www.google.com/'), None)
288    ok(p.reverseMap('http://localhost:8000/uhoh'), None)
289    # starts with "/bar"...
290    ok(p.reverseMap('http://localhost/barbarian'), None)
291
292    # What to do about trailing slashes?
293
294    # For now, since I can't figure out what the right thing is, I
295    # won't care.
296
297    #ok(p.reverseMap('http://localhost:8100/slush/'), '/slush/')
298    #ok(p.reverseMap('http://localhost:8100/slush'),  '/slush/')
299    #ok(p.reverseMap('http://localhost:8100/slash/'), '/slash/')
300    #ok(p.reverseMap('http://localhost:8100/slash'),  '/slash')
301
302    # but no double slash:
303    ok(p.reverseMap('http://localhost:8100/slash/mush'), '/slash/mush')
304    ok(p.reverseMap('http://localhost:8100/slush/mush'), '/slush/mush')
305
306    # absolute URL
307    ok(p.absoluteURLOf('http://localhost:8000/f00', 'wurble:9019'),
308       'http://wurble:9019/foo')
309    ok(p.absoluteURLOf('http://localhost/b4r/bie', 'ken.example.org'),
310       'http://ken.example.org/bar/bie')
311    ok(p.absoluteURLOf('http://google.com/search?q=guacamole', 'irrelevant'),
312       'http://google.com/search?q=guacamole')
313   
314
315    ## creating mappings to set up a reverse-proxy site
316    import twisted.web.resource
317    r = twisted.web.resource.Resource()
318    # haven't yet figured out what it should do about creating
319    # intermediate nodes, so for now we require that the already exist
320    r.putChild('slush', twisted.web.resource.Resource())
321    r.putChild('baz', twisted.web.resource.Resource())
322    p.createMappings(r)
323
324    # contents of a ReverseProxyResource
325    rpr = lambda res: (res.host, res.port, res.path)
326
327    ok(rpr(r.getStaticEntity("foo")), ('localhost', 8000, '/f00'))
328    ok(rpr(r.getStaticEntity("slash")), ('localhost', 8100, '/slash/'))
329    ok(rpr(r.getStaticEntity("slush").getStaticEntity("")),
330       ('localhost', 8100, '/slush'))
331    ok(rpr(r.getStaticEntity("bar")), ('localhost', 80, '/b4r'))
332    ok(rpr(r.getStaticEntity('baz').getStaticEntity('quux')),
333       ('localhost', 8000, '/quux'))
334
335def test():
336    test_hostport()
337    test_relative_paths()
338    test_proxymap()
339
340# When my machine is at 600MHz, this module takes only 6-10ms to
341# reload with this in, so I feel justified at running it on every
342# reload, even without it, reloading would take less than 1ms.
343test()