[Twisted-Python] Way to fix memory leaks of external c module

MārisR maris at chown.lv
Sat Nov 28 08:05:10 EST 2009


Hello!
Currently I'm trying to write small xmlrpc server for html data processing. Processing is done by html tidy lib, but the problem is that it has massive memory leak. 
As processing is blocking operation I'm running it in thread, but after some time and huge html document processing daemon eats all memory.
I wondering if its possible to load utidylib in thread, do processing and after this kill thread and release memory? Or maybe something like deferToProcess?
Thanks in advance!



#!/usr/bin/env python
# -*- coding: utf-8 -*-

import utidylib

from twisted.internet import epollreactor
epollreactor.install()

from twisted.internet import protocol, defer, threads, reactor
from twisted.web import xmlrpc, server
from twisted.python import log, threadpool

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

log.startLogging(sys.stdout)

import codecs

import gc
gc.enable()
gc.set_debug(gc.DEBUG_LEAK)
gc.set_threshold(1)

class TidyProtocol(xmlrpc.XMLRPC):

    def xmlrpc_tidify(self, data):
        defered = threads.deferToThread(self.tidyParse, data)
        defered.addCallback(self.returnToClient)
        return defered

    def tidyParse(self, data):
        options =  {
                        'drop-proprietary-attributes': '1',
                        'output-xhtml': '1',
                        'wrap': '0',
                        'bare': '0',
                        'clean': '1',
                        'doctype': 'omit',
                        'show-body-only': '1',
                        'word-2000': '0',
                        'escape-cdata': '0',
                        'hide-comments': '1',
                        'force-output': '1',
                        'alt-text': '',
                        'show-errors': '0',
                        'show-warnings': '0',
                        'tidy-mark': '0',
                        'char-encoding': 'utf8',
                    }

        if data['html'] == None:
            return None
        else:
            htmldata = data['html'].encode()
            print "Tidy start"
            return tidy.parseString(htmldata, **options)

    def returnToClient(self, data):
        gc.collect()
        print "Tidy end, retunring result"
        return data
        
if __name__ == '__main__':
    r = TidyProtocol()
    reactor.listenTCP(1100, server.Site(r))
    reactor.run()





More information about the Twisted-Python mailing list