from twisted.internet import reactor, protocol, defer from twisted.web import client import feedparser, time, out rss_feeds = out.rss_feed # This is the default site list #rss_feeds = [('http://www.nongnu.org/straw/news.rss','straw'), # ('http://googlenews.74d.com/rss/google_it.rss','google'), # ('http://www.pythonware.com/daily/rss.xml','pythonware'), # ('http://www.theinquirer.net/inquirer.rss','inq'), # ('http://www.groklaw.net/backend/GrokLaw.rdf','grok'), # ('http://www.livejournal.com/users/moshez/data/rss','zadka'), # ('http://www.pythonware.com/news.rdf','pwn')] # michele@berthold.com INTER_QUERY_TIME = 300 class FeederProtocol(object): def __init__(self): self.parsed = 0 # This dict structure will be the following: # { 'URL': (TIMESTAMP, value) } self.cache = {} def gotError(self, data=None, extra_args=None): # An Error as occurred, print traceback infos and go on print data self.parsed += 1 print "="*20 print "Trying to go on..." def getFeeds(self, where=None): #print "getting feeds" # This is to get the feeds we want if not where: # We don't have a database, then we use the local # variabile rss_feeds return rss_feeds else: return None def memoize(self, feed, site=None, extra=None): # extra is the second element of each tuple of rss_feeds # site is the address of the feed, also the first element of each tuple # of rss_feeds print "Memoizing",site,"..." self.cache.setdefault(site, (time.time(),feed)) return feed def stopWorking(self, data=None): print "Closing connection number %d..."%(self.parsed,) print "-"*20 # This is here only for testing. When a protocol/interface will be # created to communicate with this rss-aggregator server, we won't need # to die after we parsed some feeds just one time. self.parsed += 1 if self.parsed >= len(rss_feeds): print "Closing all..." #for i in self.cache: # print i print time.time()-tp #reactor.stop() def getPageFromMemory(self, key=None): #print "getting from memory" # Getting the second element of the tuple which is the parsed structure # of the feed at address key, the first element of the tuple is the # timestamp d = defer.succeed(self.cache.get(key,key)[1]) return d def parseFeed(self, feed): # This is self explaining :) return feedparser.parse(feed) def startDownloading(self, site): #print "Looking if",site[0],"cached...", # Try to get the tuple (TIMESTAMP, FEED_STRUCT) from the dict if it has # already been downloaded. Otherwise assign None to already_got already_got = self.cache.get(site[0], None) # Ok guys, we got it cached, let's see what we will do if already_got: # Well, it's cached, but will it be recent enough? #print "It is\n Looking if timestamp for",site[0],"is recent enough...", elapsed_time = time.time() - already_got[0] # Woooohooo it is, elapsed_time is less than INTER_QUERY_TIME so I # can get the page from the memory, recent enough if elapsed_time < INTER_QUERY_TIME: #print "It is" return self.getPageFromMemory(site[0]) else: # Uhmmm... actually it's a bit old, I'm going to get it from the # Net then, then I'll parse it and then I'll try to memoize it # again #print "Getting",site[0],"from the Net because old" return self.downloadPage(site) else: # Well... We hadn't it cached in, so we need to get it from the Net # now, It's useless to check if it's recent enough, it's not there. #print "Getting",site[0],"from the Net" return self.downloadPage(site) def downloadPage(self, site): #print "Now downloading..." # Self-explanatory d = client.getPage(site[0]) # Uncomment the following if you want to make everything crash :), since # it will save the feed on a file, but with the memoize feature it will # crash everything cuz it will break the get-->parse-->memoize chain #d = client.downloadPage(site[0],site[1]) # Parse the feed and if there's some errors call self.gotError d.addCallbacks(self.parseFeed, self.gotError) # Now memoize it, if there's some error call self.getError d.addCallbacks(self.memoize, self.gotError, site) return d def workOnPage(self, parsed_feed=None, site=None, extra_args=None, extra_key=None): print "-"*20 #print "finished retrieving" print "Feed Version:",parsed_feed.get('version','Unknown') # # Uncomment the following if you want to print the feeds # chan = parsed_feed.get('channel', None) if chan: print chan.get('title', '') #print chan.get('link', '') #print chan.get('tagline', '') #print chan.get('description','') print "-"*20 #items = parsed_feed.get('items', None) #if items: # for item in items: # print '\tTitle: ', item.get('title','') # print '\tDate: ', item.get('date', '') # print '\tLink: ', item.get('link', '') # print '\tDescription: ', item.get('description', '') # print '\tSummary: ', item.get('summary','') # print "-"*20 #print "got",site #print "="*40 def start(self, data=None): # Here we gather all the urls for the feeds #self.factory.tries += 1 for feed in self.getFeeds(): # Now we start telling the reactor that it has # to get all the feeds one by one... d = self.startDownloading(feed) # The it will pass the result of # startDownloading to workOnPage (this is hidden in twisted) # together with the feed url (just to use some extra infos # in the workOnPage method) d.addCallbacks(self.workOnPage, self.gotError, feed) # And when the for loop is ended we put # stopWorking on the callback for the last # feed gathered d.addCallbacks(self.stopWorking, self.gotError) # This is to try the memoize feature #if self.factory.tries<3: # d.addCallback(self.start) class FeederFactory(protocol.ClientFactory): protocol = FeederProtocol() def __init__(self): # tries is used to make more connection to use the # memoizing feature #self.tries = 0 # Here we put in the FeederProtocol instance a reference to # FeederFactory under the name of self.factory (seen from protocol) self.protocol.factory = self self.protocol.start() f = FeederFactory() tp = time.time() reactor.run()