Skip to content

Instantly share code, notes, and snippets.

@cdunklau
Last active September 12, 2015 16:00
Show Gist options
  • Save cdunklau/70bc823c5f60cdf046ec to your computer and use it in GitHub Desktop.
Save cdunklau/70bc823c5f60cdf046ec to your computer and use it in GitHub Desktop.
Basic url processor
import sys
import itertools
import datetime
from twisted.internet import task, defer
from twisted.python import log
import treq
def main(reactor, urls, chunksize):
validator = URLValidator(urls, chunksize)
return validator.run()
class FailedResponse(Exception):
pass
def urlgen(fname):
with open(fname) as f:
for line in f:
rank, sep, domain = line.strip().partition(',')
yield 'http://{0}/favicon.ico'.format(domain)
class URLValidator(object):
def __init__(self, urls, chunksize):
self.urls = iter(urls)
self.chunksize = chunksize
self.d = defer.Deferred()
self.created = datetime.datetime.now()
self.processed = 0
def run(self):
self.process_chunk()
return self.d
def finished(self):
self.d.callback(None)
def process_chunk(self, ignored=None):
chunk = list(itertools.islice(self.urls, self.chunksize))
log.msg('Processing next chunk')
if chunk:
d = defer.DeferredList([self.request(url) for url in chunk])
d.addCallback(self.log_progress)
d.addCallback(self.process_chunk)
else:
# out of urls
log.msg('No more URLs available')
self.finished()
def log_progress(self, ignored):
self.processed += self.chunksize
seconds = (datetime.datetime.now() - self.created).total_seconds()
log.msg('Processed {0} urls in {1} seconds, {2} urls/s'.format(
self.processed, seconds, self.processed/seconds))
def request(self, url):
d = treq.get(url)
d.addCallback(self.validate_response, url)
d.addCallbacks(self.log_url, self.log_failed, (url,), {}, (url,), {})
return d
def validate_response(self, response, url):
if response.code != 200:
raise FailedResponse
else:
return response
def log_url(self, response, url):
log.msg('Good URL: {0}'.format(url))
def log_failed(self, failure, url):
log.msg('Failed on URL: {0}'.format(url))
if __name__ == '__main__':
if len(sys.argv) != 3:
print 'Usage: {0} <domainrankcsv> <chunksize>'.format(sys.argv[0])
sys.exit(1)
log.startLogging(sys.stderr)
args = (urlgen(sys.argv[1]), int(sys.argv[2]))
task.react(main, args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment