Generate urllib2 vs. urllib3 benchmark results, as described here: http://attilaolah.eu/2010/08/08/urllib-benchmark/
import csv | |
import random | |
import re | |
import time | |
import urllib2 | |
import urllib3 | |
any_protocol = re.compile('^[^#^\\?^/]+?:') | |
http_link = re.compile(r'^https?://', re.I) | |
http_naked = re.compile(r'^https?://[^/]+$', re.I) | |
http_parts = re.compile(r'(^https?://[^/]+)(.*)$', re.I) | |
http_bad_url = re.compile('https?://[^/]+[#\\?]', re.I) | |
http_bad_url_split = re.compile('([#\\?])') | |
http_dots = re.compile(r'/[^/]+/\.\./') | |
http_dots_start = re.compile(r'^/\.\./') | |
http_dots_end = re.compile(r'(/\.\./?)$') | |
http_dot = re.compile(r'/\./') | |
http_dot_end = re.compile(r'(/\./?)$') | |
http_slashes = re.compile(r'/+') | |
html_ignore = re.compile( | |
r'(<script.*?>.+?</script>)|(<!--.+?-->)', | |
re.I | re.S | re.M, | |
) | |
rx_href = re.compile(""" | |
href = ( | |
("(.*?)") # double quotes | |
| | |
('(.*?)') # single quotes | |
| | |
([^ >]*) # no quotes | |
) | |
""", re.VERBOSE | re.IGNORECASE) | |
rx_js = re.compile(""" | |
( | |
location ( # [window.]location | |
| | |
\.href # [.href] | |
| | |
\.pathname # [.pathname] | |
) | |
) | |
\W* | |
= | |
\W* | |
( | |
("(.*?)") # double quotes | |
| | |
('(.*?)') # single quotes | |
) | |
""", re.VERBOSE | re.MULTILINE | re.DOTALL) | |
rx_frame = re.compile(r'(<frame .*?src=.*?>)', re.I | re.S | re.M) | |
rx_frame_url = re.compile(""" | |
^ | |
<frame | |
.*? | |
src = ( | |
("(.*?)") # double quotes | |
| | |
('(.*?)') # single quotes | |
| | |
([^ >]*) # no quotes | |
) | |
.*? | |
> | |
$ | |
""", re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) | |
rx_meta = re.compile( | |
r'(<meta [^>]*http-equiv=[\'"]refresh[\'"].*?>)', | |
re.I | re.S | re.M, | |
) | |
rx_meta_url = re.compile(r""" | |
^ | |
<meta | |
.*? | |
content | |
= | |
['"] | |
([\d]+;\s*url=)?(.*?) | |
['"] | |
.*? | |
> | |
$ | |
""", re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) | |
class URL(object): | |
"""Handle various URL-based tasks.""" | |
pool = None | |
@staticmethod | |
def canonize_url(crawled_url, extracted_link, strip_hash=True, **kw): | |
"""Canonize URLs for """ | |
## Part 1: filter out unsopported links | |
http_matched = False # catch match result for later | |
if any_protocol.match(extracted_link): | |
if not http_link.match(extracted_link): | |
return None | |
http_matched = True # change match result | |
## Part 2: construct an *absolute* URL | |
# Check if the url is absolute: | |
if http_matched: # use the cached result instead of matching again | |
link = extracted_link | |
# Check for in-host absolute paths: | |
elif extracted_link.startswith('/'): | |
# assume crawled_url starts with "http[s]://" | |
start = http_parts.match(crawled_url).group(1) | |
link = start + extracted_link | |
# Handle in-page (hash-only) URLs | |
elif extracted_link.startswith('#'): | |
link = crawled_url + extracted_link | |
# For relative urls, just append to the current path | |
else: | |
# assume crawled_url starts with "http[s]://" | |
if http_naked.match(crawled_url): | |
start = crawled_url | |
else: | |
start = crawled_url.rsplit('/', 1)[0] | |
link = start + '/' + extracted_link | |
## Part 3: try to fix some bad URLs | |
if http_bad_url.match(link): | |
# add a slash after the domain | |
link = re.sub(http_bad_url_split, '/\\1', link, 1) | |
## Part 4: more canonization | |
# hppt://foo.br/bar/baz/.[/] -> http://foo.br/bar/baz/ | |
while http_dot_end.search(link): | |
link = re.sub(http_dot_end, '/', link) | |
# http://foo.br/bar/baz/..[/] -> http://foo.br/bar/ | |
while http_dots_end.search(link): | |
link = re.sub(http_dots_end, '/', link) | |
# http://foo.br/bar/../baz -> http://foo.br/baz | |
host, path = http_parts.match(link).groups() | |
# Lower-case the host | |
host = host.lower() | |
# check the domain | |
if kw.get('domain_ends_with'): | |
if not host.split('//')[1].endswith(kw['domain'].lower()): | |
# Oops, the url goes to a subdomain! | |
return None | |
if kw.get('domain'): | |
if host.split('//')[1] != kw['domain'].lower(): | |
# Oops, the url goes outside the domain! | |
return None | |
while http_dots.search(path): | |
path = re.sub(http_dots, '/', path) | |
# http://foo.br/../bar -> http://foo.br/bar | |
while http_dots_start.search(path): | |
path = re.sub(http_dots_start, '/', path) | |
# Strip multiple slashes and dot-slashes | |
path = re.sub(http_dot, '/', path) | |
path = re.sub(http_slashes, '/', path) | |
# http://foo.br -> http://foo.br/ | |
path = path or '/' | |
# Combine the host and path again | |
link = host + path | |
## Part 5: stripping the hash | |
# Strip the hash? | |
return not strip_hash and link or link.split('#', 1)[0] | |
@staticmethod | |
def extract_links(html): | |
"""Extract various links.""" | |
for link in rx_js.findall(html): | |
yield link[-3] or link[-1] or '' | |
html = html_ignore.sub('', html) | |
for frame in rx_frame.findall(html): | |
groups = rx_frame_url.match(frame).groups() | |
yield groups[2] or groups[4] or groups[5] or '' | |
for meta in rx_meta.findall(html): | |
yield rx_meta_url.match(meta).groups()[1] | |
for link in rx_href.findall(html): | |
yield link[2] or link[4] or link[5] or '' | |
def download_url(self, url, urllib): | |
"""Download the URL.""" | |
try: | |
if urllib == urllib2: | |
request = urllib2.Request(url) | |
response = urllib2.urlopen(request) | |
return response.read() | |
except urllib2.HTTPError: | |
return '' | |
self.pool = self.pool or urllib3.connection_from_url(url) | |
return self.pool.get_url(url).data | |
if __name__ == '__main__': | |
# Do some testing | |
crawled_url, domain = 'http://theoatmeal.com/', 'theoatmeal.com' | |
queue, visited = set(), set() | |
url = URL() | |
print 'Starting urllib2/urllib3 benchmark...' | |
urllibs = [urllib2, urllib3] | |
benchmark = [] | |
csv_file = open('results.csv', 'w') | |
writer = csv.writer(csv_file) | |
try: | |
while True: | |
print ' * crawling:', | |
print crawled_url | |
random.shuffle(urllibs) | |
times = [0, 0] | |
for lib in urllibs: | |
start = time.time() | |
html = url.download_url(crawled_url, urllib=lib) | |
duration = time.time() - start | |
times[lib is urllib3 and 1 or 0] = duration | |
benchmark.append(times) | |
writer.writerow(times) | |
csv_file.flush() | |
visited.add(crawled_url) | |
extracted_links = [x for x in url.extract_links(html)] | |
valid_links = [url.canonize_url( | |
crawled_url, | |
link, | |
domain=domain, | |
) for link in extracted_links] | |
valid_links = set(filter(None, valid_links)) | |
queue.update(valid_links) | |
empty = False | |
try: | |
next_link = queue.pop() | |
except KeyError: | |
break | |
while next_link in visited: | |
try: | |
next_link = queue.pop() | |
except KeyError: | |
empty = True | |
break | |
if empty: | |
break | |
crawled_url = next_link | |
except KeyboardInterrupt: | |
pass | |
finally: | |
csv_file.close() | |
print 'Finishing benchmark, writing results to file `results.cvs`' | |
print 'Total times:' | |
print ' * urllib2:', | |
print sum(x[0] for x in benchmark) | |
print ' * urllib3:', | |
print sum(x[1] for x in benchmark) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment