kwellman/gist:632478

## gistfile1.py
"""Quick and dirty benchmarking for readability functions.
"""
import re, time, os, json
from urllib import urlopen
from hn import grabContent
from lxml_readability import extract

import socket
socket.setdefaulttimeout(30)

FILES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'files')

if not os.path.exists(FILES_DIR):
    os.mkdir(FILES_DIR)

def get_test_pages(num_hn_pages=2, max_links=99, sleep_time=3):
    """Fetch pages from Hacker News.
    """
    url = 'http://news.ycombinator.com/'
    num_links = 0

    for i in range(num_hn_pages):
        print 'Fetching %s...' % url
        hn_html = urlopen(url).read()

        for match in re.findall(r'<a href="([^"]+?)">[^<]+?</a><span class="comhead">', hn_html):
            print 'Link page: %s' % match

            # fetch link page
            try:
                link_html = urlopen(match).read()
            except IOError:
                continue
            open(os.path.join(FILES_DIR, '%s.html' % num_links), 'w').write(link_html)

            num_links += 1
            if num_links >= max_links:
                return

        next_page = re.findall(r'<a href="([^"]+?)" rel="nofollow">More</a>', hn_html)[0]
        url = 'http://news.ycombinator.com' + next_page

        # be nice
        time.sleep(sleep_time)

def run_benchmarks(funcs=[grabContent, extract]):
    contents = []
    all_results = []

    for filename in os.listdir(FILES_DIR):
        contents.append(open(os.path.join(FILES_DIR, filename)).read())

    for func in funcs:
        results = []
        for content in contents:
            print '.'
            t1 = time.time()
            # use a dummy link (example.com) because it doesn't matter
            excerpt = func('http://example.com/', content)
            t2 = time.time()
            results.append(t2-t1)

        all_results.append(results)

    open('speeds.json', 'w').write(json.dumps(all_results))

if __name__ == '__main__':
    get_test_pages(3)
    run_benchmarks()
	"""Quick and dirty benchmarking for readability functions.
	"""
	import re, time, os, json
	from urllib import urlopen
	from hn import grabContent
	from lxml_readability import extract

	import socket
	socket.setdefaulttimeout(30)

	FILES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'files')

	if not os.path.exists(FILES_DIR):
	os.mkdir(FILES_DIR)

	def get_test_pages(num_hn_pages=2, max_links=99, sleep_time=3):
	"""Fetch pages from Hacker News.
	"""
	url = 'http://news.ycombinator.com/'
	num_links = 0

	for i in range(num_hn_pages):
	print 'Fetching %s...' % url
	hn_html = urlopen(url).read()

	for match in re.findall(r'<a href="([^"]+?)">[^<]+?</a><span class="comhead">', hn_html):
	print 'Link page: %s' % match

	# fetch link page
	try:
	link_html = urlopen(match).read()
	except IOError:
	continue
	open(os.path.join(FILES_DIR, '%s.html' % num_links), 'w').write(link_html)

	num_links += 1
	if num_links >= max_links:
	return

	next_page = re.findall(r'<a href="([^"]+?)" rel="nofollow">More</a>', hn_html)[0]
	url = 'http://news.ycombinator.com' + next_page

	# be nice
	time.sleep(sleep_time)

	def run_benchmarks(funcs=[grabContent, extract]):
	contents = []
	all_results = []

	for filename in os.listdir(FILES_DIR):
	contents.append(open(os.path.join(FILES_DIR, filename)).read())

	for func in funcs:
	results = []
	for content in contents:
	print '.'
	t1 = time.time()
	# use a dummy link (example.com) because it doesn't matter
	excerpt = func('http://example.com/', content)
	t2 = time.time()
	results.append(t2-t1)

	all_results.append(results)

	open('speeds.json', 'w').write(json.dumps(all_results))

	if __name__ == '__main__':
	get_test_pages(3)
	run_benchmarks()