jeffh/hw2_test.py

## hw2_test.py
import urllib.request
import sys
from io import BytesIO, StringIO

try:
    import huij as hw2
except ImportError:
    print("Could not import homework. Change the import statement on line 7 from 'huij' to your python homework file.")
    sys.exit(1)

def mock_urlopen(**kwargs):
    return Mockify(urllib.request, 'urlopen', **kwargs) #mock.patch.object(urllib.request, 'urlopen')

def test_crawlURL_single_link():
    with mock_urlopen(return_value=BytesIO(SAMPLE_PAGE2.encode('utf-8'))):
        results = hw2.crawlURL('http://cs.strose.edu/goldschd2/')
        expected_urls = [
            ('http://google.com/index.php', "A bad page"),
        ]
        expected_words = {
            'a': 1,
            'bad': 1,
            'page': 1,
        }
        assert results[0] == len(SAMPLE_PAGE2)
        assert results[1] == expected_urls
        assert results[2] == expected_words

def test_crawlURL_with_multiple_links():
    with mock_urlopen(return_value=BytesIO(SAMPLE_PAGE.encode('utf-8'))):
        results = hw2.crawlURL('http://cs.strose.edu/goldschd/')
        expected_urls = [
            ('http://google.com/', "Here's a link to google"),
            ('/absolute/relative/', 'a Location'),
        ]
        expected_words = {
            "here": 1,
            's': 1,
            'a': 2,
            'link': 1,
            'to': 1,
            'google': 1,
            'location': 1,
        }
        expected_words_alternative = {
            "here's": 1,
            'a': 2,
            'link': 1,
            'to': 1,
            'google': 1,
            'location': 1,
        }
        assert results[0] == len(SAMPLE_PAGE)
        assert results[1] == expected_urls
        assert results[2] == expected_words or results[2] == expected_words_alternative

def test_crawlURL_various_anchors():
    page = """
<a href="/foo/bar/"><img src="Foobar.jpeg" /></a>
<a href="/foo/bar/">Dup<span>licate</span></a>
<a href="foo.php"><i>LOL</i></a><a href="page1.html">Click here</a>
<a href="page2.html" otherstuff="..." color="green">Also here</a>
<a color="blue" otherstuff="..." href="http://cnn.com/another-page.html">Click here</a>
<a\n \tfoo="bar" href="page9001.html">It's over 9000!</a>
<a href="page9999.html">\nParty like \nIt's\n 9999!</a>
<a href="p.php"><b>click here</a>
"""
    with mock_urlopen(return_value=BytesIO(page.encode('utf-8'))):
        results = hw2.crawlURL('http://google.com')
        expected_urls = [
            ('/foo/bar/', ''),
            ('/foo/bar/', 'Duplicate'),
            ('foo.php', 'LOL'),
            ('page1.html', 'Click here'),
            ('page2.html', 'Also here'),
            ('http://cnn.com/another-page.html', 'Click here'),
            ('page9001.html', "It's over 9000!"),
            ('page9999.html', "\nParty like \nIt's\n 9999!"),
            ('p.php', 'click here'),
        ]
        assert results[0] == len(page)
        assert results[1] == expected_urls

def test_crawlSite_with_link_cycles():
    site = [
        '''<a href="page1.html">The first page</a>
<a href="mailto:lol@fake.com">FOO</a>
<a href="git://github.com/jeffh/YACS.git">BAR</a>
        <a href="page2.html">The second page</a>''',  # root
        '<a href="page1.html">The first page</a>',  # page 1
        '<a href="page3.html">The third page</a>',  # page 2
        '<a href="page2.html">The third page</a>',  # page 3
    ]
    total_bytes = sum(len(x) for x in site)
    def next_page(*args, **kwargs):
        return BytesIO(site.pop(0).encode('utf-8'))

    with mock_urlopen(side_effect=next_page):
        results = hw2.crawlSite('http://google.com/', politeness=0)
        expected_urls = [
            ('http://google.com/page1.html', 'The first page'),
            ('http://google.com/page2.html', 'The second page'),
            ('http://google.com/page3.html', 'The third page'),
        ]
        expected_words = {
            'the': 5,
            'first': 2,
            'second': 1,
            'third': 2,
            'page': 5,
            'foo': 1,
            'bar': 1,
        }
        assert results[0] == total_bytes
        assert results[1] == expected_urls
        assert results[2] == expected_words

def test_analyzeStats():
    total_bytes = 1073
    urls = [
        ('http://cs.strose.edu/page1.html', 'Click here'),
        ('http://cs.strose.edu/page2.html', 'Also here'),
        ('http://cs.strose.edu/page2.html', 'Good stuff here'),
        ('http://cs.strose.edu/longresourcename.html', 'Check this out'),
    ]
    wordcounts = {
        'and': 277,
        'of': 286,
        'the': 251,
        'longestwordfound': 186,
        'goldschmidt': 247,
    }
    expected = """total pages crawled successfully: 3
total words: 1247

URLs and Link-Text:
-------------------
http://cs.strose.edu/page1.html             ==>  'Click here'
http://cs.strose.edu/page2.html             ==>  'Also here'
http://cs.strose.edu/page2.html             ==>  'Good stuff here'
http://cs.strose.edu/longresourcename.html  ==>  'Check this out'

Word Counts (total of 1247 words):
----------------------------------
of                ==>  286
and               ==>  277
the               ==>  251
goldschmidt       ==>  247
longestwordfound  ==>  186
"""
    stdout = StringIO()
    tmp, sys.stdout = sys.stdout, stdout
    hw2.analyzeStats(total_bytes, urls, wordcounts)
    sys.stdout = tmp
    assert stdout.getvalue().split('\n') == expected.split('\n')


SAMPLE_PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

  <head>
    <title> David E. Goldschmidt, Ph.D. </title>
    <meta http-equiv="content-type" content="text/html; charset=utf-8" />
    <meta name="Author" content="David E. Goldschmidt" />
    <meta name="Keywords" content="" />
    <meta http-equiv="Pragma" content="no-cache" />
    <meta http-equiv="Expires" content="-1" />

    <link rel="stylesheet" type="text/css" href="cssjs/goldschd.css" />
    <link rel="stylesheet" type="text/css" href="cssjs/goldschd-print.css" media="print" />

    <script type="text/javascript" src="cssjs/goldschd.js"></script>

  </head>

  <body>
   <div id="all">

   <h1> there is no spoon </h1>
   <a href="http://google.com/">Here's a link to google</a>
   <p>
       Blah Blah<a href="/absolute/relative/">a Location</a>LOL
    </p>
   </div><!-- end all -->
  </body>

</html>"""


SAMPLE_PAGE2 = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

  <body>
   <div id="all">

   <h1> there is no spoon </h1>
   <a href="http://google.com/index.php">A bad page</a>
   </div><!-- end all -->
  </body>

</html>"""


#### BEGIN THE MICRO TESTING FRAMEWORK ####
import re
import sys
import traceback
from itertools import zip_longest
from pprint import pformat

class Mockify(object):
    NONE = object()
    def __init__(self, module, name, return_value=NONE, side_effect=NONE):
        self._mod, self._name = module, name
        self._return_value = None
        self._return_value_was_set = False
        self._side_effect = None
        self._side_effect_was_set = False
        if return_value != self.NONE:
            self.return_value = return_value
        if side_effect != self.NONE:
            self.side_effect = side_effect

    @property
    def return_value(self):
        return self._return_value

    @return_value.setter
    def return_value(self, value):
        self._return_value = value
        self._return_value_was_set = True

    @property
    def side_effect(self):
        return self._side_effect

    @side_effect.setter
    def side_effect(self, value):
        self._side_effect = value
        self._side_effect_was_set = True

    def __call__(self, *args, **kwargs):
        if self._side_effect_was_set:
            return self._side_effect(*args, **kwargs)
        if self._return_value_was_set:
            return self.return_value
        raise TypeError("Mockify requires side_effect or return_value to be set")

    def __enter__(self):
        self._old = getattr(self._mod, self._name)
        setattr(self._mod, self._name, self)
        return self

    def __exit__(self, type, value, traceback):
        setattr(self._mod, self._name, self._old)

def most_recent_tb(tb):
    prev_tb = tb
    while prev_tb.tb_next:
        prev_tb = prev_tb.tb_next
    return prev_tb

PARTS = re.compile(r'\W([!=]=|not\W+in|in|[><]=?|or|and)\W')
def extract_parts(assert_line):
    code = assert_line[len('assert'):].strip()
    parts = PARTS.split(code)
    if len(parts) > 1:
        return [p for i, p in enumerate(parts) if i % 2 == 0], [p for i, p in enumerate(parts) if i % 2 == 1]
    return [code], []

def re_eval(frame):
    code = traceback.extract_stack(frame)[-1][-1]
    parts, ops = extract_parts(code)
    evaled = []
    all_globals = {}
    all_globals.update(frame.f_builtins)
    all_globals.update(frame.f_globals)
    for p in parts:
        evaled.append(eval(p, all_globals, frame.f_locals))
    return parts, evaled, ops

def format_testname(func):
    return func.__doc__ or func.__name__[len('test_'):].replace('_', ' ')

def is_test(name, value):
    return name.lower().startswith('test')

def tests_from_dict(vars):
    "Returns all test from a given dictionary."
    tests = []
    for name, value in tuple(vars.items()):
        if name.startswith('test_'):
            tests.append(value)
    return tests

def run_tests(tests, fail_fast=False):
    print("Running {0} Tests:".format(len(tests)))
    errors = {}
    true_stdout = sys.stdout
    for test in tests:
        stdout = StringIO()
        sys.stdout = stdout
        try:
            test()
            true_stdout.write('.')
            true_stdout.flush()
        except Exception as e:
            tb = most_recent_tb(sys.exc_info()[2])
            snippets, values, ops = re_eval(tb.tb_frame)
            true_stdout.write('F')
            true_stdout.flush()
            errors[test] = (traceback.format_exc(), stdout.getvalue(), zip_longest(ops, values, fillvalue=''))
            if fail_fast:
                break

    sys.stdout = true_stdout
    if not errors:
        print("\n\nNo Errors ^_^")
        sys.exit(0)
    print("\n")
    for test_name, (exc, stdout, tree) in errors.items():
        print("----- {0} - FAILED -----\n\n{1}\n{2}\n".format(
            format_testname(test_name),
            exc,
            '\n'.join(['%s\n%s' % (pformat(val, indent=4), op.strip()) for op, val in tree]),
        ))
        if stdout:
            print(":::STDOUT:::\n{0}".format(stdout))
    print("==== End Errors ====")
    sys.exit(1)

if __name__ == '__main__':
    run_tests(tests_from_dict(globals()), fail_fast=('-f' in sys.argv or '--failfast' in sys.argv))
	import urllib.request
	import sys
	from io import BytesIO, StringIO

	try:
	import huij as hw2
	except ImportError:
	print("Could not import homework. Change the import statement on line 7 from 'huij' to your python homework file.")
	sys.exit(1)

	def mock_urlopen(**kwargs):
	return Mockify(urllib.request, 'urlopen', **kwargs) #mock.patch.object(urllib.request, 'urlopen')

	def test_crawlURL_single_link():
	with mock_urlopen(return_value=BytesIO(SAMPLE_PAGE2.encode('utf-8'))):
	results = hw2.crawlURL('http://cs.strose.edu/goldschd2/')
	expected_urls = [
	('http://google.com/index.php', "A bad page"),
	]
	expected_words = {
	'a': 1,
	'bad': 1,
	'page': 1,
	}
	assert results[0] == len(SAMPLE_PAGE2)
	assert results[1] == expected_urls
	assert results[2] == expected_words

	def test_crawlURL_with_multiple_links():
	with mock_urlopen(return_value=BytesIO(SAMPLE_PAGE.encode('utf-8'))):
	results = hw2.crawlURL('http://cs.strose.edu/goldschd/')
	expected_urls = [
	('http://google.com/', "Here's a link to google"),
	('/absolute/relative/', 'a Location'),
	]
	expected_words = {
	"here": 1,
	's': 1,
	'a': 2,
	'link': 1,
	'to': 1,
	'google': 1,
	'location': 1,
	}
	expected_words_alternative = {
	"here's": 1,
	'a': 2,
	'link': 1,
	'to': 1,
	'google': 1,
	'location': 1,
	}
	assert results[0] == len(SAMPLE_PAGE)
	assert results[1] == expected_urls
	assert results[2] == expected_words or results[2] == expected_words_alternative

	def test_crawlURL_various_anchors():
	page = """
	<a href="/foo/bar/"><img src="Foobar.jpeg" /></a>
	<a href="/foo/bar/">Dup<span>licate</span></a>
	<a href="foo.php"><i>LOL</i></a><a href="page1.html">Click here</a>
	<a href="page2.html" otherstuff="..." color="green">Also here</a>
	<a color="blue" otherstuff="..." href="http://cnn.com/another-page.html">Click here</a>
	<a\n \tfoo="bar" href="page9001.html">It's over 9000!</a>
	<a href="page9999.html">\nParty like \nIt's\n 9999!</a>
	<a href="p.php"><b>click here</a>
	"""
	with mock_urlopen(return_value=BytesIO(page.encode('utf-8'))):
	results = hw2.crawlURL('http://google.com')
	expected_urls = [
	('/foo/bar/', ''),
	('/foo/bar/', 'Duplicate'),
	('foo.php', 'LOL'),
	('page1.html', 'Click here'),
	('page2.html', 'Also here'),
	('http://cnn.com/another-page.html', 'Click here'),
	('page9001.html', "It's over 9000!"),
	('page9999.html', "\nParty like \nIt's\n 9999!"),
	('p.php', 'click here'),
	]
	assert results[0] == len(page)
	assert results[1] == expected_urls

	def test_crawlSite_with_link_cycles():
	site = [
	'''<a href="page1.html">The first page</a>
	<a href="mailto:lol@fake.com">FOO</a>
	<a href="git://github.com/jeffh/YACS.git">BAR</a>
	<a href="page2.html">The second page</a>''', # root
	'<a href="page1.html">The first page</a>', # page 1
	'<a href="page3.html">The third page</a>', # page 2
	'<a href="page2.html">The third page</a>', # page 3
	]
	total_bytes = sum(len(x) for x in site)
	def next_page(args, *kwargs):
	return BytesIO(site.pop(0).encode('utf-8'))

	with mock_urlopen(side_effect=next_page):
	results = hw2.crawlSite('http://google.com/', politeness=0)
	expected_urls = [
	('http://google.com/page1.html', 'The first page'),
	('http://google.com/page2.html', 'The second page'),
	('http://google.com/page3.html', 'The third page'),
	]
	expected_words = {
	'the': 5,
	'first': 2,
	'second': 1,
	'third': 2,
	'page': 5,
	'foo': 1,
	'bar': 1,
	}
	assert results[0] == total_bytes
	assert results[1] == expected_urls
	assert results[2] == expected_words

	def test_analyzeStats():
	total_bytes = 1073
	urls = [
	('http://cs.strose.edu/page1.html', 'Click here'),
	('http://cs.strose.edu/page2.html', 'Also here'),
	('http://cs.strose.edu/page2.html', 'Good stuff here'),
	('http://cs.strose.edu/longresourcename.html', 'Check this out'),
	]
	wordcounts = {
	'and': 277,
	'of': 286,
	'the': 251,
	'longestwordfound': 186,
	'goldschmidt': 247,
	}
	expected = """total pages crawled successfully: 3
	total words: 1247

	URLs and Link-Text:
	-------------------
	http://cs.strose.edu/page1.html ==> 'Click here'
	http://cs.strose.edu/page2.html ==> 'Also here'
	http://cs.strose.edu/page2.html ==> 'Good stuff here'
	http://cs.strose.edu/longresourcename.html ==> 'Check this out'

	Word Counts (total of 1247 words):
	----------------------------------
	of ==> 286
	and ==> 277
	the ==> 251
	goldschmidt ==> 247
	longestwordfound ==> 186
	"""
	stdout = StringIO()
	tmp, sys.stdout = sys.stdout, stdout
	hw2.analyzeStats(total_bytes, urls, wordcounts)
	sys.stdout = tmp
	assert stdout.getvalue().split('\n') == expected.split('\n')


	SAMPLE_PAGE = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

	<head>
	<title> David E. Goldschmidt, Ph.D. </title>
	<meta http-equiv="content-type" content="text/html; charset=utf-8" />
	<meta name="Author" content="David E. Goldschmidt" />
	<meta name="Keywords" content="" />
	<meta http-equiv="Pragma" content="no-cache" />
	<meta http-equiv="Expires" content="-1" />

	<link rel="stylesheet" type="text/css" href="cssjs/goldschd.css" />
	<link rel="stylesheet" type="text/css" href="cssjs/goldschd-print.css" media="print" />

	<script type="text/javascript" src="cssjs/goldschd.js"></script>

	</head>

	<body>
	<div id="all">

	<h1> there is no spoon </h1>
	<a href="http://google.com/">Here's a link to google</a>
	<p>
	Blah Blah<a href="/absolute/relative/">a Location</a>LOL
	</p>
	</div><!-- end all -->
	</body>

	</html>"""


	SAMPLE_PAGE2 = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

	<body>
	<div id="all">

	<h1> there is no spoon </h1>
	<a href="http://google.com/index.php">A bad page</a>
	</div><!-- end all -->
	</body>

	</html>"""


	#### BEGIN THE MICRO TESTING FRAMEWORK ####
	import re
	import sys
	import traceback
	from itertools import zip_longest
	from pprint import pformat

	class Mockify(object):
	NONE = object()
	def __init__(self, module, name, return_value=NONE, side_effect=NONE):
	self._mod, self._name = module, name
	self._return_value = None
	self._return_value_was_set = False
	self._side_effect = None
	self._side_effect_was_set = False
	if return_value != self.NONE:
	self.return_value = return_value
	if side_effect != self.NONE:
	self.side_effect = side_effect

	@property
	def return_value(self):
	return self._return_value

	@return_value.setter
	def return_value(self, value):
	self._return_value = value
	self._return_value_was_set = True

	@property
	def side_effect(self):
	return self._side_effect

	@side_effect.setter
	def side_effect(self, value):
	self._side_effect = value
	self._side_effect_was_set = True

	def __call__(self, args, *kwargs):
	if self._side_effect_was_set:
	return self._side_effect(args, *kwargs)
	if self._return_value_was_set:
	return self.return_value
	raise TypeError("Mockify requires side_effect or return_value to be set")

	def __enter__(self):
	self._old = getattr(self._mod, self._name)
	setattr(self._mod, self._name, self)
	return self

	def __exit__(self, type, value, traceback):
	setattr(self._mod, self._name, self._old)

	def most_recent_tb(tb):
	prev_tb = tb
	while prev_tb.tb_next:
	prev_tb = prev_tb.tb_next
	return prev_tb

	PARTS = re.compile(r'\W([!=]=\|not\W+in\|in\|[><]=?\|or\|and)\W')
	def extract_parts(assert_line):
	code = assert_line[len('assert'):].strip()
	parts = PARTS.split(code)
	if len(parts) > 1:
	return [p for i, p in enumerate(parts) if i % 2 == 0], [p for i, p in enumerate(parts) if i % 2 == 1]
	return [code], []

	def re_eval(frame):
	code = traceback.extract_stack(frame)[-1][-1]
	parts, ops = extract_parts(code)
	evaled = []
	all_globals = {}
	all_globals.update(frame.f_builtins)
	all_globals.update(frame.f_globals)
	for p in parts:
	evaled.append(eval(p, all_globals, frame.f_locals))
	return parts, evaled, ops

	def format_testname(func):
	return func.__doc__ or func.__name__[len('test_'):].replace('_', ' ')

	def is_test(name, value):
	return name.lower().startswith('test')

	def tests_from_dict(vars):
	"Returns all test from a given dictionary."
	tests = []
	for name, value in tuple(vars.items()):
	if name.startswith('test_'):
	tests.append(value)
	return tests

	def run_tests(tests, fail_fast=False):
	print("Running {0} Tests:".format(len(tests)))
	errors = {}
	true_stdout = sys.stdout
	for test in tests:
	stdout = StringIO()
	sys.stdout = stdout
	try:
	test()
	true_stdout.write('.')
	true_stdout.flush()
	except Exception as e:
	tb = most_recent_tb(sys.exc_info()[2])
	snippets, values, ops = re_eval(tb.tb_frame)
	true_stdout.write('F')
	true_stdout.flush()
	errors[test] = (traceback.format_exc(), stdout.getvalue(), zip_longest(ops, values, fillvalue=''))
	if fail_fast:
	break

	sys.stdout = true_stdout
	if not errors:
	print("\n\nNo Errors ^_^")
	sys.exit(0)
	print("\n")
	for test_name, (exc, stdout, tree) in errors.items():
	print("----- {0} - FAILED -----\n\n{1}\n{2}\n".format(
	format_testname(test_name),
	exc,
	'\n'.join(['%s\n%s' % (pformat(val, indent=4), op.strip()) for op, val in tree]),
	))
	if stdout:
	print(":::STDOUT:::\n{0}".format(stdout))
	print("==== End Errors ====")
	sys.exit(1)

	if __name__ == '__main__':
	run_tests(tests_from_dict(globals()), fail_fast=('-f' in sys.argv or '--failfast' in sys.argv))