Skip to content

Instantly share code, notes, and snippets.

@8enmann
Created June 22, 2019 20:18
Show Gist options
  • Save 8enmann/68941a6c35bee10fe4326e0b2db65791 to your computer and use it in GitHub Desktop.
Save 8enmann/68941a6c35bee10fe4326e0b2db65791 to your computer and use it in GitHub Desktop.
Experimenting with pytest, cosine similarity, ngram counting
import requests
import re
from typing import List, Sequence
from collections import Counter
import numpy as np
TEST_URL = 'http://titan.dcs.bbk.ac.uk/~kikpef01/testpage.html'
URL_RE = re.compile(r'href="(http.+?)"')
TOKENIZER_RE: re.Pattern = re.compile(r'\w+')
TAG_RE: re.Pattern = re.compile(r'<.+?>', flags=re.S)
def pull_page(url: str):
response = requests.get(url)
if response.status_code >= 300:
return None
return response.text
def get_urls(page: str) -> List[str]:
return URL_RE.findall(page)
def window(s: str, window_size: int) -> List[str]:
return [tuple(s[j:j+window_size]) for j in range(len(s) - (window_size - 1))]
def count_ngrams(page, n=2) -> Counter:
"""Return a Counter of ngram counts from 1 to n."""
tokens = TOKENIZER_RE.findall(strip_tags(page).lower())
counter = Counter()
for i in range(1, n + 1):
counter.update(window(tokens, i))
return counter
def strip_tags(page):
return TAG_RE.sub(' ', page)
def word2vec(counter: Counter, vocab: Sequence[str]):
return [counter.get(word, 0) for word in vocab]
def cosine(a, b):
vocab = sorted(a | b)
a = word2vec(a, vocab)
b = word2vec(b, vocab)
lens = [sum(xx**2 for xx in x) ** .5 for x in (a,b)]
dot = sum(aa * bb for aa, bb in zip(a,b))
return dot / (lens[0] * lens[1])
def np_cosine(a, b):
vocab = sorted(a | b)
a = np.array(word2vec(a, vocab))
b = np.array(word2vec(b, vocab))
lens = np.sqrt(np.sum(np.square(np.stack([a,b])), axis=1))
return np.dot(a,b) / np.prod(lens)
import time
class Timer:
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.interval = self.end - self.start
def main():
# Pull urls from TEST_URL
page = pull_page(TEST_URL)
urls = get_urls(page)
print('Found', len(urls), 'urls:', urls)
counts = count_ngrams(page)
print(counts.most_common(10))
with Timer() as t:
print(cosine(counts, counts))
print(t.interval)
with Timer() as t:
print(np_cosine(counts, counts))
print(t.interval)
if __name__ == '__main__':
main()
import requests
import pytest
import mock
from unittest import mock
from collections import Counter
from counter import pull_page, get_urls, count_ngrams, window, strip_tags, cosine
FAKE_HTML = '<html><a href="http://google.com">link</a><a href="http://google.com">link</a></html>'
dummy_response = mock.MagicMock(requests.Response)
dummy_response.status_code = 200
dummy_response.text = FAKE_HTML
mockGet = mock.MagicMock(return_value=dummy_response)
@mock.patch('requests.get', mockGet)
def test_pull_page():
assert 'google' in pull_page("http://ignored.com")
def test_regex():
results = get_urls(FAKE_HTML)
assert len(results) == 2
assert results[0] == 'http://google.com'
def test_regex_empty():
assert len(get_urls('')) == 0
def test_ngrams():
counts = count_ngrams('a a aa b bbc', n=1)
assert len(counts) == 4
assert counts['a'] == 2
def test_ngrams_2():
counts = count_ngrams('a a a b b', n=2)
assert len(counts) == 5
assert counts[('a', 'a')] == 2
assert counts[('a', 'b')] == 1
def test_window():
TEST_STR = 'asdfasdfasdf'
for i in range(2,4):
ret = window(TEST_STR, i)
assert len(ret) == len(TEST_STR) + 1 - i
assert len(ret[0]) == i
def test_strip():
assert strip_tags(FAKE_HTML).split() == ['link', 'link']
def test_cosine():
a = Counter('aaaabbb')
b = Counter('bbbccc')
assert cosine(a, a) == 1.0
assert cosine(a, b) == pytest.approx(.4, .1)
assert cosine(a, b) == cosine(b, a)
if __name__ == '__main__':
pytest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment