Created
February 24, 2015 17:41
-
-
Save seanupton/dae01e1b34ae06ce22d5 to your computer and use it in GitHub Desktop.
Detect possible typosquatting package names in PyPI -- warning: slow, noisy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# compare package names in PyPI for possible typosquatting | |
# WARNING: slow O(n^2), even if optimized, may yield 1000+ results | |
import time | |
from lxml import html | |
import urllib2 | |
LISTURL = 'https://pypi.python.org/simple/' | |
def levenshtein(s1, s2): | |
""" | |
via http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python | |
""" | |
if len(s1) < len(s2): | |
return levenshtein(s2, s1) | |
# len(s1) >= len(s2) | |
if len(s2) == 0: | |
return len(s1) | |
previous_row = range(len(s2) + 1) | |
for i, c1 in enumerate(s1): | |
current_row = [i + 1] | |
for j, c2 in enumerate(s2): | |
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer | |
deletions = current_row[j] + 1 # than s2 | |
substitutions = previous_row[j] + (c1 != c2) | |
current_row.append(min(insertions, deletions, substitutions)) | |
previous_row = current_row | |
return previous_row[-1] | |
simple = urllib2.urlopen(LISTURL).read() | |
names = [a.get('href') for a in html.fromstring(simple).findall('.//a')] | |
considered = [name for name in names if len(name) > 2] | |
seen = [] | |
i = 0 | |
start = time.time() | |
for name1 in considered: | |
l = len(name1) | |
if l < 4: | |
continue # assume at least one must be > 4 characters long | |
for name2 in (n for n in considered if abs(len(n)-l) <= 1): | |
if abs(len(name1) - len(name2)) > 2: | |
continue | |
if (name1, name2) in seen or (name2, name1) in seen: | |
continue | |
if name1 != name2: | |
i += 1 | |
dist = levenshtein(name1, name2) | |
if dist < 2: | |
seen.append((name1, name2)) | |
print (name1, name2), i, time.time() - start | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment