Skip to content

Instantly share code, notes, and snippets.

@seanupton
Created February 24, 2015 17:41
Show Gist options
  • Save seanupton/dae01e1b34ae06ce22d5 to your computer and use it in GitHub Desktop.
Save seanupton/dae01e1b34ae06ce22d5 to your computer and use it in GitHub Desktop.
Detect possible typosquatting package names in PyPI -- warning: slow, noisy
# compare package names in PyPI for possible typosquatting
# WARNING: slow O(n^2), even if optimized, may yield 1000+ results
import time
from lxml import html
import urllib2
LISTURL = 'https://pypi.python.org/simple/'
def levenshtein(s1, s2):
"""
via http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
"""
if len(s1) < len(s2):
return levenshtein(s2, s1)
# len(s1) >= len(s2)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
simple = urllib2.urlopen(LISTURL).read()
names = [a.get('href') for a in html.fromstring(simple).findall('.//a')]
considered = [name for name in names if len(name) > 2]
seen = []
i = 0
start = time.time()
for name1 in considered:
l = len(name1)
if l < 4:
continue # assume at least one must be > 4 characters long
for name2 in (n for n in considered if abs(len(n)-l) <= 1):
if abs(len(name1) - len(name2)) > 2:
continue
if (name1, name2) in seen or (name2, name1) in seen:
continue
if name1 != name2:
i += 1
dist = levenshtein(name1, name2)
if dist < 2:
seen.append((name1, name2))
print (name1, name2), i, time.time() - start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment