seanupton/pkgcomp.py

## pkgcomp.py
# compare package names in PyPI for possible typosquatting
# WARNING: slow O(n^2), even if optimized, may yield 1000+ results

import time
from lxml import html
import urllib2

LISTURL = 'https://pypi.python.org/simple/'


def levenshtein(s1, s2):
    """
    via http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
    """
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


simple = urllib2.urlopen(LISTURL).read()
names = [a.get('href') for a in html.fromstring(simple).findall('.//a')]
considered = [name for name in names if len(name) > 2]

seen = []
i = 0
start = time.time()


for name1 in considered:
    l = len(name1)
    if l < 4:
        continue  # assume at least one must be > 4 characters long
    for name2 in (n for n in considered if abs(len(n)-l) <= 1):
        if abs(len(name1) - len(name2)) > 2:
            continue
        if (name1, name2) in seen or (name2, name1) in seen:
            continue
        if name1 != name2:
            i += 1
            dist = levenshtein(name1, name2)
            if dist < 2:
                seen.append((name1, name2))
                print (name1, name2), i, time.time() - start
	# compare package names in PyPI for possible typosquatting
	# WARNING: slow O(n^2), even if optimized, may yield 1000+ results

	import time
	from lxml import html
	import urllib2

	LISTURL = 'https://pypi.python.org/simple/'


	def levenshtein(s1, s2):
	"""
	via http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
	"""
	if len(s1) < len(s2):
	return levenshtein(s2, s1)

	# len(s1) >= len(s2)
	if len(s2) == 0:
	return len(s1)

	previous_row = range(len(s2) + 1)
	for i, c1 in enumerate(s1):
	current_row = [i + 1]
	for j, c2 in enumerate(s2):
	insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
	deletions = current_row[j] + 1 # than s2
	substitutions = previous_row[j] + (c1 != c2)
	current_row.append(min(insertions, deletions, substitutions))
	previous_row = current_row

	return previous_row[-1]


	simple = urllib2.urlopen(LISTURL).read()
	names = [a.get('href') for a in html.fromstring(simple).findall('.//a')]
	considered = [name for name in names if len(name) > 2]

	seen = []
	i = 0
	start = time.time()


	for name1 in considered:
	l = len(name1)
	if l < 4:
	continue # assume at least one must be > 4 characters long
	for name2 in (n for n in considered if abs(len(n)-l) <= 1):
	if abs(len(name1) - len(name2)) > 2:
	continue
	if (name1, name2) in seen or (name2, name1) in seen:
	continue
	if name1 != name2:
	i += 1
	dist = levenshtein(name1, name2)
	if dist < 2:
	seen.append((name1, name2))
	print (name1, name2), i, time.time() - start