Skip to content

Instantly share code, notes, and snippets.

@eddieantonio
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eddieantonio/9118454 to your computer and use it in GitHub Desktop.
Save eddieantonio/9118454 to your computer and use it in GitHub Desktop.
Character-wise w-shingling, or at least as I understand it. http://en.wikipedia.org/wiki/W-shingling
#!/usr/bin/env python
from itertools import islice, izip, chain
# Returns a set of shingles for the given entity.
def shingles(entity, w=3):
"""
Performs charachter-wise shingling on an entity. Natural language
processing! Returns the (frozen) set of all shingles for a given entity.
>>> shingles('herp') == {'her', 'erp'}
True
>>> shingles('derp') == {'der', 'erp'}
True
>>> shingles('AM')
frozenset(['am'])
>>> shingles('Herp derp') == {'her', 'der', 'erp'}
True
"""
normalized = entity.strip().lower()
if len(normalized) <= w:
return frozenset((normalized,))
# An entity may be made of several components.
components = normalized.split()
def shinglingle(component):
slices = (islice(component, i, None) for i in xrange(w))
return (''.join(chars) for chars in izip(*slices))
comp_shingles = (shinglingle(comp) for comp in components)
# Flatten the iterable of all shingles.
result = frozenset(chain.from_iterable(comp_shingles))
return result
def jaccard_index(a, b):
"""
Finds the similarty of two sets as defined by the Jaccard index.
Intended for use with :py:func:`shingles`.
>>> jaccard_index({'cats'}, {'dogs'})
0.0
>>> jaccard_index({'foo', 'bar', 'baz'}, {'foo', 'bar', 'baz'})
1.0
>>> jaccard_index(shingles('herp'), shingles('derp')) == 1.0 / 3.0
True
>>> 0 < jaccard_index(shingles('Benedict Cumberbatch'),
... shingles('Benadryl Cumbersnatch')) < 1.0 / 3.0
True
"""
if 0 == len(a) == len(b):
return 1
return len(a & b) / float(len(a | b))
if __name__ == '__main__':
import doctest
import sys
doctest.testmod(verbose='-v' in sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment