Skip to content

Instantly share code, notes, and snippets.

@GaelVaroquaux
Created October 31, 2017 19:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GaelVaroquaux/a8f5d6ada74037adc4df58f446a8352d to your computer and use it in GitHub Desktop.
Save GaelVaroquaux/a8f5d6ada74037adc4df58f446a8352d to your computer and use it in GitHub Desktop.
Fast 3-gram counting on small strings
"""
Fast counting of 3-grams for short strings.
Quick benchmarking seems to show that pure Python code is faster when
for strings less that 1000 characters, and numpy versions is faster for
longer strings.
Very long strings would benefit from probabilistic counting (bloom
filter, count min sketch) as implemented eg in the "bounter" module.
Run unit tests with "pytest count_3_grams.py"
"""
###############################################################################
# Pure Python versions
def number_of_3grams(string):
""" Return the number of different tri-grams in a string
"""
# Pure Python implementation: no numpy
return len(set(zip(string, string[1:], string[2:])))
def number_of_common_3grams(string1, string2):
""" Return the number of common tri-grams in two strings
"""
# Pure Python implementation: no numpy
tri_grams = set(zip(string1, string1[1:], string1[2:]))
tri_grams = tri_grams.intersection(zip(string2, string2[1:],
string2[2:]))
return len(tri_grams)
def test_number_of_3grams():
# Small unit tests
assert number_of_3grams('abc') == 1
for i in range(1, 7):
assert number_of_3grams(i * 'aaa') == 1
assert number_of_3grams('abcdefghifk'[:i+2]) == i
def test_number_of_common_3grams():
# Small unit tests
for i in range(1, 7):
assert number_of_common_3grams(i * 'aaa', i * 'aaa') == 1
assert number_of_common_3grams('abcdefghifk'[:i+2],
'abcdefghifk'[:i+2]) == i
###############################################################################
# Numpy versions
import numpy as np
def number_of_3grams_np(string):
""" Return the number of different tri-grams in a string
"""
arr = np.frombuffer(string, dtype='S1')
# Hard coding the 3 gram
# Appending 3 shifted versions of the strings
arr = np.concatenate((arr[:-2, None], arr[1:-1, None], arr[2:, None]),
axis=1)
# "concatenate" strings
arr = arr.view('S3')
return np.unique(arr).size
def number_of_common_3grams_np(string1, string2):
""" Return the number of common tri-grams in two strings
"""
arr1 = np.frombuffer(string1, dtype='S1')
arr2 = np.frombuffer(string2, dtype='S1')
# Hard coding the 3 gram
# Appending 3 shifted versions of the strings
arr1 = np.concatenate((arr1[:-2, None], arr1[1:-1, None], arr1[2:, None]),
axis=1)
# "concatenate" strings
arr1 = arr1.view('S3')
arr2 = np.concatenate((arr2[:-2, None], arr2[1:-1, None], arr2[2:, None]),
axis=1)
arr2 = arr2.view('S3')
return np.lib.arraysetops.intersect1d(arr1, arr2).size
def test_number_of_3grams_np():
# Small unit tests
assert number_of_3grams_np('abc') == 1
for i in range(1, 7):
assert number_of_3grams_np(i * 'aaa') == 1
assert number_of_3grams_np('abcdefghifk'[:i+2]) == i
def test_number_of_common_3grams_np():
# Small unit tests
for i in range(1, 7):
assert number_of_common_3grams_np(i * 'aaa', i * 'aaa') == 1
assert number_of_common_3grams_np('abcdefghifk'[:i+2],
'abcdefghifk'[:i+2]) == i
###############################################################################
if __name__ == '__main__':
# Our two example strings
s1 = 'patricio'
s2 = 'patricia'
print(number_of_common_3grams(s1, s2))
@GaelVaroquaux
Copy link
Author

Comment on making this code faster is more than welcomed.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment