Skip to content

Instantly share code, notes, and snippets.

@dchaplinsky
Created January 7, 2014 21:35
Show Gist options
  • Save dchaplinsky/8307327 to your computer and use it in GitHub Desktop.
Save dchaplinsky/8307327 to your computer and use it in GitHub Desktop.
memoization/optimisations applied to particular fuzzywuzzy method: 4.5x speed up
--- /Users/dchaplinsky/Downloads/fuzz.py 2014-01-07 23:32:54.000000000 +0200
+++ fuzz.py 2014-01-07 23:33:35.000000000 +0200
@@ -94,25 +94,47 @@
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
def _token_sort(s1, s2, partial=True, force_ascii=True):
+ try:
+ sorted1 = _sorted_full_process(s1, force_ascii)
+ except TypeError:
+ raise TypeError("s1 is None")
+
+ try:
+ sorted2 = _sorted_full_process(s2, force_ascii)
+ except TypeError:
+ raise TypeError("s2 is None")
- if s1 is None: raise TypeError("s1 is None")
- if s2 is None: raise TypeError("s2 is None")
+ if partial:
+ return partial_ratio(sorted1, sorted2)
+ else:
+ m = SequenceMatcher(None, sorted1, sorted2)
+ return intr(100 * m.ratio())
+
+
+def memoize(f):
+ """ Memoization decorator for a function taking one or more arguments. """
+ class memodict(dict):
+ def __getitem__(self, *key):
+ return dict.__getitem__(self, key)
+
+ def __missing__(self, key):
+ ret = self[key] = f(*key)
+ return ret
+
+ return memodict().__getitem__
+
+
+@memoize
+def _sorted_full_process(s, force_ascii):
+ if s is None:
+ raise TypeError("s is None")
# pull tokens
- tokens1 = full_process(s1, force_ascii=force_ascii).split()
- tokens2 = full_process(s2, force_ascii=force_ascii).split()
+ tokens = full_process(s, force_ascii=force_ascii).split()
# sort tokens and join
- sorted1 = " ".join(sorted(tokens1))
- sorted2 = " ".join(sorted(tokens2))
-
- sorted1 = sorted1.strip()
- sorted2 = sorted2.strip()
+ return " ".join(sorted(tokens)).strip()
- if partial:
- return partial_ratio(sorted1, sorted2)
- else:
- return ratio(sorted1, sorted2)
def token_sort_ratio(s1, s2, force_ascii=True):
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment