Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import bisect
class NFA(object):
EPSILON = object()
ANY = object()
def __init__(self, start_state):
self.transitions = {}
self.final_states = set()
self._start_state = start_state
@property
def start_state(self):
return frozenset(self._expand(set([self._start_state])))
def add_transition(self, src, input, dest):
self.transitions.setdefault(src, {}).setdefault(input, set()).add(dest)
def add_final_state(self, state):
self.final_states.add(state)
def is_final(self, states):
return self.final_states.intersection(states)
def _expand(self, states):
frontier = set(states)
while frontier:
state = frontier.pop()
new_states = self.transitions.get(state, {}).get(NFA.EPSILON, set()).difference(states)
frontier.update(new_states)
states.update(new_states)
return states
def next_state(self, states, input):
dest_states = set()
for state in states:
state_transitions = self.transitions.get(state, {})
dest_states.update(state_transitions.get(input, []))
dest_states.update(state_transitions.get(NFA.ANY, []))
return frozenset(self._expand(dest_states))
def get_inputs(self, states):
inputs = set()
for state in states:
inputs.update(self.transitions.get(state, {}).keys())
return inputs
def to_dfa(self):
dfa = DFA(self.start_state)
frontier = [self.start_state]
seen = set()
while frontier:
current = frontier.pop()
inputs = self.get_inputs(current)
for input in inputs:
if input == NFA.EPSILON: continue
new_state = self.next_state(current, input)
if new_state not in seen:
frontier.append(new_state)
seen.add(new_state)
if self.is_final(new_state):
dfa.add_final_state(new_state)
if input == NFA.ANY:
dfa.set_default_transition(current, new_state)
else:
dfa.add_transition(current, input, new_state)
return dfa
class DFA(object):
def __init__(self, start_state):
self.start_state = start_state
self.transitions = {}
self.defaults = {}
self.final_states = set()
def add_transition(self, src, input, dest):
self.transitions.setdefault(src, {})[input] = dest
def set_default_transition(self, src, dest):
self.defaults[src] = dest
def add_final_state(self, state):
self.final_states.add(state)
def is_final(self, state):
return state in self.final_states
def next_state(self, src, input):
state_transitions = self.transitions.get(src, {})
return state_transitions.get(input, self.defaults.get(src, None))
def next_valid_string(self, input):
state = self.start_state
stack = []
# Evaluate the DFA as far as possible
for i, x in enumerate(input):
stack.append((input[:i], state, x))
state = self.next_state(state, x)
if not state: break
else:
stack.append((input[:i+1], state, None))
if self.is_final(state):
# Input word is already valid
return input
# Perform a 'wall following' search for the lexicographically smallest
# accepting state.
while stack:
path, state, x = stack.pop()
x = self.find_next_edge(state, x)
if x:
path += x
state = self.next_state(state, x)
if self.is_final(state):
return path
stack.append((path, state, None))
return None
def find_next_edge(self, s, x):
if x is None:
x = u'\0'
else:
x = unichr(ord(x) + 1)
state_transitions = self.transitions.get(s, {})
if x in state_transitions or s in self.defaults:
return x
labels = sorted(state_transitions.keys())
pos = bisect.bisect_left(labels, x)
if pos < len(labels):
return labels[pos]
return None
def levenshtein_automata(term, k):
nfa = NFA((0, 0))
for i, c in enumerate(term):
for e in range(k + 1):
# Correct character
nfa.add_transition((i, e), c, (i + 1, e))
if e < k:
# Deletion
nfa.add_transition((i, e), NFA.ANY, (i, e + 1))
# Insertion
nfa.add_transition((i, e), NFA.EPSILON, (i + 1, e + 1))
# Substitution
nfa.add_transition((i, e), NFA.ANY, (i + 1, e + 1))
for e in range(k + 1):
if e < k:
nfa.add_transition((len(term), e), NFA.ANY, (len(term), e + 1))
nfa.add_final_state((len(term), e))
return nfa
def find_all_matches(word, k, lookup_func):
"""Uses lookup_func to find all words within levenshtein distance k of word.
Args:
word: The word to look up
k: Maximum edit distance
lookup_func: A single argument function that returns the first word in the
database that is greater than or equal to the input argument.
Yields:
Every matching word within levenshtein distance k from the database.
"""
lev = levenshtein_automata(word, k).to_dfa()
match = lev.next_valid_string(u'\0')
while match:
next = lookup_func(match)
if not next:
return
if match == next:
yield match
next = next + u'\0'
match = lev.next_valid_string(next)
import automata
import bisect
import random
class Matcher(object):
def __init__(self, l):
self.l = l
self.probes = 0
def __call__(self, w):
self.probes += 1
pos = bisect.bisect_left(self.l, w)
if pos < len(self.l):
return self.l[pos]
else:
return None
words = [x.strip().lower().decode('utf-8') for x in open('/usr/share/dict/web2')]
words.sort()
words10 = [x for x in words if random.random() <= 0.1]
words100 = [x for x in words if random.random() <= 0.01]
m = Matcher(words)
assert len(list(automata.find_all_matches('food', 1, m))) == 18
print m.probes
m = Matcher(words)
assert len(list(automata.find_all_matches('food', 2, m))) == 283
print m.probes
def levenshtein(s1, s2):
if len(s1) < len(s2):
return levenshtein(s2, s1)
if not s1:
return len(s2)
previous_row = xrange(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
class BKNode(object):
def __init__(self, term):
self.term = term
self.children = {}
def insert(self, other):
distance = levenshtein(self.term, other)
if distance in self.children:
self.children[distance].insert(other)
else:
self.children[distance] = BKNode(other)
def search(self, term, k, results=None):
if results is None:
results = []
distance = levenshtein(self.term, term)
counter = 1
if distance <= k:
results.append(self.term)
for i in range(max(0, distance - k), distance + k + 1):
child = self.children.get(i)
if child:
counter += child.search(term, k, results)
return counter
@arcolife

This comment has been minimized.

Copy link

@arcolife arcolife commented Jul 3, 2012

awesome! i had recently thought of making a py code on this topic. thanks :D

@yingfeng

This comment has been minimized.

Copy link

@yingfeng yingfeng commented Oct 25, 2012

how to get error(edit distance) between the matched result and raw string?

@matrush

This comment has been minimized.

Copy link

@matrush matrush commented Apr 25, 2013

Line 144 and Line 146 must be swapped.

@matrush

This comment has been minimized.

Copy link

@matrush matrush commented Apr 25, 2013

Line 158 should be """Uses find_all_matches to find all words within levenshtein distance k of word.

@AndrePeter

This comment has been minimized.

Copy link

@AndrePeter AndrePeter commented Jun 27, 2013

Sorry for the stupid question, but from where can I get the dictionary file?

@Romanzo

This comment has been minimized.

Copy link

@Romanzo Romanzo commented Jan 8, 2016

Ho fatto un test, scrivo poco in inglese, eccolo qui: https://gist.github.com/Romanzo/2cd7b5cc382f5f419670

@davefernig

This comment has been minimized.

Copy link

@davefernig davefernig commented May 14, 2019

I would like to use this code in a project. Would you mind adding adding an MIT license, or otherwise giving me permission to do so?

@Madoshakalaka

This comment has been minimized.

Copy link

@Madoshakalaka Madoshakalaka commented Aug 26, 2019

on line 163: lookup_func: A single argument function that returns the first word in the database that is greater than or equal to the input argument.

isn't it less than or equal to the input argument ??

you were using bisect_left in the Matcher, which get you something lesser

@stuartcrobinson

This comment has been minimized.

Copy link

@stuartcrobinson stuartcrobinson commented Dec 8, 2019

Sorry for the stupid question, but from where can I get the dictionary file?

this worked for me:
https://github.com/dwyl/english-words/blob/master/words_alpha.txt

you'll have to update lines 26 and 30 of automata_test.py (to 21 and 388)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment