Created
September 24, 2023 02:55
-
-
Save luthfibalaka/03754633447b93662d2a49714bcc8ed8 to your computer and use it in GitHub Desktop.
Mencoba Algoritma Levenshtein Automata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from levenshtein_automata import * | |
# Asumsikan kita punya korpus kata-kata baku sebagai berikut | |
words_corpus = ["kucing", "kemoceng", "kacang", "kucir", "kucil"] | |
# Misal user punya query typo, kita buat automata-nya | |
query = "kucin" | |
automata = levenshtein_automata(query, 1) | |
# Cek setiap kata di korpus, mana saja yang menjadi kandidat dengan edit distance 1 | |
for word in words_corpus: | |
current_states = automata.start_state | |
for char in word: | |
current_states = automata.next_state(current_states, char) | |
if automata.is_final(current_states): | |
print(f"Kata {word} adalah kandidat dari query {query}") | |
else: | |
print(f"Kata {word} bukan kandidat dari query {query}") | |
# Hasilnya ada 3 kandidat, yaitu kucing, kucir, dan kucil |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Implementasi Algoritma Levenshtein Automata (NFA) | |
Sumber: http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata | |
""" | |
class NFA: | |
EPSILON = object() | |
ANY = object() | |
def __init__(self, start_state: tuple): | |
self.transitions = {} | |
self.final_states = set() | |
self._start_state = start_state | |
@property | |
def start_state(self): | |
return frozenset(self._expand(set([self._start_state]))) | |
def add_transition(self, src, input, dest): | |
self.transitions.setdefault(src, {}).setdefault(input, set()).add(dest) | |
def add_final_state(self, state): | |
self.final_states.add(state) | |
def is_final(self, states): | |
return len(self.final_states.intersection(states)) > 0 | |
def _expand(self, states): | |
frontier = set(states) | |
while frontier: | |
state = frontier.pop() | |
new_states = ( | |
self.transitions.get(state, {}) | |
.get(NFA.EPSILON, set()) | |
.difference(states) | |
) | |
frontier.update(new_states) | |
states.update(new_states) | |
return states | |
def next_state(self, states, input): | |
dest_states = set() | |
for state in states: | |
state_transitions = self.transitions.get(state, {}) | |
dest_states.update(state_transitions.get(input, [])) | |
dest_states.update(state_transitions.get(NFA.ANY, [])) | |
return frozenset(self._expand(dest_states)) | |
def get_inputs(self, states): | |
inputs = set() | |
for state in states: | |
inputs.update(self.transitions.get(state, {}).keys()) | |
return inputs | |
def levenshtein_automata(term: str, k=1): | |
""" | |
Membangun NFA untuk suatu term dengan edit distance k (defaultnya 1) | |
""" | |
nfa = NFA((0, 0)) | |
for i, c in enumerate(term): | |
for e in range(k + 1): | |
# Correct character | |
nfa.add_transition((i, e), c, (i + 1, e)) | |
if e < k: | |
# Deletion | |
nfa.add_transition((i, e), NFA.ANY, (i, e + 1)) | |
# Insertion | |
nfa.add_transition((i, e), NFA.EPSILON, (i + 1, e + 1)) | |
# Substitution | |
nfa.add_transition((i, e), NFA.ANY, (i + 1, e + 1)) | |
for e in range(k + 1): | |
if e < k: | |
nfa.add_transition((len(term), e), NFA.ANY, (len(term), e + 1)) | |
nfa.add_final_state((len(term), e)) | |
return nfa |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment