Skip to content

Instantly share code, notes, and snippets.

@egorsmkv
Created June 13, 2022 13:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save egorsmkv/4961a8dd1c58c0fcc043a355599dd336 to your computer and use it in GitHub Desktop.
Save egorsmkv/4961a8dd1c58c0fcc043a355599dd336 to your computer and use it in GitHub Desktop.
"""
Python implementation of Viterbi algorithm for word segmentation
A clean-up of this: http://norvig.com/ngrams/ch14.pdf
-
You also need 'unigrams.txt' and 'bigrams.txt' to run the segmentation. The ngrams
used in this implementation is from the 'count_1w.txt' and 'count_2w.txt' provided
here: http://norvig.com/ngrams/
-
Usage:
>>> from segment import viterbi
>>> viterbi('thisisasentence')
(-8.89803279104842, ['this', 'is', 'a', 'sentence'])
>>> viterbi('idontthinkyoucandealwiththis')
(-15.311752931970325, ['i', 'dont', 'think', 'you', 'can', 'deal', 'with', 'this'])
Original code: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
Author: Mark Dong <jindongd@andrew.cmu.edu>
"""
import math
import functools
class ProbDist(dict):
### Probability distribution estimated from unigram/bigram data
def __init__(self, datafile=None, unigram=True, N=1024908267229):
data = {}
with open(datafile) as f:
for line in f:
k, v = line.rstrip().split('\t')
data[k] = int(v)
for k, c in data.items():
self[k] = self.get(k, 0) + c
if unigram:
self.unknownprob = lambda k, N: 10/(N*10**len(k)) # avoid unknown long word
else:
self.unknownprob = lambda k, N: 1/N
self.N = N
def __call__(self, key):
if key in self:
return self[key]/self.N
else:
return self.unknownprob(key, self.N)
P_unigram = ProbDist('unigrams.txt')
P_bigram = ProbDist('bigrams.txt', False)
def conditionalProb(word_curr, word_prev):
### Conditional probability of current word given the previous word.
try:
return P_bigram[word_prev + ' ' + word_curr]/P_unigram[word_prev]
except KeyError:
return P_unigram(word_curr)
@functools.lru_cache(maxsize=2**10)
def viterbi(text, prev='<S>', maxlen=20):
if not text:
return 0.0, []
textlen = min(len(text), maxlen)
splits = [(text[:i + 1], text[i + 1:]) for i in range(textlen)]
candidates = []
for first_word, remain_word in splits:
#pdb.set_trace()
first_prob = math.log10(conditionalProb(first_word, prev))
remain_prob, remain_word = viterbi(remain_word, first_word)
candidates.append((first_prob + remain_prob, [first_word] + remain_word))
return max(candidates)
while True:
v = input('>')
v = v.lower().replace(' ', '')
print('Corrected to:', v)
z = viterbi(v)
print('Viterbi:', z)
print('Result:', ' '.join(z[1]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment