egorsmkv/segment.py

## segment.py
"""
Python implementation of Viterbi algorithm for word segmentation
A clean-up of this: http://norvig.com/ngrams/ch14.pdf
-
You also need 'unigrams.txt' and 'bigrams.txt' to run the segmentation. The ngrams
used in this implementation is from the 'count_1w.txt' and 'count_2w.txt' provided
here: http://norvig.com/ngrams/
-
Usage:
>>> from segment import viterbi
>>> viterbi('thisisasentence')
(-8.89803279104842, ['this', 'is', 'a', 'sentence'])
>>> viterbi('idontthinkyoucandealwiththis')
(-15.311752931970325, ['i', 'dont', 'think', 'you', 'can', 'deal', 'with', 'this'])

Original code: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
Author: Mark Dong <jindongd@andrew.cmu.edu>
"""

import math
import functools


class ProbDist(dict):
    ### Probability distribution estimated from unigram/bigram data
    def __init__(self, datafile=None, unigram=True, N=1024908267229):
        data = {}
        with open(datafile) as f:
            for line in f:
                k, v = line.rstrip().split('\t')
                data[k] = int(v)

        for k, c in data.items():
            self[k] = self.get(k, 0) + c

        if unigram:
            self.unknownprob = lambda k, N: 10/(N*10**len(k))    # avoid unknown long word
        else:
            self.unknownprob = lambda k, N: 1/N

        self.N = N

    def __call__(self, key):
        if key in self:
            return self[key]/self.N
        else:
            return self.unknownprob(key, self.N)


P_unigram = ProbDist('unigrams.txt')
P_bigram = ProbDist('bigrams.txt', False)

def conditionalProb(word_curr, word_prev):
    ### Conditional probability of current word given the previous word.
    try:
        return P_bigram[word_prev + ' ' + word_curr]/P_unigram[word_prev]
    except KeyError:
        return P_unigram(word_curr)


@functools.lru_cache(maxsize=2**10)
def viterbi(text, prev='<S>', maxlen=20):
    if not text:
        return 0.0, []

    textlen = min(len(text), maxlen)
    splits = [(text[:i + 1], text[i + 1:]) for i in range(textlen)]

    candidates = []
    for first_word, remain_word in splits:
        #pdb.set_trace()
        first_prob = math.log10(conditionalProb(first_word, prev))
        remain_prob, remain_word = viterbi(remain_word, first_word)

        candidates.append((first_prob + remain_prob, [first_word] + remain_word))

    return max(candidates)

while True:
    v = input('>')
    v = v.lower().replace(' ', '')
    print('Corrected to:', v)
    z = viterbi(v)
    print('Viterbi:', z)
    print('Result:', ' '.join(z[1]))
	"""
	Python implementation of Viterbi algorithm for word segmentation
	A clean-up of this: http://norvig.com/ngrams/ch14.pdf
	-
	You also need 'unigrams.txt' and 'bigrams.txt' to run the segmentation. The ngrams
	used in this implementation is from the 'count_1w.txt' and 'count_2w.txt' provided
	here: http://norvig.com/ngrams/
	-
	Usage:
	>>> from segment import viterbi
	>>> viterbi('thisisasentence')
	(-8.89803279104842, ['this', 'is', 'a', 'sentence'])
	>>> viterbi('idontthinkyoucandealwiththis')
	(-15.311752931970325, ['i', 'dont', 'think', 'you', 'can', 'deal', 'with', 'this'])

	Original code: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
	Author: Mark Dong <jindongd@andrew.cmu.edu>
	"""

	import math
	import functools



	class ProbDist(dict):
	### Probability distribution estimated from unigram/bigram data
	def __init__(self, datafile=None, unigram=True, N=1024908267229):
	data = {}
	with open(datafile) as f:
	for line in f:
	k, v = line.rstrip().split('\t')
	data[k] = int(v)

	for k, c in data.items():
	self[k] = self.get(k, 0) + c

	if unigram:
	self.unknownprob = lambda k, N: 10/(N10*len(k)) # avoid unknown long word
	else:
	self.unknownprob = lambda k, N: 1/N

	self.N = N

	def __call__(self, key):
	if key in self:
	return self[key]/self.N
	else:
	return self.unknownprob(key, self.N)


	P_unigram = ProbDist('unigrams.txt')
	P_bigram = ProbDist('bigrams.txt', False)

	def conditionalProb(word_curr, word_prev):
	### Conditional probability of current word given the previous word.
	try:
	return P_bigram[word_prev + ' ' + word_curr]/P_unigram[word_prev]
	except KeyError:
	return P_unigram(word_curr)


	@functools.lru_cache(maxsize=2**10)
	def viterbi(text, prev='<S>', maxlen=20):
	if not text:
	return 0.0, []

	textlen = min(len(text), maxlen)
	splits = [(text[:i + 1], text[i + 1:]) for i in range(textlen)]

	candidates = []
	for first_word, remain_word in splits:
	#pdb.set_trace()
	first_prob = math.log10(conditionalProb(first_word, prev))
	remain_prob, remain_word = viterbi(remain_word, first_word)

	candidates.append((first_prob + remain_prob, [first_word] + remain_word))

	return max(candidates)

	while True:
	v = input('>')
	v = v.lower().replace(' ', '')
	print('Corrected to:', v)
	z = viterbi(v)
	print('Viterbi:', z)
	print('Result:', ' '.join(z[1]))