Skip to content

Instantly share code, notes, and snippets.

@darius
Last active November 19, 2016 20:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save darius/5f4db28efb33a8a131a0a9aa6a320afb to your computer and use it in GitHub Desktop.
Save darius/5f4db28efb33a8a131a0a9aa6a320afb to your computer and use it in GitHub Desktop.
from __future__ import division
from collections import defaultdict
# ... rest of original code from Norvig, ngrams.py ...
class Pdist(dict):
"A probability distribution estimated from counts in datafile."
def __init__(self, data=[], N=None, missingfn=None):
for key,count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
self.missingfn = missingfn or (lambda k, N: 1./N)
def __call__(self, key):
if key in self: return self[key]/self.N
return 10./(self.N * 10**len(key))
def segment2(text, prev='<S>'):
L = len(text)
null_result = 0.0, ()
memos = {}
def recur(j, prev):
if j == L:
return null_result
else:
Pbest = None
best = ()
Pprev = Pw.get(prev)
if Pprev is None:
for i in xrange(j+1, min(L, j+20) + 1):
first = text[j:i]
recur_key = i, first
try:
Prem, rem = memos[recur_key]
except KeyError:
Prem, rem = result = recur(i, first)
memos[recur_key] = result
Prem += log10(Pw(first))
if Pbest < Prem:
Pbest = Prem
best = (first,)+rem
else:
for i in xrange(j+1, min(L, j+20) + 1):
first = text[j:i]
recur_key = i, first
try:
Prem, rem = memos[recur_key]
except KeyError:
Prem, rem = result = recur(i, first)
memos[recur_key] = result
p = P2w.get(prev + ' ' + first)
if p is not None:
p /= Pprev
else:
p = Pw(first)
Prem += log10(p)
if Pbest < Prem:
Pbest = Prem
best = (first,)+rem
return Pbest, best
return recur(0, prev)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment