Skip to content

Instantly share code, notes, and snippets.

@catermelon
Last active October 7, 2015 05:47
Show Gist options
  • Save catermelon/6ef80ba75cdd8a2da851 to your computer and use it in GitHub Desktop.
Save catermelon/6ef80ba75cdd8a2da851 to your computer and use it in GitHub Desktop.
'''
Allows scoring of text using n-gram probabilities
17/07/12
'''
from math import log10
class ngram_score(object):
def __init__(self,ngramfile,sep=' '):
''' load a file containing ngrams and counts, calculate log probabilities '''
self.ngrams = {}
for line in open(ngramfile, 'r'):
key,count = line.split(sep)
self.ngrams[key] = int(count)
self.L = len(key)
self.N = sum(self.ngrams.itervalues())
#calculate log probabilities
for key in self.ngrams.keys():
self.ngrams[key] = log10(float(self.ngrams[key])/self.N)
self.floor = log10(0.01/self.N)
def score(self,text):
''' compute the score of text '''
score = 0
for i in xrange(len(text)-self.L+1):
key = text[i:i+self.L]
if key in self.ngrams:
score += self.ngrams[key]
else:
score += self.floor
return score
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment