Skip to content

Instantly share code, notes, and snippets.

@catermelon
Created October 7, 2015 05:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save catermelon/eb9d6bfb6f8f655cb979 to your computer and use it in GitHub Desktop.
Save catermelon/eb9d6bfb6f8f655cb979 to your computer and use it in GitHub Desktop.
'''
Allows scoring of text using n-gram probabilities
17/07/12
'''
from math import log10
# make a class called ngram_score
# which is wrong, Python classes are always CamelCased so it should be NGramScore but whatever
class ngram_score(object):
# this is the constructor, it runs when we create a new object from this class
def __init__(self,ngramfile,sep=' '):
''' load a file containing ngrams and counts, calculate log probabilities '''
self.ngrams = {} # create a hash called ngrams, store it as an instance variable in this object
for line in open(ngramfile, 'r'): # open the file called ngrams, and iterate over each line
key,count = line.split(sep) # split the line at whitespace & store in key, count variables
self.ngrams[key] = int(count) # store the count in the hash & convert it to an integer
self.L = len(key) # calculcate the length of the last key we saw and store it in the object
self.N = sum(self.ngrams.itervalues()) # sum all the values in the hash together, i.e. all counts
for key in self.ngrams.keys(): # for each key in the hash
self.ngrams[key] = log10(float(self.ngrams[key])/self.N) # do a thing and overwrite the value in the hash
self.floor = log10(0.01/self.N) # idk math or something
def score(self,text):
''' compute the score of text '''
score = 0
# The c way of saying this next line would be:
# limit = len(text)-self.L+1
# for ((x=0; x<limit; x++))
for i in xrange(len(text)-self.L+1):
# This is how you get substrings in Python
# text is a string, so this is text[start:end]
key = text[i:i+self.L]
# if the key is in the hash we already calculated
if key in self.ngrams:
# add the value of that key to the score
score += self.ngrams[key]
else:
# if not, use this other wierd value we calculated
score += self.floor
return score
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment