Skip to content

Instantly share code, notes, and snippets.

@honnibal
Created September 14, 2015 06:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save honnibal/7a8edc5e5fbab32985ba to your computer and use it in GitHub Desktop.
Save honnibal/7a8edc5e5fbab32985ba to your computer and use it in GitHub Desktop.
Simple but not so accurate bigram language model
from preshed.counter import PreshCounter
from spacy.en import English
from spacy.attrs import ORTH, IS_OOV
import plac
import plac
from os import path
import os
import bz2
import ujson
from preshed.counter import PreshCounter
def encode_bigram(w1, w2, vocab_size):
return w1 * vocab_size + w2
def decode_bigram(bigram, vocab_size):
w1 = bigram // vocab_size
w2 = bigram - w1
return w1, w2
def count_bigrams(docs, vocab_size):
counts = PreshCounter()
for doc in docs:
for w1 in doc:
if (w1.i + 1) < len(doc):
w2 = doc[w1.i + 1]
if not w1.is_oov and not w2.is_oov:
bigram = encode_bigram(w1.orth, w2.orth, vocab_size)
counts.inc(bigram, 1)
counts.smooth()
return counts
def estimate_ngram(ngram, bigrams, vocab_size):
prob = 1.0
for i in range(len(ngram)-1):
w1 = ngram[i]
w2 = ngram[i+1]
prob *= bigrams.prob(w1 * vocab_size + w2)
return prob * bigrams.total
def gen_docs(tokenizer, data_loc):
with bz2.BZ2File(data_loc) as file_:
for line in file_:
data = ujson.loads(line)
doc = tokenizer(data[u'body'])
yield doc
def main(data_loc):
print("Load spacy")
nlp = English(parser=False, tagger=False)
vocab_size = len(nlp.vocab.strings)
print("setup docs")
docs = gen_docs(nlp.tokenizer, data_loc)
print("Count bigrams")
bigram_probs = count_bigrams(docs, vocab_size)
print("Get estimates")
likely_ngram = [t.orth for t in nlp.tokenizer(u'State of the union')]
unlikely_ngram = [t.orth for t in nlp.tokenizer(u'colorless green ideas sleep furiously')]
print(estimate_ngram(likely_ngram, bigram_probs, vocab_size))
print(estimate_ngram(unlikely_ngram, bigram_probs, vocab_size))
if __name__ == '__main__':
plac.call(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment