honnibal/simple_bigrams.py

## simple_bigrams.py
from preshed.counter import PreshCounter
from spacy.en import English

from spacy.attrs import ORTH, IS_OOV

import plac

import plac
from os import path
import os
import bz2
import ujson
from preshed.counter import PreshCounter


def encode_bigram(w1, w2, vocab_size):
    return w1 * vocab_size + w2


def decode_bigram(bigram, vocab_size):
    w1 = bigram // vocab_size
    w2 = bigram - w1
    return w1, w2


def count_bigrams(docs, vocab_size):
    counts = PreshCounter()
    for doc in docs:
        for w1 in doc:
            if (w1.i + 1) < len(doc):
                w2 = doc[w1.i + 1]
                if not w1.is_oov and not w2.is_oov:
                    bigram = encode_bigram(w1.orth, w2.orth, vocab_size)
                    counts.inc(bigram, 1)
    counts.smooth()
    return counts


def estimate_ngram(ngram, bigrams, vocab_size):
    prob = 1.0
    for i in range(len(ngram)-1):
        w1 = ngram[i]
        w2 = ngram[i+1]
        prob *= bigrams.prob(w1 * vocab_size + w2)
    return prob * bigrams.total


def gen_docs(tokenizer, data_loc):
    with bz2.BZ2File(data_loc) as file_:
        for line in file_:
            data = ujson.loads(line)
            doc = tokenizer(data[u'body'])
            yield doc


def main(data_loc):
    print("Load spacy")
    nlp = English(parser=False, tagger=False)
    vocab_size = len(nlp.vocab.strings)
    print("setup docs")
    docs = gen_docs(nlp.tokenizer, data_loc)
    print("Count bigrams")
    bigram_probs = count_bigrams(docs, vocab_size)
    print("Get estimates")
    likely_ngram = [t.orth for t in nlp.tokenizer(u'State of the union')]
    unlikely_ngram = [t.orth for t in nlp.tokenizer(u'colorless green ideas sleep furiously')]
    print(estimate_ngram(likely_ngram, bigram_probs, vocab_size))
    print(estimate_ngram(unlikely_ngram, bigram_probs, vocab_size))


if __name__ == '__main__':
    plac.call(main)
	from preshed.counter import PreshCounter
	from spacy.en import English

	from spacy.attrs import ORTH, IS_OOV

	import plac

	import plac
	from os import path
	import os
	import bz2
	import ujson
	from preshed.counter import PreshCounter


	def encode_bigram(w1, w2, vocab_size):
	return w1 * vocab_size + w2


	def decode_bigram(bigram, vocab_size):
	w1 = bigram // vocab_size
	w2 = bigram - w1
	return w1, w2


	def count_bigrams(docs, vocab_size):
	counts = PreshCounter()
	for doc in docs:
	for w1 in doc:
	if (w1.i + 1) < len(doc):
	w2 = doc[w1.i + 1]
	if not w1.is_oov and not w2.is_oov:
	bigram = encode_bigram(w1.orth, w2.orth, vocab_size)
	counts.inc(bigram, 1)
	counts.smooth()
	return counts


	def estimate_ngram(ngram, bigrams, vocab_size):
	prob = 1.0
	for i in range(len(ngram)-1):
	w1 = ngram[i]
	w2 = ngram[i+1]
	prob = bigrams.prob(w1 vocab_size + w2)
	return prob * bigrams.total


	def gen_docs(tokenizer, data_loc):
	with bz2.BZ2File(data_loc) as file_:
	for line in file_:
	data = ujson.loads(line)
	doc = tokenizer(data[u'body'])
	yield doc


	def main(data_loc):
	print("Load spacy")
	nlp = English(parser=False, tagger=False)
	vocab_size = len(nlp.vocab.strings)
	print("setup docs")
	docs = gen_docs(nlp.tokenizer, data_loc)
	print("Count bigrams")
	bigram_probs = count_bigrams(docs, vocab_size)
	print("Get estimates")
	likely_ngram = [t.orth for t in nlp.tokenizer(u'State of the union')]
	unlikely_ngram = [t.orth for t in nlp.tokenizer(u'colorless green ideas sleep furiously')]
	print(estimate_ngram(likely_ngram, bigram_probs, vocab_size))
	print(estimate_ngram(unlikely_ngram, bigram_probs, vocab_size))


	if __name__ == '__main__':
	plac.call(main)