maccam912/ngram_finder.py

## ngram_finder.py
import nltk
from nltk.collocations import *
import re, string, time, operator

start_time = time.time()

bigram_measures = nltk.collocations.BigramAssocMeasures()
test = "word"
corpus = nltk.corpus.PlaintextCorpusReader('/data/', '.*')
rawtext = corpus.words('biuw.txt')
ignored_words = nltk.corpus.stopwords.words('english')
text = []
for word in rawtext:
    word = "".join(l for l in word if l.isalnum())
    if word.lower() not in ignored_words:
        text.append(word)

rawtext = text
end = ['x']
text = []
while True:
    text = []
    for word in rawtext:
        text.append(word.lower())

    finder = BigramCollocationFinder.from_words(text)
    finder.apply_word_filter(lambda w: w.lower() in ignored_words)

    finder.apply_freq_filter(3)
    output = finder.nbest(bigram_measures.likelihood_ratio, 100000)  # doctest: +NORMALIZE_WHITESPACE

    for pair in output:
        string = "%s_%s" % (pair[0], pair[1])
        for i in range(len(text)-2,-1,-1):
            if text[i] == pair[0] and text[i+1] == pair[1]:
                text[i] = string
                text.pop(i+1)
    if len(end) == len(text):
        break
    end = text
    rawtext = text

bow = {}

for word in text:
	if word not in bow:
		bow[word] = 1
	else:
		bow[word] += 1


for w in sorted(bow, key=bow.get, reverse=False):
	print w, bow[w]

print("--- %s seconds ---" % (time.time() - start_time))
	import nltk
	from nltk.collocations import *
	import re, string, time, operator

	start_time = time.time()

	bigram_measures = nltk.collocations.BigramAssocMeasures()
	test = "word"
	corpus = nltk.corpus.PlaintextCorpusReader('/data/', '.*')
	rawtext = corpus.words('biuw.txt')
	ignored_words = nltk.corpus.stopwords.words('english')
	text = []
	for word in rawtext:
	word = "".join(l for l in word if l.isalnum())
	if word.lower() not in ignored_words:
	text.append(word)

	rawtext = text
	end = ['x']
	text = []
	while True:
	text = []
	for word in rawtext:
	text.append(word.lower())

	finder = BigramCollocationFinder.from_words(text)
	finder.apply_word_filter(lambda w: w.lower() in ignored_words)

	finder.apply_freq_filter(3)
	output = finder.nbest(bigram_measures.likelihood_ratio, 100000) # doctest: +NORMALIZE_WHITESPACE

	for pair in output:
	string = "%s_%s" % (pair[0], pair[1])
	for i in range(len(text)-2,-1,-1):
	if text[i] == pair[0] and text[i+1] == pair[1]:
	text[i] = string
	text.pop(i+1)
	if len(end) == len(text):
	break
	end = text
	rawtext = text

	bow = {}

	for word in text:
	if word not in bow:
	bow[word] = 1
	else:
	bow[word] += 1


	for w in sorted(bow, key=bow.get, reverse=False):
	print w, bow[w]

	print("--- %s seconds ---" % (time.time() - start_time))