Skip to content

Instantly share code, notes, and snippets.

@maccam912
Last active November 19, 2015 20:49
Show Gist options
  • Save maccam912/aeaeab815b00e57e8cb6 to your computer and use it in GitHub Desktop.
Save maccam912/aeaeab815b00e57e8cb6 to your computer and use it in GitHub Desktop.
import nltk
from nltk.collocations import *
import re, string, time, operator
start_time = time.time()
bigram_measures = nltk.collocations.BigramAssocMeasures()
test = "word"
corpus = nltk.corpus.PlaintextCorpusReader('/data/', '.*')
rawtext = corpus.words('biuw.txt')
ignored_words = nltk.corpus.stopwords.words('english')
text = []
for word in rawtext:
word = "".join(l for l in word if l.isalnum())
if word.lower() not in ignored_words:
text.append(word)
rawtext = text
end = ['x']
text = []
while True:
text = []
for word in rawtext:
text.append(word.lower())
finder = BigramCollocationFinder.from_words(text)
finder.apply_word_filter(lambda w: w.lower() in ignored_words)
finder.apply_freq_filter(3)
output = finder.nbest(bigram_measures.likelihood_ratio, 100000) # doctest: +NORMALIZE_WHITESPACE
for pair in output:
string = "%s_%s" % (pair[0], pair[1])
for i in range(len(text)-2,-1,-1):
if text[i] == pair[0] and text[i+1] == pair[1]:
text[i] = string
text.pop(i+1)
if len(end) == len(text):
break
end = text
rawtext = text
bow = {}
for word in text:
if word not in bow:
bow[word] = 1
else:
bow[word] += 1
for w in sorted(bow, key=bow.get, reverse=False):
print w, bow[w]
print("--- %s seconds ---" % (time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment