Skip to content

Instantly share code, notes, and snippets.

@Pinak-Chakraborty
Created August 10, 2014 20:07
Show Gist options
  • Save Pinak-Chakraborty/67d91fce3b6dbbd0b5ea to your computer and use it in GitHub Desktop.
Save Pinak-Chakraborty/67d91fce3b6dbbd0b5ea to your computer and use it in GitHub Desktop.
Tokenizer and computation of unigrams and bigrams with regex
import sys, os, os.path, glob, codecs
# Set the codecs
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())
# no of highest frequemcy unigrams and bigrams that will be written out
writemax = 100
def wordTokenizier(line):
#delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+"
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
tokenList = re.findall(delimiters, line)
return tokenList
if __name__ == '__main__':
# Declare file to be worked with
textfile = "C:\Python34\Scripts\mytext.txt"
# Declare Unigram and Bigram dictionary
Unigrams = {}
Bigrams = {}
prev_word = "START"
# Open & read file in a loop
for line in open(textfile):
line = line.rstrip()
print ("input = ", line)
#-- Tokenize lines --------------------------------------
tokens = wordTokenizier(line)
#-- Loop over tokens ---------------------------------------
for word in tokens:
#-- Process unigras first ----------------------------------
if word in Unigrams:
Unigrams[word] += 1
else:
Unigrams[word] = 1
#-- Now process bigrams ------------------------------------
# concatenate words to get bigram:
bigram = prev_word + ' ' + word
if bigram in Bigrams:
Bigrams[bigram] += 1
else:
Bigrams[bigram] = 1
# change value of prev_word
prev_word = word
#-- All file processed here --------------------------------
# Write unigrams to output file - first "writemax" highest freq are written
output_file = open("unigram-out.txt","w", encoding="UTF-8")
writecount = 0
for uni in sorted(Unigrams, key=Unigrams.get, reverse=True):
countU = Unigrams[uni]
output_file.write(str(countU) + '\t' + str(uni) + '\n')
writecount +=1
if writecount >= writemax:
break
#print("output = ", str(countU), '\t', uni, '\n')
output_file.close()
# Write bigrams to output file:
output_file = open("bigram-out.txt","w", encoding="UTF-8")
writecount = 0
for bi in sorted(Bigrams, key=Bigrams.get, reverse=True):
countB = Bigrams[bi]
output_file.write(str(countB)+ '\t' + str(bi) + '\n')
writecount +=1
if writecount >= writemax:
break
#print("output = ", str(countB), '\t', bi, '\n')
output_file.close()
#-----------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment