Skip to content

Instantly share code, notes, and snippets.

@Pinak-Chakraborty
Created August 10, 2014 20:05
Show Gist options
  • Save Pinak-Chakraborty/0c7e8e6f505d45f012e5 to your computer and use it in GitHub Desktop.
Save Pinak-Chakraborty/0c7e8e6f505d45f012e5 to your computer and use it in GitHub Desktop.
Tokenizer and computation of unigrams and bigrams (without regex)
import sys, os, os.path, glob, codecs
# Set the codecs
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())
# set the delimiters
delimiterSet = ";.,!?\"()':[]\n/+-—=≤≥{}><*’”“|"
digits = "0123456789"
chars = "abcdefghijklmnopqrstuvwxyz"
chars = "".join( (chars, chars.upper()) )
spaces = " \t\n"
numberdelimiters = ",."
# Declare Unigram and Bigram dictionary
Unigrams = {}
Bigrams = {}
# Declare variable for Bigrams (used for concatenation)
# Max number of unigrams and bigrams with highest frequency written
prev_word = "START"
writemax = 100
# Main tokenizer starts here
def main(fname):
print("starting tokenizer")
global delimiterSet
global writemax
if not os.path.isfile(fname):
print("Error: Not a file", fname, "\n")
usage()
return
try:
#inStream = open(fname, mode="r")
inStream = open(fname, mode="r", encoding="UTF-8")
token = ""
ch = inStream.read(1)
lookahead = inStream.read(1)
while True:
if not ch:
if token:
print(token)
process(token)
break
if ch in delimiterSet:
if token:
if token[-1] in digits and lookahead in digits and ch in numberdelimiters:
token = "".join( (token, ch) )
elif token[-1] in chars and lookahead in chars and ch in numberdelimiters:
token = "".join( (token, ch) )
else:
print(token)
process(token)
token = ""
if ch not in spaces:
print(ch)
process(ch)
elif ch in spaces:
if token:
print(token)
process(token)
token = ""
else:
token = "".join( (token, ch) )
ch = lookahead
lookahead = inStream.read(1)
inStream.close()
except IOError:
print("Cannot read from file:", fname, file=sys.stdout)
#At this point, unigrams and bigrams are created - so print from these
#--------------------------------------------------------------------
# Write unigrams to output file - first 100 highest freq are written
output_file = open("unigram-out.txt","w", encoding="UTF-8")
writecount = 0
for uni in sorted(Unigrams, key=Unigrams.get, reverse=True):
countU = Unigrams[uni]
output_file.write(str(countU) + '\t' + str(uni) + '\n')
writecount +=1
if writecount >= writemax:
break
#print("output = ", str(countU), '\t', uni, '\n')
output_file.close()
# Write bigrams to output file:
output_file = open("bigram-out.txt","w", encoding="UTF-8")
writecount = 0
for bi in sorted(Bigrams, key=Bigrams.get, reverse=True):
countB = Bigrams[bi]
output_file.write(str(countB)+ '\t' + str(bi) + '\n')
writecount +=1
if writecount >= writemax:
break
#print("output = ", str(countB), '\t', bi, '\n')
output_file.close()
#-----------------------------------------------------------
def process(word):
# This function populates the unigrams and bigrams
# This should be called for every token from the main function
global prev_word
# loop over unigrams:
if word in Unigrams:
Unigrams[word] += 1
else:
Unigrams[word] = 1
#-----------------------------------------------------------
# concatenate words to get bigram:
bigram = prev_word + ' ' + word
if bigram in Bigrams:
Bigrams[bigram] += 1
else:
Bigrams[bigram] = 1
# change value of prev_word
prev_word = word
#-----------------------------------------------------------
def usage():
print("""
tokenizer.py
Usage:
python tokenizer.py mytext.txt
""")
if __name__ == '__main__':
if len(sys.argv) > 1:
for i in sys.argv[1:]:
for j in glob.glob(i):
main(os.path.expanduser(os.path.expandvars(j)))
else:
usage()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment