Skip to content

Instantly share code, notes, and snippets.

@xunil154
Created November 19, 2013 19:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xunil154/7551309 to your computer and use it in GitHub Desktop.
Save xunil154/7551309 to your computer and use it in GitHub Desktop.
Basic python text analysis, it looks through text files and counts, single words, bigrams and trigrams
#!/usr/bin/env python2.7
import sys
import re
single_gram = {}
bigram = {}
trigram = {}
def filter_line(line):
line = line.strip().lower()
line = re.sub(r"[^\w\s'.!?]",'', line)
return line
def mangle_gram(gram):
return "-".join(gram)
def increment_gram(source, gram):
mangled = mangle_gram(gram)
if mangled in source:
source[mangled] += 1
else:
source[mangled] = 1
def add_gram(gram):
if len(gram) == 1:
increment_gram(single_gram, gram)
elif len(gram) == 2:
increment_gram(bigram, gram)
elif len(gram) == 3:
increment_gram(trigram, gram)
else:
print "Error, not a correct gram"
def parse_file(filename):
"""
Throws FileNotFoundError if it doesn't exist
"""
handle = open(filename, 'r', 1)
last_state = []
for line in handle:
line = filter_line(line)
words = line.split()
combined = last_state + words # put the last two words of the last line on here
for start in range(0, len(combined)-2):
for i in range(1,4):
add_gram(combined[start:start+i])
last_state = words[-2:]
def usage(message=""):
if message:
print message
print "Usage: python2.7 gram-parser.py <file> [file2] ..."
sys.exit(1)
if __name__ == "__main__":
if len(sys.argv) < 2:
usage("Not enough arguments")
for filename in sys.argv[1:]:
parse_file(filename)
try:
print ""
except:
print "ERR: Could not open "+filename+" - Does not exist"
print "WARN: Skipping for now"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment