Created
November 19, 2013 19:46
-
-
Save xunil154/7551309 to your computer and use it in GitHub Desktop.
Basic python text analysis, it looks through text files and counts, single words, bigrams and trigrams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
import sys | |
import re | |
single_gram = {} | |
bigram = {} | |
trigram = {} | |
def filter_line(line): | |
line = line.strip().lower() | |
line = re.sub(r"[^\w\s'.!?]",'', line) | |
return line | |
def mangle_gram(gram): | |
return "-".join(gram) | |
def increment_gram(source, gram): | |
mangled = mangle_gram(gram) | |
if mangled in source: | |
source[mangled] += 1 | |
else: | |
source[mangled] = 1 | |
def add_gram(gram): | |
if len(gram) == 1: | |
increment_gram(single_gram, gram) | |
elif len(gram) == 2: | |
increment_gram(bigram, gram) | |
elif len(gram) == 3: | |
increment_gram(trigram, gram) | |
else: | |
print "Error, not a correct gram" | |
def parse_file(filename): | |
""" | |
Throws FileNotFoundError if it doesn't exist | |
""" | |
handle = open(filename, 'r', 1) | |
last_state = [] | |
for line in handle: | |
line = filter_line(line) | |
words = line.split() | |
combined = last_state + words # put the last two words of the last line on here | |
for start in range(0, len(combined)-2): | |
for i in range(1,4): | |
add_gram(combined[start:start+i]) | |
last_state = words[-2:] | |
def usage(message=""): | |
if message: | |
print message | |
print "Usage: python2.7 gram-parser.py <file> [file2] ..." | |
sys.exit(1) | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
usage("Not enough arguments") | |
for filename in sys.argv[1:]: | |
parse_file(filename) | |
try: | |
print "" | |
except: | |
print "ERR: Could not open "+filename+" - Does not exist" | |
print "WARN: Skipping for now" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment