Instantly share code, notes, and snippets.

Embed
What would you like to do?
Parse text, convert to lowercase, remove punctuation, and tally word frequency.
#!/usr/bin/env python
# Clean text of punctuation, convert to lowercase, count the number of instances of each word, then sort by frequency.
# Intended for non-English text.
# ./wordcount.py [input filename] [export filename]
import sys, re, string, argparse
from collections import OrderedDict
file = open(sys.argv[1], "r")
words = file.read().split()
dict = {}
for word in words:
pattern = re.compile('[\W]', re.IGNORECASE | re.UNICODE)
word = pattern.sub('', word).lower();
if word and word in dict:
dict[word] += 1
else:
dict[word] = 1
sortedWords = OrderedDict(sorted(dict.items(), key=lambda x: x[1], reverse=True))
string = ''
for i in sortedWords:
num = str(sortedWords[i])
string = string + i + ', ' + num + "\n"
f = open(sys.argv[2], "w")
f.write(string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment