Created
August 12, 2018 04:18
-
-
Save vibhurishi/3d4793e60e24af8ebb9ebd9c547b8222 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A small utility to read any txt file and output the number of words | |
import sys | |
import re | |
from collections import Counter | |
def word_count(fname): | |
# This is the ignore word list | |
ignore_words = [ "the" , "of", "to", "and", "in", "is" , "a", "this", "that", "for", "with", "was", "on", "have", "it", "he", "are", "not", "but" ] | |
total_words = 0 | |
# We are going to open the file, convert words to lower, and filter all the words | |
# Then we count the words using Counter | |
with open(fname) as f: | |
data = re.findall(r"[\w']+",(f.read().lower())) | |
words = Counter(data) | |
# We then print out the data in a tabulated format, except for those words in the ignore list | |
# But we will stil count the ignored words for the final tally. | |
# lambda is used to sort. We are sorting by the count and then by the word | |
for word, key in sorted(words.items() , key=lambda x: (x[1] , x[0])): | |
if word in ignore_words: | |
total_words = total_words+ key | |
continue | |
else : | |
total_words = total_words+ key | |
print ('{0:15} {1:5}'.format(word, key)) # This formats the output nicely | |
return total_words | |
def main(): | |
#Only try something if a filename is passed as argument | |
try: | |
filename = sys.argv[1] | |
print (filename) | |
print("Number of words in the file :",word_count(filename)) | |
except IndexError as e: | |
print ("you need to specify the filename") | |
if __name__ == "__main__": | |
main() |
👍 BTW, use most_common method of Counter instead of the sorted line. It will save a bunch of memory by using iterators instead of conversion to list and allow you to work on huge files.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
usage:
$python3 word_counter.py