Skip to content

Instantly share code, notes, and snippets.

@jakelosh
Created July 2, 2017 05:27
Show Gist options
  • Save jakelosh/32f0b6bff3061e8562990bca984033b6 to your computer and use it in GitHub Desktop.
Save jakelosh/32f0b6bff3061e8562990bca984033b6 to your computer and use it in GitHub Desktop.
This is a solution to exercise 2 of chapter 13 of Think Python (http://greenteapress.com/thinkpython/html/index.html). It opens text files of several books and then gives the top 10 most used words in each book, excluding common words.
# Chapter 13 Exercise 2
"""
Go to Project Gutenberg (http://gutenberg.org) and download
your favorite out-of-copyright book in plain text format.
Modify your program from the previous exercise to read the
book you downloaded, skip over the header information at the
beginning of the file, and process the rest of the words as
before.
Then modify the program to count the total number of words
in the book, and the number of times each word is used.
Print the number of different words used in the book.
Compare different books by different authors, written in
different eras. Which author uses the most extensive
vocabulary?
"""
import string
from nltk.corpus import stopwords
COMMON_WORDS = set(stopwords.words('english'))
def process_line(line, dict):
"""
Removes punctuation and whitespace chars and adds words to a frequency dictionary
"""
global COMMON_WORDS
# replace dashes with spaces
line = line.replace('-', ' ')
line = line.replace("'s", ' ')
for word in line.split():
word = word.strip(string.punctuation + string.whitespace)
word = word.lower()
if word not in COMMON_WORDS:
dict[word] = dict.get(word, 0) + 1
def process_file(file):
"""
Opens a file and adds words in the file to a frequency dictionary
"""
d = {}
fin = open(file, 'r')
for line in fin:
process_line(line, d)
fin.close()
return d
f1 = 'moby_dick.txt'
h1 = process_file(f1)
f2 = 'tom_sawyer.txt'
h2 = process_file(f2)
f3 = 'tale_of_two_cities.txt'
h3 = process_file(f3)
print('\nTop 10 Words in Moby Dick')
count = 0
for w in sorted(h1, key=h1.get, reverse=True):
if count < 11:
print w, h1[w]
count += 1
else:
break
print('\nTop 10 Words in Tom Sawyer')
count = 0
for w in sorted(h2, key=h2.get, reverse=True):
if count < 11:
print w, h2[w]
count += 1
else:
break
print('\nTop 10 Words in Tale of Two Cities')
count = 0
for w in sorted(h3, key=h3.get, reverse=True):
if count < 11:
print w, h3[w]
count += 1
else:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment