Created
July 2, 2017 05:27
-
-
Save jakelosh/32f0b6bff3061e8562990bca984033b6 to your computer and use it in GitHub Desktop.
This is a solution to exercise 2 of chapter 13 of Think Python (http://greenteapress.com/thinkpython/html/index.html). It opens text files of several books and then gives the top 10 most used words in each book, excluding common words.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Chapter 13 Exercise 2 | |
""" | |
Go to Project Gutenberg (http://gutenberg.org) and download | |
your favorite out-of-copyright book in plain text format. | |
Modify your program from the previous exercise to read the | |
book you downloaded, skip over the header information at the | |
beginning of the file, and process the rest of the words as | |
before. | |
Then modify the program to count the total number of words | |
in the book, and the number of times each word is used. | |
Print the number of different words used in the book. | |
Compare different books by different authors, written in | |
different eras. Which author uses the most extensive | |
vocabulary? | |
""" | |
import string | |
from nltk.corpus import stopwords | |
COMMON_WORDS = set(stopwords.words('english')) | |
def process_line(line, dict): | |
""" | |
Removes punctuation and whitespace chars and adds words to a frequency dictionary | |
""" | |
global COMMON_WORDS | |
# replace dashes with spaces | |
line = line.replace('-', ' ') | |
line = line.replace("'s", ' ') | |
for word in line.split(): | |
word = word.strip(string.punctuation + string.whitespace) | |
word = word.lower() | |
if word not in COMMON_WORDS: | |
dict[word] = dict.get(word, 0) + 1 | |
def process_file(file): | |
""" | |
Opens a file and adds words in the file to a frequency dictionary | |
""" | |
d = {} | |
fin = open(file, 'r') | |
for line in fin: | |
process_line(line, d) | |
fin.close() | |
return d | |
f1 = 'moby_dick.txt' | |
h1 = process_file(f1) | |
f2 = 'tom_sawyer.txt' | |
h2 = process_file(f2) | |
f3 = 'tale_of_two_cities.txt' | |
h3 = process_file(f3) | |
print('\nTop 10 Words in Moby Dick') | |
count = 0 | |
for w in sorted(h1, key=h1.get, reverse=True): | |
if count < 11: | |
print w, h1[w] | |
count += 1 | |
else: | |
break | |
print('\nTop 10 Words in Tom Sawyer') | |
count = 0 | |
for w in sorted(h2, key=h2.get, reverse=True): | |
if count < 11: | |
print w, h2[w] | |
count += 1 | |
else: | |
break | |
print('\nTop 10 Words in Tale of Two Cities') | |
count = 0 | |
for w in sorted(h3, key=h3.get, reverse=True): | |
if count < 11: | |
print w, h3[w] | |
count += 1 | |
else: | |
break | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment