jakelosh/think_python_ch13_ex02.py

## think_python_ch13_ex02.py
# Chapter 13 Exercise 2

"""

Go to Project Gutenberg (http://gutenberg.org) and download
your favorite out-of-copyright book in plain text format.

Modify your program from the previous exercise to read the
book you downloaded, skip over the header information at the
beginning of the file, and process the rest of the words as
before.

Then modify the program to count the total number of words
in the book, and the number of times each word is used.

Print the number of different words used in the book.
Compare different books by different authors, written in
different eras. Which author uses the most extensive
vocabulary?

"""

import string
from nltk.corpus import stopwords
COMMON_WORDS = set(stopwords.words('english'))

def process_line(line, dict):
	"""
	Removes punctuation and whitespace chars and adds words to a frequency dictionary
	"""
	global COMMON_WORDS
	# replace dashes with spaces
	line = line.replace('-', ' ')
	line = line.replace("'s", ' ')
	for word in line.split():
		word = word.strip(string.punctuation + string.whitespace)
		word = word.lower()
		if word not in COMMON_WORDS:
			dict[word] = dict.get(word, 0) + 1

def process_file(file):
	"""
	Opens a file and adds words in the file to a frequency dictionary
	"""
	d = {}
	fin = open(file, 'r')
	for line in fin:
		process_line(line, d)
	fin.close()
	return d

f1 = 'moby_dick.txt'
h1 = process_file(f1)

f2 = 'tom_sawyer.txt'
h2 = process_file(f2)

f3 = 'tale_of_two_cities.txt'
h3 = process_file(f3)

print('\nTop 10 Words in Moby Dick')
count = 0
for w in sorted(h1, key=h1.get, reverse=True):
	if count < 11:
		print w, h1[w]
		count += 1
	else:
		break

print('\nTop 10 Words in Tom Sawyer')
count = 0
for w in sorted(h2, key=h2.get, reverse=True):
	if count < 11:
		print w, h2[w]
		count += 1
	else:
		break

print('\nTop 10 Words in Tale of Two Cities')
count = 0
for w in sorted(h3, key=h3.get, reverse=True):
	if count < 11:
		print w, h3[w]
		count += 1
	else:
		break
	# Chapter 13 Exercise 2

	"""

	Go to Project Gutenberg (http://gutenberg.org) and download
	your favorite out-of-copyright book in plain text format.

	Modify your program from the previous exercise to read the
	book you downloaded, skip over the header information at the
	beginning of the file, and process the rest of the words as
	before.

	Then modify the program to count the total number of words
	in the book, and the number of times each word is used.

	Print the number of different words used in the book.
	Compare different books by different authors, written in
	different eras. Which author uses the most extensive
	vocabulary?

	"""

	import string
	from nltk.corpus import stopwords
	COMMON_WORDS = set(stopwords.words('english'))

	def process_line(line, dict):
	"""
	Removes punctuation and whitespace chars and adds words to a frequency dictionary
	"""
	global COMMON_WORDS
	# replace dashes with spaces
	line = line.replace('-', ' ')
	line = line.replace("'s", ' ')
	for word in line.split():
	word = word.strip(string.punctuation + string.whitespace)
	word = word.lower()
	if word not in COMMON_WORDS:
	dict[word] = dict.get(word, 0) + 1

	def process_file(file):
	"""
	Opens a file and adds words in the file to a frequency dictionary
	"""
	d = {}
	fin = open(file, 'r')
	for line in fin:
	process_line(line, d)
	fin.close()
	return d

	f1 = 'moby_dick.txt'
	h1 = process_file(f1)

	f2 = 'tom_sawyer.txt'
	h2 = process_file(f2)

	f3 = 'tale_of_two_cities.txt'
	h3 = process_file(f3)

	print('\nTop 10 Words in Moby Dick')
	count = 0
	for w in sorted(h1, key=h1.get, reverse=True):
	if count < 11:
	print w, h1[w]
	count += 1
	else:
	break

	print('\nTop 10 Words in Tom Sawyer')
	count = 0
	for w in sorted(h2, key=h2.get, reverse=True):
	if count < 11:
	print w, h2[w]
	count += 1
	else:
	break

	print('\nTop 10 Words in Tale of Two Cities')
	count = 0
	for w in sorted(h3, key=h3.get, reverse=True):
	if count < 11:
	print w, h3[w]
	count += 1
	else:
	break