Skip to content

Instantly share code, notes, and snippets.

@a-poor
Created July 28, 2020 18:38
Show Gist options
  • Save a-poor/bc643fe956b1c004fc89efcb3214ebf5 to your computer and use it in GitHub Desktop.
Save a-poor/bc643fe956b1c004fc89efcb3214ebf5 to your computer and use it in GitHub Desktop.
import re
from collections import Counter
import urllib.request
def get_book(url):
"""Load text from a URL"""
with urllib.request.urlopen(url) as response:
text = response.read().decode(errors='ignore')
return text.lower()
def split_words(book):
"""Extract the words with regex"""
return re.split("[^A-Za-z]+",book)
def count_words(words):
"""Create a dictionary with word counts"""
return Counter(words)
def read_books(urls):
"""For each url in urls,
load the book and count the words"""
# Create a place to store word counts
word_counts = {}
# For each book: load it, count words, store the counts
for title, path in urls.items():
book = get_book(path)
words = split_words(book)
counts = count_words(words)
word_counts[title] = counts.most_common()[:10]
return word_counts
def save_results(results,path):
"""Save the results to a text file"""
with open(path,'w') as f:
for book, words in results.items():
f.write(f"BOOK: {book}\n")
for word, count in words:
f.write(f"{' '*8}{word:10s}{count:6d}\n")
urls = {
'pride-and-prejudice': 'https://www.gutenberg.org/files/1342/1342-0.txt',
'alice-in-wonderland': 'https://www.gutenberg.org/files/11/11-0.txt',
'sherlock-holmes': 'https://www.gutenberg.org/files/1661/1661-0.txt',
'moby-dick': 'https://www.gutenberg.org/files/2701/2701-0.txt',
'count-of-monte-cristo': 'https://www.gutenberg.org/files/1184/1184-0.txt'
}
output_file = "my-results.txt"
# Download the books and count words
results = read_books(urls)
# Save the results to a text file
save_results(results,output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment