Created
July 28, 2020 18:38
-
-
Save a-poor/bc643fe956b1c004fc89efcb3214ebf5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import Counter | |
import urllib.request | |
def get_book(url): | |
"""Load text from a URL""" | |
with urllib.request.urlopen(url) as response: | |
text = response.read().decode(errors='ignore') | |
return text.lower() | |
def split_words(book): | |
"""Extract the words with regex""" | |
return re.split("[^A-Za-z]+",book) | |
def count_words(words): | |
"""Create a dictionary with word counts""" | |
return Counter(words) | |
def read_books(urls): | |
"""For each url in urls, | |
load the book and count the words""" | |
# Create a place to store word counts | |
word_counts = {} | |
# For each book: load it, count words, store the counts | |
for title, path in urls.items(): | |
book = get_book(path) | |
words = split_words(book) | |
counts = count_words(words) | |
word_counts[title] = counts.most_common()[:10] | |
return word_counts | |
def save_results(results,path): | |
"""Save the results to a text file""" | |
with open(path,'w') as f: | |
for book, words in results.items(): | |
f.write(f"BOOK: {book}\n") | |
for word, count in words: | |
f.write(f"{' '*8}{word:10s}{count:6d}\n") | |
urls = { | |
'pride-and-prejudice': 'https://www.gutenberg.org/files/1342/1342-0.txt', | |
'alice-in-wonderland': 'https://www.gutenberg.org/files/11/11-0.txt', | |
'sherlock-holmes': 'https://www.gutenberg.org/files/1661/1661-0.txt', | |
'moby-dick': 'https://www.gutenberg.org/files/2701/2701-0.txt', | |
'count-of-monte-cristo': 'https://www.gutenberg.org/files/1184/1184-0.txt' | |
} | |
output_file = "my-results.txt" | |
# Download the books and count words | |
results = read_books(urls) | |
# Save the results to a text file | |
save_results(results,output_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment