Created
November 17, 2017 14:20
-
-
Save jeremysinger/193eb8bc7258ce85fd545b7aa067d7ca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import urllib.request | |
# download the data from the internet | |
url = 'https://sherlock-holm.es/stories/plain-text/cano.txt' | |
response = urllib.request.urlopen(url) | |
data = response.read() # read the raw bytes | |
text = data.decode('utf-8') # turn the raw data into a Python string | |
words = text.split() | |
# Alternatively, open the file (if you have already downloaded it) | |
#f = open('sherlock.txt') | |
#words = f.read().split() | |
#f.close() | |
# dictionary where key is a word, value is the integer count | |
wordcount = {} | |
for word in words: | |
word = word.lower() | |
# does the first char contain punctuation? | |
while len(word)>0 and word[0] in string.punctuation: | |
word = word[1:] | |
# does the last char contain punctuation? | |
while len(word) > 0 and word[-1] in string.punctuation: | |
word = word[:-1] | |
# add word to the counts dictionary | |
wordcount[word] = 1 + wordcount.get(word, 0) | |
# elementary should appear 7 or 8 times, according to google | |
print(wordcount['elementary']) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment