Skip to content

Instantly share code, notes, and snippets.

@jeremysinger
Created November 17, 2017 14:20
Show Gist options
  • Save jeremysinger/193eb8bc7258ce85fd545b7aa067d7ca to your computer and use it in GitHub Desktop.
Save jeremysinger/193eb8bc7258ce85fd545b7aa067d7ca to your computer and use it in GitHub Desktop.
import string
import urllib.request
# download the data from the internet
url = 'https://sherlock-holm.es/stories/plain-text/cano.txt'
response = urllib.request.urlopen(url)
data = response.read() # read the raw bytes
text = data.decode('utf-8') # turn the raw data into a Python string
words = text.split()
# Alternatively, open the file (if you have already downloaded it)
#f = open('sherlock.txt')
#words = f.read().split()
#f.close()
# dictionary where key is a word, value is the integer count
wordcount = {}
for word in words:
word = word.lower()
# does the first char contain punctuation?
while len(word)>0 and word[0] in string.punctuation:
word = word[1:]
# does the last char contain punctuation?
while len(word) > 0 and word[-1] in string.punctuation:
word = word[:-1]
# add word to the counts dictionary
wordcount[word] = 1 + wordcount.get(word, 0)
# elementary should appear 7 or 8 times, according to google
print(wordcount['elementary'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment