Skip to content

Instantly share code, notes, and snippets.

@douglasgoodwin
Created March 28, 2023 21:56
Show Gist options
  • Save douglasgoodwin/2907c845ea0a679131e3d4ff838d8f7d to your computer and use it in GitHub Desktop.
Save douglasgoodwin/2907c845ea0a679131e3d4ff838d8f7d to your computer and use it in GitHub Desktop.
# EX09 _ textual analysis of Jane Austen's book "Emma"
# questions
# + How many words are in the novel?
# + what's the longest word? the shortest word? the average word length?
# + what are the top 100 words by count?
from collections import Counter
# get the remote file
import urllib.request
url = "https://gist.githubusercontent.com/douglasgoodwin/08c221e42f3b62173a476e019ac7a949/raw/8bb9852fa2ef81367486c9155d6686b184fceeb0/jane-austen_emma.txt"
urllib.request.urlretrieve(url, 'jane-austen_emma.txt')
file = open('jane-austen_emma.txt', encoding="utf8")
data= file.read()
words = []
# To eliminate duplicates, remember to split by punctuation, and use case demiliters.
for word in data.lower().split(' '):
word = word.replace(".","")
word = word.replace(",","")
word = word.replace(":","")
word = word.replace(";","")
word = word.replace("\"","")
word = word.replace("!","")
word = word.replace("“","")
word = word.replace("‘","")
word = word.replace("*","")
words.append(word)
file.close()
def longest_word(l):
maxl = 0
wd = ""
# traverse the list
for i in l:
if(len(i) > maxl):
maxl = len(i)
wd = i
return maxl,wd
def setup():
size(700,1300)
background(255)
fill(0)
stroke(255)
text_align(RIGHT,CENTER)
number_of_words = len(words)
print(f"Emma contains {number_of_words} words")
unique = set(words)
print(f"Emma has {len(unique)} unique words.")
# l,w = longest_word(words)
w=max(words,key=len)
print(f"The longest word is: {w} at length {len(w)}")
word_counter = Counter(words)
most_frequent = word_counter.most_common(100)
for i,w in enumerate(most_frequent):
text(w[0].capitalize(),70,i*12+23)
rect(80,i*12+20, w[1]/10,10)
push_matrix()
translate(500,300)
rotate(radians(90))
text_size(21)
text_align(LEFT,CENTER)
text(f"Word frequencies for Jane Austen's novel \"Emma\"\nEmma contains {number_of_words} words\nand {len(unique)} unique words\n",0,0)
save("analysis.png")
def draw():
no_loop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment