Created
March 28, 2023 21:56
-
-
Save douglasgoodwin/2907c845ea0a679131e3d4ff838d8f7d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# EX09 _ textual analysis of Jane Austen's book "Emma" | |
# questions | |
# + How many words are in the novel? | |
# + what's the longest word? the shortest word? the average word length? | |
# + what are the top 100 words by count? | |
from collections import Counter | |
# get the remote file | |
import urllib.request | |
url = "https://gist.githubusercontent.com/douglasgoodwin/08c221e42f3b62173a476e019ac7a949/raw/8bb9852fa2ef81367486c9155d6686b184fceeb0/jane-austen_emma.txt" | |
urllib.request.urlretrieve(url, 'jane-austen_emma.txt') | |
file = open('jane-austen_emma.txt', encoding="utf8") | |
data= file.read() | |
words = [] | |
# To eliminate duplicates, remember to split by punctuation, and use case demiliters. | |
for word in data.lower().split(' '): | |
word = word.replace(".","") | |
word = word.replace(",","") | |
word = word.replace(":","") | |
word = word.replace(";","") | |
word = word.replace("\"","") | |
word = word.replace("!","") | |
word = word.replace("“","") | |
word = word.replace("‘","") | |
word = word.replace("*","") | |
words.append(word) | |
file.close() | |
def longest_word(l): | |
maxl = 0 | |
wd = "" | |
# traverse the list | |
for i in l: | |
if(len(i) > maxl): | |
maxl = len(i) | |
wd = i | |
return maxl,wd | |
def setup(): | |
size(700,1300) | |
background(255) | |
fill(0) | |
stroke(255) | |
text_align(RIGHT,CENTER) | |
number_of_words = len(words) | |
print(f"Emma contains {number_of_words} words") | |
unique = set(words) | |
print(f"Emma has {len(unique)} unique words.") | |
# l,w = longest_word(words) | |
w=max(words,key=len) | |
print(f"The longest word is: {w} at length {len(w)}") | |
word_counter = Counter(words) | |
most_frequent = word_counter.most_common(100) | |
for i,w in enumerate(most_frequent): | |
text(w[0].capitalize(),70,i*12+23) | |
rect(80,i*12+20, w[1]/10,10) | |
push_matrix() | |
translate(500,300) | |
rotate(radians(90)) | |
text_size(21) | |
text_align(LEFT,CENTER) | |
text(f"Word frequencies for Jane Austen's novel \"Emma\"\nEmma contains {number_of_words} words\nand {len(unique)} unique words\n",0,0) | |
save("analysis.png") | |
def draw(): | |
no_loop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment