douglasgoodwin/janeausten.py

## janeausten.py
# EX09 _ textual analysis of Jane Austen's book "Emma"
# questions
# + How many words are in the novel?
# + what's the longest word? the shortest word? the average word length?
# + what are the top 100 words by count?

from collections import Counter

# get the remote file
import urllib.request
url = "https://gist.githubusercontent.com/douglasgoodwin/08c221e42f3b62173a476e019ac7a949/raw/8bb9852fa2ef81367486c9155d6686b184fceeb0/jane-austen_emma.txt"
urllib.request.urlretrieve(url, 'jane-austen_emma.txt')

file = open('jane-austen_emma.txt', encoding="utf8")
data= file.read()

words = []
# To eliminate duplicates, remember to split by punctuation, and use case demiliters.
for word in data.lower().split(' '):
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace(":","")
    word = word.replace(";","")
    word = word.replace("\"","")
    word = word.replace("!","")
    word = word.replace("â€œ","")
    word = word.replace("â€˜","")
    word = word.replace("*","")
    words.append(word)

file.close()

def longest_word(l):
    maxl = 0
    wd = ""
    # traverse the list
    for i in l:
        if(len(i) > maxl):
            maxl = len(i)
            wd = i
    return maxl,wd


def setup():
    size(700,1300)
    background(255)
    fill(0)
    stroke(255)
    text_align(RIGHT,CENTER)

    number_of_words = len(words)
    print(f"Emma contains {number_of_words} words")

    unique = set(words)
    print(f"Emma has {len(unique)} unique words.")

    # l,w = longest_word(words)
    w=max(words,key=len)
    print(f"The longest word is: {w} at length {len(w)}")

    word_counter = Counter(words)
    most_frequent = word_counter.most_common(100)

    for i,w in enumerate(most_frequent):
        text(w[0].capitalize(),70,i*12+23)
        rect(80,i*12+20, w[1]/10,10)

    push_matrix()
    translate(500,300)
    rotate(radians(90))
    text_size(21)
    text_align(LEFT,CENTER)
    text(f"Word frequencies for Jane Austen's novel \"Emma\"\nEmma contains {number_of_words} words\nand {len(unique)} unique words\n",0,0)

    save("analysis.png")

def draw():
    no_loop()
	# EX09 _ textual analysis of Jane Austen's book "Emma"
	# questions
	# + How many words are in the novel?
	# + what's the longest word? the shortest word? the average word length?
	# + what are the top 100 words by count?

	from collections import Counter

	# get the remote file
	import urllib.request
	url = "https://gist.githubusercontent.com/douglasgoodwin/08c221e42f3b62173a476e019ac7a949/raw/8bb9852fa2ef81367486c9155d6686b184fceeb0/jane-austen_emma.txt"
	urllib.request.urlretrieve(url, 'jane-austen_emma.txt')

	file = open('jane-austen_emma.txt', encoding="utf8")
	data= file.read()

	words = []
	# To eliminate duplicates, remember to split by punctuation, and use case demiliters.
	for word in data.lower().split(' '):
	word = word.replace(".","")
	word = word.replace(",","")
	word = word.replace(":","")
	word = word.replace(";","")
	word = word.replace("\"","")
	word = word.replace("!","")
	word = word.replace("â€œ","")
	word = word.replace("â€˜","")
	word = word.replace("*","")
	words.append(word)

	file.close()

	def longest_word(l):
	maxl = 0
	wd = ""
	# traverse the list
	for i in l:
	if(len(i) > maxl):
	maxl = len(i)
	wd = i
	return maxl,wd


	def setup():
	size(700,1300)
	background(255)
	fill(0)
	stroke(255)
	text_align(RIGHT,CENTER)

	number_of_words = len(words)
	print(f"Emma contains {number_of_words} words")

	unique = set(words)
	print(f"Emma has {len(unique)} unique words.")

	# l,w = longest_word(words)
	w=max(words,key=len)
	print(f"The longest word is: {w} at length {len(w)}")

	word_counter = Counter(words)
	most_frequent = word_counter.most_common(100)

	for i,w in enumerate(most_frequent):
	text(w[0].capitalize(),70,i*12+23)
	rect(80,i*12+20, w[1]/10,10)

	push_matrix()
	translate(500,300)
	rotate(radians(90))
	text_size(21)
	text_align(LEFT,CENTER)
	text(f"Word frequencies for Jane Austen's novel \"Emma\"\nEmma contains {number_of_words} words\nand {len(unique)} unique words\n",0,0)

	save("analysis.png")

	def draw():
	no_loop()