Cody Glickman glickmac

## R_tut_2.r
### R code from vignette source 'Presentation_2.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: init
###################################################
options(width=60)


###################################################

## dd_webscrapping.py
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download("stopwords")
nltk.download('vader_lexicon')

### Pull
url = 'http://www.gutenberg.org/files/501/501-0.txt'
res = requests.get(url)
html_page = res.content

## longest_word.py
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
tokens = [i.lower() for i in tokens]

## Uncomment and remove the ")" to get length of longest word
print("Longest word in text: " + max(tokens, key=len) )# + " is " + str(len(max(tokens, key=len))) + " characters long")

## Longest real word
tokens = [y for y in tokens if y != "cutterigsloop"]

## Flesch_Kincaid.py
def Flesch_Kincaid(text):
    sentences = text.split('.')
    avg_sentence_len = sum(len(x.split()) for x in sentences) / len(sentences)

    syllables = sum(list(map(lambda x: 1 if x in ["a","i","e","o","u","y"] else 0,text)))
    word_count = len(text.split(' '))
    mean_syllables_per_word = syllables/float(word_count)

    return (0.39 * avg_sentence_len) + (11.8 * mean_syllables_per_word) - 15.59

## Text_Processing.py
def text_processing(input_text):
    tokens = tokenizer.tokenize(input_text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(i) for i in tokens]
    stops = set(stopwords.words('english'))
    values = [i for i in tokens if i not in stops]
    weird = ["wa", "u"]
    values = [i for i in values if i not in weird]
    return(values)
values = text_processing(text)

## Animals.py
text = text.translate(str.maketrans('', '', string.punctuation))
animals = {"polynesia":"parrot", "gubgub":"pig", "cheechee":"monkey", "tootoo":"owl","pushmipullyu":"two-headed unicorn", "whitey":"mouse", "jip":"dog", "dabdab":"duck", "toggle":"horse", "cheapside":"sparrow"}
text = ' '.join([animals.get(i, i) for i in text.split()])
with open("../data/Animal_Names.txt", "r") as f:
    animals = f.readlines()
    animals = [x.split("\n")[0] for x in animals]
Animals_in_Text = set(animals) & set(tokens)
print("Doctor Dolittle interacts with " + str(len(Animals_in_Text)) + " different kinds of animals")
values = [i for i in values if i in Animals_in_Text]
freq = nltk.FreqDist(values)

## Wordcloud_Animals.py
from collections import Counter
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
## Split into chapters
Chapters = text.split("chapter")

count = 1

## vader_sentiment.py
def Chapter_Sentiment(Chapter_Text):
    def Average(lst):
        return float(sum(lst)) / len(lst)
    sid = SentimentIntensityAnalyzer()
    sentences = Chapter_Text.split('.')
    sentiment_scores = []
    for item in sentences:
        sentiment = sid.polarity_scores(item)
        sentiment_value = sentiment['compound']
        sentiment_scores.append(sentiment_value)

## war_words.py
with open("../data/War_Terms.txt", "r") as f:
    war_terms = f.readlines()
    war_terms = [x.split("\n")[0] for x in war_terms]
War_Terms_in_Text = set(war_terms) & set(tokens)
print('Number of war terms used in the story: '+ str(len(War_Terms_in_Text)))
values = [i for i in tokens if i in War_Terms_in_Text]
sentences = text.split('.')
for item in sentences:
    if "death" in item:
        print(item)

## webscraping.py
url = 'http://www.gutenberg.org/files/501/501-0.txt'
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
text = str(text)
text = text.replace("\n", " ").replace("\r", " ").replace("\\r", " ").replace("\\n", " ").replace("_", "").lower()
text = text.split("the first chapter")[1].split("illustration: the end")[0]
with open("../data/Doctor_Dolittle.txt", "w") as f:
    f.write(text)
	### R code from vignette source 'Presentation_2.Rnw'
	### Encoding: UTF-8

	###################################################
	### code chunk number 1: init
	###################################################
	options(width=60)


	###################################################
	import requests
	from bs4 import BeautifulSoup
	import nltk
	nltk.download("stopwords")
	nltk.download('vader_lexicon')

	### Pull
	url = 'http://www.gutenberg.org/files/501/501-0.txt'
	res = requests.get(url)
	html_page = res.content
	from nltk.tokenize import RegexpTokenizer
	tokenizer = RegexpTokenizer(r'\w+')
	tokens = tokenizer.tokenize(text)
	tokens = [i.lower() for i in tokens]

	## Uncomment and remove the ")" to get length of longest word
	print("Longest word in text: " + max(tokens, key=len) )# + " is " + str(len(max(tokens, key=len))) + " characters long")

	## Longest real word
	tokens = [y for y in tokens if y != "cutterigsloop"]
	def Flesch_Kincaid(text):
	sentences = text.split('.')
	avg_sentence_len = sum(len(x.split()) for x in sentences) / len(sentences)

	syllables = sum(list(map(lambda x: 1 if x in ["a","i","e","o","u","y"] else 0,text)))
	word_count = len(text.split(' '))
	mean_syllables_per_word = syllables/float(word_count)

	return (0.39 * avg_sentence_len) + (11.8 * mean_syllables_per_word) - 15.59
	def text_processing(input_text):
	tokens = tokenizer.tokenize(input_text)
	lemmatizer = WordNetLemmatizer()
	tokens = [lemmatizer.lemmatize(i) for i in tokens]
	stops = set(stopwords.words('english'))
	values = [i for i in tokens if i not in stops]
	weird = ["wa", "u"]
	values = [i for i in values if i not in weird]
	return(values)
	values = text_processing(text)
	text = text.translate(str.maketrans('', '', string.punctuation))
	animals = {"polynesia":"parrot", "gubgub":"pig", "cheechee":"monkey", "tootoo":"owl","pushmipullyu":"two-headed unicorn", "whitey":"mouse", "jip":"dog", "dabdab":"duck", "toggle":"horse", "cheapside":"sparrow"}
	text = ' '.join([animals.get(i, i) for i in text.split()])
	with open("../data/Animal_Names.txt", "r") as f:
	animals = f.readlines()
	animals = [x.split("\n")[0] for x in animals]
	Animals_in_Text = set(animals) & set(tokens)
	print("Doctor Dolittle interacts with " + str(len(Animals_in_Text)) + " different kinds of animals")
	values = [i for i in values if i in Animals_in_Text]
	freq = nltk.FreqDist(values)
	from collections import Counter
	import numpy as np
	from wordcloud import WordCloud, ImageColorGenerator
	from PIL import Image
	import matplotlib.pyplot as plt
	from matplotlib.backends.backend_pdf import PdfPages
	## Split into chapters
	Chapters = text.split("chapter")

	count = 1
	def Chapter_Sentiment(Chapter_Text):
	def Average(lst):
	return float(sum(lst)) / len(lst)
	sid = SentimentIntensityAnalyzer()
	sentences = Chapter_Text.split('.')
	sentiment_scores = []
	for item in sentences:
	sentiment = sid.polarity_scores(item)
	sentiment_value = sentiment['compound']
	sentiment_scores.append(sentiment_value)
	with open("../data/War_Terms.txt", "r") as f:
	war_terms = f.readlines()
	war_terms = [x.split("\n")[0] for x in war_terms]
	War_Terms_in_Text = set(war_terms) & set(tokens)
	print('Number of war terms used in the story: '+ str(len(War_Terms_in_Text)))
	values = [i for i in tokens if i in War_Terms_in_Text]
	sentences = text.split('.')
	for item in sentences:
	if "death" in item:
	print(item)