Skip to content

Instantly share code, notes, and snippets.

View glickmac's full-sized avatar

Cody Glickman glickmac

View GitHub Profile
### R code from vignette source 'Presentation_2.Rnw'
### Encoding: UTF-8
###################################################
### code chunk number 1: init
###################################################
options(width=60)
###################################################
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download("stopwords")
nltk.download('vader_lexicon')
### Pull
url = 'http://www.gutenberg.org/files/501/501-0.txt'
res = requests.get(url)
html_page = res.content
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
tokens = [i.lower() for i in tokens]
## Uncomment and remove the ")" to get length of longest word
print("Longest word in text: " + max(tokens, key=len) )# + " is " + str(len(max(tokens, key=len))) + " characters long")
## Longest real word
tokens = [y for y in tokens if y != "cutterigsloop"]
def Flesch_Kincaid(text):
sentences = text.split('.')
avg_sentence_len = sum(len(x.split()) for x in sentences) / len(sentences)
syllables = sum(list(map(lambda x: 1 if x in ["a","i","e","o","u","y"] else 0,text)))
word_count = len(text.split(' '))
mean_syllables_per_word = syllables/float(word_count)
return (0.39 * avg_sentence_len) + (11.8 * mean_syllables_per_word) - 15.59
def text_processing(input_text):
tokens = tokenizer.tokenize(input_text)
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(i) for i in tokens]
stops = set(stopwords.words('english'))
values = [i for i in tokens if i not in stops]
weird = ["wa", "u"]
values = [i for i in values if i not in weird]
return(values)
values = text_processing(text)
text = text.translate(str.maketrans('', '', string.punctuation))
animals = {"polynesia":"parrot", "gubgub":"pig", "cheechee":"monkey", "tootoo":"owl","pushmipullyu":"two-headed unicorn", "whitey":"mouse", "jip":"dog", "dabdab":"duck", "toggle":"horse", "cheapside":"sparrow"}
text = ' '.join([animals.get(i, i) for i in text.split()])
with open("../data/Animal_Names.txt", "r") as f:
animals = f.readlines()
animals = [x.split("\n")[0] for x in animals]
Animals_in_Text = set(animals) & set(tokens)
print("Doctor Dolittle interacts with " + str(len(Animals_in_Text)) + " different kinds of animals")
values = [i for i in values if i in Animals_in_Text]
freq = nltk.FreqDist(values)
from collections import Counter
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
## Split into chapters
Chapters = text.split("chapter")
count = 1
def Chapter_Sentiment(Chapter_Text):
def Average(lst):
return float(sum(lst)) / len(lst)
sid = SentimentIntensityAnalyzer()
sentences = Chapter_Text.split('.')
sentiment_scores = []
for item in sentences:
sentiment = sid.polarity_scores(item)
sentiment_value = sentiment['compound']
sentiment_scores.append(sentiment_value)
with open("../data/War_Terms.txt", "r") as f:
war_terms = f.readlines()
war_terms = [x.split("\n")[0] for x in war_terms]
War_Terms_in_Text = set(war_terms) & set(tokens)
print('Number of war terms used in the story: '+ str(len(War_Terms_in_Text)))
values = [i for i in tokens if i in War_Terms_in_Text]
sentences = text.split('.')
for item in sentences:
if "death" in item:
print(item)
url = 'http://www.gutenberg.org/files/501/501-0.txt'
res = requests.get(url)
html_page = res.content
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
text = str(text)
text = text.replace("\n", " ").replace("\r", " ").replace("\\r", " ").replace("\\n", " ").replace("_", "").lower()
text = text.split("the first chapter")[1].split("illustration: the end")[0]
with open("../data/Doctor_Dolittle.txt", "w") as f:
f.write(text)