This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import request | |
def get_book(url): | |
response = request.urlopen(url) | |
return response.read().decode('utf8') | |
notes_from_underground_raw = get_book("http://www.gutenberg.org/files/600/600.txt") | |
crime_and_punishment_raw = get_book("http://www.gutenberg.org/files/2554/2554-0.txt") | |
the_idiot_raw = get_book("http://www.gutenberg.org/files/2638/2638-0.txt") | |
the_possessed_raw = get_book("http://www.gutenberg.org/files/8117/8117-0.txt") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def get_book_contents(book_raw): | |
start = re.compile(r"START OF (?:THE|THIS) PROJECT GUTENBERG", flags=re.IGNORECASE) | |
end = re.compile(r"END OF (?:THE|THIS) PROJECT GUTENBERG", flags=re.IGNORECASE) | |
book_start = start.search(book_raw) | |
book_end = end.search(book_raw) | |
return book_raw[book_start.span()[1]:book_end.span()[1]] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
stop_words = nltk.corpus.stopwords.words('english') | |
def clean(book, stop_words): | |
book = book.lower() | |
#tokenizing | |
book_tokens_clean = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(book) | |
book_clean = pd.DataFrame(book_tokens_clean, columns = ['word']) | |
#removing stop words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from afinn import Afinn | |
afinn = Afinn() | |
def afinn_context(book_bigrams): | |
book_bigrams['score'] = book_bigrams['word2'].apply(afinn.score) | |
book_bigrams['score'] = book_bigrams.apply( | |
lambda x: x['score'] * -1 if x['word1'] in negations else x['score'], | |
axis = 1) | |
return book_bigrams | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NRC = pd.read_csv("NRC.csv", names=["word", "sentiment", "classifaction"]) | |
NRC_sentiments = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', | |
'positive', 'sadness', 'suprise', 'trust'] | |
def nrc_classify(word): | |
return NRC[NRC['word'] == word].loc[:, 'classifaction'].tolist() | |
def nrc_clean(book_nrc): | |
book_nrc['classifications'] = book_nrc['word2'].apply(nrc_classify) | |
book_nrc = book_nrc[book_nrc['classifications'].str.len() > 0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
notes_tokens = nltk.word_tokenize(notes) | |
crime_tokens = nltk.word_tokenize(crime) | |
idiot_tokens = nltk.word_tokenize(idiot) | |
possessed_tokens = nltk.word_tokenize(possessed) | |
brothers_tokens = nltk.word_tokenize(brothers) | |
notes_str = " ".join(notes_tokens) | |
notes_sentences = notes_str.split(".") | |
crime_str = " ".join(crime_tokens) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
#get_contents removes extraneous empty lists returned from .find_all() in line 16 | |
def get_contents(tag): | |
return tag.contents[1] | |
titles = pd.Series(dtype = "string") | |
#Loops through each webpage in the goodreads list | |
for i in np.arange(1, 12): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Goodreads has over 50 pages of titles of Dostoevsky books. | |
#All relevant titles are on the first two pages. | |
page1_url = "https://www.goodreads.com/author/list/3137322.Fyodor_Dostoyevsky?page=1&per_page=30" | |
page2_url = "https://www.goodreads.com/author/list/3137322.Fyodor_Dostoyevsky?page=2&per_page=30" | |
page1 = request.urlopen(page1_url) | |
page2 = request.urlopen(page2_url) | |
page1_soup = BeautifulSoup(page1, "html.parser") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
np.random.seed(0) | |
sample_titles = titles.iloc[np.random.choice(titles.shape[0], size=60, replace=False)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
keep_words = ['no', 'not'] | |
stop_words_context = [w for w in stop_words if w not in keep_words] | |
negations = ['not', 'no', 'never', 'without'] | |
def bigram(book): | |
book_context = clean(book, stop_words_context) | |
book_bigrams = pd.DataFrame(list(nltk.bigrams(book_context['word'])), | |
columns = ['word1', 'word2']) | |
return book_bigrams | |
OlderNewer