Skip to content

Instantly share code, notes, and snippets.

@josht-jpg
josht-jpg / getting_from_Gutenberg
Last active September 2, 2020 20:44
Getting Notes from the Underground from Gutenberg.org
from urllib import request
def get_book(url):
response = request.urlopen(url)
return response.read().decode('utf8')
notes_from_underground_raw = get_book("http://www.gutenberg.org/files/600/600.txt")
crime_and_punishment_raw = get_book("http://www.gutenberg.org/files/2554/2554-0.txt")
the_idiot_raw = get_book("http://www.gutenberg.org/files/2638/2638-0.txt")
the_possessed_raw = get_book("http://www.gutenberg.org/files/8117/8117-0.txt")
@josht-jpg
josht-jpg / remove_extraneous
Last active September 2, 2020 20:52
Removing extraneous text from raw files
import re
def get_book_contents(book_raw):
start = re.compile(r"START OF (?:THE|THIS) PROJECT GUTENBERG", flags=re.IGNORECASE)
end = re.compile(r"END OF (?:THE|THIS) PROJECT GUTENBERG", flags=re.IGNORECASE)
book_start = start.search(book_raw)
book_end = end.search(book_raw)
return book_raw[book_start.span()[1]:book_end.span()[1]]
@josht-jpg
josht-jpg / cleaning
Last active September 2, 2020 19:17
Removing stop words and tokenizing
import nltk
stop_words = nltk.corpus.stopwords.words('english')
def clean(book, stop_words):
book = book.lower()
#tokenizing
book_tokens_clean = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(book)
book_clean = pd.DataFrame(book_tokens_clean, columns = ['word'])
#removing stop words
@josht-jpg
josht-jpg / applying_afinn
Last active September 2, 2020 19:31
beginning with AFINN
from afinn import Afinn
afinn = Afinn()
def afinn_context(book_bigrams):
book_bigrams['score'] = book_bigrams['word2'].apply(afinn.score)
book_bigrams['score'] = book_bigrams.apply(
lambda x: x['score'] * -1 if x['word1'] in negations else x['score'],
axis = 1)
return book_bigrams
@josht-jpg
josht-jpg / NRC analysis
Last active September 3, 2020 18:14
NRC analysis
NRC = pd.read_csv("NRC.csv", names=["word", "sentiment", "classifaction"])
NRC_sentiments = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
'positive', 'sadness', 'suprise', 'trust']
def nrc_classify(word):
return NRC[NRC['word'] == word].loc[:, 'classifaction'].tolist()
def nrc_clean(book_nrc):
book_nrc['classifications'] = book_nrc['word2'].apply(nrc_classify)
book_nrc = book_nrc[book_nrc['classifications'].str.len() > 0]
@josht-jpg
josht-jpg / AdaBoost_setup
Last active September 7, 2020 19:54
AdaBoost setup
notes_tokens = nltk.word_tokenize(notes)
crime_tokens = nltk.word_tokenize(crime)
idiot_tokens = nltk.word_tokenize(idiot)
possessed_tokens = nltk.word_tokenize(possessed)
brothers_tokens = nltk.word_tokenize(brothers)
notes_str = " ".join(notes_tokens)
notes_sentences = notes_str.split(".")
crime_str = " ".join(crime_tokens)
@josht-jpg
josht-jpg / getting_goodreads_titles
Last active September 7, 2020 16:56
Getting goodreads titles
from bs4 import BeautifulSoup
#get_contents removes extraneous empty lists returned from .find_all() in line 16
def get_contents(tag):
return tag.contents[1]
titles = pd.Series(dtype = "string")
#Loops through each webpage in the goodreads list
for i in np.arange(1, 12):
@josht-jpg
josht-jpg / filtering_out_Dostoevsky_titles
Last active September 7, 2020 01:00
Filtering out Dostoevsky titles
#Goodreads has over 50 pages of titles of Dostoevsky books.
#All relevant titles are on the first two pages.
page1_url = "https://www.goodreads.com/author/list/3137322.Fyodor_Dostoyevsky?page=1&per_page=30"
page2_url = "https://www.goodreads.com/author/list/3137322.Fyodor_Dostoyevsky?page=2&per_page=30"
page1 = request.urlopen(page1_url)
page2 = request.urlopen(page2_url)
page1_soup = BeautifulSoup(page1, "html.parser")
@josht-jpg
josht-jpg / sampling_titles
Created August 30, 2020 02:39
sampling titles
np.random.seed(0)
sample_titles = titles.iloc[np.random.choice(titles.shape[0], size=60, replace=False)]
@josht-jpg
josht-jpg / providing_context
Last active September 2, 2020 19:23
providing_context
keep_words = ['no', 'not']
stop_words_context = [w for w in stop_words if w not in keep_words]
negations = ['not', 'no', 'never', 'without']
def bigram(book):
book_context = clean(book, stop_words_context)
book_bigrams = pd.DataFrame(list(nltk.bigrams(book_context['word'])),
columns = ['word1', 'word2'])
return book_bigrams