This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import csv | |
import sys | |
import datetime | |
import urllib.request | |
from tqdm import tqdm | |
# Headers for HTTP requests | |
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.stem import SnowballStemmer | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
# Remove all stopwords | |
stop_words = stopwords.words('english') | |
def remove_stopwords(tokenized_sentences): | |
for sentence in tokenized_sentences: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
import string | |
# Uses gensim to process the sentences | |
def sentence_to_words(sentences): | |
for sentence in sentences: | |
sentence_tokenized = gensim.utils.simple_preprocess(sentence, | |
deacc=True, | |
min_len=2, | |
max_len=15) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
# Returns the text from a HTML file based on specified tags | |
def parse_html(html_path): | |
with open(html_path, 'r') as fr: | |
html_content = fr.read() | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Check that file is valid HTML | |
if not soup.find(): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
from boilerpy3 import extractors | |
# Condenses all repeating newline characters into one single newline character | |
def condense_newline(text): | |
return '\n'.join([p for p in re.split('\n|\r', text) if len(p) > 0]) | |
# Returns the text from a HTML file | |
def parse_html(html_path): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('punkt') | |
import matplotlib.pyplot as plt | |
from nltk.util import ngrams | |
from nltk.tokenize import word_tokenize | |
# Helper method for generating n-grams | |
def extract_ngrams_sentences(sentences, num): | |
all_grams = [] | |
for sentence in sentences: |