Brandon Ko brandonko

## csv_url_cleaner.py
import os
import re
import csv
import sys
import datetime
import urllib.request
from tqdm import tqdm

# Headers for HTTP requests
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',

## text_preprocessing_per_word.py
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('wordnet')

# Remove all stopwords
stop_words = stopwords.words('english')
def remove_stopwords(tokenized_sentences):
    for sentence in tokenized_sentences:

## text_processing_tokenization.py
import gensim
import string

# Uses gensim to process the sentences
def sentence_to_words(sentences):
    for sentence in sentences:
        sentence_tokenized = gensim.utils.simple_preprocess(sentence,
                                                            deacc=True,
                                                            min_len=2,
                                                            max_len=15)

## html_to_text_beautifulsoup.py
from bs4 import BeautifulSoup

# Returns the text from a HTML file based on specified tags
def parse_html(html_path):
    with open(html_path, 'r') as fr:
        html_content = fr.read()
        soup = BeautifulSoup(html_content, 'html.parser')

        # Check that file is valid HTML
        if not soup.find():

## html_to_text.py
import os
import re
from boilerpy3 import extractors

# Condenses all repeating newline characters into one single newline character
def condense_newline(text):
    return '\n'.join([p for p in re.split('\n|\r', text) if len(p) > 0])

# Returns the text from a HTML file
def parse_html(html_path):

## large_ngram_cleaning.py
import nltk
nltk.download('punkt')
import matplotlib.pyplot as plt
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

# Helper method for generating n-grams
def extract_ngrams_sentences(sentences, num):
    all_grams = []
    for sentence in sentences:
	import os
	import re
	import csv
	import sys
	import datetime
	import urllib.request
	from tqdm import tqdm

	# Headers for HTTP requests
	hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.stem import SnowballStemmer
	nltk.download('stopwords')
	nltk.download('wordnet')

	# Remove all stopwords
	stop_words = stopwords.words('english')
	def remove_stopwords(tokenized_sentences):
	for sentence in tokenized_sentences:
	import gensim
	import string

	# Uses gensim to process the sentences
	def sentence_to_words(sentences):
	for sentence in sentences:
	sentence_tokenized = gensim.utils.simple_preprocess(sentence,
	deacc=True,
	min_len=2,
	max_len=15)
	from bs4 import BeautifulSoup

	# Returns the text from a HTML file based on specified tags
	def parse_html(html_path):
	with open(html_path, 'r') as fr:
	html_content = fr.read()
	soup = BeautifulSoup(html_content, 'html.parser')

	# Check that file is valid HTML
	if not soup.find():
	import os
	import re
	from boilerpy3 import extractors

	# Condenses all repeating newline characters into one single newline character
	def condense_newline(text):
	return '\n'.join([p for p in re.split('\n\|\r', text) if len(p) > 0])

	# Returns the text from a HTML file
	def parse_html(html_path):
	import nltk
	nltk.download('punkt')
	import matplotlib.pyplot as plt
	from nltk.util import ngrams
	from nltk.tokenize import word_tokenize

	# Helper method for generating n-grams
	def extract_ngrams_sentences(sentences, num):
	all_grams = []
	for sentence in sentences: