Skip to content

Instantly share code, notes, and snippets.

View brandonko's full-sized avatar

Brandon Ko brandonko

View GitHub Profile
@brandonko
brandonko / csv_url_cleaner.py
Created September 17, 2020 19:27
Script for cleaning URLs in a csv file
import os
import re
import csv
import sys
import datetime
import urllib.request
from tqdm import tqdm
# Headers for HTTP requests
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
@brandonko
brandonko / text_preprocessing_per_word.py
Created September 3, 2020 03:52
Stop Word Removal, Lemmatization, and Stemming
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('wordnet')
# Remove all stopwords
stop_words = stopwords.words('english')
def remove_stopwords(tokenized_sentences):
for sentence in tokenized_sentences:
@brandonko
brandonko / text_processing_tokenization.py
Last active September 3, 2020 03:52
Text Preprocessing (Punctuation, Capitalization, and Tokenization)
import gensim
import string
# Uses gensim to process the sentences
def sentence_to_words(sentences):
for sentence in sentences:
sentence_tokenized = gensim.utils.simple_preprocess(sentence,
deacc=True,
min_len=2,
max_len=15)
@brandonko
brandonko / html_to_text_beautifulsoup.py
Last active April 18, 2024 19:46
BeautifulSoup HTML Text Extractor
from bs4 import BeautifulSoup
# Returns the text from a HTML file based on specified tags
def parse_html(html_path):
with open(html_path, 'r') as fr:
html_content = fr.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Check that file is valid HTML
if not soup.find():
@brandonko
brandonko / html_to_text.py
Last active September 2, 2020 19:00
Extracts the text from HTML files
import os
import re
from boilerpy3 import extractors
# Condenses all repeating newline characters into one single newline character
def condense_newline(text):
return '\n'.join([p for p in re.split('\n|\r', text) if len(p) > 0])
# Returns the text from a HTML file
def parse_html(html_path):
@brandonko
brandonko / large_ngram_cleaning.py
Last active September 3, 2020 03:39
NLP Dataset Preprocessing using Large N-Grams
import nltk
nltk.download('punkt')
import matplotlib.pyplot as plt
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
# Helper method for generating n-grams
def extract_ngrams_sentences(sentences, num):
all_grams = []
for sentence in sentences: