Skip to content

Instantly share code, notes, and snippets.

@bdewilde
bdewilde / basic_chunking.py
Last active April 12, 2019 15:18
basic regular expression chunker and chunk-getter
def chunk_tagged_sents(tagged_sents):
from nltk.chunk import regexp
# define a chunk "grammar", i.e. chunking rules
grammar = r"""
NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase
PP: {<IN><NP>} # prepositional phrase
VP: {<MD>?<VB.*><NP|PP>} # verb phrase
CLAUSE: {<NP><VP>} # full clause
@bdewilde
bdewilde / basic_pos_tagging.py
Created April 15, 2013 02:57
super basic pos-tagging of tokenized sentences
def pos_tag_sents(tokenized_sents):
from nltk.tag import pos_tag
tagged_sents = [pos_tag(sent) for sent in tokenized_sents]
return tagged_sents
@bdewilde
bdewilde / basic_text_tokenization_and_normalization.py
Last active December 16, 2015 03:18
tokenize document into sentences of normalized+filtered words
def tokenize_and_normalize_doc(doc, filter_stopwords=True, normalize='lemma'):
import nltk.corpus
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
from string import punctuation
# use NLTK's default set of english stop words
stops_list = nltk.corpus.stopwords.words('english')
@bdewilde
bdewilde / basic_text_cleaning.py
Last active December 15, 2015 22:29
basic procedure for cleaning text in preparation for natural language processing
def clean_text(text):
from nltk import clean_html
import re
# strip html markup with handy NLTK function
text = clean_html(text)
# remove digits with regular expression
text = re.sub(r'\d', ' ', text)
# remove any patterns matching standard url format
@bdewilde
bdewilde / friedman_article_basic_scrape.py
Last active December 15, 2015 22:29
bare-bones example for scraping article text from a website
import bs4
import requests
# GET html from NYT server, and parse it
response = requests.get('http://www.nytimes.com/2013/04/07/opinion/sunday/friedman-weve-wasted-our-timeout.html')
soup = bs4.BeautifulSoup(response.text)
article = ''
# select all tags containing article text, then extract the text from each
@bdewilde
bdewilde / get_xmen_abilities
Created November 27, 2012 17:24
Get list of X-men abilities, put it together with other methods in main()
# take parsed HTML for X-man's Wikipedia page
# return list of abilities
def get_xmen_abilities(soup):
infobox = soup.find('table', class_='infobox')
if infobox is not None :
abilities = infobox.find('th', text='Abilities')
if abilities is not None :
abilities_list = abilities.next_sibling.next_sibling.find_all(text=True)
abilities_list = [item.strip("\n") for item in abilities_list if item!='' and item!='\n']
return abilities_list
@bdewilde
bdewilde / get_xmen
Created November 27, 2012 16:59
Get a dictionary of X-Men names as keys and Wikipedia URLs as corresponding values
import requests
import bs4
# search Wikipedia for a subject or provide its URL, return the parsed HTML
# spoof the user-agent, let's pretend we're Firefox :)
def wikipedia_search(subject, url=False):
if url is False :
response = requests.get('http://en.wikipedia.org/w/index.php',
params={'search':subject},
headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'})
@bdewilde
bdewilde / knnBest_kaggleDigitRecognition
Created October 29, 2012 21:56
optimal implementation of a weighted knn model
# load test datasets
test <- read.csv("test.csv", header=TRUE)
# train the optimal kknn model
model <- kknn(as.factor(label) ~ ., train, test, k=9, kernel="triangular")
results <- model$fitted.values
# save the class predictions in a column vector
write(as.numeric(levels(results))[results], file="kknn_submission.csv", ncolumns=1)
@bdewilde
bdewilde / knnWeighted_kaggleDigitRecognition
Created October 29, 2012 21:37
weighted knn model, with k=1:15 and three kernels
# weighted k-nearest neighbors package
library(kknn)
# load the training data set
train <- read.csv("train.csv", header=TRUE)
# remove near-zero variance features
library(caret)
badCols <- nearZeroVar(train[, -1])
train <- train[, -(badCols+1)]
@bdewilde
bdewilde / knnRemoveZeroVarCols_kaggleDigitRecognizer
Created October 27, 2012 16:31
how to remove features with near zero variance, not useful for discriminating classes
# helpful functions for classification/regression training
# http://cran.r-project.org/web/packages/caret/index.html
library(caret)
# get indices of data.frame columns (pixels) with low variance
badCols <- nearZeroVar(train)
print(paste("Fraction of nearZeroVar columns:", round(length(badCols)/length(train),4)))
# remove those "bad" columns from the training and cross-validation sets
train <- train[, -badCols]