This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def chunk_tagged_sents(tagged_sents): | |
from nltk.chunk import regexp | |
# define a chunk "grammar", i.e. chunking rules | |
grammar = r""" | |
NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase | |
PP: {<IN><NP>} # prepositional phrase | |
VP: {<MD>?<VB.*><NP|PP>} # verb phrase | |
CLAUSE: {<NP><VP>} # full clause |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pos_tag_sents(tokenized_sents): | |
from nltk.tag import pos_tag | |
tagged_sents = [pos_tag(sent) for sent in tokenized_sents] | |
return tagged_sents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize_and_normalize_doc(doc, filter_stopwords=True, normalize='lemma'): | |
import nltk.corpus | |
from nltk.stem import PorterStemmer, WordNetLemmatizer | |
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize | |
from string import punctuation | |
# use NLTK's default set of english stop words | |
stops_list = nltk.corpus.stopwords.words('english') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_text(text): | |
from nltk import clean_html | |
import re | |
# strip html markup with handy NLTK function | |
text = clean_html(text) | |
# remove digits with regular expression | |
text = re.sub(r'\d', ' ', text) | |
# remove any patterns matching standard url format |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import requests | |
# GET html from NYT server, and parse it | |
response = requests.get('http://www.nytimes.com/2013/04/07/opinion/sunday/friedman-weve-wasted-our-timeout.html') | |
soup = bs4.BeautifulSoup(response.text) | |
article = '' | |
# select all tags containing article text, then extract the text from each |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# take parsed HTML for X-man's Wikipedia page | |
# return list of abilities | |
def get_xmen_abilities(soup): | |
infobox = soup.find('table', class_='infobox') | |
if infobox is not None : | |
abilities = infobox.find('th', text='Abilities') | |
if abilities is not None : | |
abilities_list = abilities.next_sibling.next_sibling.find_all(text=True) | |
abilities_list = [item.strip("\n") for item in abilities_list if item!='' and item!='\n'] | |
return abilities_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
# search Wikipedia for a subject or provide its URL, return the parsed HTML | |
# spoof the user-agent, let's pretend we're Firefox :) | |
def wikipedia_search(subject, url=False): | |
if url is False : | |
response = requests.get('http://en.wikipedia.org/w/index.php', | |
params={'search':subject}, | |
headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load test datasets | |
test <- read.csv("test.csv", header=TRUE) | |
# train the optimal kknn model | |
model <- kknn(as.factor(label) ~ ., train, test, k=9, kernel="triangular") | |
results <- model$fitted.values | |
# save the class predictions in a column vector | |
write(as.numeric(levels(results))[results], file="kknn_submission.csv", ncolumns=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# weighted k-nearest neighbors package | |
library(kknn) | |
# load the training data set | |
train <- read.csv("train.csv", header=TRUE) | |
# remove near-zero variance features | |
library(caret) | |
badCols <- nearZeroVar(train[, -1]) | |
train <- train[, -(badCols+1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# helpful functions for classification/regression training | |
# http://cran.r-project.org/web/packages/caret/index.html | |
library(caret) | |
# get indices of data.frame columns (pixels) with low variance | |
badCols <- nearZeroVar(train) | |
print(paste("Fraction of nearZeroVar columns:", round(length(badCols)/length(train),4))) | |
# remove those "bad" columns from the training and cross-validation sets | |
train <- train[, -badCols] |
NewerOlder