Burton DeWilde bdewilde

## basic_chunking.py
def chunk_tagged_sents(tagged_sents):

    from nltk.chunk import regexp

    # define a chunk "grammar", i.e. chunking rules
    grammar = r"""
        NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase
        PP: {<IN><NP>}               # prepositional phrase
        VP: {<MD>?<VB.*><NP|PP>}     # verb phrase
        CLAUSE: {<NP><VP>}           # full clause

## basic_pos_tagging.py
def pos_tag_sents(tokenized_sents):

    from nltk.tag import pos_tag

    tagged_sents = [pos_tag(sent) for sent in tokenized_sents]

    return tagged_sents

## basic_text_tokenization_and_normalization.py
def tokenize_and_normalize_doc(doc, filter_stopwords=True, normalize='lemma'):

    import nltk.corpus
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
    from string import punctuation

    # use NLTK's default set of english stop words
    stops_list = nltk.corpus.stopwords.words('english')


## basic_text_cleaning.py
def clean_text(text):

    from nltk import clean_html
    import re

    # strip html markup with handy NLTK function
    text = clean_html(text)
    # remove digits with regular expression
    text = re.sub(r'\d', ' ', text)
    # remove any patterns matching standard url format

## friedman_article_basic_scrape.py
import bs4
import requests

# GET html from NYT server, and parse it
response = requests.get('http://www.nytimes.com/2013/04/07/opinion/sunday/friedman-weve-wasted-our-timeout.html')
soup = bs4.BeautifulSoup(response.text)

article = ''

# select all tags containing article text, then extract the text from each

## get_xmen_abilities
# take parsed HTML for X-man's Wikipedia page
# return list of abilities
def get_xmen_abilities(soup):
    infobox = soup.find('table', class_='infobox')
    if infobox is not None :
        abilities = infobox.find('th', text='Abilities')
        if abilities is not None :
            abilities_list = abilities.next_sibling.next_sibling.find_all(text=True)
            abilities_list = [item.strip("\n") for item in abilities_list if item!='' and item!='\n']
            return abilities_list

## get_xmen
import requests
import bs4

# search Wikipedia for a subject or provide its URL, return the parsed HTML
# spoof the user-agent, let's pretend we're Firefox :)
def wikipedia_search(subject, url=False):
    if url is False :
        response = requests.get('http://en.wikipedia.org/w/index.php',
                                params={'search':subject},
                                headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'})

## knnBest_kaggleDigitRecognition
# load test datasets
test <- read.csv("test.csv", header=TRUE)

# train the optimal kknn model
model <- kknn(as.factor(label) ~ ., train, test, k=9, kernel="triangular")
results <- model$fitted.values

# save the class predictions in a column vector
write(as.numeric(levels(results))[results], file="kknn_submission.csv", ncolumns=1)

## knnWeighted_kaggleDigitRecognition
# weighted k-nearest neighbors package
library(kknn)

# load the training data set
train <- read.csv("train.csv", header=TRUE)

# remove near-zero variance features
library(caret)
badCols <- nearZeroVar(train[, -1])
train <- train[, -(badCols+1)]

## knnRemoveZeroVarCols_kaggleDigitRecognizer
# helpful functions for classification/regression training
# http://cran.r-project.org/web/packages/caret/index.html
library(caret)

# get indices of data.frame columns (pixels) with low variance
badCols <- nearZeroVar(train)
print(paste("Fraction of nearZeroVar columns:", round(length(badCols)/length(train),4)))

# remove those "bad" columns from the training and cross-validation sets
train <- train[, -badCols]
	def chunk_tagged_sents(tagged_sents):

	from nltk.chunk import regexp

	# define a chunk "grammar", i.e. chunking rules
	grammar = r"""
	NP: {<DT\|PP\$>?<JJ><NN.>+} # noun phrase
	PP: {<IN><NP>} # prepositional phrase
	VP: {<MD>?<VB.*><NP\|PP>} # verb phrase
	CLAUSE: {<NP><VP>} # full clause
	def pos_tag_sents(tokenized_sents):

	from nltk.tag import pos_tag

	tagged_sents = [pos_tag(sent) for sent in tokenized_sents]

	return tagged_sents
	def tokenize_and_normalize_doc(doc, filter_stopwords=True, normalize='lemma'):

	import nltk.corpus
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
	from string import punctuation

	# use NLTK's default set of english stop words
	stops_list = nltk.corpus.stopwords.words('english')
	def clean_text(text):

	from nltk import clean_html
	import re

	# strip html markup with handy NLTK function
	text = clean_html(text)
	# remove digits with regular expression
	text = re.sub(r'\d', ' ', text)
	# remove any patterns matching standard url format
	import bs4
	import requests

	# GET html from NYT server, and parse it
	response = requests.get('http://www.nytimes.com/2013/04/07/opinion/sunday/friedman-weve-wasted-our-timeout.html')
	soup = bs4.BeautifulSoup(response.text)

	article = ''

	# select all tags containing article text, then extract the text from each
	# take parsed HTML for X-man's Wikipedia page
	# return list of abilities
	def get_xmen_abilities(soup):
	infobox = soup.find('table', class_='infobox')
	if infobox is not None :
	abilities = infobox.find('th', text='Abilities')
	if abilities is not None :
	abilities_list = abilities.next_sibling.next_sibling.find_all(text=True)
	abilities_list = [item.strip("\n") for item in abilities_list if item!='' and item!='\n']
	return abilities_list
	import requests
	import bs4

	# search Wikipedia for a subject or provide its URL, return the parsed HTML
	# spoof the user-agent, let's pretend we're Firefox :)
	def wikipedia_search(subject, url=False):
	if url is False :
	response = requests.get('http://en.wikipedia.org/w/index.php',
	params={'search':subject},
	headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko)'})
	# load test datasets
	test <- read.csv("test.csv", header=TRUE)

	# train the optimal kknn model
	model <- kknn(as.factor(label) ~ ., train, test, k=9, kernel="triangular")
	results <- model$fitted.values

	# save the class predictions in a column vector
	write(as.numeric(levels(results))[results], file="kknn_submission.csv", ncolumns=1)
	# weighted k-nearest neighbors package
	library(kknn)

	# load the training data set
	train <- read.csv("train.csv", header=TRUE)

	# remove near-zero variance features
	library(caret)
	badCols <- nearZeroVar(train[, -1])
	train <- train[, -(badCols+1)]
	# helpful functions for classification/regression training
	# http://cran.r-project.org/web/packages/caret/index.html
	library(caret)

	# get indices of data.frame columns (pixels) with low variance
	badCols <- nearZeroVar(train)
	print(paste("Fraction of nearZeroVar columns:", round(length(badCols)/length(train),4)))

	# remove those "bad" columns from the training and cross-validation sets
	train <- train[, -badCols]