Skip to content

Instantly share code, notes, and snippets.

@agness
Created January 20, 2014 23:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agness/8531498 to your computer and use it in GitHub Desktop.
Save agness/8531498 to your computer and use it in GitHub Desktop.
StumbleUpon evergreen webpages ML challenge submission from http://www.laurenatphysics.com/2013/11/followup-to-stumbleupon-challenge.html
import csv
import numpy as np
import scipy as scipy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import LancasterStemmer,SnowballStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk import word_tokenize
from sklearn import preprocessing,metrics,cross_validation
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import SelectPercentile, SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge
class SnowballTokenizer(object):
def __init__(self):
self.sstem = EnglishStemmer(ignore_stopwords=False)
def __call__(self, doc):
return [self.sstem.stem(t) for t in word_tokenize(doc)]
class LTokenizer10(object):
def __init__(self):
self.lstem = LancasterStemmer()
def __call__(self, doc):
temp = word_tokenize(doc)
if len(temp)<10:
limit = len(temp)
else:
limit = 10
return [self.lstem.stem(t) for t in temp[0:limit]]
class LTokenizer15(object):
def __init__(self):
self.lstem = LancasterStemmer()
def __call__(self, doc):
temp = word_tokenize(doc)
if len(temp)<15:
limit = len(temp)
else:
limit = 15
return [self.lstem.stem(t) for t in temp[0:limit]]
def main():
trainlabels, testlabels, words, chars, URL, first10, first15 = GetLabelsAndText()
trainwords, testwords = words[:len(trainlabels)], words[len(trainlabels):]
trainchars, testchars = chars[:len(trainlabels)], chars[len(trainlabels):]
trainURL, testURL = URL[:len(trainlabels)], URL[len(trainlabels):]
trainfirst10, testfirst10 = first10[:len(trainlabels)], first10[len(trainlabels):]
trainfirst15, testfirst15 = first15[:len(trainlabels)], first15[len(trainlabels):]
kf = StratifiedKFold(trainlabels, n_folds=5, indices=True)
cvLogWords, testLogWords = ModelLogistic(2.4,kf,trainwords,trainlabels,testwords)
cvMultWords, testMultWords = ModelMultinomial(10,0.025,kf,trainwords,trainlabels,testwords)
cvLogChars, testLogChars = ModelLogistic(1.45,kf,trainchars,trainlabels,testchars)
cvRFChars, testRFChars = ModelRandomForest(200,kf,trainchars,trainlabels,testchars)
cvURL, testURL = ModelLogistic(2.0,kf,trainURL,trainlabels,testURL)
cv10, test10 = ModelLogistic(0.8,kf,trainfirst10,trainlabels,testfirst10)
cv15, test15 = ModelLogistic(0.8,kf,trainfirst15,trainlabels,testfirst15)
cvlabels = ModelNone(kf,trainlabels)
X = scipy.vstack((cvLogWords, cvMultWords, cvLogChars, cvRFChars, cvURL, cv10, cv15)).T
y = cvlabels
Xtest = scipy.vstack((testLogWords, testMultWords, testLogChars, testRFChars, testURL, test10, test15)).T
model = Ridge(alpha=500)
model.fit(X,y)
outputs = model.predict(Xtest)
final=vstack((testlabels.T.astype(int),outputs.T.astype(float))).T
open_file_object = csv.writer(open('Solution.csv', "wb"))
open_file_object.writerow(['urlid','label'])
for i in final:
open_file_object.writerow(i)
open_file_object = 1
def GetLabelsAndText():
csv_file_object = csv.reader(open('train.tsv','rb'),delimiter='\t')
header = csv_file_object.next()
trainlabels=[]
text=[]
URL = []
for row in csv_file_object:
text.append(row[2])
trainlabels.append(row[26])
temp = re.search('http://(.+?)/',row[0])
if temp:
URL.append(temp.group(1))
else:
URL.append('')
trainlabels = np.array(trainlabels).astype(int)
csv_file_object = csv.reader(open('test.tsv','rb'),delimiter='\t')
header = csv_file_object.next()
testlabels=[]
for row in csv_file_object:
text.append(row[2])
testlabels.append(row[1])
temp = re.search('http://(.+?)/',row[0])
if temp:
URL.append(temp.group(1))
else:
URL.append('')
testlabels = np.array(testlabels)
text = np.array(text)
URL = np.array(URL)
temptext = text.copy()
for i in range(text.shape[0]):
text[i] = re.sub('\.|\,| +',' ',text[i].lower())
vect = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,strip_accents='unicode',analyzer='char',ngram_range=(3,3),use_idf=1,smooth_idf=1,sublinear_tf=1)
Chars = vect.fit_transform(text)
text = temptext
for i in range(text.shape[0]):
text[i] = re.sub('","url":"|{"title":"|","body":"|"}|\.|\,| +',' ',text[i].lower())
vect = TfidfVectorizer(norm='l2',stop_words='english',min_df=3,max_df=1.0,strip_accents='unicode',analyzer='word',ngram_range=(1,2),use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=SnowballTokenizer())
Words = vect.fit_transform(text)
vect = TfidfVectorizer(strip_accents = None,tokenizer = None,analyzer='word' )
URL = vect.fit_transform(URL)
vect = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,analyzer='word',ngram_range=(1,1),use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=LTokenizer10())
First10 = vect.fit_transform(text)
vect = TfidfVectorizer(norm='l2',min_df=3,max_df=1.0,analyzer='word',ngram_range=(1,1),use_idf=1,smooth_idf=1,sublinear_tf=1,tokenizer=LTokenizer15())
First15 = vect.fit_transform(text)
return trainlabels, testlabels, Words, Chars, URL, First10, First15
def ModelNone(kf,y):
count = 0
outputs = np.zeros((5,len(y)/5))
for train,cv in kf:
outputs[count] = y[cv]
count += 1
return np.hstack((outputs))
def ModelLogistic(cValue,kf,X,y,Xtest):
count = 0
outputs = np.zeros((5,len(y)/5))
for train,cv in kf:
X_train, X_cv, y_train, y_cv = X[train],X[cv],y[train],y[cv]
model = LogisticRegression(C=cValue)
model.fit(X_train,y_train)
outputs[count] = model.predict_proba(X_cv)[:,1]
count += 1
model = LogisticRegression(C=cValue)
model.fit(X,y)
outputsTest = model.predict_proba(Xtest)[:,1]
return np.hstack((outputs)), outputsTest
def ModelMultinomial(kValue,alpha,kf,X,y,Xtest):
count = 0
outputs = np.zeros((5,len(y)/5))
FS=SelectPercentile(score_func=chi2,percentile=kValue)
for train,cv in kf:
X_train, X_cv, y_train, y_cv = X[train],X[cv],y[train],y[cv]
X_new = FS.fit_transform(X_train,y_train)
model = MultinomialNB(alpha=alpha)
model.fit(X_new,y_train)
outputs[count] = model.predict_proba(FS.transform(X_cv))[:,1]
count += 1
X_new = FS.fit_transform(X,y)
model = MultinomialNB(alpha=alpha)
model.fit(X_new,y)
outputsTest = model.predict_proba(FS.transform(Xtest))[:,1]
return np.hstack((outputs)), outputsTest
def ModelRandomForest(kChi,kf,X,y,Xtest):
count = 0
outputs = np.zeros((5,len(y)/5))
FS=SelectKBest(score_func=chi2,k=kChi)
for train,cv in kf:
X_train, X_cv, y_train, y_cv = X[train],X[cv],y[train],y[cv]
X_new = FS.fit_transform(X_train,y_train)
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_new.todense(),y_train)
outputs[count] = model.predict_proba(FS.transform(X_cv.todense()))[:,1]
count += 1
X_new = FS.fit_transform(X,y)
model = RandomForestClassifier(n_estimators=1000)
model.fit(X_new.todense(),y)
outputsTest = model.predict_proba(FS.transform(Xtest.todense()))[:,1]
return np.hstack((outputs)), outputsTest
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment