Skip to content

Instantly share code, notes, and snippets.

@Newmu
Created December 11, 2014 01:14
Show Gist options
  • Save Newmu/bbfc1f0b9d003bf08b26 to your computer and use it in GitHub Desktop.
Save Newmu/bbfc1f0b9d003bf08b26 to your computer and use it in GitHub Desktop.
~0.96 on Kaggle IMDB using stupid learning instead of "deep learning"
import numpy as np
import pandas as pd
from lxml import html
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import TfidfVectorizer
def clean(text):
return html.fromstring(text).text_content().lower().strip()
tr_data = pd.read_csv('/media/datasets/kaggle_imdb/labeledTrainData.tsv', delimiter='\t')
te_data = pd.read_csv('/media/datasets/kaggle_imdb/testData.tsv', delimiter='\t')
trX = [clean(text) for text in tr_data['review'].values]
trY = tr_data['sentiment'].values
vect = TfidfVectorizer(min_df=10, ngram_range=(1, 2))
trX = vect.fit_transform(trX)
model = LR()
model.fit(trX, trY)
ids = te_data['id'].values
teX = [clean(text) for text in te_data['review'].values]
teX = vect.transform(teX)
pr_teX = model.predict_proba(teX)[:, 1]
pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('test.csv',index=False,header=["id", "sentiment"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment