Skip to content

Instantly share code, notes, and snippets.

@Newmu
Last active August 29, 2015 14:19
Show Gist options
  • Save Newmu/fd18ffd002af49b9b208 to your computer and use it in GitHub Desktop.
Save Newmu/fd18ffd002af49b9b208 to your computer and use it in GitHub Desktop.
95.05% on yelp
import json
import numpy as np
from time import time
from matplotlib import pyplot as plt
import random
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import TfidfVectorizer
def len_filter(text, max_len=1014):
words = text.split(' ')
lens = [len(word)+1 for word in words]
lens[0] -= 1
lens[-1] -= 1
lens = np.cumsum(lens).tolist()
words = [w for w, l in zip(words, lens) if l < 1014]
return ' '.join(words)
def load(review_json_path, ntrain=10000, ntest=10000):
random.seed(42)
f = open(review_json_path)
pos_text = []
neg_text = []
n = 0
for i, row in enumerate(f):
if (min([len(pos_text), len(neg_text)])*2) >= ntest+ntrain:
break
data = json.loads(row)
if data['stars'] != 3:
if len(data['text']) >= 100:
text = len_filter(data['text'])
if data['stars'] > 3:
pos_text.append(text)
else:
neg_text.append(text)
if i % 10000 == 0: print i, min([len(pos_text), len(neg_text)])*2
text = random.sample(pos_text, (ntrain+ntest)/2) + random.sample(neg_text, (ntrain+ntest)/2)
labels = ([1.] * ((ntrain+ntest)/2)) + ([0.] * ((ntrain+ntest)/2))
idxs = np.arange(len(text))
random.shuffle(idxs)
text = [text[idx] for idx in idxs]
labels = [labels[idx] for idx in idxs]
teX = text[-ntest:]
trX = text[:-ntest]
teY = labels[-ntest:]
trY = labels[:-ntest]
return trX, teX, trY, teY
if __name__ == "__main__":
review_json_path = '/home/alec/datasets/yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'
trX, teX, trY, teY = load(review_json_path, ntrain=500000, ntest=50000)
t = time()
vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)
trX = vect.fit_transform(trX)
teX = vect.transform(teX)
print trX.shape
print 'time to vect', time()-t
t = time()
model = LR(C=8.)
model.fit(trX, trY)
print 'time to model', time()-t
tr_pred = model.predict(trX)
te_pred = model.predict(teX)
print metrics.accuracy_score(trY, tr_pred)
print metrics.accuracy_score(teY, te_pred)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment