Skip to content

Instantly share code, notes, and snippets.

@Newmu
Last active August 29, 2015 14:19
Show Gist options
  • Save Newmu/b1a3e35c090998c5e254 to your computer and use it in GitHub Desktop.
Save Newmu/b1a3e35c090998c5e254 to your computer and use it in GitHub Desktop.
98.76% on dbpedia - TUFS dataset
import os
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
def len_filter(text, max_len=1014):
words = text.split(' ')
lens = [len(word)+1 for word in words]
lens[0] -= 1
lens[-1] -= 1
lens = np.cumsum(lens).tolist()
words = [w for w, l in zip(words, lens) if l < 1014]
return ' '.join(words)
def load(path):
data = pd.read_csv(path, header=None)
label = (data[0].values - 1.).tolist()
title = data[1].values.tolist()
body = data[2].values.tolist()
text = [t + ' ' + b for t, b in zip(title, body)]
text = [len_filter(t) for t in text]
return text, label
if __name__ == '__main__':
data_dir = '/home/alec/datasets/tufs/dbpedia/'
trX, trY = load(os.path.join(data_dir, 'train.csv'))
teX, teY = load(os.path.join(data_dir, 'test.csv'))
print '%d ntrain' % len(trX)
print '%d ntest' % len(teX)
t = time()
vect = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.9)
trX = vect.fit_transform(trX)
teX = vect.transform(teX)
print '%d features' % trX.shape[1]
print '%.2f seconds to vectorize' % (time()-t)
t = time()
model = LR(C=10.)
model.fit(trX, trY)
print '%.2f seconds to model' % (time()-t)
tr_pred = model.predict(trX)
te_pred = model.predict(teX)
print '%.2f train accuracy' % (metrics.accuracy_score(trY, tr_pred)*100.)
print '%.2f test accuracy' % (metrics.accuracy_score(teY, te_pred)*100.)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment