Skip to content

Instantly share code, notes, and snippets.

@abolibibelot
Created June 25, 2013 15:31
Show Gist options
  • Save abolibibelot/5859432 to your computer and use it in GitHub Desktop.
Save abolibibelot/5859432 to your computer and use it in GitHub Desktop.
naive scikitlearn benchmark
fname = '/Users/yann/temp/products.txt'
#vec = CountVectorizer(input='content',analyzer=str.split,tokenizer=None)
features = []
y = []
for line in open(fname,'r'):
c,f = line.split(',',1)
features.append(f)
y.append(c)
print datetime.now(),"splitting data sets"
text_train, text_validation, target_train, target_validation = train_test_split(features, y, test_size=.3, random_state=42)
#X = CountVectorizer(input='content',analyzer=str.split,tokenizer=None).fit_transform(text_train)
#XValid = CountVectorizer(input='content',analyzer=str.split,tokenizer=None).fit_transform(text_validation)
classifiers = [
("Passive Agressive" ,PassiveAggressiveClassifier(C=1, n_iter=5)),
("Linear SVC",LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
random_state=None, tol=0.0001, verbose=0)),
("Multimodal NB",MultinomialNB()),
("Bernouilli NB",BernoulliNB())]
for name, clf in classifiers:
pipeline = Pipeline([
('vec',CountVectorizer(input='content',analyzer=str.split,tokenizer=None)),
('clf',clf)
])
print '*************'
print name
pipeline.fit(text_train, target_train)
print "score:",pipeline.score(text_validation,target_validation)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment