Skip to content

Instantly share code, notes, and snippets.

@adammenges
Last active August 29, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adammenges/f92ec779a9bf7edb64d5 to your computer and use it in GitHub Desktop.
Save adammenges/f92ec779a9bf7edb64d5 to your computer and use it in GitHub Desktop.
import string, sklearn, random
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
def tok(m):
return m.split()
def generate_model(docs, labels):
pipeline_svm = Pipeline([
('bow', CountVectorizer(analyzer=tok)),
('tfidf', TfidfTransformer()),
('classifier', SVC(probability=True)),
])
param_svm = [
{'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
{'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]
grid_svm = GridSearchCV(
pipeline_svm,
param_grid=param_svm,
refit=True,
n_jobs=-1,
scoring='accuracy',
cv=StratifiedKFold(labels, n_folds=5),
)
return grid_svm.fit(docs, labels)
random_word = lambda: ''.join(random.choice(string.lowercase) for _ in range(3))
random_doc = lambda: ' '.join(random_word() for _ in range(10))
docs = [random_doc() for _ in range(2000)]
labels = [random.choice(['A', 'B']) for _ in range(2000)]
model = generate_model(docs, labels)
model.predict_proba("foo bar")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment