Created
January 24, 2019 03:08
-
-
Save yuriybash/3073d144a7dcc92c05d33e5cbb6675f1 to your computer and use it in GitHub Desktop.
pickling_an_sklearn_classifier_w_custom_transformer_attempt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto | |
import datetime | |
import pickle | |
import sys | |
import yaml | |
from os.path import dirname, join | |
from sklearn.externals import joblib | |
import pandas as pd | |
import numpy as np | |
import random as rnd | |
from scipy import sparse | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.naive_bayes import GaussianNB, MultinomialNB | |
from sklearn.linear_model import LogisticRegression, Perceptron | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.metrics import classification_report | |
from sklearn.neural_network.multilayer_perceptron import MLPClassifier | |
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict | |
from sklearn.pipeline import FeatureUnion, Pipeline | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.svm import SVC, LinearSVC | |
class ItemSelector(BaseEstimator, TransformerMixin): | |
def __init__(self, key): | |
self.key = key | |
def fit(self, x, y=None): | |
return self | |
def transform(self, data_dict): | |
return data_dict[self.key] | |
def generate_model(): | |
with open(join(dirname(dirname(__file__)), 'config_single_model.yml')) as f: | |
config = yaml.safe_load(f) | |
with open(join(dirname(dirname(__file__)), 'data/data.csv')) as f: | |
data_df = pd.read_csv(f) | |
pipeline = Pipeline([ | |
('union', FeatureUnion( | |
transformer_list=[ | |
('title', Pipeline([ | |
('selector', ItemSelector(key='title')), | |
('vec', TfidfVectorizer(ngram_range=[1, 1], max_features=500)), | |
])), | |
('url', Pipeline([ | |
('selector', ItemSelector(key='url')), | |
('vec', TfidfVectorizer(ngram_range=[1, 1], max_features=500)), | |
])), | |
], | |
)), | |
('estimator_cls', MultinomialNB(alpha=20.0)), | |
]) | |
X_train, X_test, Y_train, Y_test = train_test_split( | |
data_df[['title', 'url']], | |
data_df['noneng'], | |
test_size=0.25, random_state=42 | |
) | |
clf = pipeline.fit(X_train, Y_train) | |
joblib.dump(clf, 'model.pkl') | |
if __name__ == '__main__': | |
generate_model() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment