This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import pipeline | |
from sklearn.pipeline import Pipeline | |
# Import classifiers | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.multiclass import OneVsRestClassifier | |
# Import CountVectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
# Import other preprocessing modules | |
from sklearn.preprocessing import Imputer | |
from sklearn.feature_selection import chi2, SelectKBest | |
# Select 300 best features | |
chi_k = 300 | |
# Import functional utilities | |
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler | |
from sklearn.pipeline import FeatureUnion | |
# Perform preprocessing | |
get_text_data = FunctionTransformer(combine_text_columns, validate=False) | |
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False) | |
# Create the token pattern: TOKENS_ALPHANUMERIC | |
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' | |
# Import the hashing vectorizer | |
from sklearn.feature_extraction.text import HashingVectorizer | |
# Instantiate the winning model pipeline: pl | |
pl = Pipeline([ | |
('union', FeatureUnion( | |
transformer_list = [ | |
('numeric_features', Pipeline([ | |
('selector', get_numeric_data), | |
('imputer', Imputer()) | |
])), | |
('text_features', Pipeline([ | |
('selector', get_text_data), | |
('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC, | |
non_negative=True, norm=None, binary=False, | |
ngram_range=(1, 2))), | |
('dim_red', SelectKBest(chi2, chi_k)) | |
])) | |
] | |
)), | |
('int', SparseInteractions(degree=2)), | |
('scale', MaxAbsScaler()), | |
('clf', OneVsRestClassifier(LogisticRegression())) | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment