-
-
Save mprzybyla123/9af0d3f9ff09e8a04ddca8d1e14aa01e to your computer and use it in GitHub Desktop.
classification-models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import libraries | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn import svm | |
from sklearn import metrics | |
import pandas as pd | |
# text and numeric classes that use sklearn base libaries | |
class TextTransformer(BaseEstimator, TransformerMixin): | |
""" | |
Transform text features | |
""" | |
def __init__(self, key): | |
self.key = key | |
def fit(self, X, y=None, *parg, **kwarg): | |
return self | |
def transform(self, X): | |
return X[self.key] | |
class NumberTransformer(BaseEstimator, TransformerMixin): | |
""" | |
Transform numeric features | |
""" | |
def __init__(self, key): | |
self.key = key | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
return X[[self.key]] | |
# read in your dataframe | |
df = pd.read_csv('/Users/data.csv') | |
# take a look at the first 5 observations | |
df.head() | |
# use the term-frequency inverse document frequency vectorizer to transfrom count of text | |
# into a weighed matrix of term importance | |
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', norm='l2') | |
# compile both the TextTransformer and TfidfVectorizer | |
# to the text 'Text_Feature' | |
color_text = Pipeline([ | |
('transformer', TextTransformer(key='Text_Feature')), | |
('vectorizer', vec_tdidf) | |
]) | |
# compile the NumberTransformer to 'Confirmed_Test', 'Confirmed_Recovery', | |
# and 'Confirmed_New' numeric features | |
test_numeric = Pipeline([ | |
('transformer', NumberTransformer(key='Confirmed_Test')), | |
]) | |
recovery_numeric = Pipeline([ | |
('transformer', NumberTransformer(key='Confirmed_Recovery')), | |
]) | |
new_numeric = Pipeline([ | |
('transformer', NumberTransformer(key='Confirmed_New')), | |
]) | |
# combine all of the features, text and numeric together | |
features = FeatureUnion([('Text_Feature', color_text), | |
('Confirmed_Test', test_numeric), | |
('Confirmed_Recovery', recovery_numeric), | |
('Confirmed_New', new_numeric) | |
]) | |
# create the classfier from list of algs - choose one only | |
clf = RandomForestClassifier() | |
clf = MultinomialNB() | |
clf = LogisticRegression() | |
clf = svm.SVC() | |
clf = KNeighborsClassifier() | |
# unite the features and classifier together | |
pipe = Pipeline([('features', features), | |
('clf',clf) | |
]) | |
# transform the categorical predictor into numeric | |
predicted_dummies = pd.get_dummies(df['Text_Predictor']) | |
# split the data into train and test | |
# isolate the features from the predicted field | |
text_numeric_features = ['Text_Feature', 'Confirmed_Test', 'Confirmed_Recovery', 'Confirmed_New'] | |
predictor = 'Text_Predictor' | |
X_train, X_test, y_train, y_test = train_test_split(df[text_numeric_features], df[predictor], | |
test_size=0.25, random_state=42) | |
# fit the model | |
pipe.fit(X_train, y_train) | |
# predict from the test set | |
preds = pipe.predict(X_test) | |
# print out your accuracy! | |
print("Accuracy:",metrics.accuracy_score(y_test, preds)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment