Last active
October 31, 2019 14:25
-
-
Save rozanecm/ee8333741db42b10158b3e0aff3f22aa to your computer and use it in GitHub Desktop.
Pipeline blueprint to have fast access to creating ml models.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa | |
cat_columns = [] | |
num_columns = [] | |
bool_columns = [] | |
text_columns = [] | |
from sklearn.pipeline import Pipeline | |
from sklearn.compose import ColumnTransformer | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.feature_extraction.text import HashingVectorizer | |
from sklearn.decomposition import TruncatedSVD | |
transformers = [] | |
transformers.append(("small_cat", | |
Pipeline(steps=[ | |
("category_imputer", SimpleImputer(strategy='constant', fill_value="")), | |
("one_hot", OneHotEncoder(handle_unknown='ignore')), | |
("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed)) | |
]), | |
small_size_cat_columns)) | |
transformers.append(("large_cat", | |
Pipeline(steps=[ | |
("category_imputer", SimpleImputer(strategy='constant', fill_value="")), | |
("one_hot", OneHotEncoder(handle_unknown='ignore')), | |
("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed)) | |
]), | |
large_size_cat_columns)) | |
transformers.append(("num", | |
Pipeline(steps=[ | |
("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)), | |
("num_transformer", StandardScaler()) | |
]), | |
num_columns)) | |
transformers.append(("bool", | |
Pipeline(steps=[ | |
("bool_imputer", SimpleImputer(strategy='most_frequent')), | |
]), | |
bool_columns)) | |
# The reason this for is necessary is because text transformers take an array-like parameter. | |
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error. | |
# If you don't want to process all the text columns with the same pipeline, you'll have to define | |
# a different pipelines for each, and pass a different list for each of the pipelines. | |
# for col in text_columns_titulo: | |
for col in text_columns: | |
# First, fill empty texts with an empty string. | |
X_train[col] = X_train[col].fillna("") | |
X_test[col] = X_test[col].fillna("") | |
train[col] = train[col].fillna("") | |
test[col] = test[col].fillna("") | |
transformer_name = "text_" + col | |
transformers.append((transformer_name, | |
Pipeline(steps=[ | |
("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')), | |
("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed)) | |
]), | |
col)) | |
my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, | |
n_jobs=-1, | |
transformer_weights=None) | |
steps = [] | |
steps.append(("col_trans", my_col_transformer)) | |
from sklearn.ensemble import RandomForestRegressor | |
steps.append(("rfr", RandomForestRegressor(n_estimators=100, | |
n_jobs=-1, | |
random_state=seed))) | |
my_pipe = Pipeline(steps, verbose=True) | |
# .replace is introduced because algorithms need numbers; booleans don't make it. | |
my_pipe.fit(X_train.replace({True:1,False:0}), y_train) | |
y_scores = my_pipe.predict(X_test.replace({True:1,False:0})) | |
from sklearn.metrics import mean_absolute_error | |
print(mean_absolute_error(y_test, y_scores)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To save predictions. | |
# There must be a directory ../predictions for this to work as expected. | |
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa | |
import time | |
def _get_filename(my_name, timestamp): | |
return "../predictions/" + timestamp + " by " + my_name + ".csv" | |
def _save_description(authors_name, timestamp, submission_description): | |
f = open("../predictions/" + authors_name + ".txt","a") | |
f.write(timestamp + ": " + submission_description + '\n') | |
f.close() | |
def save_submission(submission_df, authors_name="rozanecm", description = "no description.", index=False, header=False): | |
timestamp = time.strftime("%Y.%m.%d - %H:%M:%S") | |
submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header) | |
_save_description(authors_name, timestamp, description) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment