Skip to content

Instantly share code, notes, and snippets.

@rozanecm
Last active October 31, 2019 14:25
Show Gist options
  • Save rozanecm/ee8333741db42b10158b3e0aff3f22aa to your computer and use it in GitHub Desktop.
Save rozanecm/ee8333741db42b10158b3e0aff3f22aa to your computer and use it in GitHub Desktop.
Pipeline blueprint to have fast access to creating ml models.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
cat_columns = []
num_columns = []
bool_columns = []
text_columns = []
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
transformers = []
transformers.append(("small_cat",
Pipeline(steps=[
("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
("one_hot", OneHotEncoder(handle_unknown='ignore')),
("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
]),
small_size_cat_columns))
transformers.append(("large_cat",
Pipeline(steps=[
("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
("one_hot", OneHotEncoder(handle_unknown='ignore')),
("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
]),
large_size_cat_columns))
transformers.append(("num",
Pipeline(steps=[
("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
("num_transformer", StandardScaler())
]),
num_columns))
transformers.append(("bool",
Pipeline(steps=[
("bool_imputer", SimpleImputer(strategy='most_frequent')),
]),
bool_columns))
# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
# First, fill empty texts with an empty string.
X_train[col] = X_train[col].fillna("")
X_test[col] = X_test[col].fillna("")
train[col] = train[col].fillna("")
test[col] = test[col].fillna("")
transformer_name = "text_" + col
transformers.append((transformer_name,
Pipeline(steps=[
("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
]),
col))
my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3,
n_jobs=-1,
transformer_weights=None)
steps = []
steps.append(("col_trans", my_col_transformer))
from sklearn.ensemble import RandomForestRegressor
steps.append(("rfr", RandomForestRegressor(n_estimators=100,
n_jobs=-1,
random_state=seed)))
my_pipe = Pipeline(steps, verbose=True)
# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)
y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_scores))
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
return "../predictions/" + timestamp + " by " + my_name + ".csv"
def _save_description(authors_name, timestamp, submission_description):
f = open("../predictions/" + authors_name + ".txt","a")
f.write(timestamp + ": " + submission_description + '\n')
f.close()
def save_submission(submission_df, authors_name="rozanecm", description = "no description.", index=False, header=False):
timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
_save_description(authors_name, timestamp, description)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment