rozanecm/0 ml useful gists

## 0 ml useful gists
.

## pipeline.py
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
cat_columns = []
num_columns = []
bool_columns = []
text_columns = []

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("small_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
                     ]),
                     small_size_cat_columns))

transformers.append(("large_cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
                     ]),
                     large_size_cat_columns))

transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.
# for col in text_columns_titulo:
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3,
                                       n_jobs=-1,
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

from sklearn.ensemble import RandomForestRegressor
steps.append(("rfr", RandomForestRegressor(n_estimators=100,
                                           n_jobs=-1,
                                           random_state=seed)))

my_pipe = Pipeline(steps, verbose=True)

# .replace is introduced because algorithms need numbers; booleans don't make it.
my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, y_scores))

## save_predictions.py
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="rozanecm", description = "no description.", index=False, header=False):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)
	# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
	cat_columns = []
	num_columns = []
	bool_columns = []
	text_columns = []

	from sklearn.pipeline import Pipeline
	from sklearn.compose import ColumnTransformer

	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.preprocessing import StandardScaler
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.decomposition import TruncatedSVD

	transformers = []

	transformers.append(("small_cat",
	Pipeline(steps=[
	("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
	("one_hot", OneHotEncoder(handle_unknown='ignore')),
	("svd", TruncatedSVD(n_components=11, n_iter=7, random_state=seed))
	]),
	small_size_cat_columns))

	transformers.append(("large_cat",
	Pipeline(steps=[
	("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
	("one_hot", OneHotEncoder(handle_unknown='ignore')),
	("svd", TruncatedSVD(n_components=25, n_iter=7, random_state=seed))
	]),
	large_size_cat_columns))

	transformers.append(("num",
	Pipeline(steps=[
	("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
	("num_transformer", StandardScaler())
	]),
	num_columns))

	transformers.append(("bool",
	Pipeline(steps=[
	("bool_imputer", SimpleImputer(strategy='most_frequent')),
	]),
	bool_columns))

	# The reason this for is necessary is because text transformers take an array-like parameter.
	# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
	# If you don't want to process all the text columns with the same pipeline, you'll have to define
	# a different pipelines for each, and pass a different list for each of the pipelines.
	# for col in text_columns_titulo:
	for col in text_columns:
	# First, fill empty texts with an empty string.
	X_train[col] = X_train[col].fillna("")
	X_test[col] = X_test[col].fillna("")
	train[col] = train[col].fillna("")
	test[col] = test[col].fillna("")
	transformer_name = "text_" + col
	transformers.append((transformer_name,
	Pipeline(steps=[
	("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
	("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
	]),
	col))

	my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3,
	n_jobs=-1,
	transformer_weights=None)

	steps = []

	steps.append(("col_trans", my_col_transformer))

	from sklearn.ensemble import RandomForestRegressor
	steps.append(("rfr", RandomForestRegressor(n_estimators=100,
	n_jobs=-1,
	random_state=seed)))

	my_pipe = Pipeline(steps, verbose=True)

	# .replace is introduced because algorithms need numbers; booleans don't make it.
	my_pipe.fit(X_train.replace({True:1,False:0}), y_train)

	y_scores = my_pipe.predict(X_test.replace({True:1,False:0}))

	from sklearn.metrics import mean_absolute_error

	print(mean_absolute_error(y_test, y_scores))
	# To save predictions.
	# There must be a directory ../predictions for this to work as expected.
	# source: https://gist.github.com/rozanecm/ee8333741db42b10158b3e0aff3f22aa
	import time
	def _get_filename(my_name, timestamp):
	return "../predictions/" + timestamp + " by " + my_name + ".csv"

	def _save_description(authors_name, timestamp, submission_description):
	f = open("../predictions/" + authors_name + ".txt","a")
	f.write(timestamp + ": " + submission_description + '\n')
	f.close()

	def save_submission(submission_df, authors_name="rozanecm", description = "no description.", index=False, header=False):
	timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
	submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
	_save_description(authors_name, timestamp, description)