Lianne & Justin @ Just into Data liannewriting

## explore_data.py
import pandas as pd
df = pd.read_csv('sberbank.csv') # renamed from the csv file within train.csv.zip on Kaggle

df.info(verbose=True)

df.head()

## define_ts_function_time_series.py
# Goal of the model:
#  Predict Global_active_power at a specified time in the future.
#   Eg. We want to predict how much Global_active_power will be ten minutes from now.
#       We can use all the values from t-1, t-2, t-3, .... t-history_length to predict t+10


def create_ts_files(dataset,
                    start_index,
                    end_index,
                    history_length,

## transform_data.py
# list and drop columns that are less related to the target based on my judgment
cols_to_drop = ['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
# at the same time, rename the columns so they are understandable. Please read the UCI page (https://archive.ics.uci.edu/ml/datasets/bank+marketing) for details
df = df.drop(columns=cols_to_drop).rename(columns={'job': 'job_type', 'default': 'default_status',
                                                   'housing': 'housing_loan_status', 'loan': 'personal_loan_status',
                                                   'contact': 'contact_type', 'month': 'contact_month',
                                                   'day_of_week': 'contact_day_of_week', 'campaign': 'num_contacts',
                                                   'pdays': 'days_last_contact', 'previous': 'previous_contacts',
                                                   'poutcome': 'previous_outcome',


## load_data.py
import pandas as pd

# please use the dataset bank-additional.zip and extract it
df = pd.read_csv('bank-additional/bank-additional/bank-additional-full.csv', delimiter=';')

## set-up-pipeline.py
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier

estimators = [
    ('encoder', TargetEncoder()),
    ('clf', XGBClassifier(random_state=8)) # can customize objective function with the objective parameter
]
pipe = Pipeline(estimators)
pipe

## set-up-hyperparameter-tuning.py
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {
    'clf__max_depth': Integer(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),

## predict-probability.py
opt.predict(X_test)

opt.predict_proba(X_test)

## split_train_test.py
from sklearn.model_selection import train_test_split

X = df.drop(columns='result')
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)

## evaluate-score.py
opt.best_score_

opt.score(X_test, y_test)

## plot-feature-importance.py
from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)
	import pandas as pd
	df = pd.read_csv('sberbank.csv') # renamed from the csv file within train.csv.zip on Kaggle

	df.info(verbose=True)

	df.head()
	# Goal of the model:
	# Predict Global_active_power at a specified time in the future.
	# Eg. We want to predict how much Global_active_power will be ten minutes from now.
	# We can use all the values from t-1, t-2, t-3, .... t-history_length to predict t+10


	def create_ts_files(dataset,
	start_index,
	end_index,
	history_length,
	# list and drop columns that are less related to the target based on my judgment
	cols_to_drop = ['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
	# at the same time, rename the columns so they are understandable. Please read the UCI page (https://archive.ics.uci.edu/ml/datasets/bank+marketing) for details
	df = df.drop(columns=cols_to_drop).rename(columns={'job': 'job_type', 'default': 'default_status',
	'housing': 'housing_loan_status', 'loan': 'personal_loan_status',
	'contact': 'contact_type', 'month': 'contact_month',
	'day_of_week': 'contact_day_of_week', 'campaign': 'num_contacts',
	'pdays': 'days_last_contact', 'previous': 'previous_contacts',
	'poutcome': 'previous_outcome',
	import pandas as pd

	# please use the dataset bank-additional.zip and extract it
	df = pd.read_csv('bank-additional/bank-additional/bank-additional-full.csv', delimiter=';')
	from sklearn.pipeline import Pipeline
	from category_encoders.target_encoder import TargetEncoder
	from xgboost import XGBClassifier

	estimators = [
	('encoder', TargetEncoder()),
	('clf', XGBClassifier(random_state=8)) # can customize objective function with the objective parameter
	]
	pipe = Pipeline(estimators)
	pipe
	from skopt import BayesSearchCV
	from skopt.space import Real, Categorical, Integer

	search_space = {
	'clf__max_depth': Integer(2,8),
	'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
	'clf__subsample': Real(0.5, 1.0),
	'clf__colsample_bytree': Real(0.5, 1.0),
	'clf__colsample_bylevel': Real(0.5, 1.0),
	'clf__colsample_bynode' : Real(0.5, 1.0),
	from sklearn.model_selection import train_test_split

	X = df.drop(columns='result')
	y = df['result']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)
	from xgboost import plot_importance

	xgboost_step = opt.best_estimator_.steps[1]
	xgboost_model = xgboost_step[1]
	plot_importance(xgboost_model)