Skip to content

Instantly share code, notes, and snippets.

View liannewriting's full-sized avatar

Lianne & Justin @ Just into Data liannewriting

View GitHub Profile
@liannewriting
liannewriting / explore_data.py
Last active July 16, 2023 20:52
data cleaning techniques in Python
import pandas as pd
df = pd.read_csv('sberbank.csv') # renamed from the csv file within train.csv.zip on Kaggle
df.info(verbose=True)
df.head()
@liannewriting
liannewriting / define_ts_function_time_series.py
Last active February 3, 2023 14:47
lstm-keras-tensorflow-time-series-202003
# Goal of the model:
# Predict Global_active_power at a specified time in the future.
# Eg. We want to predict how much Global_active_power will be ten minutes from now.
# We can use all the values from t-1, t-2, t-3, .... t-history_length to predict t+10
def create_ts_files(dataset,
start_index,
end_index,
history_length,
@liannewriting
liannewriting / transform_data.py
Last active December 9, 2022 16:03
xgboost python machine learning
# list and drop columns that are less related to the target based on my judgment
cols_to_drop = ['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
# at the same time, rename the columns so they are understandable. Please read the UCI page (https://archive.ics.uci.edu/ml/datasets/bank+marketing) for details
df = df.drop(columns=cols_to_drop).rename(columns={'job': 'job_type', 'default': 'default_status',
'housing': 'housing_loan_status', 'loan': 'personal_loan_status',
'contact': 'contact_type', 'month': 'contact_month',
'day_of_week': 'contact_day_of_week', 'campaign': 'num_contacts',
'pdays': 'days_last_contact', 'previous': 'previous_contacts',
'poutcome': 'previous_outcome',
@liannewriting
liannewriting / load_data.py
Last active December 7, 2022 23:51
xgboost python machine learning
import pandas as pd
# please use the dataset bank-additional.zip and extract it
df = pd.read_csv('bank-additional/bank-additional/bank-additional-full.csv', delimiter=';')
@liannewriting
liannewriting / set-up-pipeline.py
Last active December 7, 2022 15:54
xgboost python machine learning
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
estimators = [
('encoder', TargetEncoder()),
('clf', XGBClassifier(random_state=8)) # can customize objective function with the objective parameter
]
pipe = Pipeline(estimators)
pipe
@liannewriting
liannewriting / set-up-hyperparameter-tuning.py
Last active December 6, 2022 16:30
xgboost python machine learning
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
search_space = {
'clf__max_depth': Integer(2,8),
'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
'clf__subsample': Real(0.5, 1.0),
'clf__colsample_bytree': Real(0.5, 1.0),
'clf__colsample_bylevel': Real(0.5, 1.0),
'clf__colsample_bynode' : Real(0.5, 1.0),
@liannewriting
liannewriting / predict-probability.py
Last active December 5, 2022 16:19
xgboost python machine learning
opt.predict(X_test)
opt.predict_proba(X_test)
@liannewriting
liannewriting / split_train_test.py
Last active December 1, 2022 00:01
xgboost python machine learning
from sklearn.model_selection import train_test_split
X = df.drop(columns='result')
y = df['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)
@liannewriting
liannewriting / evaluate-score.py
Last active November 30, 2022 19:41
xgboost python machine learning
opt.best_score_
opt.score(X_test, y_test)
@liannewriting
liannewriting / plot-feature-importance.py
Created November 30, 2022 15:26
xgboost python machine learning
from xgboost import plot_importance
xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model)