Skip to content

Instantly share code, notes, and snippets.

@vidit0210
Created March 2, 2020 13:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save vidit0210/e1c67b24075f832f6c91025cff5b6f91 to your computer and use it in GitHub Desktop.
Save vidit0210/e1c67b24075f832f6c91025cff5b6f91 to your computer and use it in GitHub Desktop.
----
Data - https://www.kaggle.com/c/demand-forecasting-kernels-only/data
DEMAND FORECASTING CHALLENGE
----
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
# Read the train data
train = pd.read_csv('train.csv')
# Create a Random Forest object
rf = RandomForestRegressor()
# Train a model
rf.fit(X=train[['store', 'item']], y=train['sales'])
-----
prepare for submission
--
# Read test and sample submission data
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
# Show the head() of the sample_submission
print(sample_submission.head())
# Read test and sample submission data
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
# Show the head() of the sample_submission
print(sample_submission.head())
# Get predictions for the test set
test['sales'] = rf.predict(test[['store', 'item']])
# Write test predictions using the sample_submission format
test[['id', 'sales']].to_csv('kaggle_submission.csv', index=False)
----
Train XGBoost models
----
import xgboost as xgb
# Create DMatrix on train data
dtrain = xgb.DMatrix(data=train[['store', 'item']],
label=train['sales'])
# Define xgboost parameters
params = {'objective': 'reg:linear',
'max_depth': 2,
'silent': 1}
# Train xgboost model
xg_depth_2 = xgb.train(params=params, dtrain=dtrain)
---
xgb with depth 8
---
import xgboost as xgb
# Create DMatrix on train data
dtrain = xgb.DMatrix(data=train[['store', 'item']],
label=train['sales'])
# Define xgboost parameters
params = {'objective': 'reg:linear',
'max_depth': 8,
'silent': 1}
# Train xgboost model
xg_depth_8 = xgb.train(params=params, dtrain=dtrain)
----
XGB with depth 15
----
import xgboost as xgb
# Create DMatrix on train data
dtrain = xgb.DMatrix(data=train[['store', 'item']],
label=train['sales'])
# Define xgboost parameters
params = {'objective': 'reg:linear',
'max_depth': 15,
'silent': 1}
# Train xgboost model
xg_depth_15 = xgb.train(params=params, dtrain=dtrain)
----
Explore overfitting XGBoost
----
from sklearn.metrics import mean_squared_error
dtrain = xgb.DMatrix(data=train[['store', 'item']])
dtest = xgb.DMatrix(data=test[['store', 'item']])
# For each of 3 trained models
for model in [xg_depth_2, xg_depth_8, xg_depth_15]:
# Make predictions
train_pred = model.predict(dtrain)
test_pred = model.predict(dtest)
# Calculate metrics
mse_train = mean_squared_error(train['sales'], train_pred)
mse_test = mean_squared_error(test['sales'], test_pred)
print('MSE Train: {:.3f}. MSE Test: {:.3f}'.format(mse_train, mse_test))
-----
Kaggle Taxi fare
------
EDA Statistics
---
# Shapes of train and test data
print('Train shape:', train.shape)
print('Test shape:', test.shape)
# Train head()
print(train.head())
# Describe the target variable
print(train.fare_amount.describe())
# Train distribution of passengers within rides
print(train.passenger_count.value_counts())
----
EDA plots I
----
# Calculate the ride distance
train['distance_km'] = haversine_distance(train)
# Draw a scatterplot
plt.scatter(x=train['fare_amount'], y=train['distance_km'], alpha=0.5)
plt.xlabel('Fare amount')
plt.ylabel('Distance, km')
plt.title('Fare amount based on the distance')
# Limit on the distance
plt.ylim(0, 50)
plt.show()
-----
EDA plots II
----
# Create hour feature
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
train['hour'] = train.pickup_datetime.dt.hour
# Find median fare_amount for each hour
hour_price = train.groupby('hour', as_index=False)['fare_amount'].median()
# Plot the line plot
plt.plot(hour_price['hour'], hour_price['fare_amount'], marker='o')
plt.xlabel('Hour of the day')
plt.ylabel('Median fare amount')
plt.title('Fare amount based on day time')
plt.xticks(range(24))
plt.show()
-----
K-fold cross-validation
-----
# Import KFold
from sklearn.model_selection import KFold
# Create a KFold object
kf = KFold(n_splits=3, shuffle=True, random_state=123)
# Loop through each split
fold = 0
for train_index, test_index in kf.split(train):
# Obtain training and testing folds
cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
print('Fold: {}'.format(fold))
print('CV train shape: {}'.format(cv_train.shape))
print('Medium interest listings in CV train: {}\n'.format(sum(cv_train.interest_level == 'medium')))
fold += 1
----
# Import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# Create a StratifiedKFold object
str_kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
# Loop through each split
fold = 0
for train_index, test_index in str_kf.split(train, train['interest_level']):
# Obtain training and testing folds
cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
print('Fold: {}'.format(fold))
print('CV train shape: {}'.format(cv_train.shape))
print('Medium interest listings in CV train: {}\n'.format(sum(cv_train.interest_level == 'medium')))
fold += 1
-----
Time K-fold
----
# Create TimeSeriesSplit object
time_kfold = TimeSeriesSplit(n_splits=3)
# Sort train data by date
train = train.sort_values('date')
# Iterate through each split
fold = 0
for train_index, test_index in time_kfold.split(train):
cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
print('Fold :', fold)
print('Train date range: from {} to {}'.format(cv_train.date.min(), cv_train.date.max()))
print('Test date range: from {} to {}\n'.format(cv_test.date.min(), cv_test.date.max()))
fold += 1
----
Overall validation score
----
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
# Sort train data by date
train = train.sort_values('date')
# Initialize 3-fold time cross-validation
kf = TimeSeriesSplit(n_splits=3)
# Get MSE scores for each cross-validation split
mse_scores = get_fold_mse(train, kf)
print('Mean validation MSE: {:.5f}'.format(np.mean(mse_scores)))
----
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
# Sort train data by date
train = train.sort_values('date')
# Initialize 3-fold time cross-validation
kf = TimeSeriesSplit(n_splits=3)
# Get MSE scores for each cross-validation split
mse_scores = get_fold_mse(train, kf)
print('Mean validation MSE: {:.5f}'.format(np.mean(mse_scores)))
print('MSE by fold: {}'.format(mse_scores))
---
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
# Sort train data by date
train = train.sort_values('date')
# Initialize 3-fold time cross-validation
kf = TimeSeriesSplit(n_splits=3)
# Get MSE scores for each cross-validation split
mse_scores = get_fold_mse(train, kf)
print('Mean validation MSE: {:.5f}'.format(np.mean(mse_scores)))
print('MSE by fold: {}'.format(mse_scores))
print('Overall validation MSE: {:.5f}'.format(np.mean(mse_scores) + np.std(mse_scores)))
-----
Arithmetical features
-----
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))
# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']
# Look at the updated RMSE
print('RMSE with total area:', get_kfold_rmse(train))
-----
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))
# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']
print('RMSE with total area:', get_kfold_rmse(train))
# Find the area of the garden
train['GardenArea'] = train['LotArea'] - train['FirstFlrSF']
print('RMSE with garden area:', get_kfold_rmse(train))
-----
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))
# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']
print('RMSE with total area:', get_kfold_rmse(train))
# Find the area of the garden
train['GardenArea'] = train['LotArea'] - train['FirstFlrSF']
print('RMSE with garden area:', get_kfold_rmse(train))
# Find the total number of bathrooms
train['TotalBath'] = train['FullBath'] + train['HalfBath']
print('RMSE with number of bathrooms:', get_kfold_rmse(train))
-----
Date features
------
# Concatenate train and test together
taxi = pd.concat([train, test])
# Convert pickup date to datetime object
taxi['pickup_datetime'] = pd.to_datetime(taxi['pickup_datetime'])
# Create a day of week feature
taxi['dayofweek'] = taxi['pickup_datetime'].dt.dayofweek
# Create an hour feature
taxi['hour'] = taxi['pickup_datetime'].dt.hour
# Split back into train and test
new_train = taxi[taxi['id'].isin(train['id'])]
new_test = taxi[taxi['id'].isin(test['id'])]
-----
Label encoding
-----
# Concatenate train and test together
houses = pd.concat([train, test])
# Label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Create new features
houses['RoofStyle_enc'] = le.fit_transform(houses['RoofStyle'])
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])
# Look at new features
print(houses[['RoofStyle', 'RoofStyle_enc', 'CentralAir', 'CentralAir_enc']].head())
-------
One-Hot encoding
-----
# Concatenate train and test together
houses = pd.concat([train, test])
# Look at feature distributions
print(houses['RoofStyle'].value_counts(), '\n')
print(houses['CentralAir'].value_counts())
# Concatenate train and test together
houses = pd.concat([train, test])
# Label encode binary 'CentralAir' feature
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])
----
# Concatenate train and test together
houses = pd.concat([train, test])
# Label encode binary 'CentralAir' feature
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])
# Create One-Hot encoded features
ohe = pd.get_dummies(houses['RoofStyle'], prefix='RoofStyle')
# Concatenate OHE features to houses
houses = pd.concat([houses, ohe], axis=1)
# Look at OHE features
print(houses[[col for col in houses.columns if 'RoofStyle' in col]].head(3))
----
Mean target encoding
----
def test_mean_target_encoding(train, test, target, categorical, alpha=5):
# Calculate global mean on the train data
global_mean = train[target].mean()
# Group by the categorical feature and calculate its properties
train_groups = train.groupby(categorical)
category_sum = train_groups[target].sum()
category_size = train_groups.size()
# Calculate smoothed mean target statistics
train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
# Apply statistics to the test data and fill new categories
test_feature = test[categorical].map(train_statistics).fillna(global_mean)
return test_feature.values
----
def train_mean_target_encoding(train, target, categorical, alpha=5):
# Create 5-fold cross-validation
kf = KFold(n_splits=5, random_state=123, shuffle=True)
train_feature = pd.Series(index=train.index)
# For each folds split
for train_index, test_index in kf.split(train):
cv_train, cv_test = train.iloc[____], train.iloc[____]
# Calculate out-of-fold statistics and apply to cv_test
cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
# Save new feature for this particular fold
train_feature.iloc[test_index] = cv_test_feature
return train_feature.values
----
def mean_target_encoding(train, test, target, categorical, alpha=5):
# Get the train feature
train_feature = train_mean_target_encoding(train, target, categorical, alpha)
# Get the test feature
test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
# Return new features to add to the model
return train_feature, test_feature
----
K-fold cross-validation
----
# Create 5-fold cross-validation
kf = KFold(n_splits=5, random_state=123, shuffle=True)
# For each folds split
for train_index, test_index in kf.split(bryant_shots):
cv_train, cv_test = bryant_shots.iloc[train_index], bryant_shots.iloc[test_index]
# Create mean target encoded feature
cv_train['game_id_enc'], cv_test['game_id_enc'] = mean_target_encoding(train=cv_train,
test=cv_test,
target='shot_made_flag',
categorical='game_id',
alpha=5)
# Look at the encoding
print(cv_train[['game_id', 'shot_made_flag', 'game_id_enc']].sample(n=1))
---
Beyond binary classification
----
# Create mean target encoded feature
train['RoofStyle_enc'], test['RoofStyle_enc'] = mean_target_encoding(train=train,
test=test,
target='SalePrice',
categorical='RoofStyle',
alpha=10)
# Look at the encoding
print(test[['RoofStyle', 'RoofStyle_enc']].drop_duplicates())
-----
Missing data -- Kaggle two sigma
---
# Read dataframe
twosigma = pd.read_csv('twosigma_train.csv')
# Find the number of missing values in each column
print(twosigma.isnull().sum())
---
# Read DataFrame
twosigma = pd.read_csv('twosigma_train.csv')
# Find the number of missing values in each column
print(twosigma.isnull().sum())
# Look at the columns with the missing values
print(twosigma[['building_id', 'price']].head())
----
Impute missing data
----
# Import SimpleImputer
from sklearn.impute import SimpleImputer
# Create mean imputer
mean_imputer = SimpleImputer(strategy='mean')
# Price imputation
rental_listings[['price']] = mean_imputer.fit_transform(rental_listings[['price']])
----
# Import SimpleImputer
from sklearn.impute import SimpleImputer
# Create constant imputer
constant_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')
# building_id imputation
rental_listings[['building_id']] = constant_imputer.fit_transform(rental_listings[['building_id']])
----
Baseline based on the date
----
# Get pickup hour from the pickup_datetime column
train['hour'] = train['pickup_datetime'].dt.hour
test['hour'] = test['pickup_datetime'].dt.hour
# Calculate average fare_amount grouped by pickup hour
hour_groups = train.groupby('hour')['fare_amount'].mean()
# Make predictions on the test set
test['fare_amount'] = test.hour.map(hour_groups)
# Write predictions
test[['id','fare_amount']].to_csv('hour_mean_sub.csv', index=False)
----
from sklearn.ensemble import RandomForestRegressor
# Select only numeric features
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
'dropoff_latitude', 'passenger_count', 'hour']
# Train a Random Forest model
rf = RandomForestRegressor()
rf.fit(train[features], train.fare_amount)
# Make predictions on the test data
test['fare_amount'] = rf.predict(test[features])
# Write predictions
test[['id','fare_amount']].to_csv('rf_sub.csv', index=False)
----
Model blending
----
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# Train a Gradient Boosting model
gb = GradientBoostingRegressor().fit(train[features], train.fare_amount)
# Train a Random Forest model
rf = RandomForestRegressor().fit(train[features], train.fare_amount)
# Make predictions on the test data
test['gb_pred'] = gb.predict(test[features])
test['rf_pred'] = rf.predict(test[features])
# Find mean of model predictions
test['blend'] = (test['gb_pred'] + test['rf_pred']) / 2
print(test[['gb_pred', 'rf_pred', 'blend']].head(3))
------
Model stacking I
-----
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# Split train data into two parts
part_1, part_2 = train_test_split(train, test_size=0.5, random_state=123)
# Train a Gradient Boosting model
gb = GradientBoostingRegressor().fit(part_1[features], part_1.fare_amount)
# Train a Random Forest model on Part 1
rf = RandomForestRegressor().fit(part_1[features], part_1.fare_amount)
----
# Make predictions on the Part 2 data
part_2['gb_pred'] = gb.predict(part_2[features])
part_2['rf_pred'] = rf.predict(part_2[features])
# Make predictions on the test data
test['gb_pred'] = gb.predict(test[features])
test['rf_pred'] = rf.predict(test[features])
-----
Model stacking II
-----
from sklearn.linear_model import LinearRegression
# Create linear regression model without the intercept
lr = LinearRegression(fit_intercept=False)
# Train 2nd level model on the Part 2 data
lr.fit(part_2[['gb_pred', 'rf_pred']], part_2.fare_amount)
# Make stacking predictions on the test data
test['stacking'] = lr.predict(test[['gb_pred', 'rf_pred']])
# Look at the model coefficients
print(lr.coef_)
---
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment