Data -
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
# Read the train data
train = pd.read_csv('train.csv')
# Create a Random Forest object
rf = RandomForestRegressor()
# Train a model[['store', 'item']], y=train['sales'])
prepare for submission
# Read test and sample submission data
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
# Show the head() of the sample_submission
# Get predictions for the test set
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
# Show the head() of the sample_submission
# Get predictions for the test set
test['sales'] = rf.predict(test[['store', 'item']])
# Write test predictions using the sample_submission format
test[['id', 'sales']].to_csv('kaggle_submission.csv', index=False)
Train XGBoost models
import xgboost as xgb
# Create DMatrix on train data
dtrain = xgb.DMatrix(data=train[['store', 'item']],
# Define xgboost parameters
params = {'objective': 'reg:linear',
'max_depth': 2,
'silent': 1}
# Train xgboost model
xg_depth_2 = xgb.train(params=params, dtrain=dtrain)
xgb with depth 8
import xgboost as xgb
# Create DMatrix on train data
dtrain = xgb.DMatrix(data=train[['store', 'item']],
# Define xgboost parameters
params = {'objective': 'reg:linear',
'max_depth': 8,
'silent': 1}
# Train xgboost model
xg_depth_8 = xgb.train(params=params, dtrain=dtrain)
XGB with depth 15
import xgboost as xgb
# Create DMatrix on train data
dtrain = xgb.DMatrix(data=train[['store', 'item']],
# Define xgboost parameters
params = {'objective': 'reg:linear',
'max_depth': 15,
'silent': 1}
# Train xgboost model
xg_depth_15 = xgb.train(params=params, dtrain=dtrain)
Explore overfitting XGBoost
from sklearn.metrics import mean_squared_error
dtrain = xgb.DMatrix(data=train[['store', 'item']])
dtest = xgb.DMatrix(data=test[['store', 'item']])
# For each of 3 trained models
for model in [xg_depth_2, xg_depth_8, xg_depth_15]:
# Make predictions
train_pred = model.predict(dtrain)
test_pred = model.predict(dtest)
# Calculate metrics
mse_train = mean_squared_error(train['sales'], train_pred)
mse_test = mean_squared_error(test['sales'], test_pred)
print('MSE Train: {:.3f}. MSE Test: {:.3f}'.format(mse_train, mse_test))
Kaggle Taxi fare
EDA Statistics
# Shapes of train and test data
print('Train shape:', train.shape)
print('Test shape:', test.shape)
# Train head()
# Describe the target variable
# Train distribution of passengers within rides
EDA plots I
# Calculate the ride distance
train['distance_km'] = haversine_distance(train)
# Draw a scatterplot
plt.scatter(x=train['fare_amount'], y=train['distance_km'], alpha=0.5)
plt.xlabel('Fare amount')
plt.ylabel('Distance, km')
plt.title('Fare amount based on the distance')
# Limit on the distance
plt.ylim(0, 50)
EDA plots II
# Create hour feature
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
train['hour'] = train.pickup_datetime.dt.hour
# Find median fare_amount for each hour
hour_price = train.groupby('hour', as_index=False)['fare_amount'].median()
# Plot the line plot
plt.plot(hour_price['hour'], hour_price['fare_amount'], marker='o')
plt.xlabel('Hour of the day')
plt.ylabel('Median fare amount')
plt.title('Fare amount based on day time')
K-fold cross-validation
# Import KFold
from sklearn.model_selection import KFold
# Create a KFold object
kf = KFold(n_splits=3, shuffle=True, random_state=123)
# Loop through each split
fold = 0
for train_index, test_index in kf.split(train):
# Obtain training and testing folds
cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
print('Fold: {}'.format(fold))
print('CV train shape: {}'.format(cv_train.shape))
print('Medium interest listings in CV train: {}\n'.format(sum(cv_train.interest_level == 'medium')))
fold += 1
# Import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# Create a StratifiedKFold object
str_kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
# Loop through each split
fold = 0
for train_index, test_index in str_kf.split(train, train['interest_level']):
# Obtain training and testing folds
cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
print('Fold: {}'.format(fold))
print('CV train shape: {}'.format(cv_train.shape))
print('Medium interest listings in CV train: {}\n'.format(sum(cv_train.interest_level == 'medium')))
fold += 1
Time K-fold
# Create TimeSeriesSplit object
time_kfold = TimeSeriesSplit(n_splits=3)
# Sort train data by date
train = train.sort_values('date')
# Iterate through each split
fold = 0
for train_index, test_index in time_kfold.split(train):
cv_train, cv_test = train.iloc[train_index], train.iloc[test_index]
print('Fold :', fold)
print('Train date range: from {} to {}'.format(,
print('Test date range: from {} to {}\n'.format(,
fold += 1
Overall validation score
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
# Sort train data by date
train = train.sort_values('date')
# Initialize 3-fold time cross-validation
kf = TimeSeriesSplit(n_splits=3)
# Get MSE scores for each cross-validation split
mse_scores = get_fold_mse(train, kf)
print('Mean validation MSE: {:.5f}'.format(np.mean(mse_scores)))
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
# Sort train data by date
train = train.sort_values('date')
# Initialize 3-fold time cross-validation
kf = TimeSeriesSplit(n_splits=3)
# Get MSE scores for each cross-validation split
mse_scores = get_fold_mse(train, kf)
print('Mean validation MSE: {:.5f}'.format(np.mean(mse_scores)))
print('MSE by fold: {}'.format(mse_scores))
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
# Sort train data by date
train = train.sort_values('date')
# Initialize 3-fold time cross-validation
kf = TimeSeriesSplit(n_splits=3)
# Get MSE scores for each cross-validation split
mse_scores = get_fold_mse(train, kf)
print('Mean validation MSE: {:.5f}'.format(np.mean(mse_scores)))
print('MSE by fold: {}'.format(mse_scores))
print('Overall validation MSE: {:.5f}'.format(np.mean(mse_scores) + np.std(mse_scores)))
Arithmetical features
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))
# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']
# Look at the updated RMSE
print('RMSE with total area:', get_kfold_rmse(train))
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))
# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']
print('RMSE with total area:', get_kfold_rmse(train))
# Find the area of the garden
train['GardenArea'] = train['LotArea'] - train['FirstFlrSF']
print('RMSE with garden area:', get_kfold_rmse(train))
# Look at the initial RMSE
print('RMSE before feature engineering:', get_kfold_rmse(train))
# Find the total area of the house
train['TotalArea'] = train['TotalBsmtSF'] + train['FirstFlrSF'] + train['SecondFlrSF']
print('RMSE with total area:', get_kfold_rmse(train))
# Find the area of the garden
train['GardenArea'] = train['LotArea'] - train['FirstFlrSF']
print('RMSE with garden area:', get_kfold_rmse(train))
# Find the total number of bathrooms
train['TotalBath'] = train['FullBath'] + train['HalfBath']
print('RMSE with number of bathrooms:', get_kfold_rmse(train))
Date features
# Concatenate train and test together
taxi = pd.concat([train, test])
# Convert pickup date to datetime object
taxi['pickup_datetime'] = pd.to_datetime(taxi['pickup_datetime'])
# Create a day of week feature
taxi['dayofweek'] = taxi['pickup_datetime'].dt.dayofweek
# Create an hour feature
taxi['hour'] = taxi['pickup_datetime'].dt.hour
# Split back into train and test
new_train = taxi[taxi['id'].isin(train['id'])]
new_test = taxi[taxi['id'].isin(test['id'])]
Label encoding
# Concatenate train and test together
houses = pd.concat([train, test])
# Label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Create new features
houses['RoofStyle_enc'] = le.fit_transform(houses['RoofStyle'])
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])
# Look at new features
print(houses[['RoofStyle', 'RoofStyle_enc', 'CentralAir', 'CentralAir_enc']].head())
One-Hot encoding
# Concatenate train and test together
houses = pd.concat([train, test])
# Look at feature distributions
print(houses['RoofStyle'].value_counts(), '\n')
# Concatenate train and test together
houses = pd.concat([train, test])
# Label encode binary 'CentralAir' feature
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])
# Concatenate train and test together
houses = pd.concat([train, test])
# Label encode binary 'CentralAir' feature
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
houses['CentralAir_enc'] = le.fit_transform(houses['CentralAir'])
# Create One-Hot encoded features
ohe = pd.get_dummies(houses['RoofStyle'], prefix='RoofStyle')
# Concatenate OHE features to houses
houses = pd.concat([houses, ohe], axis=1)
# Look at OHE features
print(houses[[col for col in houses.columns if 'RoofStyle' in col]].head(3))
Mean target encoding
def test_mean_target_encoding(train, test, target, categorical, alpha=5):
# Calculate global mean on the train data
global_mean = train[target].mean()
# Group by the categorical feature and calculate its properties
train_groups = train.groupby(categorical)
category_sum = train_groups[target].sum()
category_size = train_groups.size()
# Calculate smoothed mean target statistics
train_statistics = (category_sum + global_mean * alpha) / (category_size + alpha)
# Apply statistics to the test data and fill new categories
test_feature = test[categorical].map(train_statistics).fillna(global_mean)
return test_feature.values
def train_mean_target_encoding(train, target, categorical, alpha=5):
# Create 5-fold cross-validation
kf = KFold(n_splits=5, random_state=123, shuffle=True)
train_feature = pd.Series(index=train.index)
# For each folds split
for train_index, test_index in kf.split(train):
cv_train, cv_test = train.iloc[____], train.iloc[____]
# Calculate out-of-fold statistics and apply to cv_test
cv_test_feature = test_mean_target_encoding(cv_train, cv_test, target, categorical, alpha)
# Save new feature for this particular fold
train_feature.iloc[test_index] = cv_test_feature
return train_feature.values
def mean_target_encoding(train, test, target, categorical, alpha=5):
# Get the train feature
train_feature = train_mean_target_encoding(train, target, categorical, alpha)
# Get the test feature
test_feature = test_mean_target_encoding(train, test, target, categorical, alpha)
# Return new features to add to the model
return train_feature, test_feature
K-fold cross-validation
# Create 5-fold cross-validation
kf = KFold(n_splits=5, random_state=123, shuffle=True)
# For each folds split
for train_index, test_index in kf.split(bryant_shots):
cv_train, cv_test = bryant_shots.iloc[train_index], bryant_shots.iloc[test_index]
# Create mean target encoded feature
cv_train['game_id_enc'], cv_test['game_id_enc'] = mean_target_encoding(train=cv_train,
# Look at the encoding
print(cv_train[['game_id', 'shot_made_flag', 'game_id_enc']].sample(n=1))
Beyond binary classification
# Create mean target encoded feature
train['RoofStyle_enc'], test['RoofStyle_enc'] = mean_target_encoding(train=train,
# Look at the encoding
print(test[['RoofStyle', 'RoofStyle_enc']].drop_duplicates())
Missing data -- Kaggle two sigma
# Read dataframe
twosigma = pd.read_csv('twosigma_train.csv')
# Find the number of missing values in each column
# Read DataFrame
twosigma = pd.read_csv('twosigma_train.csv')
# Find the number of missing values in each column
# Look at the columns with the missing values
print(twosigma[['building_id', 'price']].head())
Impute missing data
# Import SimpleImputer
from sklearn.impute import SimpleImputer
# Create mean imputer
mean_imputer = SimpleImputer(strategy='mean')
# Price imputation
rental_listings[['price']] = mean_imputer.fit_transform(rental_listings[['price']])
# Import SimpleImputer
from sklearn.impute import SimpleImputer
# Create constant imputer
constant_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')
# building_id imputation
rental_listings[['building_id']] = constant_imputer.fit_transform(rental_listings[['building_id']])
Baseline based on the date
# Get pickup hour from the pickup_datetime column
train['hour'] = train['pickup_datetime'].dt.hour
test['hour'] = test['pickup_datetime'].dt.hour
# Calculate average fare_amount grouped by pickup hour
hour_groups = train.groupby('hour')['fare_amount'].mean()
# Make predictions on the test set
test['fare_amount'] =
# Write predictions
test[['id','fare_amount']].to_csv('hour_mean_sub.csv', index=False)
from sklearn.ensemble import RandomForestRegressor
# Select only numeric features
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
'dropoff_latitude', 'passenger_count', 'hour']
# Train a Random Forest model
rf = RandomForestRegressor()[features], train.fare_amount)
# Make predictions on the test data
test['fare_amount'] = rf.predict(test[features])
# Write predictions
test[['id','fare_amount']].to_csv('rf_sub.csv', index=False)
Model blending
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# Train a Gradient Boosting model
gb = GradientBoostingRegressor().fit(train[features], train.fare_amount)
# Train a Random Forest model
rf = RandomForestRegressor().fit(train[features], train.fare_amount)
# Make predictions on the test data
test['gb_pred'] = gb.predict(test[features])
test['rf_pred'] = rf.predict(test[features])
# Find mean of model predictions
test['blend'] = (test['gb_pred'] + test['rf_pred']) / 2
print(test[['gb_pred', 'rf_pred', 'blend']].head(3))
Model stacking I
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# Split train data into two parts
part_1, part_2 = train_test_split(train, test_size=0.5, random_state=123)
# Train a Gradient Boosting model
gb = GradientBoostingRegressor().fit(part_1[features], part_1.fare_amount)
# Train a Random Forest model on Part 1
rf = RandomForestRegressor().fit(part_1[features], part_1.fare_amount)
# Make predictions on the Part 2 data
part_2['gb_pred'] = gb.predict(part_2[features])
part_2['rf_pred'] = rf.predict(part_2[features])
# Make predictions on the test data
test['gb_pred'] = gb.predict(test[features])
test['rf_pred'] = rf.predict(test[features])
Model stacking II
from sklearn.linear_model import LinearRegression
# Create linear regression model without the intercept
lr = LinearRegression(fit_intercept=False)
# Train 2nd level model on the Part 2 data[['gb_pred', 'rf_pred']], part_2.fare_amount)
# Make stacking predictions on the test data
test['stacking'] = lr.predict(test[['gb_pred', 'rf_pred']])
# Look at the model coefficients
