Skip to content

Instantly share code, notes, and snippets.

@afraenkel
Created June 16, 2019 22:23
Show Gist options
  • Save afraenkel/577df3687d09dd97c239699a8eef0d28 to your computer and use it in GitHub Desktop.
Save afraenkel/577df3687d09dd97c239699a8eef0d28 to your computer and use it in GitHub Desktop.
Groupwise leave one out
import pandas as pd
import numpy as np
# download data for example
from sklearn.datasets.california_housing import fetch_california_housing
d = fetch_california_housing()
df = pd.DataFrame(d['data'], columns=d['feature_names'])
df['price'] = d['target']
df = df.assign(region=df['Latitude'].round())
# ---------------------------
# import regressor / group-wise data splitter
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
# put predictions here; same index as data
preds = pd.Series(np.NaN, index=df.index)
# Create variables and target arrays; create a column of groups
variables = df.drop('price', axis=1)
target = df['price']
groups = df['region']
# Create a "leave one group out" iterator
n_groups = df['region'].nunique()
group_kfold = GroupKFold(n_splits=n_groups)
# train = data used for fitting the model
# test = data on which the model predicts
for train, test in group_kfold.split(variables, target, groups):
# define the model (could be different)
mdl = RandomForestRegressor(n_estimators=50)
# fit the model on train ("outside of group")
mdl.fit(variables.loc[train], target.loc[train])
# predict using model on test ("in group"); add it to preds
preds.loc[test] = mdl.predict(variables.loc[test])
# calculate RMSE
def rmse(preds, target):
return np.mean((preds - target)**2)
print(rmse(preds, target))
# add preds to original dataframe; calculate RMSE by group
results = pd.concat([preds.rename('predictions'), target, df['region']], axis=1)
results.groupby('region').apply(lambda x:rmse(x['predictions'], x['price']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment