afraenkel/groupwise_leave_one_out.py

## groupwise_leave_one_out.py
import pandas as pd
import numpy as np

# download data for example

from sklearn.datasets.california_housing import fetch_california_housing

d = fetch_california_housing()

df = pd.DataFrame(d['data'], columns=d['feature_names'])
df['price'] = d['target']
df = df.assign(region=df['Latitude'].round())

# ---------------------------
# import regressor / group-wise data splitter

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold

# put predictions here; same index as data
preds = pd.Series(np.NaN, index=df.index)

# Create variables and target arrays; create a column of groups
variables = df.drop('price', axis=1)
target = df['price']
groups = df['region']

# Create a "leave one group out" iterator
n_groups = df['region'].nunique()
group_kfold = GroupKFold(n_splits=n_groups)

# train = data used for fitting the model
# test = data on which the model predicts
for train, test in group_kfold.split(variables, target, groups):

    # define the model (could be different)
    mdl = RandomForestRegressor(n_estimators=50)

    # fit the model on train ("outside of group")
    mdl.fit(variables.loc[train], target.loc[train])

    # predict using model on test ("in group"); add it to preds
    preds.loc[test] = mdl.predict(variables.loc[test])


# calculate RMSE

def rmse(preds, target):
    return np.mean((preds - target)**2)

print(rmse(preds, target))

# add preds to original dataframe; calculate RMSE by group
results = pd.concat([preds.rename('predictions'), target, df['region']], axis=1)
results.groupby('region').apply(lambda x:rmse(x['predictions'], x['price']))
	import pandas as pd
	import numpy as np

	# download data for example

	from sklearn.datasets.california_housing import fetch_california_housing

	d = fetch_california_housing()

	df = pd.DataFrame(d['data'], columns=d['feature_names'])
	df['price'] = d['target']
	df = df.assign(region=df['Latitude'].round())

	# ---------------------------
	# import regressor / group-wise data splitter

	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import GroupKFold

	# put predictions here; same index as data
	preds = pd.Series(np.NaN, index=df.index)

	# Create variables and target arrays; create a column of groups
	variables = df.drop('price', axis=1)
	target = df['price']
	groups = df['region']

	# Create a "leave one group out" iterator
	n_groups = df['region'].nunique()
	group_kfold = GroupKFold(n_splits=n_groups)

	# train = data used for fitting the model
	# test = data on which the model predicts
	for train, test in group_kfold.split(variables, target, groups):

	# define the model (could be different)
	mdl = RandomForestRegressor(n_estimators=50)

	# fit the model on train ("outside of group")
	mdl.fit(variables.loc[train], target.loc[train])

	# predict using model on test ("in group"); add it to preds
	preds.loc[test] = mdl.predict(variables.loc[test])


	# calculate RMSE

	def rmse(preds, target):
	return np.mean((preds - target)**2)

	print(rmse(preds, target))

	# add preds to original dataframe; calculate RMSE by group
	results = pd.concat([preds.rename('predictions'), target, df['region']], axis=1)
	results.groupby('region').apply(lambda x:rmse(x['predictions'], x['price']))