Skip to content

Instantly share code, notes, and snippets.

@ackjake
Created June 19, 2017 02:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ackjake/f4740dbd8b74d99348d952abed12ce1a to your computer and use it in GitHub Desktop.
Save ackjake/f4740dbd8b74d99348d952abed12ce1a to your computer and use it in GitHub Desktop.
def get_target_mean(df_train, df_test, var, target, NFOLDS, NOISE):
"""
Creates out of fold averages by the categorical variable passed.
Decreasing the number of folds and increasing the noise can help
prevent over fitting the training set.
"""
df_train['mean_{}'.format(var)] = np.nan
df_test['mean_{}'.format(var)] = np.nan
values = np.empty((NFOLD))
values[:] = np.nan
mean_stats = dict((el,np.copy(values)) for el in df_train[var].unique())
kf = KFold(NFOLD, shuffle=True)
for i, (train_index, test_index) in enumerate(kf.split(df_train)):
cv_train = df_train.iloc[train_index]
cv_test = df_train.iloc[test_index]
grouped = cv_train.groupby(var)[[target]].mean().reset_index()
for v in grouped[var].unique():
var_index = cv_test[cv_test[var] == v].index
impute_mean = grouped[grouped[var] == v][target].mean()
df_train.loc[var_index, 'mean_{}'.format(var)] = impute_mean * (np.random.uniform(0,0.01)+1)
mean_stats[v][i] = the_mean
avgDict = {}
for k,v in mean_stats.items():
avgDict[k] = sum(v)/ float(len(v))
for v in avgDict:
var_index = df_test[df_test[var] == v].index
df_test.loc[var_index, 'mean_{}'.format(var)] = avgDict[v]
df_train.drop(var, axis=1, inplace=True)
df_test.drop(var, axis=1, inplace=True)
return df_train, df_test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment