Skip to content

Instantly share code, notes, and snippets.

@AdityaSoni19031997
Created May 19, 2018 13:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AdityaSoni19031997/43067c1b6be851c569fefb63db5f5ff9 to your computer and use it in GitHub Desktop.
Save AdityaSoni19031997/43067c1b6be851c569fefb63db5f5ff9 to your computer and use it in GitHub Desktop.
# This way we have randomness and are able to reproduce the behaviour within this cell.
np.random.seed(13)
def impact_coding(data, feature, target='y'):
'''
In this implementation we get the values and the dictionary as two different steps.
This is just because initially we were ignoring the dictionary as a result variable.
In this implementation the KFolds use shuffling. If you want reproducibility the cv
could be moved to a parameter.
'''
n_folds = 20
n_inner_folds = 10
impact_coded = pd.Series()
oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
kf = KFold(n_splits=n_folds, shuffle=True)
oof_mean_cv = pd.DataFrame()
split = 0
for infold, oof in kf.split(data[feature]):
impact_coded_cv = pd.Series()
kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
inner_split = 0
inner_oof_mean_cv = pd.DataFrame()
oof_default_inner_mean = data.iloc[infold][target].mean()
for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
# The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
lambda x: oof_mean[x[feature]]
if x[feature] in oof_mean.index
else oof_default_inner_mean
, axis=1))
# Also populate mapping (this has all group -> mean for all inner CV folds)
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
inner_split += 1
# Also populate mapping
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
split += 1
impact_coded = impact_coded.append(data.iloc[oof].apply(
lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
if x[feature] in inner_oof_mean_cv.index
else oof_default_mean
, axis=1))
return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean
# Apply the encoding to training and test data, and preserve the mapping
impact_coding_map = {}
for f in categorical_features:
print("Impact coding for {}".format(f))
train_data["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(train_data, f)
impact_coding_map[f] = (impact_coding_mapping, default_coding)
mapping, default_mean = impact_coding_map[f]
test_data["impact_encoded_{}".format(f)] = test_data.apply(lambda x: mapping[x[f]]
if x[f] in mapping
else default_mean
, axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment