Created
May 19, 2018 13:42
-
-
Save AdityaSoni19031997/43067c1b6be851c569fefb63db5f5ff9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This way we have randomness and are able to reproduce the behaviour within this cell. | |
np.random.seed(13) | |
def impact_coding(data, feature, target='y'): | |
''' | |
In this implementation we get the values and the dictionary as two different steps. | |
This is just because initially we were ignoring the dictionary as a result variable. | |
In this implementation the KFolds use shuffling. If you want reproducibility the cv | |
could be moved to a parameter. | |
''' | |
n_folds = 20 | |
n_inner_folds = 10 | |
impact_coded = pd.Series() | |
oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this) | |
kf = KFold(n_splits=n_folds, shuffle=True) | |
oof_mean_cv = pd.DataFrame() | |
split = 0 | |
for infold, oof in kf.split(data[feature]): | |
impact_coded_cv = pd.Series() | |
kf_inner = KFold(n_splits=n_inner_folds, shuffle=True) | |
inner_split = 0 | |
inner_oof_mean_cv = pd.DataFrame() | |
oof_default_inner_mean = data.iloc[infold][target].mean() | |
for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]): | |
# The mean to apply to the inner oof split (a 1/n_folds % based on the rest) | |
oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean() | |
impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply( | |
lambda x: oof_mean[x[feature]] | |
if x[feature] in oof_mean.index | |
else oof_default_inner_mean | |
, axis=1)) | |
# Also populate mapping (this has all group -> mean for all inner CV folds) | |
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer') | |
inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True) | |
inner_split += 1 | |
# Also populate mapping | |
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer') | |
oof_mean_cv.fillna(value=oof_default_mean, inplace=True) | |
split += 1 | |
impact_coded = impact_coded.append(data.iloc[oof].apply( | |
lambda x: inner_oof_mean_cv.loc[x[feature]].mean() | |
if x[feature] in inner_oof_mean_cv.index | |
else oof_default_mean | |
, axis=1)) | |
return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean | |
# Apply the encoding to training and test data, and preserve the mapping | |
impact_coding_map = {} | |
for f in categorical_features: | |
print("Impact coding for {}".format(f)) | |
train_data["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(train_data, f) | |
impact_coding_map[f] = (impact_coding_mapping, default_coding) | |
mapping, default_mean = impact_coding_map[f] | |
test_data["impact_encoded_{}".format(f)] = test_data.apply(lambda x: mapping[x[f]] | |
if x[f] in mapping | |
else default_mean | |
, axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment