Considering a dataset data
which is already one hot encoded:
n_split = 5
len_split = int(data.shape[0]/n_split)
# Select only numerical values for this example
data_num = data.select_dtypes(exclude=['object'])
# Shuffle the dataset rows
data_num = data_num.sample(frac=1)
# Initialize the 1st fold
scores = []
start = 0
stop = len_split
# Start the n-split fold CV
for k in range(n_split): # k in [0,1,2,3,4]
data_val = data_num.iloc[start:stop, :]
data_train = data_num.drop(index=data_val.index)
y_train = data_train['SalePriceBinary']
X_train = data_train.drop(columns=['SalePriceBinary'])
y_val = data_val['SalePriceBinary']
X_val = data_val.drop(columns=['SalePriceBinary'])
robust = RobustScaler()
# Train the Robustscaler only on train set to avoid data leakage 👌
X_train['GrLivArea'] = robust.fit_transform(X_train[['GrLivArea']])
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)
# Apply robust scaling to val set
X_val['GrLivArea'] = robust.transform(X_val[['GrLivArea']])
scores.append(log.score(X_val, y_val))
# Setup next fold
start = stop
stop += len_split
# Display the mean score and the scores standard deviation
print(np.array(scores).mean())
print(np.array(scores).std())