Skip to content

Instantly share code, notes, and snippets.

@Keiku
Last active May 2, 2017 07:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Keiku/28af36ecbd89ee3fd05c1b19ef9d156f to your computer and use it in GitHub Desktop.
Save Keiku/28af36ecbd89ee3fd05c1b19ef9d156f to your computer and use it in GitHub Desktop.
Split K-fold validation dataset.
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
X_train = np.random.random((10, 2))
y_train = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
column = "pred"
n_fold = 5
p_train = pd.DataFrame(index=range(len(X_train)), columns=[column])
kf = KFold(n_fold, random_state=123)
for tr, te in kf.split(X_train):
print(tr, te)
X_tra, y_tra, X_val, y_val = X_train[tr], y_train[tr], X_train[te], y_train[te]
p_val = y_val
p_val_df = pd.DataFrame(p_val, index=te, columns=[column])
p_train.iloc[te] = p_val_df
print(p_train)
# [2 3 4 5 6 7 8 9] [0 1]
# pred
# 0 1
# 1 1
# 2 NaN
# 3 NaN
# 4 NaN
# 5 NaN
# 6 NaN
# 7 NaN
# 8 NaN
# 9 NaN
# [0 1 4 5 6 7 8 9] [2 3]
# pred
# 0 1
# 1 1
# 2 1
# 3 1
# 4 NaN
# 5 NaN
# 6 NaN
# 7 NaN
# 8 NaN
# 9 NaN
# [0 1 2 3 6 7 8 9] [4 5]
# pred
# 0 1
# 1 1
# 2 1
# 3 1
# 4 1
# 5 0
# 6 NaN
# 7 NaN
# 8 NaN
# 9 NaN
# [0 1 2 3 4 5 8 9] [6 7]
# pred
# 0 1
# 1 1
# 2 1
# 3 1
# 4 1
# 5 0
# 6 0
# 7 0
# 8 NaN
# 9 NaN
# [0 1 2 3 4 5 6 7] [8 9]
# pred
# 0 1
# 1 1
# 2 1
# 3 1
# 4 1
# 5 0
# 6 0
# 7 0
# 8 0
# 9 0
X_train = np.random.random((10, 2))
y_train = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
column = "pred"
n_fold = 5
p_train = pd.DataFrame(index=range(len(X_train)), columns=[column])
skf = StratifiedKFold(n_fold, random_state=123)
for tr, te in skf.split(X_train, y_train):
print(tr, te)
X_tra, y_tra, X_val, y_val = X_train[tr], y_train[tr], X_train[te], y_train[te]
p_val = y_val
p_val_df = pd.DataFrame(p_val, index=te, columns=[column])
p_train.iloc[te] = p_val_df
print(p_train)
# [1 2 3 4 6 7 8 9] [0 5]
# pred
# 0 1
# 1 NaN
# 2 NaN
# 3 NaN
# 4 NaN
# 5 0
# 6 NaN
# 7 NaN
# 8 NaN
# 9 NaN
# [0 2 3 4 5 7 8 9] [1 6]
# pred
# 0 1
# 1 1
# 2 NaN
# 3 NaN
# 4 NaN
# 5 0
# 6 0
# 7 NaN
# 8 NaN
# 9 NaN
# [0 1 3 4 5 6 8 9] [2 7]
# pred
# 0 1
# 1 1
# 2 1
# 3 NaN
# 4 NaN
# 5 0
# 6 0
# 7 0
# 8 NaN
# 9 NaN
# [0 1 2 4 5 6 7 9] [3 8]
# pred
# 0 1
# 1 1
# 2 1
# 3 1
# 4 NaN
# 5 0
# 6 0
# 7 0
# 8 0
# 9 NaN
# [0 1 2 3 5 6 7 8] [4 9]
# pred
# 0 1
# 1 1
# 2 1
# 3 1
# 4 1
# 5 0
# 6 0
# 7 0
# 8 0
# 9 0
X_train = np.random.random((10, 2))
y_train = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
column = "pred"
n_fold = 5
p_train = pd.DataFrame({"ID": list(string.ascii_uppercase)[0:10]},
index=range(len(X_train)),
columns=["ID", column])
kf = KFold(n_fold, random_state=123)
for tr, te in kf.split(X_train):
print(tr, te)
X_tra, y_tra, X_val, y_val = X_train[tr], y_train[tr], X_train[te], y_train[te]
p_val = y_val
p_val_df = pd.DataFrame(p_val, index=te, columns=[column])
p_train.loc[te, [column]] = p_val_df
print(p_train)
# [2 3 4 5 6 7 8 9] [0 1]
# ID pred
# 0 A 1
# 1 B 1
# 2 C NaN
# 3 D NaN
# 4 E NaN
# 5 F NaN
# 6 G NaN
# 7 H NaN
# 8 I NaN
# 9 J NaN
# [0 1 4 5 6 7 8 9] [2 3]
# ID pred
# 0 A 1
# 1 B 1
# 2 C 1
# 3 D 1
# 4 E NaN
# 5 F NaN
# 6 G NaN
# 7 H NaN
# 8 I NaN
# 9 J NaN
# [0 1 2 3 6 7 8 9] [4 5]
# ID pred
# 0 A 1
# 1 B 1
# 2 C 1
# 3 D 1
# 4 E 1
# 5 F 0
# 6 G NaN
# 7 H NaN
# 8 I NaN
# 9 J NaN
# [0 1 2 3 4 5 8 9] [6 7]
# ID pred
# 0 A 1
# 1 B 1
# 2 C 1
# 3 D 1
# 4 E 1
# 5 F 0
# 6 G 0
# 7 H 0
# 8 I NaN
# 9 J NaN
# [0 1 2 3 4 5 6 7] [8 9]
# ID pred
# 0 A 1
# 1 B 1
# 2 C 1
# 3 D 1
# 4 E 1
# 5 F 0
# 6 G 0
# 7 H 0
# 8 I 0
# 9 J 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment