Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Created February 16, 2021 02:55
Show Gist options
  • Save ksv-muralidhar/8070fd4df850abac41320de661bd953d to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/8070fd4df850abac41320de661bd953d to your computer and use it in GitHub Desktop.
stratified cv part 1
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit, KFold
make_class = make_classification(n_samples=500,n_features=3,n_redundant=0,n_informative=2,n_classes=3,n_clusters_per_class=1,random_state=11)
data = pd.DataFrame(make_class[0],columns=range(make_class[0].shape[1]))
data['target'] = make_class[1]
data.head()
train_df,test_df = train_test_split(data,test_size=0.2,random_state=11)
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n'+
f'PROPORTION OF TARGET IN THE TRAINING SET\n{train_df["target"].value_counts() / len(train_df)}\n\n'+
f'PROPORTION OF TARGET IN THE TEST SET\n{test_df["target"].value_counts() / len(test_df)}')
train_df,test_df = train_test_split(data,test_size=0.2,stratify=data['target'],random_state=11)
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n'+
f'PROPORTION OF TARGET IN THE TRAINING SET\n{train_df["target"].value_counts() / len(train_df)}\n\n'+
f'PROPORTION OF TARGET IN THE TEST SET\n{test_df["target"].value_counts() / len(test_df)}')
kfold = KFold(n_splits=3,random_state=11,shuffle=True)
splits = kfold.split(data,data['target']) # each split has a train indexes and test indexes pair
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n')
for n,(train_index,test_index) in enumerate(splits):
print(f'SPLIT NO {n+1}\nTRAINING SET SIZE: {np.round(len(train_index) / (len(train_index)+len(test_index)),2)}'+
f'\tTEST SET SIZE: {np.round(len(test_index) / (len(train_index)+len(test_index)),2)}\nPROPORTION OF TARGET IN THE TRAINING SET\n'+
f'{data.iloc[test_index,3].value_counts() / len(data.iloc[test_index,3])}\nPROPORTION OF TARGET IN THE TEST SET\n'+
f'{data.iloc[train_index,3].value_counts() / len(data.iloc[train_index,3])}\n\n')
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=11)
#data['target'] IS THE VARIABLE USED FOR STRATIFIED SAMPLING.
splits = kfold.split(data,data['target'])
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n')
for n,(train_index,test_index) in enumerate(splits):
print(f'SPLIT NO {n+1}\nTRAINING SET SIZE: {np.round(len(train_index) / (len(train_index)+len(test_index)),2)}'+
f'\tTEST SET SIZE: {np.round(len(test_index) / (len(train_index)+len(test_index)),2)}\nPROPORTION OF TARGET IN THE TRAINING SET\n'+
f'{data.iloc[test_index,3].value_counts() / len(data.iloc[test_index,3])}\nPROPORTION OF TARGET IN THE TEST SET\n'+
f'{data.iloc[train_index,3].value_counts() / len(data.iloc[train_index,3])}\n\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment