Created
February 16, 2021 02:55
-
-
Save ksv-muralidhar/8070fd4df850abac41320de661bd953d to your computer and use it in GitHub Desktop.
stratified cv part 1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.datasets import make_classification | |
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit, KFold | |
make_class = make_classification(n_samples=500,n_features=3,n_redundant=0,n_informative=2,n_classes=3,n_clusters_per_class=1,random_state=11) | |
data = pd.DataFrame(make_class[0],columns=range(make_class[0].shape[1])) | |
data['target'] = make_class[1] | |
data.head() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_df,test_df = train_test_split(data,test_size=0.2,random_state=11) | |
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n'+ | |
f'PROPORTION OF TARGET IN THE TRAINING SET\n{train_df["target"].value_counts() / len(train_df)}\n\n'+ | |
f'PROPORTION OF TARGET IN THE TEST SET\n{test_df["target"].value_counts() / len(test_df)}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_df,test_df = train_test_split(data,test_size=0.2,stratify=data['target'],random_state=11) | |
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n'+ | |
f'PROPORTION OF TARGET IN THE TRAINING SET\n{train_df["target"].value_counts() / len(train_df)}\n\n'+ | |
f'PROPORTION OF TARGET IN THE TEST SET\n{test_df["target"].value_counts() / len(test_df)}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kfold = KFold(n_splits=3,random_state=11,shuffle=True) | |
splits = kfold.split(data,data['target']) # each split has a train indexes and test indexes pair | |
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n') | |
for n,(train_index,test_index) in enumerate(splits): | |
print(f'SPLIT NO {n+1}\nTRAINING SET SIZE: {np.round(len(train_index) / (len(train_index)+len(test_index)),2)}'+ | |
f'\tTEST SET SIZE: {np.round(len(test_index) / (len(train_index)+len(test_index)),2)}\nPROPORTION OF TARGET IN THE TRAINING SET\n'+ | |
f'{data.iloc[test_index,3].value_counts() / len(data.iloc[test_index,3])}\nPROPORTION OF TARGET IN THE TEST SET\n'+ | |
f'{data.iloc[train_index,3].value_counts() / len(data.iloc[train_index,3])}\n\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
kfold = StratifiedKFold(n_splits=3,shuffle=True,random_state=11) | |
#data['target'] IS THE VARIABLE USED FOR STRATIFIED SAMPLING. | |
splits = kfold.split(data,data['target']) | |
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{data["target"].value_counts() / len(data)}\n\n') | |
for n,(train_index,test_index) in enumerate(splits): | |
print(f'SPLIT NO {n+1}\nTRAINING SET SIZE: {np.round(len(train_index) / (len(train_index)+len(test_index)),2)}'+ | |
f'\tTEST SET SIZE: {np.round(len(test_index) / (len(train_index)+len(test_index)),2)}\nPROPORTION OF TARGET IN THE TRAINING SET\n'+ | |
f'{data.iloc[test_index,3].value_counts() / len(data.iloc[test_index,3])}\nPROPORTION OF TARGET IN THE TEST SET\n'+ | |
f'{data.iloc[train_index,3].value_counts() / len(data.iloc[train_index,3])}\n\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment