Skip to content

Instantly share code, notes, and snippets.

@dmitrysarov
Created January 9, 2019 09:59
Show Gist options
  • Save dmitrysarov/1d492519ec03a482813928b204adf67f to your computer and use it in GitHub Desktop.
Save dmitrysarov/1d492519ec03a482813928b204adf67f to your computer and use it in GitHub Desktop.
It could help when your data set not balanced over one of parameters, but you have to split it over equally sized folds
#iteratively fill fold with balancing probobilities of next item respectivly it size
#df - dataframe of subjects CT slices, different subjects can have different number of slices
#folds splits can't have same subject in it
value_count = df['subject'].value_counts() #count how much each subject have slices
number_of_folds = 5
np.random.seed(42)
for num, chunk in enumerate(np.array_split(value_count, range(number_of_folds, len(value_count), number_of_folds))):
# chunk - is sorted by count
if num == 0:
#initialize folds
rand_int = list(range(number_of_folds))
np.random.shuffle(rand_int)
folds = {i: rand_int.pop() for i in chunk.index}
folds_count = {folds[sbj]:chunk[sbj] for sbj in folds} # folds[sbj] - number of fold, chunk[sbj] - number of subject slices
weights = {f:1/c for f,c in folds_count.items()}
#normalize
weights = {k: v/sum(weights.values()) for k, v in weights.items()}
else:
#select fold number with respect to current fold "weight"
rand_int = np.random.choice(list(weights.keys()), size=len(chunk), p=list(weights.values()), replace=False).tolist()
local_folds = {i: rand_int.pop() for i in chunk.index}
folds.update(local_folds) #folds for current chunk
local_folds_count = {folds[sbj]:chunk[sbj]+folds_count[folds[sbj]] for sbj in local_folds} #add count for each fold from prev step
folds_count = local_folds_count
weights = {f:1/c for f,c in folds_count.items()}
#normalize
weights = {k: v/sum(weights.values()) for k, v in weights.items()}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment