dmitrysarov/fold_split_equally.py

## fold_split_equally.py
#iteratively fill fold with balancing probobilities of next item respectivly it size
#df - dataframe of subjects CT slices, different subjects can have different number of slices
#folds splits can't have same subject in it
value_count = df['subject'].value_counts() #count how much each subject have slices
number_of_folds = 5
np.random.seed(42)
for num, chunk in enumerate(np.array_split(value_count, range(number_of_folds, len(value_count), number_of_folds))):
# chunk - is sorted by count
    if num == 0:
        #initialize folds
        rand_int = list(range(number_of_folds))
        np.random.shuffle(rand_int)
        folds = {i: rand_int.pop() for i in chunk.index}
        folds_count = {folds[sbj]:chunk[sbj] for sbj in folds} # folds[sbj] - number of fold, chunk[sbj] - number of subject slices
        weights = {f:1/c for f,c in folds_count.items()}
        #normalize
        weights = {k: v/sum(weights.values()) for k, v in weights.items()}
    else:
        #select fold number with respect to current fold "weight"
        rand_int = np.random.choice(list(weights.keys()), size=len(chunk), p=list(weights.values()), replace=False).tolist()
        local_folds = {i: rand_int.pop() for i in chunk.index}
        folds.update(local_folds) #folds for current chunk
        local_folds_count = {folds[sbj]:chunk[sbj]+folds_count[folds[sbj]] for sbj in local_folds} #add count for each fold from prev step
        folds_count = local_folds_count
        weights = {f:1/c for f,c in folds_count.items()}
        #normalize
        weights = {k: v/sum(weights.values()) for k, v in weights.items()}
	#iteratively fill fold with balancing probobilities of next item respectivly it size
	#df - dataframe of subjects CT slices, different subjects can have different number of slices
	#folds splits can't have same subject in it
	value_count = df['subject'].value_counts() #count how much each subject have slices
	number_of_folds = 5
	np.random.seed(42)
	for num, chunk in enumerate(np.array_split(value_count, range(number_of_folds, len(value_count), number_of_folds))):
	# chunk - is sorted by count
	if num == 0:
	#initialize folds
	rand_int = list(range(number_of_folds))
	np.random.shuffle(rand_int)
	folds = {i: rand_int.pop() for i in chunk.index}
	folds_count = {folds[sbj]:chunk[sbj] for sbj in folds} # folds[sbj] - number of fold, chunk[sbj] - number of subject slices
	weights = {f:1/c for f,c in folds_count.items()}
	#normalize
	weights = {k: v/sum(weights.values()) for k, v in weights.items()}
	else:
	#select fold number with respect to current fold "weight"
	rand_int = np.random.choice(list(weights.keys()), size=len(chunk), p=list(weights.values()), replace=False).tolist()
	local_folds = {i: rand_int.pop() for i in chunk.index}
	folds.update(local_folds) #folds for current chunk
	local_folds_count = {folds[sbj]:chunk[sbj]+folds_count[folds[sbj]] for sbj in local_folds} #add count for each fold from prev step
	folds_count = local_folds_count
	weights = {f:1/c for f,c in folds_count.items()}
	#normalize
	weights = {k: v/sum(weights.values()) for k, v in weights.items()}