Skip to content

Instantly share code, notes, and snippets.

@catalystfrank
Created December 19, 2018 09:27
Show Gist options
  • Save catalystfrank/58ca4fc7e123cb146560f9c738b1fb4b to your computer and use it in GitHub Desktop.
Save catalystfrank/58ca4fc7e123cb146560f9c738b1fb4b to your computer and use it in GitHub Desktop.
protein fold
import numpy as np
import pandas as pd
import numpy.random as nr
# Read In
DF = pd.read_csv('train.csv',sep=',',header=0)
for i in xrange(28):
DF[str(i)] = DF['Target'].map(lambda x: int(str(i) in x.split(' ')))
value_counts = DF.ix[:,2:].apply(np.sum, axis=0)
class_order = list(value_counts.sort_values().index)
DF['inpool'] = 0
# Config Fold Num
Nfold = 5
# Arrange Fold From Least Labelled Class
for item in class_order:
#print item,
value_counts[item]
origin_pool = [value_counts[item]/Nfold for i in range(Nfold)]
randadd = nr.choice(Nfold, value_counts[item]%Nfold)
while len(randadd)!=len(set(randadd)):
randadd = nr.choice(Nfold, value_counts[item]%Nfold)
for index in randadd:
origin_pool[index] = origin_pool[index]+1
existing_pool = DF[DF[item]==1].groupby('inpool').count().ix[:,0].to_dict()
for i in range(Nfold):
if i+1 not in existing_pool:
existing_pool[i+1] = 0
waiting_pool = [each for each in origin_pool]
for i in range(Nfold):
if waiting_pool[i]<existing_pool[i+1]:
print "Error Allocating Label: Could Not Balance"
waiting_pool[i] = waiting_pool[i] - existing_pool[i+1]
unperturb_label = [pool_id+1 for pool_id in range(Nfold) for each in range(waiting_pool[pool_id]) ]
perturb_label = nr.permutation(unperturb_label)
#print perturb_label,
#print len(perturb_label),
#print len(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'])
for index,dfindex in enumerate(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'].index):
DF.ix[dfindex,'inpool'] = perturb_label[index]
# Check Balancing
DF.groupby('inpool').sum()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment