Last active
May 12, 2016 05:03
-
-
Save rgerkin/f82937ab30122285bf449c71d6a615b7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cross_validation import ShuffleSplit | |
from sklearn.ensemble import RandomForestRegressor | |
n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000] # The numbers we agreed on. | |
n_splits = 10 # The number of splits we agreed on. | |
n_obs = int(X_all.shape[0]/2) # X_all is my matrix of all the training and leaderboard molecule features, including the leak. | |
# It has two (consecutive) rows for each molecule, the first is the weaker concentration and the second | |
# is the stronger one. | |
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This reproduces the splits I put on GitHub. | |
class DoubleSS: | |
"""This is a new train/test iterator which accomplishes three things: | |
1) It puts both concentrations of a given molecule into the same side of the split (either train or test but not both). | |
2) It trains on both concentrations, but only puts only the stronger concentration into the test set for descriptors 1-20, | |
and only the 10^-3 concentration into the test set for descriptor 0 (intensity). | |
""" | |
__init__(self, ss, col, concs): | |
self.splits = ss # The original split from ShuffleSplit | |
self.col = col # The descriptor index, e.g. 0 for intensity, 1 for pleasantness | |
self.concs = concs # The concentrations of the molecules at each index. | |
def __iter__(self): | |
for train, test in self.splits: | |
train = np.concatenate((2*train,2*train+1)) | |
if self.col>0: | |
test = 2*test+1 # The second (higher) concentration of the pair | |
else: | |
test = np.concatenate((2*test,2*test+1)) | |
test = test[self.concs[test]==-3] # Always the 10^-3 concentration | |
yield train, test | |
def __len__(self): | |
return len(self.splits) | |
X = X_all[:,:-1] # Remove the high/low dilution feature (i.e. remove the leak). | |
rs = np.zeros((21,len(n_features),n_splits)) # Array to hold the correlations. | |
for col in range(21): # 21 descriptors | |
observed = Y[:,col] # Observed data for just this descriptor. | |
n_features_ = list(np.array(n_features)+(col==0)) # Add 1 extra feature when predicting intensity, | |
# because training will rank log-dilution first, | |
# but log-dilution=-3 always in testing, so this feature is worthless. | |
cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Convert the splits into something that matches the challenge conditions. | |
for j,(train,test) in enumerate(cv): | |
print(col,max_features) | |
rfc = RandomForestRegressor(n_estimators=10,max_features='auto', | |
oob_score=False,n_jobs=1,random_state=0) # I have not optimized this by max_features, maximum tree depth, etc. | |
# Also, 10 estimators is well short of the point of diminishing returns. | |
rfc.fit(X[train,:],observed[train]) # Fit the training data for this split. | |
importance_ranks = np.argsort(rfc.feature_importances_)[::-1] # Sort the features by importance | |
# Index 0 will be the most important; index 1 the second most, etc. | |
for i,max_features in enumerate(n_features_): | |
rfc.fit(X[train,:][:,importance_ranks[:max_features]],observed[train]) # Refit with only max_features features | |
predicted = rfc.predict(X[test,:][:,importance_ranks[:max_features]]) # Get the prediction for this fit on the test split | |
rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Save the correlation between prediction and observed data in the test split | |
means = rs4[col,:,:].mean(axis=1) # The mean across splits | |
sems = rs4[col,:,:].std(axis=1)/np.sqrt(n_splits) # The SEM across splits | |
plt.figure() | |
plt.errorbar(n_features,means,yerr=sems) # Plot the mean correlation vs the number of features |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment