Skip to content

Instantly share code, notes, and snippets.

@rgerkin
Last active May 12, 2016 05:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rgerkin/f82937ab30122285bf449c71d6a615b7 to your computer and use it in GitHub Desktop.
Save rgerkin/f82937ab30122285bf449c71d6a615b7 to your computer and use it in GitHub Desktop.
from sklearn.cross_validation import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000] # The numbers we agreed on.
n_splits = 10 # The number of splits we agreed on.
n_obs = int(X_all.shape[0]/2) # X_all is my matrix of all the training and leaderboard molecule features, including the leak.
# It has two (consecutive) rows for each molecule, the first is the weaker concentration and the second
# is the stronger one.
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This reproduces the splits I put on GitHub.
class DoubleSS:
"""This is a new train/test iterator which accomplishes three things:
1) It puts both concentrations of a given molecule into the same side of the split (either train or test but not both).
2) It trains on both concentrations, but only puts only the stronger concentration into the test set for descriptors 1-20,
and only the 10^-3 concentration into the test set for descriptor 0 (intensity).
"""
__init__(self, ss, col, concs):
self.splits = ss # The original split from ShuffleSplit
self.col = col # The descriptor index, e.g. 0 for intensity, 1 for pleasantness
self.concs = concs # The concentrations of the molecules at each index.
def __iter__(self):
for train, test in self.splits:
train = np.concatenate((2*train,2*train+1))
if self.col>0:
test = 2*test+1 # The second (higher) concentration of the pair
else:
test = np.concatenate((2*test,2*test+1))
test = test[self.concs[test]==-3] # Always the 10^-3 concentration
yield train, test
def __len__(self):
return len(self.splits)
X = X_all[:,:-1] # Remove the high/low dilution feature (i.e. remove the leak).
rs = np.zeros((21,len(n_features),n_splits)) # Array to hold the correlations.
for col in range(21): # 21 descriptors
observed = Y[:,col] # Observed data for just this descriptor.
n_features_ = list(np.array(n_features)+(col==0)) # Add 1 extra feature when predicting intensity,
# because training will rank log-dilution first,
# but log-dilution=-3 always in testing, so this feature is worthless.
cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Convert the splits into something that matches the challenge conditions.
for j,(train,test) in enumerate(cv):
print(col,max_features)
rfc = RandomForestRegressor(n_estimators=10,max_features='auto',
oob_score=False,n_jobs=1,random_state=0) # I have not optimized this by max_features, maximum tree depth, etc.
# Also, 10 estimators is well short of the point of diminishing returns.
rfc.fit(X[train,:],observed[train]) # Fit the training data for this split.
importance_ranks = np.argsort(rfc.feature_importances_)[::-1] # Sort the features by importance
# Index 0 will be the most important; index 1 the second most, etc.
for i,max_features in enumerate(n_features_):
rfc.fit(X[train,:][:,importance_ranks[:max_features]],observed[train]) # Refit with only max_features features
predicted = rfc.predict(X[test,:][:,importance_ranks[:max_features]]) # Get the prediction for this fit on the test split
rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Save the correlation between prediction and observed data in the test split
means = rs4[col,:,:].mean(axis=1) # The mean across splits
sems = rs4[col,:,:].std(axis=1)/np.sqrt(n_splits) # The SEM across splits
plt.figure()
plt.errorbar(n_features,means,yerr=sems) # Plot the mean correlation vs the number of features
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment