rgerkin/fit.py

## fit.py
from sklearn.cross_validation import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor

n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000] # The numbers we agreed on.
n_splits = 10 # The number of splits we agreed on.
n_obs = int(X_all.shape[0]/2) # X_all is my matrix of all the training and leaderboard molecule features, including the leak.
                              # It has two (consecutive) rows for each molecule, the first is the weaker concentration and the second
                              # is the stronger one.
shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This reproduces the splits I put on GitHub.

class DoubleSS:
    """This is a new train/test iterator which accomplishes three things:
	  1) It puts both concentrations of a given molecule into the same side of the split (either train or test but not both).
	  2) It trains on both concentrations, but only puts only the stronger concentration into the test set for descriptors 1-20,
	     and only the 10^-3 concentration into the test set for descriptor 0 (intensity).
	  """
	  __init__(self, ss, col, concs):
        self.splits = ss # The original split from ShuffleSplit
        self.col = col # The descriptor index, e.g. 0 for intensity, 1 for pleasantness
        self.concs = concs # The concentrations of the molecules at each index.

    def __iter__(self):
        for train, test in self.splits:
            train = np.concatenate((2*train,2*train+1))
            if self.col>0:
                test = 2*test+1 # The second (higher) concentration of the pair
            else:
                test = np.concatenate((2*test,2*test+1))
                test = test[self.concs[test]==-3] # Always the 10^-3 concentration
            yield train, test

    def __len__(self):
        return len(self.splits)

X = X_all[:,:-1] # Remove the high/low dilution feature (i.e. remove the leak).
rs = np.zeros((21,len(n_features),n_splits)) # Array to hold the correlations.
for col in range(21): # 21 descriptors
    observed = Y[:,col] # Observed data for just this descriptor.
    n_features_ = list(np.array(n_features)+(col==0)) # Add 1 extra feature when predicting intensity,
                                                      # because training will rank log-dilution first,
                                                      # but log-dilution=-3 always in testing, so this feature is worthless.
    cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Convert the splits into something that matches the challenge conditions.
    for j,(train,test) in enumerate(cv):
        print(col,max_features)
        rfc = RandomForestRegressor(n_estimators=10,max_features='auto',
                                    oob_score=False,n_jobs=1,random_state=0) # I have not optimized this by max_features, maximum tree depth, etc.
                                                                             # Also, 10 estimators is well short of the point of diminishing returns.
        rfc.fit(X[train,:],observed[train]) # Fit the training data for this split.
        importance_ranks = np.argsort(rfc.feature_importances_)[::-1] # Sort the features by importance
        							      # Index 0 will be the most important; index 1 the second most, etc.
        for i,max_features in enumerate(n_features_):
            rfc.fit(X[train,:][:,importance_ranks[:max_features]],observed[train]) # Refit with only max_features features
            predicted = rfc.predict(X[test,:][:,importance_ranks[:max_features]]) # Get the prediction for this fit on the test split
            rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Save the correlation between prediction and observed data in the test split
    means = rs4[col,:,:].mean(axis=1) # The mean across splits
    sems = rs4[col,:,:].std(axis=1)/np.sqrt(n_splits) # The SEM across splits
    plt.figure()
    plt.errorbar(n_features,means,yerr=sems) # Plot the mean correlation vs the number of features
	from sklearn.cross_validation import ShuffleSplit
	from sklearn.ensemble import RandomForestRegressor

	n_features = [1,2,3,4,5,10,33,100,333,1000,3333,10000] # The numbers we agreed on.
	n_splits = 10 # The number of splits we agreed on.
	n_obs = int(X_all.shape[0]/2) # X_all is my matrix of all the training and leaderboard molecule features, including the leak.
	# It has two (consecutive) rows for each molecule, the first is the weaker concentration and the second
	# is the stronger one.
	shuffle_split = ShuffleSplit(n_obs,n_splits,test_size=0.17,random_state=0) # This reproduces the splits I put on GitHub.

	class DoubleSS:
	"""This is a new train/test iterator which accomplishes three things:
	1) It puts both concentrations of a given molecule into the same side of the split (either train or test but not both).
	2) It trains on both concentrations, but only puts only the stronger concentration into the test set for descriptors 1-20,
	and only the 10^-3 concentration into the test set for descriptor 0 (intensity).
	"""
	__init__(self, ss, col, concs):
	self.splits = ss # The original split from ShuffleSplit
	self.col = col # The descriptor index, e.g. 0 for intensity, 1 for pleasantness
	self.concs = concs # The concentrations of the molecules at each index.

	def __iter__(self):
	for train, test in self.splits:
	train = np.concatenate((2train,2train+1))
	if self.col>0:
	test = 2*test+1 # The second (higher) concentration of the pair
	else:
	test = np.concatenate((2test,2test+1))
	test = test[self.concs[test]==-3] # Always the 10^-3 concentration
	yield train, test

	def __len__(self):
	return len(self.splits)

	X = X_all[:,:-1] # Remove the high/low dilution feature (i.e. remove the leak).
	rs = np.zeros((21,len(n_features),n_splits)) # Array to hold the correlations.
	for col in range(21): # 21 descriptors
	observed = Y[:,col] # Observed data for just this descriptor.
	n_features_ = list(np.array(n_features)+(col==0)) # Add 1 extra feature when predicting intensity,
	# because training will rank log-dilution first,
	# but log-dilution=-3 always in testing, so this feature is worthless.
	cv = DoubleSS(shuffle_split, col, X_all[:,-2]) # Convert the splits into something that matches the challenge conditions.
	for j,(train,test) in enumerate(cv):
	print(col,max_features)
	rfc = RandomForestRegressor(n_estimators=10,max_features='auto',
	oob_score=False,n_jobs=1,random_state=0) # I have not optimized this by max_features, maximum tree depth, etc.
	# Also, 10 estimators is well short of the point of diminishing returns.
	rfc.fit(X[train,:],observed[train]) # Fit the training data for this split.
	importance_ranks = np.argsort(rfc.feature_importances_)[::-1] # Sort the features by importance
	# Index 0 will be the most important; index 1 the second most, etc.
	for i,max_features in enumerate(n_features_):
	rfc.fit(X[train,:][:,importance_ranks[:max_features]],observed[train]) # Refit with only max_features features
	predicted = rfc.predict(X[test,:][:,importance_ranks[:max_features]]) # Get the prediction for this fit on the test split
	rs[col,i,j] = np.corrcoef(predicted,observed[test])[1,0] # Save the correlation between prediction and observed data in the test split
	means = rs4[col,:,:].mean(axis=1) # The mean across splits
	sems = rs4[col,:,:].std(axis=1)/np.sqrt(n_splits) # The SEM across splits
	plt.figure()
	plt.errorbar(n_features,means,yerr=sems) # Plot the mean correlation vs the number of features