Skip to content

Instantly share code, notes, and snippets.

@kingjr
Created November 20, 2014 11:20
Show Gist options
  • Save kingjr/ef64b63dbbde06455a2d to your computer and use it in GitHub Desktop.
Save kingjr/ef64b63dbbde06455a2d to your computer and use it in GitHub Desktop.
Test to see how subsampling can increase decoding scores
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
def add_information(X, SNR, prop):
''' Introduce differences between
Input
-----
X : np.array(n_subjects, n_features)
The basic data.
SNR : float
Signal to Noise Ratio
prop : float
Proportion of feature containing information
Returns
-------
X : np.array(n_subjects, n_features)
The data.
'''
for feature in range(0, int(X.shape[1] * prop)):
# effect can go either way on each specific feature:
# e.g. cortical_thickness(ASD) > cortical_thickness(healthy)
# but regionAsize(ASD) < regionAsize(healthy)
effect_direction = np.random.randint(0, 2) * 2. - 1. # -1 or 1
effect = SNR * effect_direction
X[:, feature] = X[:, feature] + effect
return X
# ---------- Decoding parameters
lda = LDA() # linear SVM
scaler = StandardScaler() # normalize data.
clf = Pipeline([('scaler', scaler), ('lda', lda)])
n_subsampling = 40 # arbitrary number of subsets used for analyses (not to be confounded with the number of true subgroups)
# ---------- Experiment parameters
n_subjects = 900 # number of subjects
n_feature = 180+32+148+148+148 # number of feature (cortex thickness, area size etc)
# Basic Anatomical data
X = np.random.randn(n_subjects, n_feature) # let's assume that the basic data across subjects is approximately Gaussian.
y = np.random.randint(0, 2, n_subjects) # vector defining the class of each subject
# Signal to Noise Ratio
SNR = .4
# proportion of feature with information
prop = .10
# ================ SCENARIO 1:
# homogeneous sampling: add similar information to all ASD
X[y==1,:] = add_information(X[y==1,:], SNR, prop)
#--- mean accuracy across folds
scores = np.mean(cross_val_score(clf, X, y=y, cv=10, scoring='accuracy'))
#--- score on smaller sample sizes:
subscores = np.zeros((n_subsampling,1))
sets = KFold(n=n_subjects, n_folds=n_subsampling)
for ii, (_, subset) in enumerate(sets):
subscores[ii] = np.mean(cross_val_score(clf, X[subset], y=y[subset], cv=5, scoring='accuracy'))
print("Accuracy with all subjects: {:.2%}".format(scores))
print("Accuracy with subsets: {:.2%}".format(np.mean(subscores)))
# ================= SCENARIO 2:
# heterogeneous sampling: add different information to ASD subgroups
n_sub = 5 # number of subgroups
y_subgroup = y # subgroup
y_subgroup[y>0] = np.random.randint(1, n_sub, sum(y>0))
for sub in range(0, n_sub):
X[y_subgroup==sub,:] = add_information(X[y_subgroup==sub,:], SNR, prop)
#--- overall mean accuracy across folds
scores = np.mean(cross_val_score(clf, X, y=y, cv=10, scoring='accuracy'))
#--- score on smaller sample sizes:
subscores = np.zeros((n_subsampling,1))
sets = KFold(n=n_subjects, n_folds=n_subsampling)
for ii, (_, subset) in enumerate(sets):
subscores[ii] = np.mean(cross_val_score(clf, X[subset], y=y[subset], cv=5, scoring='accuracy'))
print("Accuracy with all subjects: {:.2%}".format(scores))
print("Accuracy with subsets: {:.2%}".format(np.mean(subscores)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment