Created
November 20, 2014 11:20
-
-
Save kingjr/ef64b63dbbde06455a2d to your computer and use it in GitHub Desktop.
Test to see how subsampling can increase decoding scores
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.cross_validation import cross_val_score, KFold | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.lda import LDA | |
def add_information(X, SNR, prop): | |
''' Introduce differences between | |
Input | |
----- | |
X : np.array(n_subjects, n_features) | |
The basic data. | |
SNR : float | |
Signal to Noise Ratio | |
prop : float | |
Proportion of feature containing information | |
Returns | |
------- | |
X : np.array(n_subjects, n_features) | |
The data. | |
''' | |
for feature in range(0, int(X.shape[1] * prop)): | |
# effect can go either way on each specific feature: | |
# e.g. cortical_thickness(ASD) > cortical_thickness(healthy) | |
# but regionAsize(ASD) < regionAsize(healthy) | |
effect_direction = np.random.randint(0, 2) * 2. - 1. # -1 or 1 | |
effect = SNR * effect_direction | |
X[:, feature] = X[:, feature] + effect | |
return X | |
# ---------- Decoding parameters | |
lda = LDA() # linear SVM | |
scaler = StandardScaler() # normalize data. | |
clf = Pipeline([('scaler', scaler), ('lda', lda)]) | |
n_subsampling = 40 # arbitrary number of subsets used for analyses (not to be confounded with the number of true subgroups) | |
# ---------- Experiment parameters | |
n_subjects = 900 # number of subjects | |
n_feature = 180+32+148+148+148 # number of feature (cortex thickness, area size etc) | |
# Basic Anatomical data | |
X = np.random.randn(n_subjects, n_feature) # let's assume that the basic data across subjects is approximately Gaussian. | |
y = np.random.randint(0, 2, n_subjects) # vector defining the class of each subject | |
# Signal to Noise Ratio | |
SNR = .4 | |
# proportion of feature with information | |
prop = .10 | |
# ================ SCENARIO 1: | |
# homogeneous sampling: add similar information to all ASD | |
X[y==1,:] = add_information(X[y==1,:], SNR, prop) | |
#--- mean accuracy across folds | |
scores = np.mean(cross_val_score(clf, X, y=y, cv=10, scoring='accuracy')) | |
#--- score on smaller sample sizes: | |
subscores = np.zeros((n_subsampling,1)) | |
sets = KFold(n=n_subjects, n_folds=n_subsampling) | |
for ii, (_, subset) in enumerate(sets): | |
subscores[ii] = np.mean(cross_val_score(clf, X[subset], y=y[subset], cv=5, scoring='accuracy')) | |
print("Accuracy with all subjects: {:.2%}".format(scores)) | |
print("Accuracy with subsets: {:.2%}".format(np.mean(subscores))) | |
# ================= SCENARIO 2: | |
# heterogeneous sampling: add different information to ASD subgroups | |
n_sub = 5 # number of subgroups | |
y_subgroup = y # subgroup | |
y_subgroup[y>0] = np.random.randint(1, n_sub, sum(y>0)) | |
for sub in range(0, n_sub): | |
X[y_subgroup==sub,:] = add_information(X[y_subgroup==sub,:], SNR, prop) | |
#--- overall mean accuracy across folds | |
scores = np.mean(cross_val_score(clf, X, y=y, cv=10, scoring='accuracy')) | |
#--- score on smaller sample sizes: | |
subscores = np.zeros((n_subsampling,1)) | |
sets = KFold(n=n_subjects, n_folds=n_subsampling) | |
for ii, (_, subset) in enumerate(sets): | |
subscores[ii] = np.mean(cross_val_score(clf, X[subset], y=y[subset], cv=5, scoring='accuracy')) | |
print("Accuracy with all subjects: {:.2%}".format(scores)) | |
print("Accuracy with subsets: {:.2%}".format(np.mean(subscores))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment