tansey/aicc_select.py

## aicc_select.py
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm


def generalized_liang_sim_xy(N=500, P=500, S=100):
    '''Generates data from a simple linear model'''
    X = (np.random.normal(size=(N,1)) + np.random.normal(size=(N,P))) / 2.
    w0 = np.random.normal(1, size=S//4)
    w1 = np.random.normal(2, size=S//4)
    w2 = np.random.normal(2, size=S//4)
    w3 = np.random.normal(2, size=S//4)
    y = X[:,0:S:4].dot(w0) + X[:,1:S:4].dot(w1) + X[:,2:S:4].dot(w2) + X[:,3:S:4].dot(w3) + np.random.normal(0, 0.5, size=N)
    # Return X, y, and the binary true discovery labels
    return X, y, np.concatenate([np.ones(S), np.zeros(P-S)])


def powerset(iterable):
    '''powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)'''
    from itertools import chain, combinations
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def enumerate_ols(X, y):
    '''Enumerates out the powerset of possible covariate subsets and fits each one.'''
    return [(np.array(variables), sm.OLS(y, X[:,variables]).fit()) for variables in powerset(np.arange(X.shape[1])) if len(variables) > 0]


if __name__ == '__main__':
    n_trials = 100
    delta_threshold = 2
    N = 50
    P = 8
    S = 4

    null_p_values = []
    for trial in range(n_trials):
        print(trial)

        # Generate 50 samples with 8 covariates where the first 4 are true positives.
        # All covariates have correlation 0.5.
        X, y, _ = generalized_liang_sim_xy(N=N, P=P, S=S)


        # Try all the different models and evaluate via AICc
        fits = enumerate_ols(X, y)
        scores = np.array([sm.tools.eval_measures.aicc(r.llf, r.nobs, r.df_model+1) for v,r in fits])

        # Select all models with at most AICc delta of 2
        # deltas = scores - scores.min()
        # selected = deltas <= delta_threshold

        # Get the best as selected by AICc
        best = np.argmin(scores)
        best_variables, best_fit = fits[best]

        # Add all p-values for null variables to the list
        print(best_fit.pvalues, best_variables, best_fit.pvalues[best_variables >= S])
        null_p_values.extend(best_fit.pvalues[best_variables >= S])

    null_p_values = np.array(null_p_values)

    # Get the sorted values from smallest to largest
    p_sorted = null_p_values[np.argsort(null_p_values)]

    # Get the theoretical and empirical CDFs
    theoretical = np.linspace(0,1,len(p_sorted))
    empirical = (p_sorted[:,None] < theoretical[None]).mean(axis=0)

    plt.plot(theoretical, empirical, color='orange', label='Actual p-values')
    plt.plot([0,1], [0,1], color='black', label='Valid p-values')
    plt.ylim([0,1])
    plt.xlim([0,1])
    plt.xlabel('Theoretical null p-value CDF')
    plt.ylabel('Actual null p-value CDF')
    plt.legend(loc='lower right')
    plt.savefig('aicc-select.pdf', bbox_inches='tight')
    plt.close()
	import matplotlib.pyplot as plt
	import numpy as np
	import statsmodels.api as sm


	def generalized_liang_sim_xy(N=500, P=500, S=100):
	'''Generates data from a simple linear model'''
	X = (np.random.normal(size=(N,1)) + np.random.normal(size=(N,P))) / 2.
	w0 = np.random.normal(1, size=S//4)
	w1 = np.random.normal(2, size=S//4)
	w2 = np.random.normal(2, size=S//4)
	w3 = np.random.normal(2, size=S//4)
	y = X[:,0:S:4].dot(w0) + X[:,1:S:4].dot(w1) + X[:,2:S:4].dot(w2) + X[:,3:S:4].dot(w3) + np.random.normal(0, 0.5, size=N)
	# Return X, y, and the binary true discovery labels
	return X, y, np.concatenate([np.ones(S), np.zeros(P-S)])


	def powerset(iterable):
	'''powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)'''
	from itertools import chain, combinations
	s = list(iterable)
	return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


	def enumerate_ols(X, y):
	'''Enumerates out the powerset of possible covariate subsets and fits each one.'''
	return [(np.array(variables), sm.OLS(y, X[:,variables]).fit()) for variables in powerset(np.arange(X.shape[1])) if len(variables) > 0]


	if __name__ == '__main__':
	n_trials = 100
	delta_threshold = 2
	N = 50
	P = 8
	S = 4

	null_p_values = []
	for trial in range(n_trials):
	print(trial)

	# Generate 50 samples with 8 covariates where the first 4 are true positives.
	# All covariates have correlation 0.5.
	X, y, _ = generalized_liang_sim_xy(N=N, P=P, S=S)


	# Try all the different models and evaluate via AICc
	fits = enumerate_ols(X, y)
	scores = np.array([sm.tools.eval_measures.aicc(r.llf, r.nobs, r.df_model+1) for v,r in fits])

	# Select all models with at most AICc delta of 2
	# deltas = scores - scores.min()
	# selected = deltas <= delta_threshold

	# Get the best as selected by AICc
	best = np.argmin(scores)
	best_variables, best_fit = fits[best]

	# Add all p-values for null variables to the list
	print(best_fit.pvalues, best_variables, best_fit.pvalues[best_variables >= S])
	null_p_values.extend(best_fit.pvalues[best_variables >= S])

	null_p_values = np.array(null_p_values)

	# Get the sorted values from smallest to largest
	p_sorted = null_p_values[np.argsort(null_p_values)]

	# Get the theoretical and empirical CDFs
	theoretical = np.linspace(0,1,len(p_sorted))
	empirical = (p_sorted[:,None] < theoretical[None]).mean(axis=0)

	plt.plot(theoretical, empirical, color='orange', label='Actual p-values')
	plt.plot([0,1], [0,1], color='black', label='Valid p-values')
	plt.ylim([0,1])
	plt.xlim([0,1])
	plt.xlabel('Theoretical null p-value CDF')
	plt.ylabel('Actual null p-value CDF')
	plt.legend(loc='lower right')
	plt.savefig('aicc-select.pdf', bbox_inches='tight')
	plt.close()