Skip to content

Instantly share code, notes, and snippets.

@ce-carey
Last active July 13, 2021 18:59
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ce-carey/6480d6544f132829d9579b2a1f1455b4 to your computer and use it in GitHub Desktop.
Save ce-carey/6480d6544f132829d9579b2a1f1455b4 to your computer and use it in GitHub Desktop.
UKB Phenotypic Correlation Matrices
## UKB Phenotypic Correlation Matrices
##
## Generates a phenotypic correlation matrix for all GWAS'ed
## traits, residualized for all GWAS covariates. Also outputs
## a matrix of pairwise N's.
##
## Created by Caitlin E Carey (ccarey@broadinstitute.org)
##
## Last updated 9/27/2019
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import functools
import sys
# flush the buffer to std.out
print = functools.partial(print,flush=True)
# grab commandline input specifying sex to be analyzed
biosex=sys.argv[1]
# load the most current manifest file
manifest = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/UKBBmegaGWAS_manifest_downloaded20190910.tsv')
# grab all non-covar GWAS phenotypes in manifest
phenolist = [x for x in manifest.loc[manifest.Sex==biosex,"Phenotype Code"].values.tolist() if pd.notnull(x) and x not in ["age","sex"]]
phenolist = np.unique(phenolist)
# grab all participants in GWAS sample remove those who subsequently withdrew
sampleIDs = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.neale_gwas_samples.'+biosex+'.txt')][1:]
withdrawn = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.withdrawn_samples.txt')][1:]
sampleIDs = [int(x) for x in sampleIDs if x not in withdrawn]
# load all GWAS covariates
covars = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/megaGWAS_covars/ukb31063.neale_gwas_covariates.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
# load all relevant phenotype files
phesant = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.PHESANT_January_2019.'+biosex+'.tsv.bgz',dtype=object,index_col="s",compression="gzip")
finngen = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.FINNGEN_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
icd10 = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.ICD10_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
biomarkers = pd.read_csv('/psych/genetics_data/ccarey/UKBB/ukb_files/biomarkers_megaGWAS/ukb31063.biomarkers_gwas.'+biosex+'.tsv',dtype=object,index_col="s")
# grab all GWAS'ed phenotypes
phesant_phenos = np.intersect1d(phesant.columns,phenolist)
finngen_phenos = np.intersect1d(finngen.columns,phenolist)
icd10_phenos = np.intersect1d(icd10.columns,phenolist)
biomarkers_phenos = np.intersect1d(biomarkers.columns,phenolist)
filephenos = np.concatenate((phesant_phenos,finngen_phenos,icd10_phenos,biomarkers_phenos))
# extract data for all GWAS'ed phenotypes
phesant_extracted = phesant.loc[sampleIDs,phesant_phenos]
finngen_extracted = finngen.loc[sampleIDs,finngen_phenos]
icd10_extracted = icd10.loc[sampleIDs,icd10_phenos]
biomarkers_extracted = biomarkers.loc[sampleIDs,biomarkers_phenos]
manifest_phenos = pd.concat([phesant_extracted,finngen_extracted,icd10_extracted,biomarkers_extracted],axis=1)
# replace boolean strings with actual booleans
manifest_phenos.replace('TRUE',True,inplace=True)
manifest_phenos.replace('FALSE',False,inplace=True)
# function to residualize GWAS phenotypes by GWAS covars
def residualize(column):
temp = manifest_phenos[column].astype(float).to_frame().join(covars,how="left")
temp['y'] = temp[column]
model = sm.ols("y ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_isFemale + age_squared + age_squared_isFemale + isFemale", data=temp,missing="drop").fit()
return(model.resid)
# residualize all phenotypes
manifest_phenos_resid = manifest_phenos.apply(lambda x: residualize(x.name))
# add in GWAS'ed covariates
if biosex=="both_sexes":
isfemale_mod = sm.ols("isFemale.astype(int) ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_squared", data=covars,missing="drop").fit()
manifest_phenos_resid["isFemale"]=isfemale_mod.resid
age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + isFemale", data=covars,missing="drop").fit()
manifest_phenos_resid["age"]=age_mod.resid
else:
age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20", data=covars,missing="drop").fit()
manifest_phenos_resid["age"]=age_mod.resid
# generate residualized phenotypic correlation matrix
manifest_phenos_resid_corr = manifest_phenos_resid.corr()
print("correlation matrix generated")
# save resulting correlation matrix
manifest_phenos_resid_corr.to_csv('/psych/genetics_data/ccarey/UKBB/h2_corrmat/corrmat/'+biosex+'_resid_corrmat.csv')
print("saved corrmat to file")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment