Created
August 23, 2018 16:54
-
-
Save danking/9c9d77afb3ff318adc7bf79bb52d4f7d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hail as hl | |
import pickle | |
import time | |
import argparse | |
def flip_text(base): | |
""" | |
:param StringExpression base: Expression of a single base | |
:return: StringExpression of flipped base | |
:rtype: StringExpression | |
""" | |
return (hl.switch(base) | |
.when('A', 'T') | |
.when('T', 'A') | |
.when('C', 'G') | |
.when('G', 'C') | |
.default(base)) | |
def annotate_beta(mt, ss_loc): | |
mt = mt.annotate_rows(**{ | |
'beta': hl.case() | |
.when(((mt.alleles[0] == ss_loc.ref) & | |
(mt.alleles[1] == ss_loc.alt)) | | |
((flip_text(mt.alleles[0]) == ss_loc.ref) & | |
(flip_text(mt.alleles[1]) == ss_loc.alt)), | |
(-1 * ss_loc.beta)) | |
.when(((mt.alleles[0] == ss_loc.alt) & | |
(mt.alleles[1] == ss_loc.ref)) | | |
((flip_text(mt.alleles[0]) == ss_loc.alt) & | |
(flip_text(mt.alleles[1]) == ss_loc.ref)), | |
ss_loc.beta) | |
.or_missing()} | |
) | |
return(mt) | |
def specific_clumps(filename): | |
clump = hl.import_table(filename, delimiter='\s+') | |
clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP))).select() | |
return(clump) | |
######################################################################## | |
### initialize | |
phenos = ['height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil', 'eosinophil', 'basophil', 'lymphocyte', | |
'rbc', 'mch', 'mcv', 'mchc', 'hb', 'ht', 'plt'] | |
phenotype = 'ALL17' | |
sumstats_text_file = 'gs://armartin/disparities/pheno_31063_holdout_gwas_ALL17.clumped' | |
generate_prs_loci_table = False | |
prs_loci_table_location = 'gs://armartin/disparities/keytables/ukb-'+phenotype+'-pt-sumstats-locus-allele-keyed.kt' | |
generate_contig_row_dict = False | |
contig_row_dict_location = 'gs://armartin/disparities/contig_row_dict-'+phenotype | |
contigs = {'0{}'.format(x):str(x) for x in range(1, 10)} | |
bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen' | |
start = time.time() | |
# large block size because we read very little data (due to filtering & ignoring genotypes) | |
hl.init(branching_factor=10, min_block_size=2000) | |
################################################################################ | |
### set up the sumstats table (chr, bp for union SNPs) | |
if (generate_prs_loci_table): | |
t = hl.import_table(sumstats_text_file, | |
delimiter='\s+', | |
impute=True) | |
t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP)) | |
t = t.key_by('locus') | |
t.write(prs_loci_table_location, overwrite=True) | |
ss = hl.read_table(prs_loci_table_location) | |
################################################################################ | |
### determine the indices of the prs variants in bgen | |
if (generate_contig_row_dict): | |
mt = hl.methods.import_bgen(bgen_files, | |
[], | |
contig_recoding=contigs, | |
_row_fields=['file_row_idx']) | |
prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows() | |
print('about to collect') | |
# remove all unnecessary data, dropping keys and other irrelevant fields | |
prs_rows = prs_rows.key_by() | |
prs_rows = prs_rows.select(contig=prs_rows.locus.contig, | |
file_row_idx=prs_rows.file_row_idx) | |
contig_row_list = prs_rows.collect() | |
print('finished collecting') | |
contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list] | |
print('reformed') | |
from collections import defaultdict | |
contig_row_dict = defaultdict(list) | |
for k, v in contig_reformed: | |
contig_row_dict[k].append(v) | |
print('dictionary created') | |
with hl.hadoop_open(contig_row_dict_location, 'wb') as f: | |
pickle.dump(contig_row_dict, f) | |
else: | |
with hl.hadoop_open(contig_row_dict_location, 'rb') as f: | |
contig_row_dict = pickle.load(f) | |
################################################################################ | |
### Get true phenotypes from UKBB | |
phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz', | |
key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='') | |
gwas_phenos = {'50': 'height', '21001': 'bmi', '4080': 'sbp', '4079': 'dbp', '30000': 'wbc', '30130': 'monocyte', | |
'30140': 'neutrophil', '30150': 'eosinophil', '30160': 'basophil', '30120': 'lymphocyte', '30010': 'rbc', | |
'30050': 'mch', '30040': 'mcv', '30060': 'mchc', '30020': 'hb', '30030': 'ht', '30080': 'plt'} | |
phenotypes = phenotypes.select(*gwas_phenos.keys()) | |
phenotypes = phenotypes.rename(gwas_phenos) | |
covariates = hl.import_table('gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', | |
key='s', impute=True, types={'s': hl.tstr}) | |
################################################################################ | |
### Run the PRS using phenotype-specific clump variants | |
contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()} | |
mt_all = hl.methods.import_bgen(bgen_files, | |
['dosage'], | |
sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', | |
contig_recoding=contigs, | |
_variants_per_file=contig_row_dict2, | |
_row_fields=[]) | |
for pheno in phenos: | |
print(pheno) | |
ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz', | |
delimiter='\s+', | |
impute=True, | |
types={'beta': hl.tfloat, 'pval': hl.tfloat}) | |
ss = ss.key_by(locus = hl.locus(hl.str(ss.chr), hl.int(ss.pos))) | |
#sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s') | |
gwas_holdout = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.info.txt.gz', delimiter='\s+').key_by('s') | |
mt = mt_all.annotate_cols(true_pheno=phenotypes[mt_all.s][pheno]) # ok that phenos keyed on userId not s? | |
mt = mt.annotate_cols(**covariates[mt.s]) | |
""" | |
To add: | |
- Filter only to samples in holdout GWAS | |
- Filter to rows in phenotype-specific clump file | |
- Build PRS for 10 p-value thresholds | |
- Also fix nt1/nt2 to A1 and A2 (check) from sumstats. | |
""" | |
# filter to only samples held out from GWAS | |
mt = mt.filter_cols(gwas_holdout[mt.s].gwas_holdout == 'holdout') | |
mt = mt.annotate_rows(ss=ss[mt.locus]) | |
mt = annotate_beta(mt, mt.ss) | |
p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1} | |
pheno_clump = specific_clumps('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.clumped') | |
mt = mt.annotate_rows(clump_snps=hl.int(hl.is_defined(pheno_clump[mt.locus]))) | |
# filter rows to things that have p-values here | |
annot_expr = { | |
k: hl.agg.sum(mt.clump_snps * mt.beta * mt.dosage * hl.int(mt.ss.pval < v)) | |
for k, v in p_max.items()} | |
mt = mt.annotate_cols(**annot_expr) | |
mt.cols().write('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht') | |
ht = hl.read_table('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht') | |
output_location = 'gs://armartin/disparities/UKB_' + pheno + '_PRS.txt.bgz' | |
ht.export(output_location) | |
end = time.time() | |
print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment