Skip to content

Instantly share code, notes, and snippets.

@danking
Created August 23, 2018 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danking/9c9d77afb3ff318adc7bf79bb52d4f7d to your computer and use it in GitHub Desktop.
Save danking/9c9d77afb3ff318adc7bf79bb52d4f7d to your computer and use it in GitHub Desktop.
import hail as hl
import pickle
import time
import argparse
def flip_text(base):
"""
:param StringExpression base: Expression of a single base
:return: StringExpression of flipped base
:rtype: StringExpression
"""
return (hl.switch(base)
.when('A', 'T')
.when('T', 'A')
.when('C', 'G')
.when('G', 'C')
.default(base))
def annotate_beta(mt, ss_loc):
mt = mt.annotate_rows(**{
'beta': hl.case()
.when(((mt.alleles[0] == ss_loc.ref) &
(mt.alleles[1] == ss_loc.alt)) |
((flip_text(mt.alleles[0]) == ss_loc.ref) &
(flip_text(mt.alleles[1]) == ss_loc.alt)),
(-1 * ss_loc.beta))
.when(((mt.alleles[0] == ss_loc.alt) &
(mt.alleles[1] == ss_loc.ref)) |
((flip_text(mt.alleles[0]) == ss_loc.alt) &
(flip_text(mt.alleles[1]) == ss_loc.ref)),
ss_loc.beta)
.or_missing()}
)
return(mt)
def specific_clumps(filename):
clump = hl.import_table(filename, delimiter='\s+')
clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP))).select()
return(clump)
########################################################################
### initialize
phenos = ['height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil', 'eosinophil', 'basophil', 'lymphocyte',
'rbc', 'mch', 'mcv', 'mchc', 'hb', 'ht', 'plt']
phenotype = 'ALL17'
sumstats_text_file = 'gs://armartin/disparities/pheno_31063_holdout_gwas_ALL17.clumped'
generate_prs_loci_table = False
prs_loci_table_location = 'gs://armartin/disparities/keytables/ukb-'+phenotype+'-pt-sumstats-locus-allele-keyed.kt'
generate_contig_row_dict = False
contig_row_dict_location = 'gs://armartin/disparities/contig_row_dict-'+phenotype
contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}
bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'
start = time.time()
# large block size because we read very little data (due to filtering & ignoring genotypes)
hl.init(branching_factor=10, min_block_size=2000)
################################################################################
### set up the sumstats table (chr, bp for union SNPs)
if (generate_prs_loci_table):
t = hl.import_table(sumstats_text_file,
delimiter='\s+',
impute=True)
t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP))
t = t.key_by('locus')
t.write(prs_loci_table_location, overwrite=True)
ss = hl.read_table(prs_loci_table_location)
################################################################################
### determine the indices of the prs variants in bgen
if (generate_contig_row_dict):
mt = hl.methods.import_bgen(bgen_files,
[],
contig_recoding=contigs,
_row_fields=['file_row_idx'])
prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
print('about to collect')
# remove all unnecessary data, dropping keys and other irrelevant fields
prs_rows = prs_rows.key_by()
prs_rows = prs_rows.select(contig=prs_rows.locus.contig,
file_row_idx=prs_rows.file_row_idx)
contig_row_list = prs_rows.collect()
print('finished collecting')
contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
print('reformed')
from collections import defaultdict
contig_row_dict = defaultdict(list)
for k, v in contig_reformed:
contig_row_dict[k].append(v)
print('dictionary created')
with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
pickle.dump(contig_row_dict, f)
else:
with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
contig_row_dict = pickle.load(f)
################################################################################
### Get true phenotypes from UKBB
phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz',
key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='')
gwas_phenos = {'50': 'height', '21001': 'bmi', '4080': 'sbp', '4079': 'dbp', '30000': 'wbc', '30130': 'monocyte',
'30140': 'neutrophil', '30150': 'eosinophil', '30160': 'basophil', '30120': 'lymphocyte', '30010': 'rbc',
'30050': 'mch', '30040': 'mcv', '30060': 'mchc', '30020': 'hb', '30030': 'ht', '30080': 'plt'}
phenotypes = phenotypes.select(*gwas_phenos.keys())
phenotypes = phenotypes.rename(gwas_phenos)
covariates = hl.import_table('gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
key='s', impute=True, types={'s': hl.tstr})
################################################################################
### Run the PRS using phenotype-specific clump variants
contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
mt_all = hl.methods.import_bgen(bgen_files,
['dosage'],
sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
contig_recoding=contigs,
_variants_per_file=contig_row_dict2,
_row_fields=[])
for pheno in phenos:
print(pheno)
ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz',
delimiter='\s+',
impute=True,
types={'beta': hl.tfloat, 'pval': hl.tfloat})
ss = ss.key_by(locus = hl.locus(hl.str(ss.chr), hl.int(ss.pos)))
#sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')
gwas_holdout = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.info.txt.gz', delimiter='\s+').key_by('s')
mt = mt_all.annotate_cols(true_pheno=phenotypes[mt_all.s][pheno]) # ok that phenos keyed on userId not s?
mt = mt.annotate_cols(**covariates[mt.s])
"""
To add:
- Filter only to samples in holdout GWAS
- Filter to rows in phenotype-specific clump file
- Build PRS for 10 p-value thresholds
- Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
"""
# filter to only samples held out from GWAS
mt = mt.filter_cols(gwas_holdout[mt.s].gwas_holdout == 'holdout')
mt = mt.annotate_rows(ss=ss[mt.locus])
mt = annotate_beta(mt, mt.ss)
p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1}
pheno_clump = specific_clumps('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.clumped')
mt = mt.annotate_rows(clump_snps=hl.int(hl.is_defined(pheno_clump[mt.locus])))
# filter rows to things that have p-values here
annot_expr = {
k: hl.agg.sum(mt.clump_snps * mt.beta * mt.dosage * hl.int(mt.ss.pval < v))
for k, v in p_max.items()}
mt = mt.annotate_cols(**annot_expr)
mt.cols().write('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht')
ht = hl.read_table('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht')
output_location = 'gs://armartin/disparities/UKB_' + pheno + '_PRS.txt.bgz'
ht.export(output_location)
end = time.time()
print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment