danking/run_prs_holdout.py

## run_prs_holdout.py
import hail as hl
import pickle
import time
import argparse

def flip_text(base):
    """
    :param StringExpression base: Expression of a single base
    :return: StringExpression of flipped base
    :rtype: StringExpression
    """
    return (hl.switch(base)
            .when('A', 'T')
            .when('T', 'A')
            .when('C', 'G')
            .when('G', 'C')
            .default(base))


def annotate_beta(mt, ss_loc):
    mt = mt.annotate_rows(**{
        'beta': hl.case()
                          .when(((mt.alleles[0] == ss_loc.ref) &
                                 (mt.alleles[1] == ss_loc.alt)) |
                                ((flip_text(mt.alleles[0]) == ss_loc.ref) &
                                 (flip_text(mt.alleles[1]) == ss_loc.alt)),
                                (-1 * ss_loc.beta))
                          .when(((mt.alleles[0] == ss_loc.alt) &
                                 (mt.alleles[1] == ss_loc.ref)) |
                                ((flip_text(mt.alleles[0]) == ss_loc.alt) &
                                 (flip_text(mt.alleles[1]) == ss_loc.ref)),
                                ss_loc.beta)
                          .or_missing()}
                          )
    return(mt)


def specific_clumps(filename):
    clump = hl.import_table(filename, delimiter='\s+')
    clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP))).select()
    return(clump)

########################################################################
### initialize

phenos = ['height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil', 'eosinophil', 'basophil', 'lymphocyte',
          'rbc', 'mch', 'mcv', 'mchc', 'hb', 'ht', 'plt']
phenotype = 'ALL17'
sumstats_text_file = 'gs://armartin/disparities/pheno_31063_holdout_gwas_ALL17.clumped'

generate_prs_loci_table = False
prs_loci_table_location = 'gs://armartin/disparities/keytables/ukb-'+phenotype+'-pt-sumstats-locus-allele-keyed.kt'
generate_contig_row_dict = False
contig_row_dict_location = 'gs://armartin/disparities/contig_row_dict-'+phenotype

contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'

start = time.time()
# large block size because we read very little data (due to filtering & ignoring genotypes)
hl.init(branching_factor=10, min_block_size=2000)


################################################################################
### set up the sumstats table (chr, bp for union SNPs)
if (generate_prs_loci_table):
    t = hl.import_table(sumstats_text_file,
                        delimiter='\s+',
                        impute=True)
    t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP))
    t = t.key_by('locus')
    t.write(prs_loci_table_location, overwrite=True)

ss = hl.read_table(prs_loci_table_location)

################################################################################
### determine the indices of the prs variants in bgen
if (generate_contig_row_dict):
    mt = hl.methods.import_bgen(bgen_files,
                                [],
                                contig_recoding=contigs,
                                _row_fields=['file_row_idx'])
    prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
    print('about to collect')
    # remove all unnecessary data, dropping keys and other irrelevant fields
    prs_rows = prs_rows.key_by()
    prs_rows = prs_rows.select(contig=prs_rows.locus.contig,
                               file_row_idx=prs_rows.file_row_idx)
    contig_row_list = prs_rows.collect()
    print('finished collecting')
    contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
    print('reformed')
    from collections import defaultdict
    contig_row_dict = defaultdict(list)
    for k, v in contig_reformed:
        contig_row_dict[k].append(v)
    print('dictionary created')

    with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
        pickle.dump(contig_row_dict, f)
else:
    with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
        contig_row_dict = pickle.load(f)

################################################################################
### Get true phenotypes from UKBB
phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz',
                               key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='')

gwas_phenos = {'50': 'height', '21001': 'bmi', '4080': 'sbp', '4079': 'dbp', '30000': 'wbc', '30130': 'monocyte',
               '30140': 'neutrophil', '30150': 'eosinophil', '30160': 'basophil', '30120': 'lymphocyte', '30010': 'rbc',
               '30050': 'mch', '30040': 'mcv', '30060': 'mchc', '30020': 'hb', '30030': 'ht', '30080': 'plt'}

phenotypes = phenotypes.select(*gwas_phenos.keys())
phenotypes = phenotypes.rename(gwas_phenos)

covariates = hl.import_table('gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
                             key='s', impute=True, types={'s': hl.tstr})

################################################################################
### Run the PRS using phenotype-specific clump variants
contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
mt_all = hl.methods.import_bgen(bgen_files,
                            ['dosage'],
                            sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
                            contig_recoding=contigs,
                            _variants_per_file=contig_row_dict2,
                            _row_fields=[])

for pheno in phenos:
    print(pheno)
    ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz',
                              delimiter='\s+',
                              impute=True,
                         types={'beta': hl.tfloat, 'pval': hl.tfloat})
    ss = ss.key_by(locus = hl.locus(hl.str(ss.chr), hl.int(ss.pos)))

    #sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')
    gwas_holdout = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.info.txt.gz', delimiter='\s+').key_by('s')

    mt = mt_all.annotate_cols(true_pheno=phenotypes[mt_all.s][pheno]) # ok that phenos keyed on userId not s?
    mt = mt.annotate_cols(**covariates[mt.s])
    """
    To add:
    - Filter only to samples in holdout GWAS
    - Filter to rows in phenotype-specific clump file
    - Build PRS for 10 p-value thresholds
    - Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
    """
    # filter to only samples held out from GWAS
    mt = mt.filter_cols(gwas_holdout[mt.s].gwas_holdout == 'holdout')

    mt = mt.annotate_rows(ss=ss[mt.locus])
    mt = annotate_beta(mt, mt.ss)

    p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1}

    pheno_clump = specific_clumps('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.clumped')

    mt = mt.annotate_rows(clump_snps=hl.int(hl.is_defined(pheno_clump[mt.locus])))
    # filter rows to things that have p-values here

    annot_expr = {
        k: hl.agg.sum(mt.clump_snps * mt.beta * mt.dosage * hl.int(mt.ss.pval < v))
        for k, v in p_max.items()}

    mt = mt.annotate_cols(**annot_expr)

    mt.cols().write('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht')
    ht = hl.read_table('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht')

    output_location = 'gs://armartin/disparities/UKB_' + pheno + '_PRS.txt.bgz'
    ht.export(output_location)
end = time.time()
print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
	import hail as hl
	import pickle
	import time
	import argparse

	def flip_text(base):
	"""
	:param StringExpression base: Expression of a single base
	:return: StringExpression of flipped base
	:rtype: StringExpression
	"""
	return (hl.switch(base)
	.when('A', 'T')
	.when('T', 'A')
	.when('C', 'G')
	.when('G', 'C')
	.default(base))


	def annotate_beta(mt, ss_loc):
	mt = mt.annotate_rows(**{
	'beta': hl.case()
	.when(((mt.alleles[0] == ss_loc.ref) &
	(mt.alleles[1] == ss_loc.alt)) \|
	((flip_text(mt.alleles[0]) == ss_loc.ref) &
	(flip_text(mt.alleles[1]) == ss_loc.alt)),
	(-1 * ss_loc.beta))
	.when(((mt.alleles[0] == ss_loc.alt) &
	(mt.alleles[1] == ss_loc.ref)) \|
	((flip_text(mt.alleles[0]) == ss_loc.alt) &
	(flip_text(mt.alleles[1]) == ss_loc.ref)),
	ss_loc.beta)
	.or_missing()}
	)
	return(mt)


	def specific_clumps(filename):
	clump = hl.import_table(filename, delimiter='\s+')
	clump = clump.key_by(locus=hl.locus(hl.str(clump.CHR), hl.int(clump.BP))).select()
	return(clump)

	########################################################################
	### initialize

	phenos = ['height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil', 'eosinophil', 'basophil', 'lymphocyte',
	'rbc', 'mch', 'mcv', 'mchc', 'hb', 'ht', 'plt']
	phenotype = 'ALL17'
	sumstats_text_file = 'gs://armartin/disparities/pheno_31063_holdout_gwas_ALL17.clumped'

	generate_prs_loci_table = False
	prs_loci_table_location = 'gs://armartin/disparities/keytables/ukb-'+phenotype+'-pt-sumstats-locus-allele-keyed.kt'
	generate_contig_row_dict = False
	contig_row_dict_location = 'gs://armartin/disparities/contig_row_dict-'+phenotype

	contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

	bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'

	start = time.time()
	# large block size because we read very little data (due to filtering & ignoring genotypes)
	hl.init(branching_factor=10, min_block_size=2000)


	################################################################################
	### set up the sumstats table (chr, bp for union SNPs)
	if (generate_prs_loci_table):
	t = hl.import_table(sumstats_text_file,
	delimiter='\s+',
	impute=True)
	t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP))
	t = t.key_by('locus')
	t.write(prs_loci_table_location, overwrite=True)

	ss = hl.read_table(prs_loci_table_location)

	################################################################################
	### determine the indices of the prs variants in bgen
	if (generate_contig_row_dict):
	mt = hl.methods.import_bgen(bgen_files,
	[],
	contig_recoding=contigs,
	_row_fields=['file_row_idx'])
	prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
	print('about to collect')
	# remove all unnecessary data, dropping keys and other irrelevant fields
	prs_rows = prs_rows.key_by()
	prs_rows = prs_rows.select(contig=prs_rows.locus.contig,
	file_row_idx=prs_rows.file_row_idx)
	contig_row_list = prs_rows.collect()
	print('finished collecting')
	contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
	print('reformed')
	from collections import defaultdict
	contig_row_dict = defaultdict(list)
	for k, v in contig_reformed:
	contig_row_dict[k].append(v)
	print('dictionary created')

	with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
	pickle.dump(contig_row_dict, f)
	else:
	with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
	contig_row_dict = pickle.load(f)

	################################################################################
	### Get true phenotypes from UKBB
	phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz',
	key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='')

	gwas_phenos = {'50': 'height', '21001': 'bmi', '4080': 'sbp', '4079': 'dbp', '30000': 'wbc', '30130': 'monocyte',
	'30140': 'neutrophil', '30150': 'eosinophil', '30160': 'basophil', '30120': 'lymphocyte', '30010': 'rbc',
	'30050': 'mch', '30040': 'mcv', '30060': 'mchc', '30020': 'hb', '30030': 'ht', '30080': 'plt'}

	phenotypes = phenotypes.select(*gwas_phenos.keys())
	phenotypes = phenotypes.rename(gwas_phenos)

	covariates = hl.import_table('gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
	key='s', impute=True, types={'s': hl.tstr})

	################################################################################
	### Run the PRS using phenotype-specific clump variants
	contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
	mt_all = hl.methods.import_bgen(bgen_files,
	['dosage'],
	sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
	contig_recoding=contigs,
	_variants_per_file=contig_row_dict2,
	_row_fields=[])

	for pheno in phenos:
	print(pheno)
	ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz',
	delimiter='\s+',
	impute=True,
	types={'beta': hl.tfloat, 'pval': hl.tfloat})
	ss = ss.key_by(locus = hl.locus(hl.str(ss.chr), hl.int(ss.pos)))

	#sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')
	gwas_holdout = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.info.txt.gz', delimiter='\s+').key_by('s')

	mt = mt_all.annotate_cols(true_pheno=phenotypes[mt_all.s][pheno]) # ok that phenos keyed on userId not s?
	mt = mt.annotate_cols(**covariates[mt.s])
	"""
	To add:
	- Filter only to samples in holdout GWAS
	- Filter to rows in phenotype-specific clump file
	- Build PRS for 10 p-value thresholds
	- Also fix nt1/nt2 to A1 and A2 (check) from sumstats.
	"""
	# filter to only samples held out from GWAS
	mt = mt.filter_cols(gwas_holdout[mt.s].gwas_holdout == 'holdout')

	mt = mt.annotate_rows(ss=ss[mt.locus])
	mt = annotate_beta(mt, mt.ss)

	p_max = {'s1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1}

	pheno_clump = specific_clumps('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.clumped')

	mt = mt.annotate_rows(clump_snps=hl.int(hl.is_defined(pheno_clump[mt.locus])))
	# filter rows to things that have p-values here

	annot_expr = {
	k: hl.agg.sum(mt.clump_snps * mt.beta * mt.dosage * hl.int(mt.ss.pval < v))
	for k, v in p_max.items()}

	mt = mt.annotate_cols(**annot_expr)

	mt.cols().write('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht')
	ht = hl.read_table('gs://armartin/disparities/UKB_' + pheno + '_PRS.ht')

	output_location = 'gs://armartin/disparities/UKB_' + pheno + '_PRS.txt.bgz'
	ht.export(output_location)
	end = time.time()
	print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))