ce-carey/README.md

## README.md

      
    Raw
  

              README.md
            
          
    PRS Scoring of the UK Biobank Using Hail

The run_prs.py script slack_utils.py was created to generate polygenic risk scores (PRS) in the UK Biobank (UKB) using Hail 0.2. Currently, I have hard-coded the file locations in-line with my own file structure and that of the Neale Lab UKB Round 2 GWAS.
NOTE: This is under active development and generally intended for my own use and the use of my collaborators. ADAPT AT YOUR OWN PERIL!!
How to Run


This script assumes that you have generated an input file using my process-sumstats and LDPred repos.


Install cloudtools and start up a cluster using the following code, where N is the number of preemptible workers you want to start up:


cluster start CLUSTERNAME --version devel --spark 2.2.0 --preemptible-worker-boot-disk-size=10 --worker-boot-disk-size=10 -p N


Submit the run_prs.py script to the cluster using the following code, where PHENOTYPE is the phenotype you want to score:

cluster submit ccarey run_prs.py --args "--phenotype PHENOTYPE"

Benchmarks

With 100 total workers (2 non-preemptible, 98 preemptible), it typically takes ~20 minutes to score the entire UK Biobank white European subsample, and costs ~$12.

  
## run_prs.py
import hail as hl
import pickle
import time
import argparse

def flip_text(base):
    """
    :param StringExpression base: Expression of a single base
    :return: StringExpression of flipped base
    :rtype: StringExpression
    """
    return (hl.switch(base)
            .when('A', 'T')
            .when('T', 'A')
            .when('C', 'G')
            .when('G', 'C')
            .default(base))

parser = argparse.ArgumentParser()
parser.add_argument("--phenotype", help="name of the sumstat phenotype")
args = parser.parse_args()

########################################################################
### initialize
generate_sumstats_table = True
sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'
generate_contig_row_dict = True
contig_row_dict_location = 'gs://ukbb_prs/sumstats/keytables/contig_row_dict-'+phenotype

output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'

contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'


# large block size because we read very little data (due to filtering & ignoring genotypes)
hl.init(branching_factor=10, min_block_size=2000)

########################################################################
### set up the sumstats table
if (generate_sumstats_table):
    ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
    ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
              alleles=[ss.nt1,ss.nt2])
      .key_by('locus'))
    ss.write(sumstats_table_location, overwrite=True)

ss = hl.read_table(sumstats_table_location)

########################################################################
### determine the file locations of the prs variants
if (generate_contig_row_dict):
    mt = hl.methods.import_bgen(bgen_files,
                [],
                contig_recoding=contigs,
                _row_fields=['file_row_idx'])
    prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
    print('about to collect')
    # remove all unnecessary data, dropping keys and other irrelevant fields
    prs_rows = prs_rows.key_by()
    prs_rows = prs_rows.select(prs_rows.locus.contig, prs_rows.file_row_idx)
    contig_row_list = prs_rows.collect()
    print('finished collecting')
    contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
    print('reformed')
    from collections import defaultdict
    contig_row_dict = defaultdict(list)
    for k, v in contig_reformed:
        contig_row_dict[k].append(v)
    print('dictionary created')

    with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
        pickle.dump(contig_row_dict, f)
else:
    with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
        contig_row_dict = pickle.load(f)

########################################################################
### Run the PRS
contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
mt = hl.methods.import_bgen(bgen_files,
                ['dosage'],
                sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
                contig_recoding=contigs,
                _variants_per_file=contig_row_dict2,
                _row_fields=[])

sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')

mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
mt = mt.annotate_rows(ss=ss[mt.locus])

mt = mt.annotate_rows(
    beta = (hl.case()
        .when(((mt.alleles[0] == mt.ss.nt1) &
           (mt.alleles[1] == mt.ss.nt2)) |
          ((flip_text(mt.alleles[0]) == mt.ss.nt1) &
           (flip_text(mt.alleles[1]) == mt.ss.nt2)),
              (-1*mt.ss.ldpred_inf_beta))
        .when(((mt.alleles[0] == mt.ss.nt2) &
               (mt.alleles[1] == mt.ss.nt1)) |
              ((flip_text(mt.alleles[0]) == mt.ss.nt2) &
               (flip_text(mt.alleles[1]) == mt.ss.nt1)),
              mt.ss.ldpred_inf_beta)
        .or_missing()))

mt = mt.filter_rows(hl.is_defined(mt.beta))

mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))

mt.cols().export(output_location)
	import hail as hl
	import pickle
	import time
	import argparse

	def flip_text(base):
	"""
	:param StringExpression base: Expression of a single base
	:return: StringExpression of flipped base
	:rtype: StringExpression
	"""
	return (hl.switch(base)
	.when('A', 'T')
	.when('T', 'A')
	.when('C', 'G')
	.when('G', 'C')
	.default(base))

	parser = argparse.ArgumentParser()
	parser.add_argument("--phenotype", help="name of the sumstat phenotype")
	args = parser.parse_args()

	########################################################################
	### initialize
	generate_sumstats_table = True
	sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'
	generate_contig_row_dict = True
	contig_row_dict_location = 'gs://ukbb_prs/sumstats/keytables/contig_row_dict-'+phenotype

	output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'

	contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

	bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'


	# large block size because we read very little data (due to filtering & ignoring genotypes)
	hl.init(branching_factor=10, min_block_size=2000)

	########################################################################
	### set up the sumstats table
	if (generate_sumstats_table):
	ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
	ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
	alleles=[ss.nt1,ss.nt2])
	.key_by('locus'))
	ss.write(sumstats_table_location, overwrite=True)

	ss = hl.read_table(sumstats_table_location)

	########################################################################
	### determine the file locations of the prs variants
	if (generate_contig_row_dict):
	mt = hl.methods.import_bgen(bgen_files,
	[],
	contig_recoding=contigs,
	_row_fields=['file_row_idx'])
	prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
	print('about to collect')
	# remove all unnecessary data, dropping keys and other irrelevant fields
	prs_rows = prs_rows.key_by()
	prs_rows = prs_rows.select(prs_rows.locus.contig, prs_rows.file_row_idx)
	contig_row_list = prs_rows.collect()
	print('finished collecting')
	contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
	print('reformed')
	from collections import defaultdict
	contig_row_dict = defaultdict(list)
	for k, v in contig_reformed:
	contig_row_dict[k].append(v)
	print('dictionary created')

	with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
	pickle.dump(contig_row_dict, f)
	else:
	with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
	contig_row_dict = pickle.load(f)

	########################################################################
	### Run the PRS
	contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
	mt = hl.methods.import_bgen(bgen_files,
	['dosage'],
	sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
	contig_recoding=contigs,
	_variants_per_file=contig_row_dict2,
	_row_fields=[])

	sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')

	mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
	mt = mt.annotate_rows(ss=ss[mt.locus])

	mt = mt.annotate_rows(
	beta = (hl.case()
	.when(((mt.alleles[0] == mt.ss.nt1) &
	(mt.alleles[1] == mt.ss.nt2)) \|
	((flip_text(mt.alleles[0]) == mt.ss.nt1) &
	(flip_text(mt.alleles[1]) == mt.ss.nt2)),
	(-1*mt.ss.ldpred_inf_beta))
	.when(((mt.alleles[0] == mt.ss.nt2) &
	(mt.alleles[1] == mt.ss.nt1)) \|
	((flip_text(mt.alleles[0]) == mt.ss.nt2) &
	(flip_text(mt.alleles[1]) == mt.ss.nt1)),
	mt.ss.ldpred_inf_beta)
	.or_missing()))

	mt = mt.filter_rows(hl.is_defined(mt.beta))

	mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))

	mt.cols().export(output_location)