danking/README.md

## README.md

      
    Raw
  

              README.md
            
          
    PRS Scoring of the UK Biobank Using Hail

The run_prs.py script slack_utils.py was created to generate polygenic risk scores (PRS) in the UK Biobank (UKB) using Hail 0.2. Currently, I have hard-coded the file locations in-line with my own file structure and that of the Neale Lab UKB Round 2 GWAS.
NOTE: This is under active development and generally intended for my own use and the use of my collaborators. ADAPT AT YOUR OWN PERIL!!
How to Run


This script assumes that you have generated an input file using my process-sumstats and LDPred repos.


Install cloudtools and start up a cluster using the following code, where N is the number of preemptible workers you want to start up:


cluster start CLUSTERNAME --version devel --spark 2.2.0 --preemptible-worker-boot-disk-size=10 --worker-boot-disk-size=10 -p N


Submit the run_prs.py script to the cluster using the following code, where PHENOTYPE is the phenotype you want to score:

cluster submit ccarey run_prs.py --args "--phenotype PHENOTYPE"

Benchmarks

With 100 total workers (2 non-preemptible, 98 preemptible), it typically takes ~20 minutes to score the entire UK Biobank white European subsample, and costs ~$12.

  
## run_prs.py
import hail as hl
import pickle
import time
import argparse

def flip_text(base):
    """
    :param StringExpression base: Expression of a single base
    :return: StringExpression of flipped base
    :rtype: StringExpression
    """
    return (hl.switch(base)
            .when('A', 'T')
            .when('T', 'A')
            .when('C', 'G')
            .when('G', 'C')
            .default(base))

parser = argparse.ArgumentParser()
parser.add_argument("--phenotype", help="name of the sumstat phenotype")
args = parser.parse_args()

########################################################################
### initialize
generate_sumstats_table = True
sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'

output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'

contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'


# large block size because we read very little data (due to filtering & ignoring genotypes)
hl.init(branching_factor=10, min_block_size=2000)

########################################################################
### set up the sumstats table
if (generate_sumstats_table):
    ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
    ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
              alleles=[ss.nt1,ss.nt2])
      .key_by('locus'))
    ss.write(sumstats_table_location, overwrite=True)

ss = hl.read_table(sumstats_table_location)

########################################################################
### do it

mt = hl.methods.import_bgen(bgen_files,
                ['dosage'],
                sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
                contig_recoding=contigs,
                variants={"locus": ss.locus, "alleles": ss.alleles})

sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')

mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
mt = mt.annotate_rows(ss=ss[mt.locus])

mt = mt.annotate_rows(
    beta = (hl.case()
        .when(((mt.alleles[0] == mt.ss.nt1) &
           (mt.alleles[1] == mt.ss.nt2)) |
          ((flip_text(mt.alleles[0]) == mt.ss.nt1) &
           (flip_text(mt.alleles[1]) == mt.ss.nt2)),
              (-1*mt.ss.ldpred_inf_beta))
        .when(((mt.alleles[0] == mt.ss.nt2) &
               (mt.alleles[1] == mt.ss.nt1)) |
              ((flip_text(mt.alleles[0]) == mt.ss.nt2) &
               (flip_text(mt.alleles[1]) == mt.ss.nt1)),
              mt.ss.ldpred_inf_beta)
        .or_missing()))

mt = mt.filter_rows(hl.is_defined(mt.beta))

mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))

mt.cols().export(output_location)
	import hail as hl
	import pickle
	import time
	import argparse

	def flip_text(base):
	"""
	:param StringExpression base: Expression of a single base
	:return: StringExpression of flipped base
	:rtype: StringExpression
	"""
	return (hl.switch(base)
	.when('A', 'T')
	.when('T', 'A')
	.when('C', 'G')
	.when('G', 'C')
	.default(base))

	parser = argparse.ArgumentParser()
	parser.add_argument("--phenotype", help="name of the sumstat phenotype")
	args = parser.parse_args()

	########################################################################
	### initialize
	generate_sumstats_table = True
	sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'

	output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'

	contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}

	bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'


	# large block size because we read very little data (due to filtering & ignoring genotypes)
	hl.init(branching_factor=10, min_block_size=2000)

	########################################################################
	### set up the sumstats table
	if (generate_sumstats_table):
	ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
	ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
	alleles=[ss.nt1,ss.nt2])
	.key_by('locus'))
	ss.write(sumstats_table_location, overwrite=True)

	ss = hl.read_table(sumstats_table_location)

	########################################################################
	### do it

	mt = hl.methods.import_bgen(bgen_files,
	['dosage'],
	sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
	contig_recoding=contigs,
	variants={"locus": ss.locus, "alleles": ss.alleles})

	sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')

	mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
	mt = mt.annotate_rows(ss=ss[mt.locus])

	mt = mt.annotate_rows(
	beta = (hl.case()
	.when(((mt.alleles[0] == mt.ss.nt1) &
	(mt.alleles[1] == mt.ss.nt2)) \|
	((flip_text(mt.alleles[0]) == mt.ss.nt1) &
	(flip_text(mt.alleles[1]) == mt.ss.nt2)),
	(-1*mt.ss.ldpred_inf_beta))
	.when(((mt.alleles[0] == mt.ss.nt2) &
	(mt.alleles[1] == mt.ss.nt1)) \|
	((flip_text(mt.alleles[0]) == mt.ss.nt2) &
	(flip_text(mt.alleles[1]) == mt.ss.nt1)),
	mt.ss.ldpred_inf_beta)
	.or_missing()))

	mt = mt.filter_rows(hl.is_defined(mt.beta))

	mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))

	mt.cols().export(output_location)