Skip to content

Instantly share code, notes, and snippets.

@ce-carey
Last active March 22, 2022 16:45
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save ce-carey/99e9911b12a352179fb947bef1606307 to your computer and use it in GitHub Desktop.
Save ce-carey/99e9911b12a352179fb947bef1606307 to your computer and use it in GitHub Desktop.
Hail PRS code

PRS Scoring of the UK Biobank Using Hail

The run_prs.py script slack_utils.py was created to generate polygenic risk scores (PRS) in the UK Biobank (UKB) using Hail 0.2. Currently, I have hard-coded the file locations in-line with my own file structure and that of the Neale Lab UKB Round 2 GWAS.

NOTE: This is under active development and generally intended for my own use and the use of my collaborators. ADAPT AT YOUR OWN PERIL!!

How to Run

  1. This script assumes that you have generated an input file using my process-sumstats and LDPred repos.

  2. Install cloudtools and start up a cluster using the following code, where N is the number of preemptible workers you want to start up:

cluster start CLUSTERNAME --version devel --spark 2.2.0 --preemptible-worker-boot-disk-size=10 --worker-boot-disk-size=10 -p N
  1. Submit the run_prs.py script to the cluster using the following code, where PHENOTYPE is the phenotype you want to score:
cluster submit ccarey run_prs.py --args "--phenotype PHENOTYPE"

Benchmarks

With 100 total workers (2 non-preemptible, 98 preemptible), it typically takes ~20 minutes to score the entire UK Biobank white European subsample, and costs ~$12.

import hail as hl
import pickle
import time
import argparse
def flip_text(base):
"""
:param StringExpression base: Expression of a single base
:return: StringExpression of flipped base
:rtype: StringExpression
"""
return (hl.switch(base)
.when('A', 'T')
.when('T', 'A')
.when('C', 'G')
.when('G', 'C')
.default(base))
parser = argparse.ArgumentParser()
parser.add_argument("--phenotype", help="name of the sumstat phenotype")
args = parser.parse_args()
########################################################################
### initialize
generate_sumstats_table = True
sumstats_table_location = 'gs://ukbb_prs/sumstats/keytables/ukb-'+phenotype+'-ldpred-sumstats-locus-allele-keyed.kt'
generate_contig_row_dict = True
contig_row_dict_location = 'gs://ukbb_prs/sumstats/keytables/contig_row_dict-'+phenotype
output_location = 'gs://ukbb_prs/prs/UKB_'+phenotype+'_PRS.txt'
contigs = {'0{}'.format(x):str(x) for x in range(1, 10)}
bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen'
# large block size because we read very little data (due to filtering & ignoring genotypes)
hl.init(branching_factor=10, min_block_size=2000)
########################################################################
### set up the sumstats table
if (generate_sumstats_table):
ss = hl.import_table('gs://ukbb_prs/sumstats/UKB_'+phenotype+'_LDPred.txt', delimiter='\s+', impute=True)
ss = (ss.annotate(locus=hl.parse_locus(ss.chrom[6:] + ":" + hl.str(ss.pos)),
alleles=[ss.nt1,ss.nt2])
.key_by('locus'))
ss.write(sumstats_table_location, overwrite=True)
ss = hl.read_table(sumstats_table_location)
########################################################################
### determine the file locations of the prs variants
if (generate_contig_row_dict):
mt = hl.methods.import_bgen(bgen_files,
[],
contig_recoding=contigs,
_row_fields=['file_row_idx'])
prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows()
print('about to collect')
# remove all unnecessary data, dropping keys and other irrelevant fields
prs_rows = prs_rows.key_by()
prs_rows = prs_rows.select(prs_rows.locus.contig, prs_rows.file_row_idx)
contig_row_list = prs_rows.collect()
print('finished collecting')
contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list]
print('reformed')
from collections import defaultdict
contig_row_dict = defaultdict(list)
for k, v in contig_reformed:
contig_row_dict[k].append(v)
print('dictionary created')
with hl.hadoop_open(contig_row_dict_location, 'wb') as f:
pickle.dump(contig_row_dict, f)
else:
with hl.hadoop_open(contig_row_dict_location, 'rb') as f:
contig_row_dict = pickle.load(f)
########################################################################
### Run the PRS
contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()}
mt = hl.methods.import_bgen(bgen_files,
['dosage'],
sample_file='gs://phenotype_31063/ukb31063.autosomes.sample',
contig_recoding=contigs,
_variants_per_file=contig_row_dict2,
_row_fields=[])
sampleids = hl.import_table('gs://ukb31063-mega-gwas/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s')
mt = mt.filter_cols(hl.is_defined(sampleids[mt.s]))
mt = mt.annotate_rows(ss=ss[mt.locus])
mt = mt.annotate_rows(
beta = (hl.case()
.when(((mt.alleles[0] == mt.ss.nt1) &
(mt.alleles[1] == mt.ss.nt2)) |
((flip_text(mt.alleles[0]) == mt.ss.nt1) &
(flip_text(mt.alleles[1]) == mt.ss.nt2)),
(-1*mt.ss.ldpred_inf_beta))
.when(((mt.alleles[0] == mt.ss.nt2) &
(mt.alleles[1] == mt.ss.nt1)) |
((flip_text(mt.alleles[0]) == mt.ss.nt2) &
(flip_text(mt.alleles[1]) == mt.ss.nt1)),
mt.ss.ldpred_inf_beta)
.or_missing()))
mt = mt.filter_rows(hl.is_defined(mt.beta))
mt = mt.annotate_cols(prs=hl.agg.sum(mt.beta * mt.dosage))
mt.cols().export(output_location)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment