Skip to content

Instantly share code, notes, and snippets.

@liangyy
Last active September 5, 2019 03:18
Show Gist options
  • Save liangyy/707d63c6102f403f5821f40a008e7c6a to your computer and use it in GitHub Desktop.
Save liangyy/707d63c6102f403f5821f40a008e7c6a to your computer and use it in GitHub Desktop.
GWAS-gold_standard_genes-intersection

Dependency

> sessionInfo()
R version 3.3.2 (2016-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)

locale:
[1] C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] dplyr_0.8.0.1     data.table_1.11.8

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.5 magrittr_1.5     assertthat_0.2.0 R6_2.3.0
 [5] pillar_1.3.1     glue_1.3.0       tibble_2.1.1     crayon_1.3.4
 [9] Rcpp_1.0.0       pkgconfig_2.0.2  rlang_0.3.4      purrr_0.2.5

Minimal test

$ bash run_gwas2gene.sh

Workflow, input files, and output files

Step 1

Take GWAS sumstat, LD blocks, and a list of gold standard gene:

  1. Only keep LD blocks with at least one GWAS signal (passing a threshold parameter --gwas_pval_lt). Set the top GWAS variant as leading variant (randomly pick one if tie).
  2. Filter out LD blocks which do not overlap with gold standard gene
  3. Extract all genes overlapping the LD block after filtering
  • GWAS summary statistics: /gpfs/data/im-lab/nas40t2/Data/SummaryResults/imputed_gwas_hg38_1.1/imputed_GLGC_Mc_LDL.txt.gz as example
  • Gene model: /gpfs/data/im-lab/nas40t2/yanyul/mv_from_scratch/repo_new/rotation-at-imlab/data/annotations_gencode_v26.tsv as example
  • LD block definition: /gpfs/data/im-lab/nas40t2/yanyul/mv_from_scratch/repo_new/rotation-at-imlab/analysis/allelic_heterogeneity/data/ld_independent_regions.txt as example
  • Set threshold to to call GWAS signal: 5e-8 in this case
  • Path to some sourced files inside the script: --source_path rlib.R

In output gwas_in_LDblock-GLGC_Mc_LDL.rds, you can find the list of LD blocks (along with GWAS leading variants) selected by the procedure and the list of selected genes. As shown in below

$ R
> out = readRDS('gwas_in_LDblock-GLGC_Mc_LDL.rds')
> head(out$gwas_leading_variant)
    cs_idx                lead_var   chr       pos region_start region_end
1  chr1_18   chr1_25442446_A_G_b38  chr1  25442446     25190354   27075376
2  chr1_34   chr1_55039974_G_T_b38  chr1  55039974     53760589   55947444
3  chr2_13   chr2_21041028_G_A_b38  chr2  21041028     20850730   23118512
4  chr2_27   chr2_43846742_T_C_b38  chr2  43846742     43082452   44086664
5  chr8_94  chr8_143948489_G_T_b38  chr8 143948489    143155464  145078481
6 chr11_70 chr11_116778201_G_C_b38 chr11 116778201    116512631  117876395
> head(out$extracted_genes)
             gene dist_to_tss dist_to_gene_body
1 ENSG00000006555      238707            238707
2 ENSG00000014164      407036            407036
3 ENSG00000058799     1150140           1150140
4 ENSG00000058804     1201114           1201114
5 ENSG00000060642     1345026           1345026
6 ENSG00000065989      674857            621999

Region means LD block. The distance computed is relative to GWAS leading variant.

Step 2 (it is not necessary for plotting ROC)

Post process the output in step 1 to link each LD block with gene (for locus-based analysis). See input details in run_gwas2gene.sh. The output file is gwas_in_LDblock-GLGC_Mc_LDL.by_locus.rds.

$ R
> out = readRDS('gwas_in_LDblock-GLGC_Mc_LDL.by_locus.rds')
> head(out)
             gene  chr    start      end strand      tss              lead_var
1 ENSG00000060642 chr1 26787472 26798398      + 26787472 chr1_25442446_A_G_b38
2 ENSG00000090273 chr1 26900238 26946862      + 26900238 chr1_25442446_A_G_b38
3 ENSG00000117614 chr1 25222679 25232502      - 25232502 chr1_25442446_A_G_b38
4 ENSG00000117616 chr1 25242237 25338213      - 25338213 chr1_25442446_A_G_b38
5 ENSG00000117632 chr1 25884181 25906991      - 25906991 chr1_25442446_A_G_b38
6 ENSG00000117640 chr1 25818640 25832942      + 25818640 chr1_25442446_A_G_b38
   cs_idx dist_to_gene_body dist_to_tss
1 chr1_18           1345026     1345026
2 chr1_18           1457792     1457792
3 chr1_18            209944      209944
4 chr1_18            104233      104233
5 chr1_18            441735      464545
6 chr1_18            376194      376194

Each row is a locus/gene pair. In some cases, two LD blocks can have the same gene because one gene can overlap with two nearby LD blocks.

library(optparse)
option_list = list(
make_option(c("-g", "--gwas"), type="character", default=NULL,
help="gwas leading variant and extracted gene list", metavar="character"),
make_option(c("-o", "--output"), type="character", default="test",
help="output", metavar="character"),
make_option(c("-e", "--gene_list"), type="character", default="test",
help="gene model", metavar="character"),
make_option(c("-r", "--region_based"), type="numeric", default=0,
help="if use region based method to assign a gene to a leading variant set to 1", metavar="character"),
make_option(c("-u", "--source_path"), type="character", default='rlib.R',
help="path to sourced files (default rlib.R)", metavar="character")
);
args_parser = OptionParser(option_list=option_list);
opt = parse_args(args_parser)
# source(paste0(path, '/../../scripts/mylib_doc.R'))
# source(paste0(path, '/../../scripts/mylib_generic.R'))
source(opt$source_path)
library(dplyr)
gene_map = read.table(opt$gene_list, header = T, stringsAsFactors = F) %>% filter(gene_type == 'protein_coding') %>% rename(gene = gene_id)
gwas = readRDS(opt$gwas)
gwas_lead = gwas$gwas_leading_variant
candidate_genes = gwas$extracted_genes
if(nrow(gwas_lead) == 0) {
saveRDS(gwas_lead, opt$output)
quit()
}
candidate_genes = left_join(candidate_genes, gene_map, by = 'gene')
if(opt$region_based != 1) {
variant_with_genes = assign_gene_to_variant(gwas_lead, candidate_genes$gene, candidate_genes$chromosome, candidate_genes$start, candidate_genes$end, candidate_genes$strand)
} else {
variant_with_genes = assign_gene_to_variant_by_region(gwas_lead, candidate_genes$gene, candidate_genes$chromosome, candidate_genes$start, candidate_genes$end, candidate_genes$strand)
}
saveRDS(variant_with_genes, opt$output)
library(optparse)
option_list = list(
make_option(c("-a", "--gwas"), type="character", default="test",
help="GWAS sumstat", metavar="character"),
make_option(c("-g", "--gene_list"), type="character", default="test",
help="gene model", metavar="character"),
make_option(c("-d", "--ldblock"), type="character", default="test",
help="ldblock file", metavar="character"),
make_option(c("-o", "--output"), type="character", default="test",
help="output", metavar="character"),
make_option(c("-l", "--gene_list_filter"), type="character", default=NULL,
help="a list of gene", metavar="character"),
make_option(c("-w", "--gwas_pval_gt"), type="numeric", default=NULL,
help="if only want to use large p-value (to mimic GWAS with low sample size)", metavar="character"),
make_option(c("-s", "--gwas_pval_lt"), type="numeric", default=5e-8,
help="conventional genome-wide cutoff on p-value", metavar="character"),
make_option(c("-u", "--source_path"), type="character", default='rlib.R',
help="path to sourced files (default rlib.R)", metavar="character")
);
args_parser = OptionParser(option_list=option_list);
opt = parse_args(args_parser);
# do region-based operation instead of variant-based with window
library(data.table)
options(datatable.fread.datatable = F)
library(dplyr)
# source(paste0(path, '/../../scripts/mylib_doc.R'))
# source(paste0(path, '/../../scripts/mylib_generic.R'))
source(opt$source_path)
get_gwas_lead_for_ldblock = function(filename, ldblock, cutoff_p_gt, cutoff_p_lt) {
df = fread(paste0('zcat ', filename, ' | cut -f 2,3,4,11'), header = T, sep = '\t')
if(nrow(df) == 0) {
return(data.frame())
}
if(!is.null(cutoff_p_gt)) {
df = df %>% filter(pvalue > cutoff_p_gt)
}
if(!is.null(cutoff_p_lt)) {
df = df %>% filter(pvalue < cutoff_p_lt)
}
ldblock = read.table(ldblock, header = T, stringsAsFactors = F)
df_all = data.frame()
for(i in 1 : nrow(ldblock)) {
# message(i)
sub = df %>% filter(chromosome == ldblock$chromosome[i]) %>% filter(position >= ldblock$start[i], position < ldblock$end[i])
if(nrow(sub) == 0) {
next
}
min_p = min(sub$pvalue)
sub = sub[sub$pvalue == min_p, ]
if(nrow(sub) > 1) {
sub = sub[sample(1: nrow(sub), size = 1), ]
}
sub$cs_idx = ldblock$region_name[i]
sub$start = ldblock$start[i]
sub$end = ldblock$end[i]
df_all = rbind(df_all, sub)
}
colnames(df_all) = c('lead_var', 'chr', 'pos', 'pval', 'cs_idx', 'region_start', 'region_end')
df_all[, c('cs_idx', 'lead_var', 'chr', 'pos', 'region_start', 'region_end')]
}
gene_map = read.table(opt$gene_list, header = T) %>% filter(gene_type == 'protein_coding')
gene_map$tss = get_tss(gene_map$start, gene_map$end, gene_map$strand)
gwas_lead = get_gwas_lead_for_ldblock(opt$gwas, opt$ldblock, opt$gwas_pval_gt, opt$gwas_pval_lt)
if(nrow(gwas_lead) == 0) {
saveRDS(list(gwas_leading_variant = gwas_lead, extracted_genes = data.frame()), opt$output)
quit()
}
if(!is.null(opt$gene_list_filter)) {
gene_list = read.table(opt$gene_list_filter, header = F, stringsAsFactors = F)$V1
gene_map_this = gene_map %>% filter(gene_id %in% gene_list)
gwas_lead = post_filter_region_by_causal_gene_position(gwas_lead, gene_map_this$chromosome, gene_map_this$start, gene_map_this$end)
}
region_pos = gwas_lead[, c('chr', 'region_start', 'region_end', 'cs_idx', 'pos')]
df_extracted_genes_tmp = data.frame()
if(nrow(gwas_lead) == 0) {
saveRDS(list(gwas_leading_variant = gwas_lead, extracted_genes = df_extracted_genes_tmp), opt$output)
quit()
}
for(i in 1 : nrow(gwas_lead)) {
sub = gene_map %>% filter(chromosome == region_pos$chr[i])
sub = sub %>% mutate(dist_to_gene_body = get_distance(start, end, region_pos$pos[i]), dist_to_tss = abs(tss - region_pos$pos[i])) %>% filter(check_two_region_overlap(region_pos$region_start[i], region_pos$region_end[i], start, end))
if(nrow(sub) == 0) {
next
}
sub$variant = gwas_lead$lead_var[i]
sub$cs_idx = gwas_lead$cs_idx[i]
df_extracted_genes_tmp = rbind(df_extracted_genes_tmp, sub)
}
df_extracted_genes_tmp = df_extracted_genes_tmp %>% rename(gene = gene_id) %>% group_by(gene) %>% summarize(dist_to_tss = min(dist_to_tss), dist_to_gene_body = min(dist_to_gene_body)) %>% ungroup()
saveRDS(list(gwas_leading_variant = gwas_lead, extracted_genes = df_extracted_genes_tmp), opt$output)
read_gz_and_merge = function(prefix, suffix, tissues, cmd = 'zcat < ', filter = NULL, filter_col = 'variant_id', ...) {
if(tools::file_ext(suffix) == 'gz') {
cmd = 'zcat < '
}
o = data.frame()
for(i in tissues) {
filename = paste0(prefix, i, suffix)
if(!file.exists(filename)) {
message(filename)
next
}
temp = fread(cmd = paste0(cmd, filename), header = T, fill = TRUE, ...)
if(nrow(temp) == 0) {
next
}
if(!is.null(filter)) {
temp = temp[temp[, filter_col] %in% filter, ]
}
if(nrow(temp) == 0) {
next
}
temp$tissue = i
o = rbind(o, temp)
}
o
}
post_filter_gwas_by_causal_gene_position = function(gwas_lead, causal_gene_chr, causal_gene_tss, dist_to_tss_cutoff = 1e6) {
loci_pos = gwas_lead[, c('chr', 'pos', 'lead_var')]
gene_map = data.frame(chr = causal_gene_chr, tss = causal_gene_tss)
gwas_loci_selected = c()
for(i in 1 : nrow(loci_pos)) {
sub = gene_map %>% filter(chr == loci_pos$chr[i])
sub = sub %>% mutate(dist_to_tss = abs(tss - loci_pos$pos[i]))
if(nrow(sub) == 0) {
next
}
if(sum(sub$dist_to_tss <= dist_to_tss_cutoff) == 0) {
next
}
gwas_loci_selected = c(gwas_loci_selected, as.character(loci_pos$lead_var[i]))
}
gwas_lead %>% filter(lead_var %in% gwas_loci_selected)
}
post_filter_gwas_by_causal_gene_position_by_gene_body = function(gwas_lead, causal_gene_chr, causal_gene_start, causal_gene_end, dist_to_gene_body_cutoff = 1e6) {
loci_pos = gwas_lead[, c('chr', 'pos', 'lead_var')]
gene_map = data.frame(chr = causal_gene_chr, start = causal_gene_start, end = causal_gene_end)
gwas_loci_selected = c()
for(i in 1 : nrow(loci_pos)) {
sub = gene_map %>% filter(chr == loci_pos$chr[i])
sub = sub %>% mutate(dist_to_gene_body = get_distance(start, end, loci_pos$pos[i]))
if(nrow(sub) == 0) {
next
}
if(sum(sub$dist_to_gene_body <= dist_to_gene_body_cutoff) == 0) {
next
}
gwas_loci_selected = c(gwas_loci_selected, as.character(loci_pos$lead_var[i]))
}
gwas_lead %>% filter(lead_var %in% gwas_loci_selected)
}
get_smr_by_heidi_cutoff = function(smr, heidi, cutoff = 1 / exp(1)) {
if(sum(!is.na(heidi) & heidi > cutoff) == 0) {
return(NA)
} else {
smr = smr[!is.na(heidi) & heidi > cutoff]
return(min(smr, na.rm = T))
}
}
assign_gene_to_variant = function(df_lead_var, gene, chr, start, end, strand) {
o = data.frame()
df_gene = data.frame(gene = gene, chr = chr, start = start, end = end, strand = strand, stringsAsFactors = F)
df_gene = df_gene %>% mutate(tss = get_tss(start, end, strand))
for(i in 1 : nrow(df_lead_var)) {
df_gene_sub = df_gene %>% filter(chr == df_lead_var$chr[i]) %>% mutate(lead_var = df_lead_var$lead_var[i])
df_gene_sub = df_gene_sub %>% mutate(dist_to_gene_body = get_distance(start, end, df_lead_var$pos[i]), dist_to_tss = abs(tss - df_lead_var$pos[i]))
o = rbind(o, df_gene_sub)
}
o
}
iterative_strategy = function(score_list, score_info, method_idx_in_order, gwas_var_with_genes, use_nearest_gene = T, debug = F) {
df_gene_candidates = score_list[[method_idx_in_order[1]]]
df_gene_candidates$score = 0
message('---')
message(length(unique(paste(gwas_var_with_genes$lead_var, gwas_var_with_genes$trait))), ' at the beginning')
if(debug == F) {
for(i in method_idx_in_order) {
temp = left_join(gwas_var_with_genes, score_list[[i]], by = c('gene_trait_pair', 'gene'))
if(score_info$method[i] == 'gt') {
temp_target = temp %>% filter(score != score_info$fill.na[i], score > score_info$per_locus_cutoff[i]) %>% group_by(lead_var, trait)
} else if (score_info$method[i] == 'lt'){
temp_target = temp %>% filter(score != score_info$fill.na[i], score < score_info$per_locus_cutoff[i]) %>% group_by(lead_var, trait)
}
if(score_info$method[i] == 'gt') {
temp_target = temp_target %>% summarize(score = max(score, na.rm = T)) %>% ungroup()
} else if (score_info$method[i] == 'lt'){
temp_target = temp_target %>% summarize(score = min(score, na.rm = T)) %>% ungroup()
}
temp = left_join(temp_target, temp, by = c('trait', 'lead_var', 'score')) %>% mutate(gene_trait_pair = paste(gene, '--', trait))
gwas_var_with_genes = gwas_var_with_genes %>% filter(!paste(lead_var, trait) %in% paste(temp$lead_var, temp$trait))
df_gene_candidates$score[df_gene_candidates$gene_trait_pair %in% temp$gene_trait_pair] = 1
message(length(unique(paste(gwas_var_with_genes$lead_var, gwas_var_with_genes$trait))), ' left')
}
message('---')
}
if((use_nearest_gene == T & nrow(gwas_var_with_genes) > 0) | debug == T) {
temp = gwas_var_with_genes %>% group_by(trait, lead_var) %>% summarize(dist_to_gene_body = min(dist_to_gene_body, na.rm = NA)) %>% ungroup()
temp = left_join(temp, gwas_var_with_genes, by = c('trait', 'lead_var', 'dist_to_gene_body')) %>% mutate(gene_trait_pair = paste(gene, '--', trait))
df_gene_candidates$score[df_gene_candidates$gene_trait_pair %in% temp$gene_trait_pair] = 1
}
df_gene_candidates
}
do_transformation = function(score, transform) {
if(is.na(transform) | transform == 'none') {
score
} else if(transform == '-log10') {
o = -log10(score)
o[is.infinite(o)] = max(o[!is.infinite(o)]) + 1
o
} else if(transform == '-') {
- score
}
}
perform_logistic_test = function(result, score_info, random_result_raw, causal_gene_trait_pairs) {
df_score = random_result_raw %>% select(lead_var, gene, trait, gene_trait_pair)
for(i in 1 : length(score_info$name)) {
df_score = left_join(df_score, result[[i]] %>% select(gene_trait_pair, gene, score) %>% mutate(score = do_transformation(score, score_info$transformation[i])), by = c('gene_trait_pair', 'gene'))
colnames(df_score)[which(colnames(df_score) == 'score')] = score_info$tag[i]
}
df_rank = random_result_raw %>% select(lead_var, gene, trait)
for(i in 1 : length(score_info$name)) {
df_sub = df_score[, c('lead_var', 'gene', 'trait', score_info$tag[i])]
colnames(df_sub)[4] = 'temp'
df_sub = df_sub %>% group_by(lead_var, trait) %>% mutate(rank = rank(-temp) - 1) %>% ungroup() %>% select(-temp)
colnames(df_sub)[4] = score_info$tag[i]
df_rank = left_join(df_rank, df_sub, by = c('lead_var', 'gene', 'trait'))
# colnames(df_rank)[which(colnames(df_rank) == 'rank')] = score_info$tag[i]
}
df_percentage = random_result_raw %>% select(lead_var, gene, trait)
for(i in 1 : length(score_info$name)) {
df_sub = df_score[, c('lead_var', 'gene', 'trait', score_info$tag[i])]
colnames(df_sub)[4] = 'temp'
df_sub = df_sub %>% group_by(lead_var, trait) %>% mutate(percentage = (rank(-temp) - 1) / n()) %>% ungroup() %>% select(-temp)
colnames(df_sub)[4] = score_info$tag[i]
df_percentage = left_join(df_percentage, df_sub, by = c('lead_var', 'gene', 'trait'))
# print(colnames(df_sub))
# colnames(df_rank)[which(colnames(df_percentage) == 'percentage')] = score_info$tag[i]
}
df_rank_proximity = random_result_raw %>% group_by(lead_var, trait) %>% mutate(rank_proximity = rank(dist_to_gene_body) - 1) %>% ungroup()
df_percentage_proximity = random_result_raw %>% group_by(lead_var, trait) %>% mutate(percentage_proximity = (rank(dist_to_gene_body) - 1) / n()) %>% ungroup()
y_is_omim = random_result_raw$gene_trait_pair %in% causal_gene_trait_pairs
# rank based
# glm(is_omim ~ proximity + other-scores, family = 'binomial')
df = data.frame(is_omim = y_is_omim, rank_proximity = df_rank_proximity$rank_proximity)
df = cbind(df, df_score[, score_info$tag])
formula = as.formula(paste('is_omim', '~', 'rank_proximity', '+', paste0(score_info$tag, collapse = ' + ')))
mod1 = glm(formula, family = 'binomial', data = df)
# glm(is_omim ~ proximity + other-scores, family = 'binomial')
df = data.frame(is_omim = y_is_omim, rank_proximity = df_rank_proximity$rank_proximity)
df = cbind(df, df_rank[, score_info$tag])
formula = as.formula(paste('is_omim', '~', 'rank_proximity', '+', paste0(score_info$tag, collapse = ' + ')))
mod2 = glm(formula, family = 'binomial', data = df)
# percentage based
# glm(is_omim ~ proximity + other-scores, family = 'binomial')
df = data.frame(is_omim = y_is_omim, percentage_proximity = df_percentage_proximity$percentage_proximity)
df = cbind(df, df_score[, score_info$tag])
formula = as.formula(paste('is_omim', '~', 'percentage_proximity', '+', paste0(score_info$tag, collapse = ' + ')))
mod3 = glm(formula, family = 'binomial', data = df)
# glm(is_omim ~ proximity + other-scores, family = 'binomial')
df = data.frame(is_omim = y_is_omim, percentage_proximity = df_percentage_proximity$percentage_proximity)
df = cbind(df, df_percentage[, score_info$tag])
formula = as.formula(paste('is_omim', '~', 'percentage_proximity', '+', paste0(score_info$tag, collapse = ' + ')))
mod4 = glm(formula, family = 'binomial', data = df)
return(list(rank_based = list(with_score = mod1, without_score = mod2), percentage_based = list(with_score = mod3, without_score = mod4)))
}
check_two_region_overlap = function(region_s, region_e, gene_s, gene_e) {
# case1
# region -----|-------|------
# gene |---------|
case1 = gene_s < region_e & gene_s >= region_s
# case2
# region -----|-------|------
# gene |---------|
case2 = gene_e < region_e & gene_e >= region_s
# case3
# region -----|-------|------
# gene |---------------|
case3 = gene_e >= region_e & gene_s <= region_s
case1 | case2 | case3
}
post_filter_region_by_causal_gene_position = function(gwas_lead, causal_gene_chr, causal_gene_start, causal_gene_end) {
region_pos = gwas_lead[, c('chr', 'region_start', 'region_end', 'cs_idx')]
gene_map = data.frame(chr = causal_gene_chr, start = causal_gene_start, end = causal_gene_end)
region_selected = c()
for(i in 1 : nrow(region_pos)) {
sub = gene_map %>% filter(chr == region_pos$chr[i])
sub = sub %>% mutate(if_in_region = check_two_region_overlap(region_pos$region_start[i], region_pos$region_end[i], start, end))
if(nrow(sub) == 0) {
next
}
if(sum(sub$if_in_region) == 0) {
next
}
region_selected = c(region_selected, as.character(region_pos$cs_idx[i]))
}
gwas_lead %>% filter(cs_idx %in% region_selected)
}
assign_gene_to_variant_by_region = function(df_lead_var, gene, chr, start, end, strand) {
o = data.frame()
df_gene = data.frame(gene = gene, chr = chr, start = start, end = end, strand = strand, stringsAsFactors = F)
df_gene = df_gene %>% mutate(tss = get_tss(start, end, strand))
for(i in 1 : nrow(df_lead_var)) {
df_gene_sub = df_gene %>% filter(chr == df_lead_var$chr[i]) %>% mutate(lead_var = df_lead_var$lead_var[i], cs_idx = df_lead_var$cs_idx[i])
df_gene_sub = df_gene_sub %>% filter(check_two_region_overlap(df_lead_var$region_start[i], df_lead_var$region_end[i], start, end))
df_gene_sub = df_gene_sub %>% mutate(dist_to_gene_body = get_distance(start, end, df_lead_var$pos[i]), dist_to_tss = abs(tss - df_lead_var$pos[i]))
o = rbind(o, df_gene_sub)
}
o
}
random_guess = function(score_list, gwas_var_with_genes) {
df_gene_candidates = score_list
df_gene_candidates$score = 0
gwas_var_with_genes = gwas_var_with_genes %>% group_by(lead_var, trait) %>% summarize(gene_trait_pair = gene_trait_pair[sample(1 : n(), size = 1)]) %>% ungroup()
df_gene_candidates$score[df_gene_candidates$gene_trait_pair %in% gwas_var_with_genes$gene_trait_pair] = 1
df_gene_candidates
}
compute_power_and_fdr = function(discovered_genes, true_genes) {
true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
nTP = length(true_positive_genes)
nT = length(true_genes)
nP = length(discovered_genes)
if(nP == 0) {
nP = 1
}
data.frame(power = nTP / nT, fdr = 1 - nTP / nP, nTP = nTP, nP = nP, nT = nT)
}
compute_tpr_and_fpr = function(tested_genes, discovered_genes, true_genes) {
true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
nTP = length(true_positive_genes)
nT = length(true_genes)
all_genes = union(tested_genes, true_genes)
not_true_genes = all_genes[!all_genes %in% true_genes]
nNT = length(not_true_genes)
false_positive_genes = not_true_genes[ not_true_genes %in% discovered_genes ]
nFP = length(false_positive_genes)
if(nNT == 0) {
nNT = 1
}
data.frame(tpr = nTP / nT, fpr = nFP / nNT, nTP = nTP, nT = nT, nFP = nFP, nNT = nNT)
}
get_distance = function(s, e, p) {
o = cbind(abs(s - p), abs(e - p))
o = apply(o, 1, min)
o[p <= e & p >= s] = 0
o
}
get_nearest_gene = function(var_id, gene_name, gene_chr, gene_start, gene_end) {
pos = get_chr_pos_sep(var_id)
o = data.frame()
for(i in 1 : length(var_id)) {
g = data.frame(gene = gene_name, chr = gene_chr, start = gene_start, end = gene_end)
g = g %>% filter(chr == pos$chr[i]) %>% mutate(dist = get_distance(start, end, pos$pos[i]))
dist_min = min(g$dist)
g = g %>% filter(dist == dist_min)
g$var = var_id[i]
o = rbind(o, g)
}
o
}
is_sig = function(score, threshold, method, tie = T) {
if(method == 'gt') {
if(tie == T) {
return(score >= threshold)
} else {
return(score > threshold)
}
} else if(method == 'lt') {
if(tie == T) {
return(score <= threshold)
} else {
return(score < threshold)
}
} else {
return(NA)
}
}
gen_fdr_power_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
if(is.null(cutoff)) {
true_cutoffs = sort(unique(score[gene %in% true_genes]))
} else {
true_cutoffs = cutoff
}
if(method == 'lt') {
true_cutoffs = sort(true_cutoffs, decreasing = T)
}
df_curve = data.frame()
for(i in true_cutoffs) {
positive_genes = gene[is_sig(score, i, method)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = T
df_curve = rbind(df_curve, sub)
positive_genes = gene[is_sig(score, i, method, tie = F)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = F
df_curve = rbind(df_curve, sub)
}
df_curve = df_curve %>% mutate(precision = 1 - fdr, recall = power)
df_curve$precision[df_curve$nP == 0] = 1
if(is.null(cutoff)) {
df_curve = df_curve[-nrow(df_curve), ]
}
df_curve
}
gen_fdr_power_curve_fixed_joint = function(true_genes, gene, score_vary, score_fix, cutoff_fix, method_vary = 'gt', method_fix = 'gt', by = 1, cutoff = NULL) {
if(is.null(cutoff)) {
true_cutoffs = sort(unique(score_vary[gene %in% true_genes]))
} else {
true_cutoffs = cutoff
}
if(method_vary == 'lt') {
true_cutoffs = sort(true_cutoffs, decreasing = T)
}
df_curve = data.frame()
for(i in true_cutoffs) {
positive_genes = gene[is_sig(score_vary, i, method_vary) & is_sig(score_fix, cutoff_fix, method_fix, tie = F)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = T
df_curve = rbind(df_curve, sub)
positive_genes = gene[is_sig(score_vary, i, method_vary, tie = F) & is_sig(score_fix, cutoff_fix, method_fix, tie = F)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = F
df_curve = rbind(df_curve, sub)
}
df_curve = df_curve %>% mutate(precision = 1 - fdr, recall = power)
df_curve$precision[df_curve$nP == 0] = 1
df_curve[-nrow(df_curve), ]
}
gen_fdr_power_curve_joint = function(true_genes, gene, score1, score2, method1 = 'gt', method2 = 'gt', by1 = 10, by2 = 10, cutoff1 = NULL, cutoff2 = NULL) {
if(is.null(cutoff1)) {
true_cutoffs1 = sort(unique(score1[gene %in% true_genes]))
true_cutoffs_sub1 = true_cutoffs1[seq(1, length(true_cutoffs1), by = by1)]
} else {
true_cutoffs_sub1 = cutoff1
}
if(is.null(cutoff2)) {
true_cutoffs2 = sort(unique(score2[gene %in% true_genes]))
true_cutoffs_sub2 = true_cutoffs2[seq(1, length(true_cutoffs2), by = by2)]
} else {
true_cutoffs_sub2 = cutoff2
}
df_curve_join = data.frame()
for(i in true_cutoffs_sub1) {
for(j in true_cutoffs_sub2) {
positive_genes = gene[is_sig(score1, i, method1) & is_sig(score2, j, method2)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff1 = i
sub$cutoff2 = j
df_curve_join = rbind(df_curve_join, sub)
}
}
df_curve_join %>% mutate(precision = 1 - fdr, recall = power)
}
gen_roc_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
if(is.null(cutoff)) {
true_cutoffs = sort(unique(score[gene %in% true_genes]))
} else {
true_cutoffs = cutoff
}
if(method == 'lt') {
true_cutoffs = sort(true_cutoffs, decreasing = T)
}
df_curve = data.frame()
for(i in true_cutoffs) {
positive_genes = gene[is_sig(score, i, method)]
sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = T
df_curve = rbind(df_curve, sub)
positive_genes = gene[is_sig(score, i, method, tie = F)]
sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = F
df_curve = rbind(df_curve, sub)
}
# sub = compute_tpr_and_fpr(gene, union(gene, true_genes), true_genes)
# sub$cutoff = NA
# df_curve = rbind(df_curve, sub)
if(is.null(cutoff)) {
df_curve = df_curve[-1, ]
}
df_curve
}
# DEPRECATED
# Need to fix how to handle tie if want to use it
# gen_roc_curve_joint = function(true_genes, gene, score1, score2, method1 = 'gt', method2 = 'gt', by1 = 10, by2 = 10, cutoff1 = NULL, cutoff2 = NULL) {
# if(is.null(cutoff1)) {
# true_cutoffs1 = sort(unique(score1[gene %in% true_genes]))
# true_cutoffs_sub1 = true_cutoffs1[seq(1, length(true_cutoffs1), by = by1)]
# } else {
# true_cutoffs_sub1 = cutoff1
# }
# if(is.null(cutoff2)) {
# true_cutoffs2 = sort(unique(score2[gene %in% true_genes]))
# true_cutoffs_sub2 = true_cutoffs2[seq(1, length(true_cutoffs2), by = by2)]
# } else {
# true_cutoffs_sub2 = cutoff2
# }
# df_curve_join = data.frame()
# for(i in true_cutoffs_sub1) {
# for(j in true_cutoffs_sub2) {
# positive_genes = gene[is_sig(score1, i, method1) & is_sig(score2, j, method2)]
# sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
# sub$cutoff1 = i
# sub$cutoff2 = j
# df_curve_join = rbind(df_curve_join, sub)
# }
# }
# df_curve_join
# }
# END
process_susie_output = function(cs, var) {
dcs = fread(paste0('zcat < ', cs), header = T)
dvar = fread(paste0('zcat < ', var, '| grep -v \'\\-1\' |awk -v OFS="\t" \'{split($4,a,"_"); print $0,a[1],a[2]}\''), header = T)
ncol = ncol(dvar)
colnames(dvar)[(ncol - 1):ncol] = c('chr', 'pos')
# pos = get_chr_pos_sep(dvar$variant)
# dvar$chr = pos$chr
# dvar$pos = pos$pos
o = data.frame()
for(i in 1 : nrow(dcs)) {
sub = dvar %>% filter(cs == dcs$cs[i]) %>% filter(pos <= dcs$pos_max[i] & pos >= dcs$pos_min[i] & chr == dcs$chr[i]) %>% mutate(cs_idx = i)
o = rbind(o, sub)
}
o
}
match_credible_set = function(cs1, cs2, pip1, pip2) {
pip1_select = pip1 %>% filter(cluster_id %in% cs1$cluster)
pip2_select = pip2 %>% filter(cluster_id %in% cs2$cluster)
overlapped_var = intersect(pip1_select$variant_id, pip2_select$variant_id)
cs1_overlap = pip1_select %>% filter(variant_id %in% overlapped_var) %>% select(cluster_id, variant_id)
cs2_overlap = pip2_select %>% filter(variant_id %in% overlapped_var) %>% select(cluster_id, variant_id)
cs_map = unique(inner_join(cs1_overlap, cs2_overlap, by = 'variant_id', suffix = c('_1', '_2')) %>% select(cluster_id_1, cluster_id_2))
}
rare_var_to_gtexid = function(x) {
e = strsplit(x, ':')
f = lapply(e, function(x) {
paste(c(x, 'b38'), collapse = '_')
})
unlist(f)
}
get_chr_pos = function(x) {
x = as.character(x)
e = strsplit(x, '_')
f = lapply(e, function(x) {
paste(c(x[1:2]), collapse = '_')
})
unlist(f)
}
get_chr_pos_sep = function(x) {
x = as.character(x)
e = strsplit(x, '_')
f = lapply(e, function(x) {
x[1]
})
g = lapply(e, function(x) {
as.numeric(x[2])
})
o = data.frame(chr = unlist(f), pos = unlist(g))
o$chr = as.character(o$chr)
o
}
get_tss = function(start, end, strand) {
strand = as.character(strand)
o = start
o[strand == '-'] = end[strand == '-']
o
}
parse_annotation_ng2017 = function(annot) {
annot = as.character(annot)
e = strsplit(annot, ':')
data.frame(gene_name = unlist(lapply(e, function(x) { x[1] })), annotation = unlist(lapply(e, function(x) { x[2] })))
}
parse_finnish_vartest = function(gene_str, trait_str) {
idx = 1 : length(gene_str)
o = data.frame()
for(i in idx) {
gene_i = strsplit(gene_str[i], ';')[[1]]
trait_i = strsplit(trait_str[i], ';')[[1]]
ng = length(gene_i)
nt = length(trait_i)
gene_idx = rep(1 : ng, each = nt)
trait_idx = rep(1 : nt, ng)
ogene = gene_i[gene_idx]
otrait = trait_i[trait_idx]
o = rbind(o, data.frame(gene = ogene, trait = otrait, idx = i))
}
o
}
# Step 1
Rscript ldblock2gene.R \
--gwas /gpfs/data/im-lab/nas40t2/Data/SummaryResults/imputed_gwas_hg38_1.1/imputed_GLGC_Mc_LDL.txt.gz \
--gene_list /gpfs/data/im-lab/nas40t2/yanyul/mv_from_scratch/repo_new/rotation-at-imlab/data/annotations_gencode_v26.tsv \
--ldblock /gpfs/data/im-lab/nas40t2/yanyul/mv_from_scratch/repo_new/rotation-at-imlab/analysis/allelic_heterogeneity/data/ld_independent_regions.txt \
--output gwas_in_LDblock-GLGC_Mc_LDL.rds --gene_list_filter /gpfs/data/im-lab/nas40t2/yanyul/mv_from_scratch/repo_new/rotation-at-imlab/analysis/fdr_power_specificity/data/omim/trait-to-gene-omim.GLGC_Mc_LDL.txt \
--gwas_pval_lt 5e-8 \
--source_path rlib.R
# Step 2
Rscript annotate_random_precision.R \
--gwas gwas_in_LDblock-GLGC_Mc_LDL.rds \
--output gwas_in_LDblock-GLGC_Mc_LDL.by_locus.rds \
--gene_list /gpfs/data/im-lab/nas40t2/yanyul/mv_from_scratch/repo_new/rotation-at-imlab/data/annotations_gencode_v26.tsv \
--region_based 1 \
--source_path rlib.R
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment