Created
June 29, 2021 19:09
-
-
Save nievergeltlab/2370fac698b7a1d16051778ab88ee08b to your computer and use it in GitHub Desktop.
Extract summary data from GWAS to produce a forest plot. Contains code for pivoting alleles to the same strand across datasets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
args <- commandArgs(trailingOnly = TRUE) | |
snpresults <- args[1] | |
header_classes <- args[2] | |
descriptor <-args[3] | |
outfile <- args[4] | |
# snpresults="results_filtered/forest_plots/rs10266297" | |
# header_classes="results_filtered/forest_plots/header_classes.csv" | |
# descriptor="results_filtered/forest_plots/forestplot_descriptor.csv" | |
# outfile="results_filtered/forest_plots/rs10266297" | |
#Load metafor library and plyr (for mapped values) | |
library(metafor) | |
library(plyr) | |
library(data.table) | |
#study info | |
descriptors <- read.csv(descriptor,header=T,stringsAsFactors=F) | |
# if (anc != "all") | |
# { | |
# if (anc =="aam") | |
# { | |
# descriptors <- subset(descriptors,ancestry %in% c("aam","saf-afr" )) | |
# }else { | |
# descriptors <- subset(descriptors,ancestry == anc) | |
# } | |
# } | |
# if (gender == "males") | |
# { | |
# descriptors$study <- descriptors$study_males | |
# descriptors <- subset(descriptors, included_male == 1) | |
# } | |
# if (gender == "females") | |
# { | |
# descriptors$study <- descriptors$study_females | |
# descriptors <- subset(descriptors, included_fem == 1) | |
# } | |
#print(dim(descriptors)[1]) | |
headers <- fread(header_classes,data.table=F,header=F) | |
#Load data (should be in described format of file name followed by | |
d1 <- read.table(paste(snpresults,'_class1.use',sep=""),stringsAsFactors=F) | |
d2 <- read.table(paste(snpresults,'_class2.use',sep=""),stringsAsFactors=F) | |
d3 <- read.table(paste(snpresults,'_class3.use',sep=""),stringsAsFactors=F) | |
d4 <- read.table(paste(snpresults,'_class4.use',sep=""),stringsAsFactors=F) | |
names(d1) <- headers$V1[1:18] | |
names(d2) <- headers$V2[1:21] | |
names(d3) <- headers$V3[1:13] | |
names(d4) <- headers$V4[1:17] | |
d1$A1n <- d1$A1 | |
d1$A2n <- d1$AX | |
d2$A1n <- d2$A1 | |
d2$A2n <- d2$AX | |
d3$A1n <- d3$A1 | |
d3$A2n <- d3$A2 | |
d4$A1n <- d4$ALLELE1 | |
d4$A2n <- d4$ALLELE0 | |
d1$B <- d1$BETA | |
d1$S <- d1$SE | |
d1$SNP <- d1$ID | |
d2$B <- log(d2$OR) | |
d2$S <- d2$SE | |
d2$SNP <- d2$ID | |
d3$B <- log(d3$OR) | |
d3$S <- d3$SE | |
d4$B <- d4$BETA | |
d4$S <- d4$SE | |
d1$classd <- 1 | |
d2$classd <- 2 | |
d3$classd <- 3 | |
d4$classd <- 4 | |
dat0 <- rbind.fill(d1,d2,d3,d4) | |
dat0 <- subset(dat0,Study %in% descriptors$filename2) | |
#R is fucking retarded, this turns TRUE back to T ALLELE! | |
dat0$A1n <- as.factor(revalue(dat0$A1n, c("TRUE" = "T", "FALSE" = "F"))) | |
dat0$A2n <- as.factor(revalue(dat0$A2n, c("TRUE" = "T", "FALSE" = "F"))) | |
dat0$A1n <- as.character(dat0$A1n) | |
dat0$A2n <- as.character(dat0$A2n) | |
dat0$B2 <- NA | |
##Align alleles | |
#Set a reference for pivoting (Just choose row 1 in the data) | |
pivot=c(dat0[1,"A1n"],dat0[1,"A2n"]) | |
#this is not coded for C G alleles!! | |
#For every row, check allele alignment | |
for ( i in 1:dim(dat0)[1]) | |
{ | |
betaval=dat0[i,]$B | |
snp=c(dat0[i,"A1n"],dat0[i,"A2n"]) | |
flipped=mapvalues(snp, c("A", "C", "G", "T"), c("T", "G", "C", "A")) | |
if (pivot[1] == snp[1] & pivot[2] == snp[2]) | |
{ | |
#No changes need to be made, alleles already aligned | |
print(paste("No changes for", dat0[i,]$Study,sep=" ")) | |
} else if (pivot[1] == snp[2] & pivot[2] == snp[1]) | |
{ | |
#A1n and A2n need to be flipped, change sign of beta value | |
betaval=-1*betaval | |
print(paste("Direction flip for", dat0[i,]$Study,sep=" ")) | |
} else if ( !(flipped[1] %in% pivot) | !(flipped[2] %in% pivot)) | |
{ | |
print(paste("Study is unflippable", dat0[i,]$Study,sep=" ")) | |
#Check if allele might be flippable, if not, give NA value | |
betaval=NA | |
} else if (flipped[1] == snp[1] & flipped[2] == snp[2]) | |
{ | |
print(paste("Wrong strand but correct flip for ", dat0[i,]$Study,sep=" ")) | |
#It's on the wrong strand but nothing actually needs to be done | |
} else if (flipped[1] == snp[2] & flipped[2] == snp[1]) | |
{ | |
#It's on the wrong strand and needs to be flipped | |
print(paste("Direction and allele flip for", dat0[i,]$Study,sep=" ")) | |
betaval=-1*betaval | |
} else { | |
betaval=NA | |
} | |
#Now reassign adjusted beta | |
dat0[i,]$B2 <- betaval | |
} | |
sink(file=paste(outfile,'.log',sep='_')) | |
#Merge in study descriptors for the plot | |
descriptors$Study = descriptors$filename2 | |
dat <- merge(dat0,descriptors,by="Study") | |
evalfun <- function(x) | |
{ | |
eval(parse(text=x)) | |
} | |
dat$rescaling_factor2 <- sapply(dat$rescaling_factor,evalfun) | |
dat$B3 <- dat$B2 * as.numeric(dat$rescaling_factor2) * 100 | |
dat$SE3 <- dat$S * as.numeric(dat$rescaling_factor2) * 100 | |
dat <- dat[order(as.numeric(dat$N),decreasing=c(T)),] | |
#Which studys are NOT in the original data | |
print("Not included (missing data):") | |
print(descriptors[-which(descriptors$Study %in% dat$Study),]$Study) | |
print("N Studies analyzed:") | |
print(dim(dat)[1]) | |
#These will match if the number of inputs in the starting directory | |
#matches the number of inputs in the study description file | |
dat_qt <- subset(dat,Included.As == "Quantitative") | |
dat_cc <- subset(dat,Included.As != "Quantitative") | |
rma(yi=dat_qt$B3, sei=dat_qt$SE3, mods= ~ I(Trauma.Exposure.Sampled ==1) ,data=dat_qt) | |
meta_res_qt <- rma(yi=dat_qt$B3,sei=dat_qt$SE3,method="FE",slab=dat_qt$abbr) | |
print("P value is") | |
meta_res_qt$pval | |
print("B value is") | |
meta_res_qt$estimate | |
save(meta_res_qt,file=paste(outfile,"_fp",'.R',sep='')) | |
pdf(paste(outfile,'_qt.pdf',sep=''),7,9) | |
forest(meta_res_qt) | |
dev.off() | |
meta_res_cc <- rma(yi=dat_cc$B2,sei=dat_cc$S,method="FE",slab=dat_cc$abbr) | |
print("P value is") | |
meta_res_cc$pval | |
print("B value is") | |
meta_res_cc$estimate | |
save(meta_res_cc,file=paste(outfile,"_fp",'.R',sep='')) | |
pdf(paste(outfile,'_cc.pdf',sep=''),7,9) | |
forest(meta_res_cc) | |
dev.off() | |
# #Write output file for metasoft | |
# dout <- unlist(matrix(c(dat$B, dat$SE),ncol=2,byrow=F)) | |
# write.table(t(c(outfile,t(dout + 0.0000001))),paste("results_cat/", outfile,"_",anc,'.msin',sep=''),quote=F,row.names=F,col.names=F) | |
# write.table(paste(dat$abbr,sep="" ),paste("results_cat/", outfile,"_",anc,'.msinstudynames',sep=''),quote=F,row.names=F,col.names=F) # Label by Study Number and abbr | |
# write.table(dat$caroline_order2,paste("results_cat/", outfile,"_",anc,'.msinstudyorder',sep=''),quote=F,row.names=F,col.names=F) #Order by number of cases | |
# print(warnings()) | |
# sum(dat$B * 1/dat$SE^2) / sum(1/dat$SE^2) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
echo "rs10266297,7 | |
rs10821140,9 | |
rs10821154,9 | |
rs146918648,6 | |
rs1504930,5 | |
rs2721802,7 | |
rs2721817,7 | |
rs2894699,7 | |
rs35888516,20 | |
rs4557006,2 | |
rs6896669,5 | |
rs71149745,7 | |
rs7264419,20 | |
rs72657988,1 | |
rs8059002,16" > may8_2020_hits.csv #Latino hit is just the overall anc tophit, but a few markers away | |
#Load summary GWAS data for forest plot SNPS | |
for fp in $(cat may8_2020_hits.csv ) | |
do | |
forestsnp=$(echo $fp | awk 'BEGIN {FS=","}{print $1}') | |
forestsnpchr=$(echo $fp | awk 'BEGIN {FS=","}{print $2}') | |
# zgrep -w -m1 $forestsnp results_cat/*PTSD*.assoc.gz_"$forestsnpchr".gz | sed 's/:/ /' | sed 's/results_cat\///g' | sed 's/\.gz_[0-9]*.gz/.gz/g' | awk 'BEGIN{OFS="\t"}{print}' | column -t > results_filtered/forest_plots/"$forestsnp"_class1.use | |
# zgrep -w -m1 $forestsnp results_cat/*.Case.assoc.gz_"$forestsnpchr".gz | sed 's/:/\t/' | sed 's/results_cat\///g' | sed 's/\.gz_[0-9]*.gz/.gz/g' | awk 'BEGIN{OFS="\t"}{print}' | column -t > results_filtered/forest_plots/"$forestsnp"_class2.use | |
# zgrep -w -m1 $forestsnp sumstats/bychr/*_"$forestsnpchr".gz | grep -v vetsa | grep -v ukbb | sed 's/:/ /' | sed 's/sumstats\/bychr\///g' | sed 's/\.gz_[0-9]*.gz/.gz/g' | column -t > results_filtered/forest_plots/"$forestsnp"_class3.use | |
# zgrep -w -m1 $forestsnp sumstats/bychr/*_"$forestsnpchr".gz | grep -E ukbb\|vetsa | sed 's/:/ /' | sed 's/sumstats\/bychr\///g' | sed 's/\.gz_[0-9]*.gz/.gz/g' | column -t > results_filtered/forest_plots/"$forestsnp"_class4.use | |
echo "Extracting data for $forestsnp $forestsnpchr" | |
Rscript 0_forest_plot.r results_filtered/forest_plots/"$forestsnp" results_filtered/forest_plots/header_classes.csv results_filtered/forest_plots/forestplot_descriptor.csv results_filtered/forest_plots/"$forestsnp" | |
done | |
snpresults="results_filtered/forest_plots/rs10266297" | |
header_classes="results_filtered/forest_plots/header_classes.csv" | |
descriptor="results_filtered/forest_plots/forestplot_descriptor.csv" | |
outfile="results_filtered/forest_plots/rs10266297" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment