Last active
May 31, 2025 11:36
-
-
Save alicebraun/4b320d4df46adb0e0139f45dca3b71b1 to your computer and use it in GitHub Desktop.
PGS-output.R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########### | |
#PGC_GPRS.R | |
#Polygenic Score Analysis from PLINK PRS-CS or C-PT output | |
########### | |
# libraries | |
library(Hmisc) | |
library(plyr) | |
library(rms) | |
library(pROC) | |
library(MASS) | |
library(dplyr) | |
library(readr) | |
# Read phenotype/covariate data | |
prefix <- "myprefix" #adjust prefix | |
PREVALENCE <- 0.01 #adjust prevalence | |
pheno <- read_tsv("my_covariates.tsv") # adjust filename for covariates (e.g. principal components) | |
# Read PRS profile data | |
prs <- read_table("my_plink_output.profile") # adjust filename | |
# Merge on IID | |
merged <- prs %>% | |
left_join(pheno, by = "IID") | |
# Rename or create columns to match desired output based on your covariate file | |
ri <- merged %>% | |
transmute( | |
FID = FID, | |
IID = IID, | |
COUNT = CNT, # use CNT from PRS as count | |
PHENO = PHENO.x %||% PHENO.y, # handle if both exist | |
SCORE = SCORESUM, | |
C1 = PC1, | |
C2 = PC2, | |
C4 = PC4, | |
C5 = PC5, | |
C6 = PC6, | |
C7 = PC7, | |
C8 = PC8, | |
C9 = PC9, | |
C10 = PC10 | |
) | |
# Write file | |
print(head(ri)) | |
write_tsv(ri, paste0(prefix,"_sumprofile_cov.tsv")) | |
# Functions | |
###### do not use this one, it seems outdated, maybe for quantitative phenotypes but untested | |
###### we use h2l_R2N instead | |
h2l_R2 <- function(k, r2, p) { | |
# K baseline disease risk | |
# r2 from a linear regression model attributable to genomic profile risk score | |
# P proportion of sample that are cases | |
# calculates proportion of variance explained on the liability scale | |
#from ABC at http://www.complextraitgenomics.com/software/ | |
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. | |
x= qnorm(1-K) | |
z= dnorm(x) | |
i=z/K | |
C= k*(1-k)*k*(1-k)/(z^2*p*(1-p)) | |
theta= i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x) | |
h2l_R2 = C*r2 / (1 + C*theta*r2) | |
} | |
se_h2l_R2 <- function(k,h2,se, p) { | |
# K baseline disease risk | |
# r2 from a linear regression model attributable to genomic profile risk score | |
# P proportion of sample that are cases | |
# calculates proportion of variance explained on the liability scale | |
#from ABC at http://www.complextraitgenomics.com/software/ | |
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. | |
#SE on the liability (From a Taylor series expansion) | |
#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d being calculus differentiation | |
x= qnorm(1-K) | |
z= dnorm(x) | |
i=z/K | |
C= k*(1-k)*k*(1-k)/(z^2*p*(1-p)) | |
theta= i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x) | |
se_h2l_R2 = C*(1-h2*theta)*se | |
} | |
h2l_R2N <- function(k, r2n, p) { | |
# k baseline disease risk | |
# r2n Nagelkerkes attributable to genomic profile risk score | |
# proportion of sample that are cases | |
# calculates proportionexplained on the liability scale | |
#from ABC at http://www.complextraitgenomics.com/software/ | |
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. | |
x <- qnorm(1 - k) | |
z <- dnorm(x) | |
i <- z / k | |
cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p)) | |
theta <- i * ((p - k)/(1 - k)) * (i * ((p - k) / ( 1 - k)) - x) | |
e <- 1 - p^(2 * p) * (1 - p)^(2 * (1 - p)) | |
h2l_R2N <- cc * e * r2n / (1 + cc * e * theta * r2n) | |
} | |
h2l_AUC <- function(k,auc) { | |
# k baseline disease risk | |
# auc attributable to genomic profile risk score | |
# calculates proportion of variance explained on the liability scale | |
#from genroc at http://www.complextraitgenomics.com/software/ | |
#Wray NR, Yang J, Goddard ME, Visscher PM (2010) The Genetic Interpretation of Area under the ROC Curve in Genomic Profiling. PLoS Genet 6(2): e1000864 | |
T0 <- qnorm(1 - k) | |
z <- dnorm(T0) | |
i <- z / k | |
v <- -i * (k / (1-k)) | |
q <- qnorm(auc) | |
h2l_AUC <- 2 * q^2 / ((v - i)^2 + q^2 * i * (i - T0) + v * (v - T0)) # eq 4 | |
} | |
h2l_CS <- function(k,cs,p) { | |
# k baseline disease risk | |
# cs Cox Snell R2 attributable to genomic profile risk score | |
# calculates proportion of variance explained on the liability scale | |
#from ABC at http://www.complextraitgenomics.com/software/ | |
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. | |
T0 <- qnorm(1 - k) | |
z <- dnorm(T0) | |
cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p)) | |
h2l_CS <- cs*cc | |
} | |
# main program | |
### light edits of s.ripke | |
#Pd=c("0.00000005","0.000001","0.0001","0.001","0.01","0.05","0.1","0.2","0.5","1") | |
Pd=c("1") | |
name="myprefix" | |
FILE="INTEMPL." | |
nj=1 # number of files | |
#nj=10 # number of files | |
#This output file gives a range of measures including those we con | |
O=data.frame("name","Pd","file","N","Propcase","NKr2","NKr2_wrong","pval","h2l_r2","h2l_r2n","h2l_auc","h2l_cs","h2l_r2n_wrong","h2l_auc_wrong","auc","aucvF","aucvR","aucv","auc_wrong", | |
"ORD","ORDL","ORDH","ORDchk","ORDchkL","ORDchkH") | |
write.table(O,"PGRS_chk.csv",row.names=F,col.names=F,sep=",") | |
#This output file gives the measures that should be used | |
O=data.frame("name","Pd","N","Propcase","NKr2","pval","PopRisk","h2l_r2n","se_h2l_r2","AUC","OR10decile","ORL95","ORH95","Ncase","Ncontrol","Coeff_with_cov") | |
write.table(O,"INTEMPL.poly.out.tprefixxt",row.names=F,col.names=F,sep=" ") | |
file <- paste0(prefix,"_sumprofile_cov.tsv") | |
pdf(paste(FILE,"pdf",sep="")) | |
read.table(file,head=T)->ri | |
head(ri) | |
K=0.015 # baseline disorder risk for schizophrenia | |
### normalize the score | |
(ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)->ri$NSCORE | |
ri$PHENO1=ri$PHENO-1 | |
vars = var(ri$SCORE) | |
print (vars) | |
P=sum(ri$PHENO1)/length(ri$PHENO1) # proportion of target sample that are cases | |
###################################### | |
###Stephans code included for comparison | |
## here statistics with covariates: | |
#library(rms) | |
lrm(PHENO ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri )-> go | |
## here for only covariates | |
lrm(PHENO ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri )-> go_cov | |
str=summary(lm(NSCORE~ PHENO + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri )) | |
ricase=ri[ri$PHENO1==1,] | |
ricont=ri[ri$PHENO1==0,] | |
strcase=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricase )) | |
strcont=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricont )) | |
### here the stats values: | |
go$stats -> go_s | |
go_cov$stats -> go_cov_s | |
## Stephan: here I substract the "C" values of go_cov_s from the "C" value of go_s (so the same way I do for the R2-values. | |
C_diff = go_s["C"] - go_cov_s["C"] #AUC = C_diff+0.5 | |
r2_diff = go_s["R2"] - go_cov_s["R2"] | |
###new code = RIGHT | |
###logistic models | |
tstF = glm(PHENO1 ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model | |
tstS = glm(PHENO1 ~ NSCORE, data = ri,family = binomial(logit)) # logit model | |
tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model | |
tst0 = glm(PHENO1 ~ 1 , data = ri,family = binomial(logit)) # logit model | |
coeff_w_cov = tstF$coefficients["NSCORE"] | |
#library(pROC) | |
aucvF = auc(ri$PHENO1,tstF$linear.predictors) | |
aucvR = auc(ri$PHENO1,tstR$linear.predictors) | |
aucvS = auc(ri$PHENO1,tstS$linear.predictors) | |
auc_wrong=aucvF-aucvR +0.5 #auc for score incorrect | |
aucv=pnorm(qnorm(aucvF)-qnorm(aucvR)) #this is not correct either | |
#aucvS may be approximately close to correct value (but without covariate) | |
#Cox&Snell R2 | |
N=length(ri$PHENO1) | |
NCA=sum(ri$PHENO1==1) | |
NCO=sum(ri$PHENO1==0) | |
LLF=logLik(tstF) | |
LLR=logLik(tstR) | |
LL0=logLik(tst0) | |
CSv=1-exp((2/N)*(LLR[1]-LLF[1])) | |
CS=1-exp((2/N)*(LL0[1]-LLF[1])) | |
#Nagelkerkes R2 | |
NK0<-CS/(1-exp((2/N)*LL0[1])) | |
NKv<-CSv/(1-exp((2/N)*LLR[1])) | |
#pvalue | |
devdiff=tstR$deviance-tstF$deviance | |
df=tstR$df.residual-tstF$df.residual | |
pval=pchisq(devdiff,df,lower.tail=F) | |
#linear model R2 ********************************************* | |
std_y=ri$PHENO1 | |
ri$std_y=(std_y-mean(std_y))/sd(std_y) | |
lmf=lm(std_y ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri) | |
lmr=lm(std_y ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri) | |
lm0=lm(std_y~1) | |
R2v=1-exp((2/N)*(logLik(lmr)[1]-logLik(lmf)[1])) | |
R2=1-exp((2/N)*(logLik(lm0)[1]-logLik(lmf)[1])) | |
#standard error of R2v | |
#from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164) | |
np=1 #number of paramters | |
vr=4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y)) | |
# calculate liability R2 | |
h2l_r2 = h2l_R2(K,R2v,P) # linear model | |
#SE on the liability (From a Taylor series expansion) | |
#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation | |
se_h2l_r2=se_h2l_R2(K,h2l_r2,vr^.5,P) | |
h2l_r2n = h2l_R2N(K,NKv,P) #nagelkerkes | |
h2l_auc = h2l_AUC(K,aucvS[1]) # auc | |
h2l_cs = h2l_CS(K,CSv,P) # Cox & Snell | |
h2l_r2n_wrong=h2l_R2N(K,r2_diff,P) | |
h2l_auc_wrong=h2l_AUC(K,C_diff+0.5) | |
#make deciles | |
oNSCORE=ri$NSCORE[order(ri$NSCORE)] | |
oPHENO1=ri$PHENO1[order(ri$NSCORE)] | |
rio=ri[order(ri$NSCORE),] | |
N10=round(N/10) | |
dumv=matrix(0,length(oNSCORE),9) #dummy varaible | |
for (zi in 1:9) { | |
fst=length(oNSCORE)-zi*N10+1 | |
lst=length(oNSCORE)-zi*N10+N10 | |
dumv[fst:lst,zi]=1 | |
} | |
tstF = glm(PHENO1 ~ dumv + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model | |
tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model | |
tst0 = glm(PHENO1 ~ 1 , data = rio,family = binomial(logit)) # logit model | |
tstchk = glm(PHENO1 ~ dumv , data = rio,family = binomial(logit)) # logit model | |
ORD=exp(tstF$coefficients[2]) | |
ORDL=exp(tstF$coefficients[2]-1.96*summary(tstF)$coefficients[2,2]) | |
ORDH=exp(tstF$coefficients[2]+1.96*summary(tstF)$coefficients[2,2]) | |
ORDchk=exp(tstchk$coefficients[2]) | |
ORDchkL=exp(tstchk$coefficients[2]-1.96*summary(tstchk)$coefficients[2,2]) | |
ORDchkH=exp(tstchk$coefficients[2]+1.96*summary(tstchk)$coefficients[2,2]) | |
#output | |
#name = name of cohort | |
#Pd = p-value cutoff for discovery cohort | |
#file = input filename | |
#N = total sample size | |
#P = proportion of sample that are cases | |
#NKv Nagelkerkes R2 | |
#pval - pvalue of the R2 | |
#CWC - coefficients with covariates (for direction of effect) | |
#K population risk of disease used in converting NKv to liability scale | |
#h2l_r2n - proportion of variance explained by the score on the liability scale calculated from NKv | |
#aucvS - what we think is the most appropriate estimate of AUC attributed to the score | |
#ORD - the odds ratio when comparing top to bottom decile | |
#ORDL - lower CI of the ORD | |
#ORDH - upper CI of the ORD | |
O=data.frame("INTEMPL",Pd,N,P,NKv,pval,K,h2l_r2n,se_h2l_r2,aucvS,ORD,ORDL,ORDH,NCA,NCO,coeff_w_cov) | |
write.table(O,paste0(prefix,".poly.out.txt"),row.names=F,col.names=F,sep=" ",append=T) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment