Skip to content

Instantly share code, notes, and snippets.

@alicebraun
Last active May 31, 2025 11:36
Show Gist options
  • Save alicebraun/4b320d4df46adb0e0139f45dca3b71b1 to your computer and use it in GitHub Desktop.
Save alicebraun/4b320d4df46adb0e0139f45dca3b71b1 to your computer and use it in GitHub Desktop.
PGS-output.R
###########
#PGC_GPRS.R
#Polygenic Score Analysis from PLINK PRS-CS or C-PT output
###########
# libraries
library(Hmisc)
library(plyr)
library(rms)
library(pROC)
library(MASS)
library(dplyr)
library(readr)
# Read phenotype/covariate data
prefix <- "myprefix" #adjust prefix
PREVALENCE <- 0.01 #adjust prevalence
pheno <- read_tsv("my_covariates.tsv") # adjust filename for covariates (e.g. principal components)
# Read PRS profile data
prs <- read_table("my_plink_output.profile") # adjust filename
# Merge on IID
merged <- prs %>%
left_join(pheno, by = "IID")
# Rename or create columns to match desired output based on your covariate file
ri <- merged %>%
transmute(
FID = FID,
IID = IID,
COUNT = CNT, # use CNT from PRS as count
PHENO = PHENO.x %||% PHENO.y, # handle if both exist
SCORE = SCORESUM,
C1 = PC1,
C2 = PC2,
C4 = PC4,
C5 = PC5,
C6 = PC6,
C7 = PC7,
C8 = PC8,
C9 = PC9,
C10 = PC10
)
# Write file
print(head(ri))
write_tsv(ri, paste0(prefix,"_sumprofile_cov.tsv"))
# Functions
###### do not use this one, it seems outdated, maybe for quantitative phenotypes but untested
###### we use h2l_R2N instead
h2l_R2 <- function(k, r2, p) {
# K baseline disease risk
# r2 from a linear regression model attributable to genomic profile risk score
# P proportion of sample that are cases
# calculates proportion of variance explained on the liability scale
#from ABC at http://www.complextraitgenomics.com/software/
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
x= qnorm(1-K)
z= dnorm(x)
i=z/K
C= k*(1-k)*k*(1-k)/(z^2*p*(1-p))
theta= i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
h2l_R2 = C*r2 / (1 + C*theta*r2)
}
se_h2l_R2 <- function(k,h2,se, p) {
# K baseline disease risk
# r2 from a linear regression model attributable to genomic profile risk score
# P proportion of sample that are cases
# calculates proportion of variance explained on the liability scale
#from ABC at http://www.complextraitgenomics.com/software/
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
#SE on the liability (From a Taylor series expansion)
#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d being calculus differentiation
x= qnorm(1-K)
z= dnorm(x)
i=z/K
C= k*(1-k)*k*(1-k)/(z^2*p*(1-p))
theta= i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
se_h2l_R2 = C*(1-h2*theta)*se
}
h2l_R2N <- function(k, r2n, p) {
# k baseline disease risk
# r2n Nagelkerkes attributable to genomic profile risk score
# proportion of sample that are cases
# calculates proportionexplained on the liability scale
#from ABC at http://www.complextraitgenomics.com/software/
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
x <- qnorm(1 - k)
z <- dnorm(x)
i <- z / k
cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
theta <- i * ((p - k)/(1 - k)) * (i * ((p - k) / ( 1 - k)) - x)
e <- 1 - p^(2 * p) * (1 - p)^(2 * (1 - p))
h2l_R2N <- cc * e * r2n / (1 + cc * e * theta * r2n)
}
h2l_AUC <- function(k,auc) {
# k baseline disease risk
# auc attributable to genomic profile risk score
# calculates proportion of variance explained on the liability scale
#from genroc at http://www.complextraitgenomics.com/software/
#Wray NR, Yang J, Goddard ME, Visscher PM (2010) The Genetic Interpretation of Area under the ROC Curve in Genomic Profiling. PLoS Genet 6(2): e1000864
T0 <- qnorm(1 - k)
z <- dnorm(T0)
i <- z / k
v <- -i * (k / (1-k))
q <- qnorm(auc)
h2l_AUC <- 2 * q^2 / ((v - i)^2 + q^2 * i * (i - T0) + v * (v - T0)) # eq 4
}
h2l_CS <- function(k,cs,p) {
# k baseline disease risk
# cs Cox Snell R2 attributable to genomic profile risk score
# calculates proportion of variance explained on the liability scale
#from ABC at http://www.complextraitgenomics.com/software/
#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
T0 <- qnorm(1 - k)
z <- dnorm(T0)
cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
h2l_CS <- cs*cc
}
# main program
### light edits of s.ripke
#Pd=c("0.00000005","0.000001","0.0001","0.001","0.01","0.05","0.1","0.2","0.5","1")
Pd=c("1")
name="myprefix"
FILE="INTEMPL."
nj=1 # number of files
#nj=10 # number of files
#This output file gives a range of measures including those we con
O=data.frame("name","Pd","file","N","Propcase","NKr2","NKr2_wrong","pval","h2l_r2","h2l_r2n","h2l_auc","h2l_cs","h2l_r2n_wrong","h2l_auc_wrong","auc","aucvF","aucvR","aucv","auc_wrong",
"ORD","ORDL","ORDH","ORDchk","ORDchkL","ORDchkH")
write.table(O,"PGRS_chk.csv",row.names=F,col.names=F,sep=",")
#This output file gives the measures that should be used
O=data.frame("name","Pd","N","Propcase","NKr2","pval","PopRisk","h2l_r2n","se_h2l_r2","AUC","OR10decile","ORL95","ORH95","Ncase","Ncontrol","Coeff_with_cov")
write.table(O,"INTEMPL.poly.out.tprefixxt",row.names=F,col.names=F,sep=" ")
file <- paste0(prefix,"_sumprofile_cov.tsv")
pdf(paste(FILE,"pdf",sep=""))
read.table(file,head=T)->ri
head(ri)
K=0.015 # baseline disorder risk for schizophrenia
### normalize the score
(ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)->ri$NSCORE
ri$PHENO1=ri$PHENO-1
vars = var(ri$SCORE)
print (vars)
P=sum(ri$PHENO1)/length(ri$PHENO1) # proportion of target sample that are cases
######################################
###Stephans code included for comparison
## here statistics with covariates:
#library(rms)
lrm(PHENO ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri )-> go
## here for only covariates
lrm(PHENO ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri )-> go_cov
str=summary(lm(NSCORE~ PHENO + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri ))
ricase=ri[ri$PHENO1==1,]
ricont=ri[ri$PHENO1==0,]
strcase=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricase ))
strcont=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricont ))
### here the stats values:
go$stats -> go_s
go_cov$stats -> go_cov_s
## Stephan: here I substract the "C" values of go_cov_s from the "C" value of go_s (so the same way I do for the R2-values.
C_diff = go_s["C"] - go_cov_s["C"] #AUC = C_diff+0.5
r2_diff = go_s["R2"] - go_cov_s["R2"]
###new code = RIGHT
###logistic models
tstF = glm(PHENO1 ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model
tstS = glm(PHENO1 ~ NSCORE, data = ri,family = binomial(logit)) # logit model
tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model
tst0 = glm(PHENO1 ~ 1 , data = ri,family = binomial(logit)) # logit model
coeff_w_cov = tstF$coefficients["NSCORE"]
#library(pROC)
aucvF = auc(ri$PHENO1,tstF$linear.predictors)
aucvR = auc(ri$PHENO1,tstR$linear.predictors)
aucvS = auc(ri$PHENO1,tstS$linear.predictors)
auc_wrong=aucvF-aucvR +0.5 #auc for score incorrect
aucv=pnorm(qnorm(aucvF)-qnorm(aucvR)) #this is not correct either
#aucvS may be approximately close to correct value (but without covariate)
#Cox&Snell R2
N=length(ri$PHENO1)
NCA=sum(ri$PHENO1==1)
NCO=sum(ri$PHENO1==0)
LLF=logLik(tstF)
LLR=logLik(tstR)
LL0=logLik(tst0)
CSv=1-exp((2/N)*(LLR[1]-LLF[1]))
CS=1-exp((2/N)*(LL0[1]-LLF[1]))
#Nagelkerkes R2
NK0<-CS/(1-exp((2/N)*LL0[1]))
NKv<-CSv/(1-exp((2/N)*LLR[1]))
#pvalue
devdiff=tstR$deviance-tstF$deviance
df=tstR$df.residual-tstF$df.residual
pval=pchisq(devdiff,df,lower.tail=F)
#linear model R2 *********************************************
std_y=ri$PHENO1
ri$std_y=(std_y-mean(std_y))/sd(std_y)
lmf=lm(std_y ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri)
lmr=lm(std_y ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri)
lm0=lm(std_y~1)
R2v=1-exp((2/N)*(logLik(lmr)[1]-logLik(lmf)[1]))
R2=1-exp((2/N)*(logLik(lm0)[1]-logLik(lmf)[1]))
#standard error of R2v
#from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164)
np=1 #number of paramters
vr=4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y))
# calculate liability R2
h2l_r2 = h2l_R2(K,R2v,P) # linear model
#SE on the liability (From a Taylor series expansion)
#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation
se_h2l_r2=se_h2l_R2(K,h2l_r2,vr^.5,P)
h2l_r2n = h2l_R2N(K,NKv,P) #nagelkerkes
h2l_auc = h2l_AUC(K,aucvS[1]) # auc
h2l_cs = h2l_CS(K,CSv,P) # Cox & Snell
h2l_r2n_wrong=h2l_R2N(K,r2_diff,P)
h2l_auc_wrong=h2l_AUC(K,C_diff+0.5)
#make deciles
oNSCORE=ri$NSCORE[order(ri$NSCORE)]
oPHENO1=ri$PHENO1[order(ri$NSCORE)]
rio=ri[order(ri$NSCORE),]
N10=round(N/10)
dumv=matrix(0,length(oNSCORE),9) #dummy varaible
for (zi in 1:9) {
fst=length(oNSCORE)-zi*N10+1
lst=length(oNSCORE)-zi*N10+N10
dumv[fst:lst,zi]=1
}
tstF = glm(PHENO1 ~ dumv + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model
tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model
tst0 = glm(PHENO1 ~ 1 , data = rio,family = binomial(logit)) # logit model
tstchk = glm(PHENO1 ~ dumv , data = rio,family = binomial(logit)) # logit model
ORD=exp(tstF$coefficients[2])
ORDL=exp(tstF$coefficients[2]-1.96*summary(tstF)$coefficients[2,2])
ORDH=exp(tstF$coefficients[2]+1.96*summary(tstF)$coefficients[2,2])
ORDchk=exp(tstchk$coefficients[2])
ORDchkL=exp(tstchk$coefficients[2]-1.96*summary(tstchk)$coefficients[2,2])
ORDchkH=exp(tstchk$coefficients[2]+1.96*summary(tstchk)$coefficients[2,2])
#output
#name = name of cohort
#Pd = p-value cutoff for discovery cohort
#file = input filename
#N = total sample size
#P = proportion of sample that are cases
#NKv Nagelkerkes R2
#pval - pvalue of the R2
#CWC - coefficients with covariates (for direction of effect)
#K population risk of disease used in converting NKv to liability scale
#h2l_r2n - proportion of variance explained by the score on the liability scale calculated from NKv
#aucvS - what we think is the most appropriate estimate of AUC attributed to the score
#ORD - the odds ratio when comparing top to bottom decile
#ORDL - lower CI of the ORD
#ORDH - upper CI of the ORD
O=data.frame("INTEMPL",Pd,N,P,NKv,pval,K,h2l_r2n,se_h2l_r2,aucvS,ORD,ORDL,ORDH,NCA,NCO,coeff_w_cov)
write.table(O,paste0(prefix,".poly.out.txt"),row.names=F,col.names=F,sep=" ",append=T)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment