alicebraun/PGS-output.R

## PGS-output.R
###########
#PGC_GPRS.R
#Polygenic Score Analysis from PLINK PRS-CS or C-PT output
###########

# libraries
library(Hmisc)
library(plyr)
library(rms)
library(pROC)
library(MASS)
library(dplyr)
library(readr)


# Read phenotype/covariate data
prefix <- "myprefix" #adjust prefix
PREVALENCE <- 0.01 #adjust prevalence

pheno <- read_tsv("my_covariates.tsv")  # adjust filename for covariates (e.g. principal components)

# Read PRS profile data
prs <- read_table("my_plink_output.profile")  # adjust filename


# Merge on IID
merged <- prs %>%
  left_join(pheno, by = "IID")

# Rename or create columns to match desired output based on your covariate file
ri <- merged %>%
  transmute(
    FID = FID,
    IID = IID,
    COUNT = CNT,            # use CNT from PRS as count
    PHENO = PHENO.x %||% PHENO.y,  # handle if both exist
    SCORE = SCORESUM,
    C1 = PC1,
    C2 = PC2,
    C4 = PC4,
    C5 = PC5,
    C6 = PC6,
    C7 = PC7,
    C8 = PC8,
    C9 = PC9,
    C10 = PC10
  )

# Write file
print(head(ri))
write_tsv(ri, paste0(prefix,"_sumprofile_cov.tsv"))


# Functions

###### do not use this one, it seems outdated, maybe for quantitative phenotypes but untested
###### we use h2l_R2N instead
h2l_R2 <- function(k, r2, p) {
  # K baseline disease risk
  # r2 from a linear regression model attributable to genomic profile risk score
  # P proportion of sample that are cases
  # calculates proportion of variance explained on the liability scale
  #from ABC at http://www.complextraitgenomics.com/software/
  #Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  x= qnorm(1-K)
  z= dnorm(x)
  i=z/K
  C= k*(1-k)*k*(1-k)/(z^2*p*(1-p))
  theta= i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
  h2l_R2 = C*r2 / (1 + C*theta*r2)
}

se_h2l_R2 <- function(k,h2,se, p) {
  # K baseline disease risk
  # r2 from a linear regression model attributable to genomic profile risk score
  # P proportion of sample that are cases
  # calculates proportion of variance explained on the liability scale
  #from ABC at http://www.complextraitgenomics.com/software/
  #Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.

  #SE on the liability (From a Taylor series expansion)
  #var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d being calculus differentiation
  x= qnorm(1-K)
  z= dnorm(x)
  i=z/K
  C= k*(1-k)*k*(1-k)/(z^2*p*(1-p))
  theta= i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
  se_h2l_R2 = C*(1-h2*theta)*se
}


h2l_R2N <- function(k, r2n, p) {
  # k baseline disease risk
  # r2n Nagelkerkes attributable to genomic profile risk score
  # proportion of sample that are cases
  # calculates proportionexplained on the liability scale
  #from ABC at http://www.complextraitgenomics.com/software/
  #Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  x <- qnorm(1 - k)
  z <- dnorm(x)
  i <- z / k
  cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
  theta <- i * ((p - k)/(1 - k)) * (i * ((p - k) / ( 1 - k)) - x)
  e <- 1 - p^(2 * p) * (1 - p)^(2 * (1 - p))
  h2l_R2N <- cc * e * r2n / (1 + cc * e * theta * r2n)
}
h2l_AUC <-  function(k,auc) {
  # k baseline disease risk
  # auc attributable to genomic profile risk score
  # calculates proportion of variance explained on the liability scale
  #from genroc at http://www.complextraitgenomics.com/software/
  #Wray NR, Yang J, Goddard ME, Visscher PM (2010) The Genetic Interpretation of Area under the ROC Curve in Genomic Profiling. PLoS Genet 6(2): e1000864
  T0 <- qnorm(1 - k)
  z  <- dnorm(T0)
  i  <- z / k
  v  <- -i * (k / (1-k))
  q <- qnorm(auc)
  h2l_AUC <- 2 * q^2 / ((v - i)^2 + q^2 * i * (i - T0) + v * (v - T0)) # eq 4
}
h2l_CS <-  function(k,cs,p) {
  # k baseline disease risk
  # cs Cox Snell R2 attributable to genomic profile risk score
  # calculates proportion of variance explained on the liability scale
  #from ABC at http://www.complextraitgenomics.com/software/
  #Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  T0 <- qnorm(1 - k)
  z  <- dnorm(T0)
  cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
  h2l_CS <- cs*cc
}


# main program
### light edits of s.ripke


#Pd=c("0.00000005","0.000001","0.0001","0.001","0.01","0.05","0.1","0.2","0.5","1")
Pd=c("1")

name="myprefix"
FILE="INTEMPL."
nj=1  # number of files
#nj=10  # number of files

#This output file gives a range of measures including those we con
O=data.frame("name","Pd","file","N","Propcase","NKr2","NKr2_wrong","pval","h2l_r2","h2l_r2n","h2l_auc","h2l_cs","h2l_r2n_wrong","h2l_auc_wrong","auc","aucvF","aucvR","aucv","auc_wrong",
             "ORD","ORDL","ORDH","ORDchk","ORDchkL","ORDchkH")
write.table(O,"PGRS_chk.csv",row.names=F,col.names=F,sep=",")

#This output file gives the measures that should be used
O=data.frame("name","Pd","N","Propcase","NKr2","pval","PopRisk","h2l_r2n","se_h2l_r2","AUC","OR10decile","ORL95","ORH95","Ncase","Ncontrol","Coeff_with_cov")
write.table(O,"INTEMPL.poly.out.tprefixxt",row.names=F,col.names=F,sep=" ")

file <- paste0(prefix,"_sumprofile_cov.tsv")


pdf(paste(FILE,"pdf",sep=""))

    read.table(file,head=T)->ri
    head(ri)

    K=0.015 # baseline disorder risk for schizophrenia

    ### normalize the score
    (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)->ri$NSCORE
    ri$PHENO1=ri$PHENO-1

    vars = var(ri$SCORE)
    print (vars)

    P=sum(ri$PHENO1)/length(ri$PHENO1) # proportion of target sample that are cases


       ######################################
      ###Stephans code included for comparison
      ## here statistics with covariates:
      #library(rms)
      lrm(PHENO ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri )-> go

      ## here for only covariates
      lrm(PHENO ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri )-> go_cov

      str=summary(lm(NSCORE~ PHENO + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri ))

      ricase=ri[ri$PHENO1==1,]
      ricont=ri[ri$PHENO1==0,]

      strcase=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricase ))
      strcont=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricont ))


      ### here the stats values:
      go$stats -> go_s
      go_cov$stats -> go_cov_s

      ## Stephan: here I substract the "C" values of go_cov_s from the "C" value of go_s (so the same way I do for the R2-values.
      C_diff = go_s["C"] - go_cov_s["C"]  #AUC = C_diff+0.5
      r2_diff = go_s["R2"] - go_cov_s["R2"]

      ###new code = RIGHT
      ###logistic models
      tstF = glm(PHENO1 ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model
      tstS = glm(PHENO1 ~ NSCORE, data = ri,family = binomial(logit)) # logit model
      tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model
      tst0 = glm(PHENO1 ~ 1 , data = ri,family = binomial(logit)) # logit model


      coeff_w_cov = tstF$coefficients["NSCORE"]

      #library(pROC)
      aucvF = auc(ri$PHENO1,tstF$linear.predictors)
      aucvR = auc(ri$PHENO1,tstR$linear.predictors)
      aucvS = auc(ri$PHENO1,tstS$linear.predictors)

      auc_wrong=aucvF-aucvR +0.5  #auc for score incorrect
      aucv=pnorm(qnorm(aucvF)-qnorm(aucvR))  #this is not correct either
      #aucvS may be approximately close to correct value (but without covariate)

      #Cox&Snell R2
      N=length(ri$PHENO1)
      NCA=sum(ri$PHENO1==1)
      NCO=sum(ri$PHENO1==0)
      LLF=logLik(tstF)
      LLR=logLik(tstR)
      LL0=logLik(tst0)

      CSv=1-exp((2/N)*(LLR[1]-LLF[1]))
      CS=1-exp((2/N)*(LL0[1]-LLF[1]))

      #Nagelkerkes R2
      NK0<-CS/(1-exp((2/N)*LL0[1]))
      NKv<-CSv/(1-exp((2/N)*LLR[1]))

      #pvalue
      devdiff=tstR$deviance-tstF$deviance
      df=tstR$df.residual-tstF$df.residual
      pval=pchisq(devdiff,df,lower.tail=F)

      #linear model R2 *********************************************
      std_y=ri$PHENO1
      ri$std_y=(std_y-mean(std_y))/sd(std_y)

      lmf=lm(std_y ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri)
      lmr=lm(std_y ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri)
      lm0=lm(std_y~1)

      R2v=1-exp((2/N)*(logLik(lmr)[1]-logLik(lmf)[1]))
      R2=1-exp((2/N)*(logLik(lm0)[1]-logLik(lmf)[1]))

      #standard error of R2v
      #from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164)
      np=1    #number of paramters
      vr=4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y))

      # calculate liability R2
      h2l_r2 = h2l_R2(K,R2v,P) # linear model
      #SE on the liability (From a Taylor series expansion)
      #var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation
      se_h2l_r2=se_h2l_R2(K,h2l_r2,vr^.5,P)

      h2l_r2n = h2l_R2N(K,NKv,P) #nagelkerkes
      h2l_auc = h2l_AUC(K,aucvS[1])  # auc
      h2l_cs = h2l_CS(K,CSv,P)   # Cox & Snell
      h2l_r2n_wrong=h2l_R2N(K,r2_diff,P)
      h2l_auc_wrong=h2l_AUC(K,C_diff+0.5)

      #make deciles
      oNSCORE=ri$NSCORE[order(ri$NSCORE)]
      oPHENO1=ri$PHENO1[order(ri$NSCORE)]
      rio=ri[order(ri$NSCORE),]
      N10=round(N/10)


      dumv=matrix(0,length(oNSCORE),9) #dummy varaible
      for (zi in 1:9) {
        fst=length(oNSCORE)-zi*N10+1
        lst=length(oNSCORE)-zi*N10+N10
        dumv[fst:lst,zi]=1
      }

      tstF = glm(PHENO1 ~ dumv + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model
      tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model
      tst0 = glm(PHENO1 ~ 1 , data = rio,family = binomial(logit)) # logit model
      tstchk = glm(PHENO1 ~ dumv , data = rio,family = binomial(logit)) # logit model


      ORD=exp(tstF$coefficients[2])
      ORDL=exp(tstF$coefficients[2]-1.96*summary(tstF)$coefficients[2,2])
      ORDH=exp(tstF$coefficients[2]+1.96*summary(tstF)$coefficients[2,2])
      ORDchk=exp(tstchk$coefficients[2])
      ORDchkL=exp(tstchk$coefficients[2]-1.96*summary(tstchk)$coefficients[2,2])
      ORDchkH=exp(tstchk$coefficients[2]+1.96*summary(tstchk)$coefficients[2,2])


      #output
      #name = name of cohort
      #Pd = p-value cutoff for discovery cohort
      #file = input filename
      #N = total sample size
      #P = proportion of sample that are cases
      #NKv Nagelkerkes R2
      #pval - pvalue of the R2
      #CWC - coefficients with covariates (for direction of effect)
      #K population risk of disease used in converting NKv to liability scale
      #h2l_r2n - proportion of variance explained by the score on the liability scale calculated from NKv
      #aucvS - what we think is the most appropriate estimate of AUC attributed to the score
      #ORD - the odds ratio when comparing top to bottom decile
      #ORDL - lower CI of the ORD
      #ORDH - upper CI of the ORD

      O=data.frame("INTEMPL",Pd,N,P,NKv,pval,K,h2l_r2n,se_h2l_r2,aucvS,ORD,ORDL,ORDH,NCA,NCO,coeff_w_cov)
      write.table(O,paste0(prefix,".poly.out.txt"),row.names=F,col.names=F,sep=" ",append=T)
	###########
	#PGC_GPRS.R
	#Polygenic Score Analysis from PLINK PRS-CS or C-PT output
	###########

	# libraries
	library(Hmisc)
	library(plyr)
	library(rms)
	library(pROC)
	library(MASS)
	library(dplyr)
	library(readr)


	# Read phenotype/covariate data
	prefix <- "myprefix" #adjust prefix
	PREVALENCE <- 0.01 #adjust prevalence

	pheno <- read_tsv("my_covariates.tsv") # adjust filename for covariates (e.g. principal components)

	# Read PRS profile data
	prs <- read_table("my_plink_output.profile") # adjust filename


	# Merge on IID
	merged <- prs %>%
	left_join(pheno, by = "IID")

	# Rename or create columns to match desired output based on your covariate file
	ri <- merged %>%
	transmute(
	FID = FID,
	IID = IID,
	COUNT = CNT, # use CNT from PRS as count
	PHENO = PHENO.x %\|\|% PHENO.y, # handle if both exist
	SCORE = SCORESUM,
	C1 = PC1,
	C2 = PC2,
	C4 = PC4,
	C5 = PC5,
	C6 = PC6,
	C7 = PC7,
	C8 = PC8,
	C9 = PC9,
	C10 = PC10
	)

	# Write file
	print(head(ri))
	write_tsv(ri, paste0(prefix,"_sumprofile_cov.tsv"))


	# Functions

	###### do not use this one, it seems outdated, maybe for quantitative phenotypes but untested
	###### we use h2l_R2N instead
	h2l_R2 <- function(k, r2, p) {
	# K baseline disease risk
	# r2 from a linear regression model attributable to genomic profile risk score
	# P proportion of sample that are cases
	# calculates proportion of variance explained on the liability scale
	#from ABC at http://www.complextraitgenomics.com/software/
	#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
	x= qnorm(1-K)
	z= dnorm(x)
	i=z/K
	C= k(1-k)k(1-k)/(z^2p*(1-p))
	theta= i((p-k)/(1-k))(i*((p-k)/(1-k))-x)
	h2l_R2 = Cr2 / (1 + Ctheta*r2)
	}

	se_h2l_R2 <- function(k,h2,se, p) {
	# K baseline disease risk
	# r2 from a linear regression model attributable to genomic profile risk score
	# P proportion of sample that are cases
	# calculates proportion of variance explained on the liability scale
	#from ABC at http://www.complextraitgenomics.com/software/
	#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.

	#SE on the liability (From a Taylor series expansion)
	#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d being calculus differentiation
	x= qnorm(1-K)
	z= dnorm(x)
	i=z/K
	C= k(1-k)k(1-k)/(z^2p*(1-p))
	theta= i((p-k)/(1-k))(i*((p-k)/(1-k))-x)
	se_h2l_R2 = C(1-h2theta)*se
	}


	h2l_R2N <- function(k, r2n, p) {
	# k baseline disease risk
	# r2n Nagelkerkes attributable to genomic profile risk score
	# proportion of sample that are cases
	# calculates proportionexplained on the liability scale
	#from ABC at http://www.complextraitgenomics.com/software/
	#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
	x <- qnorm(1 - k)
	z <- dnorm(x)
	i <- z / k
	cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
	theta <- i * ((p - k)/(1 - k)) * (i * ((p - k) / ( 1 - k)) - x)
	e <- 1 - p^(2 * p) * (1 - p)^(2 * (1 - p))
	h2l_R2N <- cc * e * r2n / (1 + cc * e * theta * r2n)
	}
	h2l_AUC <- function(k,auc) {
	# k baseline disease risk
	# auc attributable to genomic profile risk score
	# calculates proportion of variance explained on the liability scale
	#from genroc at http://www.complextraitgenomics.com/software/
	#Wray NR, Yang J, Goddard ME, Visscher PM (2010) The Genetic Interpretation of Area under the ROC Curve in Genomic Profiling. PLoS Genet 6(2): e1000864
	T0 <- qnorm(1 - k)
	z <- dnorm(T0)
	i <- z / k
	v <- -i * (k / (1-k))
	q <- qnorm(auc)
	h2l_AUC <- 2 * q^2 / ((v - i)^2 + q^2 * i * (i - T0) + v * (v - T0)) # eq 4
	}
	h2l_CS <- function(k,cs,p) {
	# k baseline disease risk
	# cs Cox Snell R2 attributable to genomic profile risk score
	# calculates proportion of variance explained on the liability scale
	#from ABC at http://www.complextraitgenomics.com/software/
	#Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
	T0 <- qnorm(1 - k)
	z <- dnorm(T0)
	cc <- k * (1 - k) * k * (1 - k) / (z^2 * p * (1 - p))
	h2l_CS <- cs*cc
	}



	# main program
	### light edits of s.ripke


	#Pd=c("0.00000005","0.000001","0.0001","0.001","0.01","0.05","0.1","0.2","0.5","1")
	Pd=c("1")

	name="myprefix"
	FILE="INTEMPL."
	nj=1 # number of files
	#nj=10 # number of files

	#This output file gives a range of measures including those we con
	O=data.frame("name","Pd","file","N","Propcase","NKr2","NKr2_wrong","pval","h2l_r2","h2l_r2n","h2l_auc","h2l_cs","h2l_r2n_wrong","h2l_auc_wrong","auc","aucvF","aucvR","aucv","auc_wrong",
	"ORD","ORDL","ORDH","ORDchk","ORDchkL","ORDchkH")
	write.table(O,"PGRS_chk.csv",row.names=F,col.names=F,sep=",")

	#This output file gives the measures that should be used
	O=data.frame("name","Pd","N","Propcase","NKr2","pval","PopRisk","h2l_r2n","se_h2l_r2","AUC","OR10decile","ORL95","ORH95","Ncase","Ncontrol","Coeff_with_cov")
	write.table(O,"INTEMPL.poly.out.tprefixxt",row.names=F,col.names=F,sep=" ")

	file <- paste0(prefix,"_sumprofile_cov.tsv")


	pdf(paste(FILE,"pdf",sep=""))

	read.table(file,head=T)->ri
	head(ri)

	K=0.015 # baseline disorder risk for schizophrenia

	### normalize the score
	(ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)->ri$NSCORE
	ri$PHENO1=ri$PHENO-1

	vars = var(ri$SCORE)
	print (vars)

	P=sum(ri$PHENO1)/length(ri$PHENO1) # proportion of target sample that are cases


	######################################
	###Stephans code included for comparison
	## here statistics with covariates:
	#library(rms)
	lrm(PHENO ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri )-> go

	## here for only covariates
	lrm(PHENO ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri )-> go_cov

	str=summary(lm(NSCORE~ PHENO + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri ))

	ricase=ri[ri$PHENO1==1,]
	ricont=ri[ri$PHENO1==0,]

	strcase=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricase ))
	strcont=summary(lm(NSCORE~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ricont ))


	### here the stats values:
	go$stats -> go_s
	go_cov$stats -> go_cov_s

	## Stephan: here I substract the "C" values of go_cov_s from the "C" value of go_s (so the same way I do for the R2-values.
	C_diff = go_s["C"] - go_cov_s["C"] #AUC = C_diff+0.5
	r2_diff = go_s["R2"] - go_cov_s["R2"]

	###new code = RIGHT
	###logistic models
	tstF = glm(PHENO1 ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model
	tstS = glm(PHENO1 ~ NSCORE, data = ri,family = binomial(logit)) # logit model
	tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, data = ri,family = binomial(logit)) # logit model
	tst0 = glm(PHENO1 ~ 1 , data = ri,family = binomial(logit)) # logit model


	coeff_w_cov = tstF$coefficients["NSCORE"]

	#library(pROC)
	aucvF = auc(ri$PHENO1,tstF$linear.predictors)
	aucvR = auc(ri$PHENO1,tstR$linear.predictors)
	aucvS = auc(ri$PHENO1,tstS$linear.predictors)

	auc_wrong=aucvF-aucvR +0.5 #auc for score incorrect
	aucv=pnorm(qnorm(aucvF)-qnorm(aucvR)) #this is not correct either
	#aucvS may be approximately close to correct value (but without covariate)

	#Cox&Snell R2
	N=length(ri$PHENO1)
	NCA=sum(ri$PHENO1==1)
	NCO=sum(ri$PHENO1==0)
	LLF=logLik(tstF)
	LLR=logLik(tstR)
	LL0=logLik(tst0)

	CSv=1-exp((2/N)*(LLR[1]-LLF[1]))
	CS=1-exp((2/N)*(LL0[1]-LLF[1]))

	#Nagelkerkes R2
	NK0<-CS/(1-exp((2/N)*LL0[1]))
	NKv<-CSv/(1-exp((2/N)*LLR[1]))

	#pvalue
	devdiff=tstR$deviance-tstF$deviance
	df=tstR$df.residual-tstF$df.residual
	pval=pchisq(devdiff,df,lower.tail=F)

	#linear model R2 *********************************************
	std_y=ri$PHENO1
	ri$std_y=(std_y-mean(std_y))/sd(std_y)

	lmf=lm(std_y ~ NSCORE + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri)
	lmr=lm(std_y ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10 , data = ri)
	lm0=lm(std_y~1)

	R2v=1-exp((2/N)*(logLik(lmr)[1]-logLik(lmf)[1]))
	R2=1-exp((2/N)*(logLik(lm0)[1]-logLik(lmf)[1]))

	#standard error of R2v
	#from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164)
	np=1 #number of paramters
	vr=4/length(std_y)R2v(1-R2v)^2(1-(2np+3)/length(std_y))

	# calculate liability R2
	h2l_r2 = h2l_R2(K,R2v,P) # linear model
	#SE on the liability (From a Taylor series expansion)
	#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation
	se_h2l_r2=se_h2l_R2(K,h2l_r2,vr^.5,P)

	h2l_r2n = h2l_R2N(K,NKv,P) #nagelkerkes
	h2l_auc = h2l_AUC(K,aucvS[1]) # auc
	h2l_cs = h2l_CS(K,CSv,P) # Cox & Snell
	h2l_r2n_wrong=h2l_R2N(K,r2_diff,P)
	h2l_auc_wrong=h2l_AUC(K,C_diff+0.5)

	#make deciles
	oNSCORE=ri$NSCORE[order(ri$NSCORE)]
	oPHENO1=ri$PHENO1[order(ri$NSCORE)]
	rio=ri[order(ri$NSCORE),]
	N10=round(N/10)


	dumv=matrix(0,length(oNSCORE),9) #dummy varaible
	for (zi in 1:9) {
	fst=length(oNSCORE)-zi*N10+1
	lst=length(oNSCORE)-zi*N10+N10
	dumv[fst:lst,zi]=1
	}

	tstF = glm(PHENO1 ~ dumv + C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model
	tstR = glm(PHENO1 ~ C1 + C2 + C4 + C5 + C6 + C7 + C8 + C9 + C10, , data = rio,family = binomial(logit)) # logit model
	tst0 = glm(PHENO1 ~ 1 , data = rio,family = binomial(logit)) # logit model
	tstchk = glm(PHENO1 ~ dumv , data = rio,family = binomial(logit)) # logit model



	ORD=exp(tstF$coefficients[2])
	ORDL=exp(tstF$coefficients[2]-1.96*summary(tstF)$coefficients[2,2])
	ORDH=exp(tstF$coefficients[2]+1.96*summary(tstF)$coefficients[2,2])
	ORDchk=exp(tstchk$coefficients[2])
	ORDchkL=exp(tstchk$coefficients[2]-1.96*summary(tstchk)$coefficients[2,2])
	ORDchkH=exp(tstchk$coefficients[2]+1.96*summary(tstchk)$coefficients[2,2])



	#output
	#name = name of cohort
	#Pd = p-value cutoff for discovery cohort
	#file = input filename
	#N = total sample size
	#P = proportion of sample that are cases
	#NKv Nagelkerkes R2
	#pval - pvalue of the R2
	#CWC - coefficients with covariates (for direction of effect)
	#K population risk of disease used in converting NKv to liability scale
	#h2l_r2n - proportion of variance explained by the score on the liability scale calculated from NKv
	#aucvS - what we think is the most appropriate estimate of AUC attributed to the score
	#ORD - the odds ratio when comparing top to bottom decile
	#ORDL - lower CI of the ORD
	#ORDH - upper CI of the ORD

	O=data.frame("INTEMPL",Pd,N,P,NKv,pval,K,h2l_r2n,se_h2l_r2,aucvS,ORD,ORDL,ORDH,NCA,NCO,coeff_w_cov)
	write.table(O,paste0(prefix,".poly.out.txt"),row.names=F,col.names=F,sep=" ",append=T)