Skip to content

Instantly share code, notes, and snippets.

@kkholst
Last active March 23, 2017 08:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kkholst/1cedd0185219511189a7c130d7cf78ae to your computer and use it in GitHub Desktop.
Save kkholst/1cedd0185219511189a7c130d7cf78ae to your computer and use it in GitHub Desktop.
Epidemiology
Solutions to selected exercises...
ihd: Exercise 2.3
bcg: Exercise 2.4
estradiol.sas: Mediation exercise
estradiol.R : Mediation exercise
data bcgdata;
filename bcgfile url 'http://publicifsv.sund.ku.dk/~kkho/data/bcgalldata.txt';
infile bcgfile firstobs=2;
input age scar status $ n;
run;
proc print data=bcgdata (obs=50) ;
run ;
/*
To do a particular analysis we must select cases and a particular
set of controls with the where-statement, so the first analysis is
with the total set of controls.
We use the estimate statements to give the ORs of leprosy between
age-classes, using both the first and last age-class as reference.
Note that the s.e. of the log-odds-ratio is very large when we use
the first age-class with only 2 cases as reference.
*/
proc genmod data=bcgdata;
where status='case' or status='conall' ;
class age scar;
model status = age scar / dist=bin type3 ;
weight n;
estimate "BCG scar
" scar -1 0 / exp ;
estimate "Age 2 vs. 1" age -1 1 0 0 0 0 0 / exp ;
estimate "Age 3 vs. 1" age -1 0 1 0 0 0 0 / exp ;
estimate "Age 4 vs. 1" age -1 0 0 1 0 0 0 / exp ;
estimate "Age 5 vs. 1" age -1 0 0 0 1 0 0 / exp ;
estimate "Age 6 vs. 1" age -1 0 0 0 0 1 0 / exp ;
estimate "Age 7 vs. 1" age -1 0 0 0 0 0 1 / exp ;
run;
## Uncomment these to install some dependencies
## install.packages("devtools")
## devtools::load_all("kkholst/lava")
## install.packages("mets")
library(mets)
## Load the data:
estradiol <- read.table("http://publicifsv.sund.ku.dk/~kkho/data/estradiol.txt",header=TRUE)
##Alternatively, load data from local subdirectory 'data':
##estradiol <- read.table("data/estradiol.txt",header=TRUE)
## Direct effects
l1 <- lm(neocort ~ testosterone+estradiol+age, data=estradiol)
summary(l1)
l2 <- lm(estradiol ~ testosterone+age, data=estradiol)
summary(l2)
## Joint model
m <- lvm(neocort~estradiol+testosterone+age,
estradiol~testosterone+age)
e <- estimate(m,estradiol)
coef(effects(e, neocort~testosterone))
## Mediation proportion
estimate(e, function(x,...) {
ind <- x["neocort~estradiol"]*x["estradiol~testosterone"]
dir <- x["neocort~testosterone"]
total <- ind+dir
medprop <- ind/total
return(c(indirect=ind,direct=dir,total=total,mediationproportion=medprop))
}, vcov=vcov(e))
## Bootstrap confidence limits
f <- function(object,...) coef(effects(object,neocort~testosterone))[,1]
b <- bootstrap(e, fun=f, R=100, estimator="glm", mc.cores=4)
b
data estradiol;
filename efile url "http://publicifsv.sund.ku.dk/~kkho/data/estradiol.txt";
infile efile firstobs=2;
input neocort estra testo age mr;
run;
proc genmod data=estradiol plots=none;
model neocort = testo estra age /cl;
run; quit;
* The grouped data (slightly altered from the book by C&H) ;
data ihdfreq;
input eksp agr pyrs cases;
lpyrs = log( pyrs );
datalines;
1 0 346.87 2
1 1 979.34 12
1 2 699.14 14
0 0 560.13 4
0 1 1127.70 6
0 2 794.15 8
;
run;
* Make a Poisson regression of the rates ;
proc genmod data = ihdfreq ;
class eksp agr ;
model cases = eksp agr
/ dist = poisson offset = lpyrs type3 ;
estimate "low vs. high" eksp -1 1 / Exp ;
estimate "Rate E1 A0" intercept 1 eksp 1 0 agr 1 0 0 / Exp ;
run;
/* Q2
Read the individual records from the diet data
*/
data ihdindiv ;
* filename dietfile "/home/bendix/teach/Epi/KU-epi/data/diet.txt" ;
filename dietfile url "http://publicifsv.sund.ku.dk/~pka/epidata/diet.txt";
infile dietfile firstobs=2 ;
input id doe dox chd dob job month energy height weight fat fibre ;
* This is to read the date variables correctly in and print them reabably ;
informat doe dox dob mmddyy10.;
format doe dox dob ddmmyy10.;
* We must compute the exposure variable ;
exposure = ( energy < 2.75 ) ;
drop job month energy height weight fat fibre ;
run ;
* Take a look at the individual records ;
proc print data = ihdindiv (obs=20) ;
run ;
/* Q3
Include the two macros for time-splitting and tabulation ;
*/
options source2 ;
filename Lexispr url "http://BendixCarstensen.com/Lexis/Lexis.sas";
%inc Lexispr ;
filename PYtabpr url "http://BendixCarstensen.com/Lexis/PYtab.sas";
%inc PYtabpr ;
/* Q4
Timesplitting using the Lexis macro
*/
%Lexis( data = ihdindiv, /* Dataset with original data */
out = ihdsplit, /* Dataset with time-split data */
entry = doe, /* Date of entry */
exit = dox, /* Date of exit */
fail = chd, /* Event (failure) indicator */
origin = dob, /* Origin of the time-scale */
scale = 365.25, /* Convert from input scale to braks-scale */
breaks = 40 to 70 by 10, /* Where to split the time scale */
left = agr ); /* The name of the new age-variable */
/* Q6
We can use proc print to inspect the resulting records
*/
proc print data = ihdsplit (obs=20) ;
run ;
/* Q7
We can use %PYtab to tabulate person-years and detahs:
*/
%PYtab( data = ihdsplit,
class = exposure agr,
fail = chd,
risk = risk,
scale = 1000 ) ;
/* Q7a
But we can also use proc tabulate to replicate the table from C&H
in a readable form
*/
proc tabulate data=ihdsplit missing noseps formchar=" ---- ---" ;
class agr exposure ;
var risk chd ;
table agr="Age" all="Total",
( exposure="Exposure" all="Total" ) *
sum=" " * ( chd="D"*f=6. risk="Y"*f=8.1 )
/ rts = 15 ;
run ;
/* Q8
Now fit the model to the time-split data - results are as for the
grouped data */
proc genmod data = ihdsplit ;
class agr exposure ;
model chd = agr exposure / dist=poisson offset=lrisk ;
run ;
/* Q9
Add an interaction between age and exposure and check that you
get the same test for interaction as with the grouped data.
*/
proc genmod data = ihdsplit ;
class agr exposure ;
model chd = agr exposure agr*exposure / dist=poisson offset=lrisk type3 ;
run ;
/* Q10
We fit the main effects model to the grouped data (6 obs-dataset)
and observe that the deviance is the same as the type-3 test for
interaction based on the individual data
*/
proc genmod data = ihdfreq ;
class eksp agr ;
model cases = eksp agr
/ dist = poisson offset = lpyrs type3 ;
run;
/* We can split in smaller intervals and model age continuously */
%Lexis( data = ihdindiv, /* Dataset with original data */
out = ihdsplit, /* Dataset with time-split data */
entry = doe, /* Date of entry */
exit = dox, /* Date of exit */
fail = chd, /* Event (failure) indicator */
origin = dob, /* Origin of the time-scale */
scale = 365.25, /* Convert from input scale to braks-scale */
breaks = 40 to 70 by 1, /* Where to split the time scale */
left = age ); /* The name of the new age-variable */
/* And then use a spline model for the effect of age */
data ihdsplit ;
set ihdsplit ;
a45 = max( 0, age-45 ) ;
a50 = max( 0, age-50 ) ;
a55 = max( 0, age-55 ) ;
a60 = max( 0, age-60 ) ;
a65 = max( 0, age-65 ) ;
run ;
/* Making predictions requires extra observations with missing chd */
data ihdsplit ;
set ihdsplit end = last ;
prr = 0 ; * Indicator for prediction part of data ;
output ;
if last then
do exposure = 0, 1 ;
do age = 40 to 70 by 1 ;
prr = 1 ; * Indicator for prediction part of data ;
chd = . ;
a45 = max( 0, age-45 ) ;
a50 = max( 0, age-50 ) ;
a55 = max( 0, age-55 ) ;
a60 = max( 0, age-60 ) ;
a65 = max( 0, age-65 ) ;
output ;
end ;
end ;
run ;
/* Model using linear splines, outputting predictions */
proc genmod data = ihdsplit ;
class exposure ;
model chd = age a45 a50 a55 a60 a65 exposure
/ dist=poisson offset=lrisk type3 ;
output out = plrate ( where = (prr=1) )
xbeta = lograte
stdxbeta = selograte ;
run ;
data plrate ;
set plrate ;
* rates and confidence intervals ;
rate = exp( lograte ) * 1000 ;
rlo = exp( lograte - 1.96*selograte ) * 1000 ;
rhi = exp( lograte + 1.96*selograte ) * 1000 ;
* change to missing for expoure 0 / 1 ;
e1 = 1 ; e0 = 1 ;
if exposure = 0 then e1 = . ; else e0 = . ;
rate0 = rate*e0 ; rlo0 = rlo*e0 ; rhi0 = rhi*e0 ;
rate1 = rate*e1 ; rlo1 = rlo*e1 ; rhi1 = rhi*e1 ;
run ;
proc print data = plrate ;
where age < 46 ;
run ;
proc sgplot data = plrate ;
series x=age y=rate0 / lineattrs=(color=blue) legendlabel="un-exposed" name="r0";
band x=age lower=rlo0 upper=rhi0 / fillattrs=(color=blue transparency=.9);
series x=age y=rate1 / lineattrs=(color=red pattern=dash) legendlabel="exposed" name="r1";
band x=age lower=rlo1 upper=rhi1 / fillattrs=(color=red transparency=0.9);
xaxis label="Age" GRID VALUES = (40 to 70 by 5);
yaxis type=log logbase=10 logstyle=logexponent logvtype=expanded values=(1,2,5,10,20,50,100);
keylegend "r0" "r1";
run ;
/* proc gplot data = plrate ; */
/* plot ( rate0 rlo0 rhi0 ) * age / */
/* overlay haxis=axis1 vaxis=axis2 ; * un-exposed ; */
/* plot2 ( rate1 rlo1 rhi1 ) * age / */
/* overlay haxis=axis1 vaxis=axis2 ; * exposed ; */
/* axis1 order=(40 to 70 by 5) ; */
/* axis2 logbase=10 logstyle=expand order=(1,2,5,10,20,50,100) ; */
/* symbol1 w=5 i=join color=blue ; */
/* symbol2 w=2 i=join color=blue ; */
/* symbol3 w=2 i=join color=blue ; */
/* symbol4 w=5 i=join color=red ; */
/* symbol5 w=2 i=join color=red ; */
/* symbol6 w=2 i=join color=red ; */
/* run ; */
/* Data from Clayton & Hills Table 22.6, p. 221 */
## Alternatively, read
ihd <- read.table("http://publicifsv.sund.ku.dk/~kkho/data/ihd-tab.txt",header=TRUE)
## Alternatively, input the data directly:
ihd <- data.frame(exposed=c(1,1,1,0,0,0),
age=c("40-49","50-59","60-69","40-49","50-59","60-69"),
pyears=c(311.9,878.1,667.5,607.9,1272.1,888.9),
cases=c(2,12,14,4,5,8))
ihd$age <- relevel(ihd$age, ref="60-69") ## Use last (older) group as reference
## Model with constant rate within both exposure groups (Table 13.1)
g0 <- glm(cases ~ exposed+offset(log(pyears)),data=ihd,family=poisson)
summary(g0)
## Direct calculations gives exact same results:
y1 <- sum(subset(ihd,exposed==1)$pyears)
d1 <- sum(subset(ihd,exposed==1)$cases)
y0 <- sum(subset(ihd,exposed==0)$pyears)
d0 <- sum(subset(ihd,exposed==0)$cases)
r0 <- d0/y0 ## Rate in unexposed
r1 <- d1/y1 ## Rate in exposed
se0 <- 1/sqrt(d0) ## SE of log(r0)
se1 <- 1/sqrt(d1) ## SE of log(r0)
logrr = log(r1)-log(r0) ## Rate-ratio
selogrr <- (1/d0+1/d1)^.5 ## SE of RR
cbind(logr0=log(r0), logr0.se=se0, logrr=logrr, logrr=selogrr)
## Compare with poisson regression:
summary(g0)
## Q1:
## We fit the model with main effects of age and exposure, and we see
## we obtain the same estimates as in Clayton & Hill's book
g1 <- glm(cases ~ age + exposed+offset(log(pyears)),data=ihd,family=poisson)
summary(g1)
## Model without exposure effect
g2 <- glm(cases ~ age + offset(log(pyears)),data=ihd,family=poisson)
## LRT:
Q <- -2*(logLik(g2)-logLik(g1))
1-pchisq(Q,df=1)
## Wald test: 0.0048, LRT: 0.0040
/*
First read in the data as from the table in the book
*/
data ihd;
input exposed age pyrs cases;
lpyrs=log(pyrs);
cards;
0 2 311.9 2
0 1 878.1 12
0 0 667.5 14
1 2 607.9 4
1 1 1272.1 5
1 0 888.9 8
;
run;
proc format;
value exposedfmt 0='exposed(<2750kcals)'
1='unexposed(>=2750kcals)';
value agefmt 0='60-69'
1='50-59'
2='40-49';
run;
/* Table 22.6 in Clayton & Hills */
proc print data=ihd;
format age agefmt. exposed exposedfmt.;
run;
/*
* Comparison of rate within exposed and unexposed (see Table 13.1 Clayton & Hills)
*/
proc genmod data=ihd;
format exposed exposedfmt.;
class exposed;
model cases=exposed / dist=poisson offset=lpyrs;
estimate "exp. vs. non-exp." exposed 1 -1 / exp;
run;
/* For comparison we see that we get the same estimates and standard errors using the usual formulas: */
data ihdest;
pyears1=311.9+878.1+667.5;
pyears0=607.9+1272.1+888.9;
cases1=2+12+14;
cases0=4+5+8;
r1 = cases1/pyears1; /* rate in exposed */
r0 = cases0/pyears0; /* rate in unexposed */
lr0 =log(r0);
lrr =log(r1)-log(r0); /* log rate-ratio */
lrr_se=sqrt(1/cases1+1/cases0);
put "log(RR) = " lrr ", S.E = " lrr_se;
run;
/*
Q1:
We fit the model with main effects of age and exposure, and we see
we obtain the same estimates as in Clayton \& Hill's book
*/
proc genmod data=ihd;
format exposed exposedfmt. age agefmt.;
class exposed age;
model cases=exposed age / dist=poisson offset=lpyrs type3;
estimate "exp. vs. non-exp." exposed 1 -1 / exp;
estimate "50-59 vs. 40-49"
age 0 1 -1 / exp;
estimate "60-69 vs. 40-49"
age 1 0 -1 / exp;
run;
/* model without exposure effect */
proc genmod data=ihd;
format age agefmt.;
class age;
model cases=age / dist=poisson offset=lpyrs type3;
run;
/* LRT */
data ihdest;
Q=-2*(-16.0477-(-11.8982));
pval=1-cdf("chisq",Q,1);
put "P-value (LRT): " pval;
run;
/* Wald test: 0.0048, LRT: 0.0040. */
data twe;
filename testfile url "http://www.biostat.ku.dk/~pka/epidata/testis.txt";
* filename testfile "/folders/myfolders/testis.txt";
infile testfile firstobs=2;
input SON_AGE SON_KOH MOTH_AGE PARITY PYRS NONSEMI SEMI CASES;
lpyrs=log(pyrs);
par2 = (parity>=2);
* Merging the the first two cohorts ;
if son_koh=1973 then son_koh=1968;
* Grouping cohort to 3 levels ;
if son_koh=1950 or son_koh=1958 then son_kohny=1;
else if son_koh=1963 then son_kohny=2;
else if son_koh=1968 or son_koh=1973 then son_kohny=3;
run;
/*
Q10
*/
title "Q10 Crude RR for parity 2+ vs 1";
proc genmod data=twe;
class par2;
model cases = PAR2 / offset=lpyrs dist=poisson type3 ;
estimate "par 2+ vs par 1" PAR2 -1 1 / exp ;
run;
/*
Q11
*/
title "Q11a All testis cancers";
proc genmod data=twe ;
class SON_AGE SON_KOH MOTH_AGE parity;
model cases = SON_AGE SON_KOH MOTH_AGE PARity
/ offset=lpyrs dist=poi type3;
estimate "par 2 vs. par 1" parity -1 1 0 0 /exp;
estimate "par 3 vs. par 1" parity -1 0 1 0 /exp;
estimate "par 4+ vs. par 1" parity -1 0 0 1 /exp;
run;
title "Q11b Non seminoma testis cancers";
proc genmod data=twe;
class SON_AGE SON_KOH MOTH_AGE parity;
model nonsemi = SON_AGE SON_KOH MOTH_AGE PARity
/ offset=lpyrs dist=poi type3;
estimate "par 2 vs. par 1" parity -1 1 0 0 /exp;
estimate "par 3 vs. par 1" parity -1 0 1 0 /exp;
estimate "par 4+ vs. par 1" parity -1 0 0 1 /exp;
run;
/* Q12
Estimates concerning Mother"s age at first birth
*/
title "Q12: Mother"s age at boy"s birth - test of interaction" ;
proc genmod data=twe;
class SON_AGE SON_KOH MOTH_AGE par2;
model cases = SON_AGE SON_KOH MOTH_AGE par2 moth_age*par2
/ offset=lpyrs dist=poi type3 ;
run;
title "Q12: Mother"s age at boy"s birth - interaction effects" ;
proc genmod data=twe;
class SON_AGE SON_KOH MOTH_AGE par2;
model cases = SON_AGE SON_KOH MOTH_AGE moth_age*par2
/ offset=lpyrs dist=poi type3;
estimate "par2+ vs. 1, <20" moth_age*par2 -1 1 0 0 0 0 0 0 /exp;
estimate "par2+ vs. 1, 20-24" moth_age*par2 0 0 -1 1 0 0 0 0 /exp;
estimate "par2+ vs. 1, 25-29" moth_age*par2 0 0 0 0 -1 1 0 0 /exp;
estimate "par2+ vs. 1, 30+ " moth_age*par2 0 0 0 0 0 0 -1 1 /exp;
run;
/* Q13
Estimates concerning Birth cohort of the son
*/
title "Q13 Birth cohort of the son - test of interaction" ;
proc genmod data=twe ;
class SON_AGE MOTH_AGE son_kohny son_koh par2;
model cases=SON_AGE MOTH_AGE son_koh par2 son_kohny*par2
/offset=lpyrs dist=poi type3;
run;
title "Q13 Birth cohort of the son - interaction effects" ;
proc genmod data=twe ;
class SON_AGE MOTH_AGE son_kohny son_koh par2;
model cases=SON_AGE MOTH_AGE son_koh son_kohny*par2
/offset=lpyrs dist=poi type3;
estimate "par 2+ vs. 1, 1950-63" son_kohny*par2 -1 1 0 0 0 0 /exp;
estimate "par 2+ vs. 1, 1963-67" son_kohny*par2 0 0 -1 1 0 0 /exp;
estimate "par 2+ vs. 1, 1968-92" son_kohny*par2 0 0 0 0 -1 1 /exp;
run;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment