kkholst/ Epidemiology

## Epidemiology
Solutions to selected exercises...

ihd: Exercise 2.3
bcg: Exercise 2.4

estradiol.sas: Mediation exercise
estradiol.R  : Mediation exercise

## bcg.sas
data bcgdata;
filename bcgfile url 'http://publicifsv.sund.ku.dk/~kkho/data/bcgalldata.txt';
infile bcgfile firstobs=2;
input age scar status $ n;
run;

proc print data=bcgdata (obs=50) ;
run ;


/*
To do a particular analysis we must select cases and a particular
set of controls with the where-statement, so the first analysis is
with the total set of controls.

We use the estimate statements to give the ORs of leprosy between
age-classes, using both the first and last age-class as reference.
Note that the s.e. of the log-odds-ratio is very large when we use
the first age-class with only 2 cases as reference.
*/
proc genmod data=bcgdata;
where status='case' or status='conall' ;
class age scar;
model status = age scar / dist=bin type3 ;
weight n;
estimate "BCG scar
" scar -1 0 / exp ;
estimate "Age 2 vs. 1" age -1 1 0 0 0 0 0 / exp ;
estimate "Age 3 vs. 1" age -1 0 1 0 0 0 0 / exp ;
estimate "Age 4 vs. 1" age -1 0 0 1 0 0 0 / exp ;
estimate "Age 5 vs. 1" age -1 0 0 0 1 0 0 / exp ;
estimate "Age 6 vs. 1" age -1 0 0 0 0 1 0 / exp ;
estimate "Age 7 vs. 1" age -1 0 0 0 0 0 1 / exp ;
run;

## estradiol.R
## Uncomment these to install some dependencies
## install.packages("devtools")
## devtools::load_all("kkholst/lava")
## install.packages("mets")
library(mets)

## Load the data:
estradiol <- read.table("http://publicifsv.sund.ku.dk/~kkho/data/estradiol.txt",header=TRUE)
##Alternatively, load data from local subdirectory 'data':
##estradiol <- read.table("data/estradiol.txt",header=TRUE)

## Direct effects
l1 <- lm(neocort ~ testosterone+estradiol+age, data=estradiol)
summary(l1)

l2 <- lm(estradiol ~ testosterone+age, data=estradiol)
summary(l2)


## Joint model
m <- lvm(neocort~estradiol+testosterone+age,
        estradiol~testosterone+age)
e <- estimate(m,estradiol)

coef(effects(e, neocort~testosterone))


## Mediation proportion
estimate(e, function(x,...) {
    ind <- x["neocort~estradiol"]*x["estradiol~testosterone"]
    dir <- x["neocort~testosterone"]
    total <- ind+dir
    medprop <- ind/total
    return(c(indirect=ind,direct=dir,total=total,mediationproportion=medprop))
    }, vcov=vcov(e))


## Bootstrap confidence limits
f <- function(object,...) coef(effects(object,neocort~testosterone))[,1]
b <- bootstrap(e, fun=f, R=100, estimator="glm", mc.cores=4)
b

## estradiol.sas
data estradiol;
    filename efile url "http://publicifsv.sund.ku.dk/~kkho/data/estradiol.txt";
    infile efile firstobs=2;
    input neocort estra testo age mr;
run;

proc genmod data=estradiol plots=none;
    model neocort = testo estra age /cl;
run; quit;

## ihd-lexis-sol.sas
* The grouped data (slightly altered from the book by C&H) ;
data ihdfreq;
  input eksp agr pyrs cases;
  lpyrs = log( pyrs );
datalines;
1 0  346.87  2
1 1  979.34 12
1 2  699.14 14
0 0  560.13  4
0 1 1127.70  6
0 2  794.15  8
;
run;

* Make a Poisson regression of the rates ;
proc genmod  data = ihdfreq ;
  class eksp agr ;
  model cases = eksp agr
                / dist = poisson  offset = lpyrs  type3 ;
  estimate "low vs. high" eksp -1 1 / Exp ;
  estimate "Rate E1 A0" intercept 1 eksp 1 0 agr 1 0 0 / Exp ;
run;

/* Q2
   Read the individual records from the diet data
   */
data ihdindiv ;
* filename dietfile "/home/bendix/teach/Epi/KU-epi/data/diet.txt" ;
  filename dietfile url "http://publicifsv.sund.ku.dk/~pka/epidata/diet.txt";
    infile dietfile firstobs=2 ;
  input id doe dox chd dob job month energy height weight fat fibre ;
  * This is to read the date variables correctly in and print them reabably ;
  informat doe dox dob mmddyy10.;
    format doe dox dob ddmmyy10.;
  * We must compute the exposure variable ;
  exposure = ( energy < 2.75 ) ;
  drop job month energy height weight fat fibre ;
run ;

* Take a look at the individual records ;
proc print  data = ihdindiv (obs=20) ;
run ;

/* Q3
   Include the two macros for time-splitting and tabulation ;
   */
options source2 ;
filename Lexispr url "http://BendixCarstensen.com/Lexis/Lexis.sas";
%inc Lexispr ;
filename PYtabpr url "http://BendixCarstensen.com/Lexis/PYtab.sas";
%inc PYtabpr ;

/* Q4
   Timesplitting using the Lexis macro
   */
%Lexis( data   = ihdindiv,        /* Dataset with original data       */
        out    = ihdsplit,        /* Dataset with time-split data     */
        entry  = doe,             /* Date of entry                    */
        exit   = dox,             /* Date of exit                     */
        fail   = chd,             /* Event (failure) indicator        */
        origin = dob,             /* Origin of the time-scale         */
        scale  = 365.25,          /* Convert from input scale to braks-scale */
        breaks = 40 to 70 by 10,  /* Where to split the time scale    */
        left   = agr );           /* The name of the new age-variable */

/* Q6
   We can use proc print to inspect the resulting records
   */
proc print  data = ihdsplit (obs=20) ;
run ;

/* Q7
   We can use %PYtab to tabulate person-years and detahs:
   */
%PYtab( data  = ihdsplit,
        class = exposure agr,
        fail  = chd,
        risk  = risk,
        scale = 1000 ) ;

/* Q7a
   But we can also use proc tabulate to replicate the table from C&H
   in a readable form
   */
proc tabulate  data=ihdsplit  missing  noseps formchar=" ----   ---" ;
  class agr exposure ;
  var risk chd ;
  table agr="Age" all="Total",
        ( exposure="Exposure" all="Total" ) *
        sum=" " * ( chd="D"*f=6. risk="Y"*f=8.1 )
        / rts = 15 ;
run ;

/* Q8
   Now fit the model to the time-split data - results are as for the
   grouped data */
proc genmod  data = ihdsplit ;
  class agr exposure ;
  model chd = agr exposure / dist=poisson offset=lrisk ;
run ;

/* Q9
   Add an interaction between age and exposure and check that you
   get the same test for interaction as with the grouped data.
   */
proc genmod  data = ihdsplit ;
  class agr exposure ;
  model chd = agr exposure agr*exposure / dist=poisson offset=lrisk type3 ;
run ;

/* Q10
   We fit the main effects model to the grouped data (6 obs-dataset)
   and observe that the deviance is the same as the type-3 test for
   interaction based on the individual data
   */
proc genmod  data = ihdfreq ;
  class eksp agr ;
  model cases = eksp agr
                / dist = poisson  offset = lpyrs  type3 ;
run;

/* We can split in smaller intervals and model age continuously */
   %Lexis( data   = ihdindiv,        /* Dataset with original data       */
           out    = ihdsplit,        /* Dataset with time-split data     */
           entry  = doe,             /* Date of entry                    */
           exit   = dox,             /* Date of exit                     */
           fail   = chd,             /* Event (failure) indicator        */
           origin = dob,             /* Origin of the time-scale         */
           scale  = 365.25,          /* Convert from input scale to braks-scale */
           breaks = 40 to 70 by 1,   /* Where to split the time scale    */
           left   = age );           /* The name of the new age-variable */

/* And then use a spline model for the effect of age */
data ihdsplit ;
  set ihdsplit ;
  a45 = max( 0, age-45 ) ;
  a50 = max( 0, age-50 ) ;
  a55 = max( 0, age-55 ) ;
  a60 = max( 0, age-60 ) ;
  a65 = max( 0, age-65 ) ;
run ;

/* Making predictions requires extra observations with missing chd */
data ihdsplit ;
  set ihdsplit  end = last ;
  prr = 0 ; * Indicator for prediction part of data ;
  output ;
  if last then
    do exposure = 0, 1 ;
      do age = 40 to 70 by 1 ;
        prr = 1 ; * Indicator for prediction part of data ;
        chd = . ;
        a45 = max( 0, age-45 ) ;
        a50 = max( 0, age-50 ) ;
        a55 = max( 0, age-55 ) ;
        a60 = max( 0, age-60 ) ;
        a65 = max( 0, age-65 ) ;
        output ;
      end ;
    end ;
run ;

/* Model using linear splines, outputting predictions */
proc genmod  data = ihdsplit ;
  class exposure ;
  model chd = age a45 a50 a55 a60 a65 exposure
        / dist=poisson  offset=lrisk type3 ;
  output out = plrate  ( where = (prr=1) )
       xbeta =   lograte
    stdxbeta = selograte ;
run ;

data plrate ;
  set plrate ;
  * rates and confidence intervals ;
  rate = exp( lograte ) * 1000 ;
  rlo  = exp( lograte - 1.96*selograte ) * 1000 ;
  rhi  = exp( lograte + 1.96*selograte ) * 1000 ;
  * change to missing for expoure 0 / 1 ;
  e1 = 1 ; e0 = 1 ;
  if exposure = 0 then e1 = . ; else e0 = . ;
  rate0 = rate*e0 ; rlo0 = rlo*e0 ; rhi0 = rhi*e0 ;
  rate1 = rate*e1 ; rlo1 = rlo*e1 ; rhi1 = rhi*e1 ;
run ;

proc print data = plrate ;
  where age < 46 ;
run ;


proc sgplot data = plrate ;
  series x=age y=rate0 / lineattrs=(color=blue) legendlabel="un-exposed" name="r0";
  band x=age lower=rlo0 upper=rhi0 / fillattrs=(color=blue transparency=.9);
  series x=age y=rate1 / lineattrs=(color=red pattern=dash) legendlabel="exposed" name="r1";
  band x=age lower=rlo1 upper=rhi1 / fillattrs=(color=red transparency=0.9);
  xaxis label="Age" GRID VALUES = (40 to 70 by 5);
  yaxis type=log logbase=10 logstyle=logexponent logvtype=expanded values=(1,2,5,10,20,50,100);
  keylegend "r0" "r1";
run ;


/* proc gplot data = plrate ; */
/*   plot  ( rate0 rlo0 rhi0 ) * age / */
/*         overlay haxis=axis1 vaxis=axis2 ; * un-exposed ; */
/*   plot2 ( rate1 rlo1 rhi1 ) * age / */
/*         overlay haxis=axis1 vaxis=axis2 ; *    exposed ; */
/*   axis1 order=(40 to 70 by 5) ; */
/*   axis2 logbase=10 logstyle=expand order=(1,2,5,10,20,50,100) ; */
/*   symbol1 w=5 i=join color=blue ; */
/*   symbol2 w=2 i=join color=blue ; */
/*   symbol3 w=2 i=join color=blue ; */
/*   symbol4 w=5 i=join color=red ; */
/*   symbol5 w=2 i=join color=red ; */
/*   symbol6 w=2 i=join color=red ; */
/* run ; */

## ihd.R
/* Data from Clayton & Hills Table 22.6, p. 221 */
## Alternatively, read
ihd <- read.table("http://publicifsv.sund.ku.dk/~kkho/data/ihd-tab.txt",header=TRUE)
## Alternatively, input the data directly:
ihd <- data.frame(exposed=c(1,1,1,0,0,0),
                 age=c("40-49","50-59","60-69","40-49","50-59","60-69"),
                 pyears=c(311.9,878.1,667.5,607.9,1272.1,888.9),
                 cases=c(2,12,14,4,5,8))
ihd$age <- relevel(ihd$age, ref="60-69") ## Use last (older) group as reference

## Model with constant rate within both exposure groups (Table 13.1)
g0 <- glm(cases ~ exposed+offset(log(pyears)),data=ihd,family=poisson)
summary(g0)

## Direct calculations gives exact same results:
y1 <- sum(subset(ihd,exposed==1)$pyears)
d1 <- sum(subset(ihd,exposed==1)$cases)
y0 <- sum(subset(ihd,exposed==0)$pyears)
d0 <- sum(subset(ihd,exposed==0)$cases)

r0 <- d0/y0               ## Rate in unexposed
r1 <- d1/y1               ## Rate in exposed
se0 <- 1/sqrt(d0)            ## SE of log(r0)
se1 <- 1/sqrt(d1)            ## SE of log(r0)
logrr = log(r1)-log(r0)  ## Rate-ratio
selogrr <- (1/d0+1/d1)^.5 ## SE of RR
cbind(logr0=log(r0), logr0.se=se0, logrr=logrr, logrr=selogrr)

## Compare with poisson regression:
summary(g0)


## Q1:
## We fit the model with main effects of age and exposure, and we see
## we obtain the same estimates as in Clayton & Hill's book
g1 <- glm(cases ~ age + exposed+offset(log(pyears)),data=ihd,family=poisson)
summary(g1)

## Model without exposure effect
g2 <- glm(cases ~ age + offset(log(pyears)),data=ihd,family=poisson)

## LRT:
Q <- -2*(logLik(g2)-logLik(g1))
1-pchisq(Q,df=1)
## Wald test: 0.0048, LRT: 0.0040

## ihd.sas
/*
First read in the data as from the table in the book
*/
data ihd;
input exposed age pyrs cases;
lpyrs=log(pyrs);
cards;
0 2  311.9  2
0 1  878.1 12
0 0  667.5 14
1 2  607.9  4
1 1 1272.1  5
1 0  888.9  8
;
run;


proc format;
    value exposedfmt 0='exposed(<2750kcals)'
                     1='unexposed(>=2750kcals)';
    value agefmt     0='60-69'
                     1='50-59'
                     2='40-49';
run;

/* Table 22.6 in Clayton & Hills */
proc print data=ihd;
    format age agefmt. exposed exposedfmt.;
run;

/*
 * Comparison of rate within exposed and unexposed (see Table 13.1 Clayton & Hills)
 */
proc genmod data=ihd;
    format exposed exposedfmt.;
    class exposed;
    model cases=exposed / dist=poisson offset=lpyrs;
    estimate "exp. vs. non-exp." exposed 1 -1 / exp;
run;

/* For comparison we see that we get the same estimates and standard errors using the usual formulas: */
data ihdest;
	pyears1=311.9+878.1+667.5;
	pyears0=607.9+1272.1+888.9;
	cases1=2+12+14;
	cases0=4+5+8;
	r1    = cases1/pyears1; /* rate in exposed */
	r0    = cases0/pyears0; /* rate in unexposed */
	lr0   =log(r0);
	lrr   =log(r1)-log(r0); /* log rate-ratio */
	lrr_se=sqrt(1/cases1+1/cases0);
	put "log(RR) = " lrr ", S.E = " lrr_se;
run;

/*
Q1:
We fit the model with main effects of age and exposure, and we see
we obtain the same estimates as in Clayton \& Hill's book
*/
proc genmod data=ihd;
    format exposed exposedfmt. age agefmt.;
    class exposed age;
    model cases=exposed age / dist=poisson offset=lpyrs type3;
    estimate "exp. vs. non-exp." exposed 1 -1 / exp;
    estimate "50-59 vs. 40-49"
        age 0 1 -1 / exp;
    estimate "60-69 vs. 40-49"
        age 1 0 -1 / exp;
run;

/* model without exposure effect */
proc genmod data=ihd;
    format age agefmt.;
    class age;
    model cases=age / dist=poisson offset=lpyrs type3;
run;

/* LRT */
data ihdest;
	Q=-2*(-16.0477-(-11.8982));
	pval=1-cdf("chisq",Q,1);
	put "P-value (LRT): " pval;
run;
/* Wald test: 0.0048, LRT: 0.0040. */

## testis.sas
data twe;
    filename testfile url "http://www.biostat.ku.dk/~pka/epidata/testis.txt";
*    filename testfile "/folders/myfolders/testis.txt";
    infile testfile firstobs=2;
    input SON_AGE SON_KOH MOTH_AGE PARITY PYRS NONSEMI SEMI CASES;
    lpyrs=log(pyrs);
    par2 = (parity>=2);
* Merging the the first two cohorts ;
    if son_koh=1973 then son_koh=1968;
* Grouping cohort to 3 levels ;
    if son_koh=1950 or son_koh=1958 then son_kohny=1;
    else if son_koh=1963 then son_kohny=2;
    else if son_koh=1968 or son_koh=1973 then son_kohny=3;
run;

/*
Q10
*/
title "Q10 Crude RR for parity 2+ vs 1";
proc genmod data=twe;
    class par2;
    model cases = PAR2 / offset=lpyrs dist=poisson type3 ;
    estimate "par 2+ vs par 1" PAR2 -1 1 / exp ;
run;

/*
Q11
*/
title "Q11a All testis cancers";
proc genmod data=twe ;
    class SON_AGE SON_KOH MOTH_AGE parity;
    model cases = SON_AGE SON_KOH MOTH_AGE PARity
        / offset=lpyrs dist=poi type3;
    estimate "par 2 vs. par 1" parity -1 1 0 0 /exp;
    estimate "par 3 vs. par 1" parity -1 0 1 0 /exp;
    estimate "par 4+ vs. par 1" parity -1 0 0 1 /exp;
run;

title "Q11b Non seminoma testis cancers";
proc genmod data=twe;
    class SON_AGE SON_KOH MOTH_AGE parity;
    model nonsemi = SON_AGE SON_KOH MOTH_AGE PARity
        / offset=lpyrs dist=poi type3;
    estimate "par 2 vs. par 1" parity -1 1 0 0 /exp;
    estimate "par 3 vs. par 1" parity -1 0 1 0 /exp;
    estimate "par 4+ vs. par 1" parity -1 0 0 1 /exp;
run;

/* Q12
Estimates concerning Mother"s age at first birth
*/
title "Q12: Mother"s age at boy"s birth - test of interaction" ;
proc genmod data=twe;
    class SON_AGE SON_KOH MOTH_AGE par2;
    model cases = SON_AGE SON_KOH MOTH_AGE par2 moth_age*par2
        / offset=lpyrs dist=poi type3 ;
run;

title "Q12: Mother"s age at boy"s birth - interaction effects" ;
proc genmod data=twe;
    class SON_AGE SON_KOH MOTH_AGE par2;
    model cases = SON_AGE SON_KOH MOTH_AGE moth_age*par2
        / offset=lpyrs dist=poi type3;
    estimate "par2+ vs. 1, <20" moth_age*par2 -1 1 0 0 0 0 0 0 /exp;
    estimate "par2+ vs. 1, 20-24" moth_age*par2 0 0 -1 1 0 0 0 0 /exp;
    estimate "par2+ vs. 1, 25-29" moth_age*par2 0 0 0 0 -1 1 0 0 /exp;
    estimate "par2+ vs. 1, 30+ " moth_age*par2 0 0 0 0 0 0 -1 1 /exp;
run;

/* Q13
Estimates concerning Birth cohort of the son
*/
title "Q13 Birth cohort of the son - test of interaction" ;
proc genmod data=twe ;
    class SON_AGE MOTH_AGE son_kohny son_koh par2;
    model cases=SON_AGE MOTH_AGE son_koh par2 son_kohny*par2
        /offset=lpyrs dist=poi type3;
run;

title "Q13 Birth cohort of the son - interaction effects" ;
proc genmod data=twe ;
    class SON_AGE MOTH_AGE son_kohny son_koh par2;
    model cases=SON_AGE MOTH_AGE son_koh son_kohny*par2
        /offset=lpyrs dist=poi type3;
    estimate "par 2+ vs. 1, 1950-63" son_kohny*par2 -1 1 0 0 0 0 /exp;
    estimate "par 2+ vs. 1, 1963-67" son_kohny*par2 0 0 -1 1 0 0 /exp;
    estimate "par 2+ vs. 1, 1968-92" son_kohny*par2 0 0 0 0 -1 1 /exp;
run;
	Solutions to selected exercises...

	ihd: Exercise 2.3
	bcg: Exercise 2.4

	estradiol.sas: Mediation exercise
	estradiol.R : Mediation exercise
	data bcgdata;
	filename bcgfile url 'http://publicifsv.sund.ku.dk/~kkho/data/bcgalldata.txt';
	infile bcgfile firstobs=2;
	input age scar status $ n;
	run;

	proc print data=bcgdata (obs=50) ;
	run ;


	/*
	To do a particular analysis we must select cases and a particular
	set of controls with the where-statement, so the first analysis is
	with the total set of controls.

	We use the estimate statements to give the ORs of leprosy between
	age-classes, using both the first and last age-class as reference.
	Note that the s.e. of the log-odds-ratio is very large when we use
	the first age-class with only 2 cases as reference.
	*/
	proc genmod data=bcgdata;
	where status='case' or status='conall' ;
	class age scar;
	model status = age scar / dist=bin type3 ;
	weight n;
	estimate "BCG scar
	" scar -1 0 / exp ;
	estimate "Age 2 vs. 1" age -1 1 0 0 0 0 0 / exp ;
	estimate "Age 3 vs. 1" age -1 0 1 0 0 0 0 / exp ;
	estimate "Age 4 vs. 1" age -1 0 0 1 0 0 0 / exp ;
	estimate "Age 5 vs. 1" age -1 0 0 0 1 0 0 / exp ;
	estimate "Age 6 vs. 1" age -1 0 0 0 0 1 0 / exp ;
	estimate "Age 7 vs. 1" age -1 0 0 0 0 0 1 / exp ;
	run;
	## Uncomment these to install some dependencies
	## install.packages("devtools")
	## devtools::load_all("kkholst/lava")
	## install.packages("mets")
	library(mets)

	## Load the data:
	estradiol <- read.table("http://publicifsv.sund.ku.dk/~kkho/data/estradiol.txt",header=TRUE)
	##Alternatively, load data from local subdirectory 'data':
	##estradiol <- read.table("data/estradiol.txt",header=TRUE)

	## Direct effects
	l1 <- lm(neocort ~ testosterone+estradiol+age, data=estradiol)
	summary(l1)

	l2 <- lm(estradiol ~ testosterone+age, data=estradiol)
	summary(l2)


	## Joint model
	m <- lvm(neocort~estradiol+testosterone+age,
	estradiol~testosterone+age)
	e <- estimate(m,estradiol)

	coef(effects(e, neocort~testosterone))


	## Mediation proportion
	estimate(e, function(x,...) {
	ind <- x["neocort~estradiol"]*x["estradiol~testosterone"]
	dir <- x["neocort~testosterone"]
	total <- ind+dir
	medprop <- ind/total
	return(c(indirect=ind,direct=dir,total=total,mediationproportion=medprop))
	}, vcov=vcov(e))


	## Bootstrap confidence limits
	f <- function(object,...) coef(effects(object,neocort~testosterone))[,1]
	b <- bootstrap(e, fun=f, R=100, estimator="glm", mc.cores=4)
	b
	data estradiol;
	filename efile url "http://publicifsv.sund.ku.dk/~kkho/data/estradiol.txt";
	infile efile firstobs=2;
	input neocort estra testo age mr;
	run;

	proc genmod data=estradiol plots=none;
	model neocort = testo estra age /cl;
	run; quit;
	* The grouped data (slightly altered from the book by C&H) ;
	data ihdfreq;
	input eksp agr pyrs cases;
	lpyrs = log( pyrs );
	datalines;
	1 0 346.87 2
	1 1 979.34 12
	1 2 699.14 14
	0 0 560.13 4
	0 1 1127.70 6
	0 2 794.15 8
	;
	run;

	* Make a Poisson regression of the rates ;
	proc genmod data = ihdfreq ;
	class eksp agr ;
	model cases = eksp agr
	/ dist = poisson offset = lpyrs type3 ;
	estimate "low vs. high" eksp -1 1 / Exp ;
	estimate "Rate E1 A0" intercept 1 eksp 1 0 agr 1 0 0 / Exp ;
	run;

	/* Q2
	Read the individual records from the diet data
	*/
	data ihdindiv ;
	* filename dietfile "/home/bendix/teach/Epi/KU-epi/data/diet.txt" ;
	filename dietfile url "http://publicifsv.sund.ku.dk/~pka/epidata/diet.txt";
	infile dietfile firstobs=2 ;
	input id doe dox chd dob job month energy height weight fat fibre ;
	* This is to read the date variables correctly in and print them reabably ;
	informat doe dox dob mmddyy10.;
	format doe dox dob ddmmyy10.;
	* We must compute the exposure variable ;
	exposure = ( energy < 2.75 ) ;
	drop job month energy height weight fat fibre ;
	run ;

	* Take a look at the individual records ;
	proc print data = ihdindiv (obs=20) ;
	run ;

	/* Q3
	Include the two macros for time-splitting and tabulation ;
	*/
	options source2 ;
	filename Lexispr url "http://BendixCarstensen.com/Lexis/Lexis.sas";
	%inc Lexispr ;
	filename PYtabpr url "http://BendixCarstensen.com/Lexis/PYtab.sas";
	%inc PYtabpr ;

	/* Q4
	Timesplitting using the Lexis macro
	*/
	%Lexis( data = ihdindiv, /* Dataset with original data */
	out = ihdsplit, /* Dataset with time-split data */
	entry = doe, /* Date of entry */
	exit = dox, /* Date of exit */
	fail = chd, /* Event (failure) indicator */
	origin = dob, /* Origin of the time-scale */
	scale = 365.25, /* Convert from input scale to braks-scale */
	breaks = 40 to 70 by 10, /* Where to split the time scale */
	left = agr ); /* The name of the new age-variable */

	/* Q6
	We can use proc print to inspect the resulting records
	*/
	proc print data = ihdsplit (obs=20) ;
	run ;

	/* Q7
	We can use %PYtab to tabulate person-years and detahs:
	*/
	%PYtab( data = ihdsplit,
	class = exposure agr,
	fail = chd,
	risk = risk,
	scale = 1000 ) ;

	/* Q7a
	But we can also use proc tabulate to replicate the table from C&H
	in a readable form
	*/
	proc tabulate data=ihdsplit missing noseps formchar=" ---- ---" ;
	class agr exposure ;
	var risk chd ;
	table agr="Age" all="Total",
	( exposure="Exposure" all="Total" ) *
	sum=" " * ( chd="D"f=6. risk="Y"f=8.1 )
	/ rts = 15 ;
	run ;

	/* Q8
	Now fit the model to the time-split data - results are as for the
	grouped data */
	proc genmod data = ihdsplit ;
	class agr exposure ;
	model chd = agr exposure / dist=poisson offset=lrisk ;
	run ;

	/* Q9
	Add an interaction between age and exposure and check that you
	get the same test for interaction as with the grouped data.
	*/
	proc genmod data = ihdsplit ;
	class agr exposure ;
	model chd = agr exposure agr*exposure / dist=poisson offset=lrisk type3 ;
	run ;

	/* Q10
	We fit the main effects model to the grouped data (6 obs-dataset)
	and observe that the deviance is the same as the type-3 test for
	interaction based on the individual data
	*/
	proc genmod data = ihdfreq ;
	class eksp agr ;
	model cases = eksp agr
	/ dist = poisson offset = lpyrs type3 ;
	run;

	/* We can split in smaller intervals and model age continuously */
	%Lexis( data = ihdindiv, /* Dataset with original data */
	out = ihdsplit, /* Dataset with time-split data */
	entry = doe, /* Date of entry */
	exit = dox, /* Date of exit */
	fail = chd, /* Event (failure) indicator */
	origin = dob, /* Origin of the time-scale */
	scale = 365.25, /* Convert from input scale to braks-scale */
	breaks = 40 to 70 by 1, /* Where to split the time scale */
	left = age ); /* The name of the new age-variable */

	/* And then use a spline model for the effect of age */
	data ihdsplit ;
	set ihdsplit ;
	a45 = max( 0, age-45 ) ;
	a50 = max( 0, age-50 ) ;
	a55 = max( 0, age-55 ) ;
	a60 = max( 0, age-60 ) ;
	a65 = max( 0, age-65 ) ;
	run ;

	/* Making predictions requires extra observations with missing chd */
	data ihdsplit ;
	set ihdsplit end = last ;
	prr = 0 ; * Indicator for prediction part of data ;
	output ;
	if last then
	do exposure = 0, 1 ;
	do age = 40 to 70 by 1 ;
	prr = 1 ; * Indicator for prediction part of data ;
	chd = . ;
	a45 = max( 0, age-45 ) ;
	a50 = max( 0, age-50 ) ;
	a55 = max( 0, age-55 ) ;
	a60 = max( 0, age-60 ) ;
	a65 = max( 0, age-65 ) ;
	output ;
	end ;
	end ;
	run ;

	/* Model using linear splines, outputting predictions */
	proc genmod data = ihdsplit ;
	class exposure ;
	model chd = age a45 a50 a55 a60 a65 exposure
	/ dist=poisson offset=lrisk type3 ;
	output out = plrate ( where = (prr=1) )
	xbeta = lograte
	stdxbeta = selograte ;
	run ;

	data plrate ;
	set plrate ;
	* rates and confidence intervals ;
	rate = exp( lograte ) * 1000 ;
	rlo = exp( lograte - 1.96selograte ) 1000 ;
	rhi = exp( lograte + 1.96selograte ) 1000 ;
	* change to missing for expoure 0 / 1 ;
	e1 = 1 ; e0 = 1 ;
	if exposure = 0 then e1 = . ; else e0 = . ;
	rate0 = ratee0 ; rlo0 = rloe0 ; rhi0 = rhi*e0 ;
	rate1 = ratee1 ; rlo1 = rloe1 ; rhi1 = rhi*e1 ;
	run ;

	proc print data = plrate ;
	where age < 46 ;
	run ;


	proc sgplot data = plrate ;
	series x=age y=rate0 / lineattrs=(color=blue) legendlabel="un-exposed" name="r0";
	band x=age lower=rlo0 upper=rhi0 / fillattrs=(color=blue transparency=.9);
	series x=age y=rate1 / lineattrs=(color=red pattern=dash) legendlabel="exposed" name="r1";
	band x=age lower=rlo1 upper=rhi1 / fillattrs=(color=red transparency=0.9);
	xaxis label="Age" GRID VALUES = (40 to 70 by 5);
	yaxis type=log logbase=10 logstyle=logexponent logvtype=expanded values=(1,2,5,10,20,50,100);
	keylegend "r0" "r1";
	run ;




	/* proc gplot data = plrate ; */
	/* plot ( rate0 rlo0 rhi0 ) * age / */
	/* overlay haxis=axis1 vaxis=axis2 ; * un-exposed ; */
	/* plot2 ( rate1 rlo1 rhi1 ) * age / */
	/* overlay haxis=axis1 vaxis=axis2 ; * exposed ; */
	/* axis1 order=(40 to 70 by 5) ; */
	/* axis2 logbase=10 logstyle=expand order=(1,2,5,10,20,50,100) ; */
	/* symbol1 w=5 i=join color=blue ; */
	/* symbol2 w=2 i=join color=blue ; */
	/* symbol3 w=2 i=join color=blue ; */
	/* symbol4 w=5 i=join color=red ; */
	/* symbol5 w=2 i=join color=red ; */
	/* symbol6 w=2 i=join color=red ; */
	/* run ; */
	/* Data from Clayton & Hills Table 22.6, p. 221 */
	## Alternatively, read
	ihd <- read.table("http://publicifsv.sund.ku.dk/~kkho/data/ihd-tab.txt",header=TRUE)
	## Alternatively, input the data directly:
	ihd <- data.frame(exposed=c(1,1,1,0,0,0),
	age=c("40-49","50-59","60-69","40-49","50-59","60-69"),
	pyears=c(311.9,878.1,667.5,607.9,1272.1,888.9),
	cases=c(2,12,14,4,5,8))
	ihd$age <- relevel(ihd$age, ref="60-69") ## Use last (older) group as reference

	## Model with constant rate within both exposure groups (Table 13.1)
	g0 <- glm(cases ~ exposed+offset(log(pyears)),data=ihd,family=poisson)
	summary(g0)

	## Direct calculations gives exact same results:
	y1 <- sum(subset(ihd,exposed==1)$pyears)
	d1 <- sum(subset(ihd,exposed==1)$cases)
	y0 <- sum(subset(ihd,exposed==0)$pyears)
	d0 <- sum(subset(ihd,exposed==0)$cases)

	r0 <- d0/y0 ## Rate in unexposed
	r1 <- d1/y1 ## Rate in exposed
	se0 <- 1/sqrt(d0) ## SE of log(r0)
	se1 <- 1/sqrt(d1) ## SE of log(r0)
	logrr = log(r1)-log(r0) ## Rate-ratio
	selogrr <- (1/d0+1/d1)^.5 ## SE of RR
	cbind(logr0=log(r0), logr0.se=se0, logrr=logrr, logrr=selogrr)

	## Compare with poisson regression:
	summary(g0)


	## Q1:
	## We fit the model with main effects of age and exposure, and we see
	## we obtain the same estimates as in Clayton & Hill's book
	g1 <- glm(cases ~ age + exposed+offset(log(pyears)),data=ihd,family=poisson)
	summary(g1)

	## Model without exposure effect
	g2 <- glm(cases ~ age + offset(log(pyears)),data=ihd,family=poisson)

	## LRT:
	Q <- -2*(logLik(g2)-logLik(g1))
	1-pchisq(Q,df=1)
	## Wald test: 0.0048, LRT: 0.0040
	/*
	First read in the data as from the table in the book
	*/
	data ihd;
	input exposed age pyrs cases;
	lpyrs=log(pyrs);
	cards;
	0 2 311.9 2
	0 1 878.1 12
	0 0 667.5 14
	1 2 607.9 4
	1 1 1272.1 5
	1 0 888.9 8
	;
	run;


	proc format;
	value exposedfmt 0='exposed(<2750kcals)'
	1='unexposed(>=2750kcals)';
	value agefmt 0='60-69'
	1='50-59'
	2='40-49';
	run;

	/* Table 22.6 in Clayton & Hills */
	proc print data=ihd;
	format age agefmt. exposed exposedfmt.;
	run;

	/*
	* Comparison of rate within exposed and unexposed (see Table 13.1 Clayton & Hills)
	*/
	proc genmod data=ihd;
	format exposed exposedfmt.;
	class exposed;
	model cases=exposed / dist=poisson offset=lpyrs;
	estimate "exp. vs. non-exp." exposed 1 -1 / exp;
	run;

	/* For comparison we see that we get the same estimates and standard errors using the usual formulas: */
	data ihdest;
	pyears1=311.9+878.1+667.5;
	pyears0=607.9+1272.1+888.9;
	cases1=2+12+14;
	cases0=4+5+8;
	r1 = cases1/pyears1; /* rate in exposed */
	r0 = cases0/pyears0; /* rate in unexposed */
	lr0 =log(r0);
	lrr =log(r1)-log(r0); /* log rate-ratio */
	lrr_se=sqrt(1/cases1+1/cases0);
	put "log(RR) = " lrr ", S.E = " lrr_se;
	run;

	/*
	Q1:
	We fit the model with main effects of age and exposure, and we see
	we obtain the same estimates as in Clayton \& Hill's book
	*/
	proc genmod data=ihd;
	format exposed exposedfmt. age agefmt.;
	class exposed age;
	model cases=exposed age / dist=poisson offset=lpyrs type3;
	estimate "exp. vs. non-exp." exposed 1 -1 / exp;
	estimate "50-59 vs. 40-49"
	age 0 1 -1 / exp;
	estimate "60-69 vs. 40-49"
	age 1 0 -1 / exp;
	run;

	/* model without exposure effect */
	proc genmod data=ihd;
	format age agefmt.;
	class age;
	model cases=age / dist=poisson offset=lpyrs type3;
	run;

	/* LRT */
	data ihdest;
	Q=-2*(-16.0477-(-11.8982));
	pval=1-cdf("chisq",Q,1);
	put "P-value (LRT): " pval;
	run;
	/* Wald test: 0.0048, LRT: 0.0040. */
	data twe;
	filename testfile url "http://www.biostat.ku.dk/~pka/epidata/testis.txt";
	* filename testfile "/folders/myfolders/testis.txt";
	infile testfile firstobs=2;
	input SON_AGE SON_KOH MOTH_AGE PARITY PYRS NONSEMI SEMI CASES;
	lpyrs=log(pyrs);
	par2 = (parity>=2);
	* Merging the the first two cohorts ;
	if son_koh=1973 then son_koh=1968;
	* Grouping cohort to 3 levels ;
	if son_koh=1950 or son_koh=1958 then son_kohny=1;
	else if son_koh=1963 then son_kohny=2;
	else if son_koh=1968 or son_koh=1973 then son_kohny=3;
	run;

	/*
	Q10
	*/
	title "Q10 Crude RR for parity 2+ vs 1";
	proc genmod data=twe;
	class par2;
	model cases = PAR2 / offset=lpyrs dist=poisson type3 ;
	estimate "par 2+ vs par 1" PAR2 -1 1 / exp ;
	run;

	/*
	Q11
	*/
	title "Q11a All testis cancers";
	proc genmod data=twe ;
	class SON_AGE SON_KOH MOTH_AGE parity;
	model cases = SON_AGE SON_KOH MOTH_AGE PARity
	/ offset=lpyrs dist=poi type3;
	estimate "par 2 vs. par 1" parity -1 1 0 0 /exp;
	estimate "par 3 vs. par 1" parity -1 0 1 0 /exp;
	estimate "par 4+ vs. par 1" parity -1 0 0 1 /exp;
	run;

	title "Q11b Non seminoma testis cancers";
	proc genmod data=twe;
	class SON_AGE SON_KOH MOTH_AGE parity;
	model nonsemi = SON_AGE SON_KOH MOTH_AGE PARity
	/ offset=lpyrs dist=poi type3;
	estimate "par 2 vs. par 1" parity -1 1 0 0 /exp;
	estimate "par 3 vs. par 1" parity -1 0 1 0 /exp;
	estimate "par 4+ vs. par 1" parity -1 0 0 1 /exp;
	run;

	/* Q12
	Estimates concerning Mother"s age at first birth
	*/
	title "Q12: Mother"s age at boy"s birth - test of interaction" ;
	proc genmod data=twe;
	class SON_AGE SON_KOH MOTH_AGE par2;
	model cases = SON_AGE SON_KOH MOTH_AGE par2 moth_age*par2
	/ offset=lpyrs dist=poi type3 ;
	run;

	title "Q12: Mother"s age at boy"s birth - interaction effects" ;
	proc genmod data=twe;
	class SON_AGE SON_KOH MOTH_AGE par2;
	model cases = SON_AGE SON_KOH MOTH_AGE moth_age*par2
	/ offset=lpyrs dist=poi type3;
	estimate "par2+ vs. 1, <20" moth_age*par2 -1 1 0 0 0 0 0 0 /exp;
	estimate "par2+ vs. 1, 20-24" moth_age*par2 0 0 -1 1 0 0 0 0 /exp;
	estimate "par2+ vs. 1, 25-29" moth_age*par2 0 0 0 0 -1 1 0 0 /exp;
	estimate "par2+ vs. 1, 30+ " moth_age*par2 0 0 0 0 0 0 -1 1 /exp;
	run;

	/* Q13
	Estimates concerning Birth cohort of the son
	*/
	title "Q13 Birth cohort of the son - test of interaction" ;
	proc genmod data=twe ;
	class SON_AGE MOTH_AGE son_kohny son_koh par2;
	model cases=SON_AGE MOTH_AGE son_koh par2 son_kohny*par2
	/offset=lpyrs dist=poi type3;
	run;

	title "Q13 Birth cohort of the son - interaction effects" ;
	proc genmod data=twe ;
	class SON_AGE MOTH_AGE son_kohny son_koh par2;
	model cases=SON_AGE MOTH_AGE son_koh son_kohny*par2
	/offset=lpyrs dist=poi type3;
	estimate "par 2+ vs. 1, 1950-63" son_kohny*par2 -1 1 0 0 0 0 /exp;
	estimate "par 2+ vs. 1, 1963-67" son_kohny*par2 0 0 -1 1 0 0 /exp;
	estimate "par 2+ vs. 1, 1968-92" son_kohny*par2 0 0 0 0 -1 1 /exp;
	run;