tnagler/GAM-PCC-sim.md

## GAM-PCC-sim.md

      
    Raw
  

              GAM-PCC-sim.md
            
          
  title
  subtitle
  author
  date
  output
  
  
  Appendix to "Generalized Additive Models for Pair-Copula Constructions"
  Code for the simulation study
  Thibault Vatter and Thomas Nagler
  14 August, 2017
  github_document
  
  
Preliminary definitions

Required packages

library(gamCopula)
library(tibble)
library(dplyr)
library(purrr)
Smooth functions to be estimated

s_funs <- list(
    s1 = function(t) -1 / 4 + t / 2,
    s2 = function(t) sin(2 * pi * t) / 4,
    s3 = function(t) sin(6 * pi * t) / 4,
    s4 = function(t) rep(0, length(t)),
    s5 = function(t) rep(0, length(t))
)
Parameter vector to be estimated (first entry is intercept)

beta <- c(0, 1, 1, -1, 0, 0, 1, -1, -1, 0, 0) / 4
Creating a simulation model with randomly selected families

make_sim_model <- function(covariates, family_set) {
    n <- nrow(covariates)
    
    # 5-dimensional R-vine structure matrix
    d <- 5
    Matrix <- c(5, 2, 4, 1, 3,
                0, 2, 4, 3, 1,
                0, 0, 1, 4, 3,
                0, 0, 0, 4, 3,
                0, 0, 0, 0, 3)
    Matrix <- matrix(Matrix, d, d)
    # family matrix
    family <- matrix(0, d, d)
    
    # initialize model
    count <- 1
    model <- vector(mode = "list", length = d * (d - 1) / 2)
    
    # model formula for true model
    tmpform <- "B.1 + B.2 + B.3 + Z.1 + Z.2 + Z.3"
    for (i in 1:3) {
        tmpform <- paste0(tmpform, " + s(S.", i, ", k = 10, bs = 'cr')")
    }
    
    # select a family for each pair copula and set GAM relationship to true model
    for (tree in 1:(d - 1)) {
        for (pc in 1:(d - tree)) {
            # select a copula family
            family[d - tree + 1, pc] <- sample(family_set, 1)
            
            # spline approximation of the true functions
            S <- covariates[, substr(colnames(covariates), 1, 1) == "S"]
            y <- rowSums(sapply(1:5, function(i) s_funs[[i]](S[, i])))
            form <- as.formula(paste0("y ~", tmpform))
            approx <- mgcv::gam(form, data = cbind(y, covariates))
            
            # create a dummy gamBiCop object
            tmp <- gamBiCopFit(
                data = cbind(u1 = runif(n), u2 = runif(n), covariates), 
                formula = form, 
                family = 1, 
                n.iters = 1
            )$res
            
            # set to true model
            attr(tmp, "model") <- approx
            attr(tmp, "family") <- family[d - tree + 1, pc]
            tmp@model$coefficients[1] <- 0
            tmp@model$coefficients[2:7] <- beta[beta != 0]
            if (family[d - tree + 1, pc] == 2) {
                attr(tmp, "par2") <- 4
            }
            model[[count]] <- tmp
            count <- count + 1
        }
    }
    
    # return gamVine model
    gamVine(
        Matrix = Matrix,
        model  = model,
        covariates = colnames(covariates)
    )
}
Extracting results from fitted models

# evaluation grid
grid <- cbind(
    matrix(1, 50, 10),
    sapply(1:5, function(i) seq(0, 1, l = 50))
)
colnames(grid) <- c(paste0("B.", 1:5), paste0("Z.", 1:5), paste0("S.", 1:5))

# extract results for a single pair-copula fit
get_pc_results <- function(x) {
    gamBiCopPredict(x, newdata = grid, type = "terms", target = "calib") %>%
        map_df(~ tidy_terms(.)) %>%
        mutate(family = if (is.list(x)) x$family else x@family)
}

# tidy results returned from gamBiCopPredict
tidy_terms <- function(x) {
    # rename smooths to bare variable names
    colnames(x) <- gsub("(^s\\()|(\\))", "", colnames(x))
    # add intercept as column
    x <- as_tibble(cbind(intercept = attr(x, "constant"), x))
    
    # fill missing columns with 0
    x[setdiff(colnames(grid), colnames(x))] <- 0
    
    # standardize variable order
    x <- x[c("intercept", colnames(grid))]
    
    x %>%
        # in case one of the smooth covariates is selected as a linear term, 
        # it will not integrate to zero; hence, we need to subsume the 
        # means of all S.i into the intercept, and standardize the variables
        # accordingly
        mutate(intercept = intercept - mean(S.1 + S.2 + S.3 + S.4 + S.5)) %>%
        mutate_at(paste0("S.", 1:5), funs(. - mean(.))) %>%
        # add grid sequence for smooth covariates
        mutate(t = grid[, 15])
}
Simulation and fitting function for a single seed

do_one <- function(seed, n) {
    
    ## simulate from true model ----------------
    # simulate covariates
    B <- matrix(rbinom(n * 5, 1, 0.5), n, 5)
    Z <- matrix(rnorm(n * 5), n, 5)
    S <- matrix(runif(n * 5), n, 5)
    covariates <- data.frame(B = B, Z = Z, S = S)
    # create the gamVineCopula object
    family_set <- c(1:2, 1:2, 301:302, 401:402)  # allowed copula families
    GVC <- make_sim_model(covariates, family_set)
    # simulate data from gamVine model
    sim <- gamVineSimulate(n, GVC, newdata = covariates)
    
    ## fit models ----------------
    time_oracle <- system.time(
        fit_oracle <- gamVineSeqFit(
            data = cbind(sim, covariates),
            GVC = GVC,
            covariates = colnames(covariates),
            method = "FS",
            n.iters = 30
        )
    )[3]
    
    time_selection <- system.time(
        fit_selection <- gamVineCopSelect(
            data = sim,
            Matrix = GVC@Matrix,
            lin.covs = covariates[, 1:10],
            smooth.covs = covariates[, 11:15],
            simplified = TRUE,
            familyset = unique(family_set),
            familycrit = "AIC",
            rotations = FALSE,
            method = "FS",
            n.iters = 30,
            select.once = FALSE
        )
    )[3]
    
    ## extract results ----------------
    res_oracle <- map(fit_oracle@model[1:10], ~ get_pc_results(.)) %>%
        bind_rows(.id = "pc") %>%
        mutate(method = "oracle", time = time_oracle)
    res_selection <- map(fit_selection@model[1:10], ~ get_pc_results(.)) %>%
        bind_rows(.id = "pc") %>%
        mutate(method = "selection", time = time_selection)
    
    bind_rows(res_oracle, res_selection) %>%
        mutate(
            pc = as.numeric(pc),
            tree = as.factor(map_dbl(pc, ~ which.max(. <= c(4, 7, 9, 10)))),
            seed = seed,
            n = n
        )
}
Run the study

Required packages

library(doParallel)
Backend for parallel computing

cl <- makeCluster(detectCores(), outfile = "")
registerDoParallel(cl)
pkg <- c("gamCopula", "dplyr", "purrr", "tibble")
Parallelize over seeds

# n = 500
res_5h <- foreach(seed = 1:500, .packages = pkg, .export = ls()) %dopar% {
    message(seed)
    tryCatch(do_one(seed, n = 500), error = function(e) NULL)
}
saveRDS(res_5h, file = "GAM-PCC-sim-5h.rds")
# n = 5000
res_5k <- foreach(seed = 1:500, .packages = pkg, .export = ls()) %dopar% {
    message(seed)
    tryCatch(do_one(seed, n = 5000), error = function(e) NULL)
}
saveRDS(res_5k, file = "GAM-PCC-sim-5k.rds")
Tear down parallel backend

stopCluster(cl)
Analysis of results

Preliminaries

Required packages

library(tidyr)
library(ggplot2)
library(dichromat)
Colors

cols <- dichromat::colorschemes[[5]][c(2, 11)]
Combine results for both sample sizes

res <- bind_rows(
    readRDS("GAM-PCC-sim-5h.rds"),
    readRDS("GAM-PCC-sim-5k.rds")
)
Estimation accuracy

Linear covariates

res_linear <- res %>%
    # estimates for linear coefficients are constant for all t, we can focus on 
    # first value
    filter(t == 0) %>%
    # select linear covariates
    select(method, n, tree, seed, starts_with("B"), starts_with("Z")) %>%
    gather(term, value, -method, -n, -tree, -seed) %>%
    # rename terms to appropriate index of beta
    mutate(j = (substr(term, 1, 1) == "Z") * 5 + as.numeric(substr(term, 3, 3))) %>%
    # calculate mean estimates and quantiles
    group_by(j, n, method, tree) %>%
    summarize(
        mu = mean(value),
        q05 = quantile(value, 0.05),
        q95 = quantile(value, 0.95),
        true = beta[1 + first(j)]
    ) %>%
    ungroup()
res_linear %>%
    mutate(n = paste0("n == ", n)) %>%
    ggplot(aes(j, mu, alpha = tree, color = method)) +
    geom_segment(
        aes(x = j - 0.5, xend = j + 0.5, y = true, yend = true), 
        color = "grey40"
    ) +
    geom_point(position = position_dodge(0.9)) + 
    geom_errorbar(aes(ymin = q05, ymax = q95), position = position_dodge(0.9)) +
    facet_grid(n ~ method, labeller = label_parsed) +
    theme_bw() +
    theme(
        legend.position = "bottom",
        plot.margin = grid::unit(c(0, 0, 0, 0), "mm")
    ) +
    scale_color_manual(values = cols) +
    scale_alpha_manual(values = seq(1, 0.4, l = 4)) +
    scale_x_continuous(breaks = 1:10) +
    geom_vline(xintercept = 1:10 - 0.5, color = "grey90") +
    guides(color = FALSE) +
    ylab(quote(beta[j]))

Nonlinear covariates

res_nonlinear <- res %>%
    # select nonlinear covariates
    select(method, n, tree, seed, t, starts_with("S")) %>%
    gather(term, value, -method, -n, -tree, -seed, -t) %>%
    # calculate mean estimates and quantiles
    group_by(method, n, tree, term, t) %>%
    summarize(
        mu = mean(value),
        q05 = quantile(value, 0.05),
        q95 = quantile(value, 0.95)
    ) %>%
    group_by(n, tree, method, term) %>%
    # add true values
    mutate(
        true_fun = map_int(term, ~ as.integer(substr(., 3, 3))),
        true = map2_dbl(true_fun, t, ~ s_funs[[.x]](.y))
    ) %>%
    ungroup() %>%
    mutate(term = map_chr(true_fun, ~ paste0("s[", ., "]"))) # better plot labels
res_nonlinear %>%
    filter(tree == 1) %>%
    mutate(n = paste0("n == ", n)) %>%
    ggplot(aes(t)) +
    geom_ribbon(aes(ymin = q05, ymax = q95, alpha = 0.02)) +
    geom_line(aes(y = true), linetype = 2) +
    geom_line(aes(y = mu, color = method)) +
    facet_grid(n + method ~ term, labeller = label_parsed) +
    scale_color_manual(values = cols) +
    theme_bw() +
    theme(legend.position = "none") +
    ylab(quote(s[k])) +
    theme(plot.margin = grid::unit(c(0, 0, 0, 0), "mm")) +
    coord_cartesian(ylim = c(-0.4, 0.4)) +
    scale_x_continuous(breaks = c(0, 0.5, 1.0))

res_nonlinear %>%
    filter(tree == 4) %>%
    mutate(n = paste0("n == ", n)) %>%
    ggplot(aes(t)) +
    geom_ribbon(aes(ymin = q05, ymax = q95, alpha = 0.02)) +
    geom_line(aes(y = true), linetype = 2) +
    geom_line(aes(y = mu, color = method)) +
    facet_grid(n + method ~ term, labeller = label_parsed) +
    scale_color_manual(values = cols) +
    theme_bw() +
    theme(legend.position = "none") +
    theme(plot.margin = grid::unit(c(0, 0, 0, 0), "mm")) +
    ylab(quote(s[k])) +
    coord_cartesian(ylim = c(-0.4, 0.4)) +
    scale_x_continuous(breaks = c(0, 0.5, 1.0))

Family selection

Frequency of correctly selected families

res %>%
    filter(t == 0) %>%
    mutate(family = substr(family, 1, 1)) %>%
    group_by(tree, pc, n, seed) %>%
    select(method, family) %>%
    spread(method, family) %>%
    ungroup() %>%
    group_by(tree, n) %>%
    summarize(
        freq = mean(selection == oracle) * 100,
        se   = sd(selection == oracle) / sqrt(length(oracle)) * 100
    ) %>%
    gather(type, value, -n, -tree) %>%
    spread(tree, value) %>%
    knitr::kable(digits = 1)


n
type
1
2
3
4


500
freq
76.8
63.0
53.5
43.0


500
se
0.9
1.2
1.6
2.2


5000
freq
97.5
96.7
87.9
72.2


5000
se
0.3
0.5
1.0
2.0


True vs selected

# tree 1
tab1 <- res %>%
    filter(t == 0, tree == 1, n == 500) %>%
    mutate(family = substr(family, 1, 1)) %>%
    select(method, family, seed, pc) %>%
    spread(method, family) %>%
    select(selection, oracle) %>%
    table()
tab1 <- tab1 / sum(tab1) * 100
colnames(tab1) <- rownames(tab1) <- c("Gaussian", "Student t", "Clayton", "Gumbel")
tab1 <- rbind(tab1, colSums(tab1))
tab1 <- cbind(tab1, rowSums(tab1))
knitr::kable(tab1, digits = 1)


Gaussian
Student t
Clayton
Gumbel


Gaussian
18.2
0.7
4.3
5.1
28.4


Student t
4.8
23.4
2.8
3.4
34.4


Clayton
0.0
0.0
18.5
0.0
18.6


Gumbel
1.0
0.8
0.2
16.7
18.7


24.1
25.0
25.9
25.2
100.0


# tree 4
tab4 <- res %>%
    filter(t == 0, tree == 4, n == 500) %>%
    mutate(family = substr(family, 1, 1)) %>%
    select(method, family, seed, pc) %>%
    spread(method, family) %>%
    select(selection, oracle) %>%
    table()
tab4 <- tab4 / sum(tab4) * 100
colnames(tab4) <- rownames(tab4) <- c("Gaussian", "Student t", "Clayton", "Gumbel")
tab4 <- rbind(tab4, colSums(tab4))
tab4 <- cbind(tab4, rowSums(tab4))
knitr::kable(tab4, digits = 1)


Gaussian
Student t
Clayton
Gumbel


Gaussian
15.0
5.2
12.0
12.2
44.4


Student t
6.4
14.6
7.2
6.2
34.4


Clayton
1.4
0.6
4.6
0.2
6.8


Gumbel
2.8
2.2
0.6
8.8
14.4


25.6
22.6
24.4
27.4
100.0


Covariate selection

true_covs <- c(paste0("S.", 1:3), paste0("B.", 1:3), paste0("Z.", 1:3))
res %>%
    filter(method == "selection") %>%
    group_by(tree, pc, n, seed) %>%
    select(starts_with("S."), starts_with("B."), starts_with("Z.")) %>%
    summarize_all(funs(1 * any(. != 0))) %>%
    ungroup() %>%
    select(-pc, -seed) %>%
    gather(var, value, -tree, -n) %>%
    mutate(correct = (value == ifelse(var %in% true_covs, 1, 0))) %>%
    mutate(type = ifelse(substr(var, 1, 1) == "S", "s", "beta")) %>%
    group_by(tree, n, type) %>%
    summarize(
        freq = mean(correct) * 100,
        se = sd(correct) / sqrt(length(correct)) * 100
    ) %>%
    gather(what, value, -tree, -n, -type) %>%
    unite(tree_type, tree, type) %>%
    spread(tree_type, value) %>%
    knitr::kable(digits = 1)


n
what
1_beta
1_s
2_beta
2_s
3_beta
3_s
4_beta
4_s


500
freq
83.5
75.4
81.3
72.5
73.6
63.5
61.4
54.8


500
se
0.3
0.4
0.3
0.5
0.4
0.7
0.7
1.0


5000
freq
94.6
91.5
94.8
91.4
94.8
91.2
94.4
90.2


5000
se
0.2
0.3
0.2
0.3
0.2
0.4
0.3
0.6


Linear term selected correctly

res %>% 
  filter(method == "selection") %>%
  group_by(tree, pc, n, seed) %>%
  summarize(correct = ((var(diff(S.1)) < 1e-20) & any(S.1 != 0))) %>%
  ungroup() %>%
  group_by(tree, n) %>%
  summarize(
      freq = mean(correct) * 100,
      se   = sd(correct) / sqrt(length(correct))
  ) %>%
  gather(type, value, -tree, -n) %>%
  spread(tree, value) %>%
  knitr::kable(digits = 1)


n
type
1
2
3
4


500
freq
42.5
41.9
31.8
19.2


500
se
0.0
0.0
0.0
0.0


5000
freq
58.8
57.6
58.5
52.2


5000
se
0.0
0.0
0.0
0.0


Computing time

res %>%
    group_by(n, method) %>%
    mutate(minutes = time / 60) %>%
    summarize(
        sd      = sd(minutes),
        minutes = mean(minutes),
    ) %>%
    gather(type, value, -n, -method) %>%
    spread(method, value) %>%
    knitr::kable(digits = 1)


n
type
oracle
selection


500
minutes
0.4
12.8


500
sd
0.1
1.2


5000
minutes
2.0
68.8


5000
sd
0.5
6.4


Environment info

sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu/x86_64 (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2    dichromat_2.0-0 ggplot2_2.1.0   tidyr_0.6.3    
## [5] purrr_0.2.2.2   dplyr_0.7.2     tibble_1.3.3    gamCopula_0.0-3
## 
## loaded via a namespace (and not attached):
##  [1] pcaPP_1.9-72      Rcpp_0.12.12      highr_0.6        
##  [4] plyr_1.8.4        bindr_0.1         iterators_1.0.8  
##  [7] tools_3.3.1       digest_0.6.10     gtable_0.2.0     
## [10] evaluate_0.10.1   nlme_3.1-128      lattice_0.20-35  
## [13] pspline_1.0-18    mgcv_1.8-15       pkgconfig_2.0.1  
## [16] rlang_0.1.1       Matrix_1.2-7.1    foreach_1.4.3    
## [19] igraph_1.1.1      parallel_3.3.1    mvtnorm_1.0-6    
## [22] copula_0.999-17   stringr_1.1.0     knitr_1.16       
## [25] stats4_3.3.1      grid_3.3.1        ADGofTest_0.3    
## [28] glue_1.1.1        R6_2.2.0          VineCopula_2.1.2 
## [31] reshape2_1.4.2    magrittr_1.5      scales_0.4.0     
## [34] codetools_0.2-15  MASS_7.3-45       stabledist_0.7-1 
## [37] assertthat_0.1    colorspace_1.2-7  numDeriv_2016.8-1
## [40] labeling_0.3      stringi_1.1.2     network_1.13.0   
## [43] munsell_0.4.3     gsl_1.9-10.3      doParallel_1.0.10


## linear_terms-1.png

      
    Raw
  

              linear_terms-1.png
            
          
## nonlinear_terms_tree1-1.png

      
    Raw
  

              nonlinear_terms_tree1-1.png
            
          
## nonlinear_terms_tree4-1.png

      
    Raw
  

              nonlinear_terms_tree4-1.png
n	type	1	2	3	4
500	freq	76.8	63.0	53.5	43.0
500	se	0.9	1.2	1.6	2.2
5000	freq	97.5	96.7	87.9	72.2
5000	se	0.3	0.5	1.0	2.0
	Gaussian	Student t	Clayton	Gumbel
Gaussian	18.2	0.7	4.3	5.1	28.4
Student t	4.8	23.4	2.8	3.4	34.4
Clayton	0.0	0.0	18.5	0.0	18.6
Gumbel	1.0	0.8	0.2	16.7	18.7
	24.1	25.0	25.9	25.2	100.0
	Gaussian	Student t	Clayton	Gumbel
Gaussian	15.0	5.2	12.0	12.2	44.4
Student t	6.4	14.6	7.2	6.2	34.4
Clayton	1.4	0.6	4.6	0.2	6.8
Gumbel	2.8	2.2	0.6	8.8	14.4
	25.6	22.6	24.4	27.4	100.0
n	what	1_beta	1_s	2_beta	2_s	3_beta	3_s	4_beta	4_s
500	freq	83.5	75.4	81.3	72.5	73.6	63.5	61.4	54.8
500	se	0.3	0.4	0.3	0.5	0.4	0.7	0.7	1.0
5000	freq	94.6	91.5	94.8	91.4	94.8	91.2	94.4	90.2
5000	se	0.2	0.3	0.2	0.3	0.2	0.4	0.3	0.6
n	type	1	2	3	4
500	freq	42.5	41.9	31.8	19.2
500	se	0.0	0.0	0.0	0.0
5000	freq	58.8	57.6	58.5	52.2
5000	se	0.0	0.0	0.0	0.0