Skip to content

Instantly share code, notes, and snippets.

@tnagler
Last active September 25, 2019 14:32
Show Gist options
  • Save tnagler/ea8190914c44f9ac184ddd962cad75f7 to your computer and use it in GitHub Desktop.
Save tnagler/ea8190914c44f9ac184ddd962cad75f7 to your computer and use it in GitHub Desktop.
Appendix to "Generalized Additive Models for Pair-Copula Constructions": Code for the simulation study
title subtitle author date output
Appendix to "Generalized Additive Models for Pair-Copula Constructions"
Code for the simulation study
Thibault Vatter and Thomas Nagler
14 August, 2017
github_document

Preliminary definitions

Required packages

library(gamCopula)
library(tibble)
library(dplyr)
library(purrr)

Smooth functions to be estimated

s_funs <- list(
    s1 = function(t) -1 / 4 + t / 2,
    s2 = function(t) sin(2 * pi * t) / 4,
    s3 = function(t) sin(6 * pi * t) / 4,
    s4 = function(t) rep(0, length(t)),
    s5 = function(t) rep(0, length(t))
)

Parameter vector to be estimated (first entry is intercept)

beta <- c(0, 1, 1, -1, 0, 0, 1, -1, -1, 0, 0) / 4

Creating a simulation model with randomly selected families

make_sim_model <- function(covariates, family_set) {
    n <- nrow(covariates)
    
    # 5-dimensional R-vine structure matrix
    d <- 5
    Matrix <- c(5, 2, 4, 1, 3,
                0, 2, 4, 3, 1,
                0, 0, 1, 4, 3,
                0, 0, 0, 4, 3,
                0, 0, 0, 0, 3)
    Matrix <- matrix(Matrix, d, d)
    # family matrix
    family <- matrix(0, d, d)
    
    # initialize model
    count <- 1
    model <- vector(mode = "list", length = d * (d - 1) / 2)
    
    # model formula for true model
    tmpform <- "B.1 + B.2 + B.3 + Z.1 + Z.2 + Z.3"
    for (i in 1:3) {
        tmpform <- paste0(tmpform, " + s(S.", i, ", k = 10, bs = 'cr')")
    }
    
    # select a family for each pair copula and set GAM relationship to true model
    for (tree in 1:(d - 1)) {
        for (pc in 1:(d - tree)) {
            # select a copula family
            family[d - tree + 1, pc] <- sample(family_set, 1)
            
            # spline approximation of the true functions
            S <- covariates[, substr(colnames(covariates), 1, 1) == "S"]
            y <- rowSums(sapply(1:5, function(i) s_funs[[i]](S[, i])))
            form <- as.formula(paste0("y ~", tmpform))
            approx <- mgcv::gam(form, data = cbind(y, covariates))
            
            # create a dummy gamBiCop object
            tmp <- gamBiCopFit(
                data = cbind(u1 = runif(n), u2 = runif(n), covariates), 
                formula = form, 
                family = 1, 
                n.iters = 1
            )$res
            
            # set to true model
            attr(tmp, "model") <- approx
            attr(tmp, "family") <- family[d - tree + 1, pc]
            tmp@model$coefficients[1] <- 0
            tmp@model$coefficients[2:7] <- beta[beta != 0]
            if (family[d - tree + 1, pc] == 2) {
                attr(tmp, "par2") <- 4
            }
            model[[count]] <- tmp
            count <- count + 1
        }
    }
    
    # return gamVine model
    gamVine(
        Matrix = Matrix,
        model  = model,
        covariates = colnames(covariates)
    )
}

Extracting results from fitted models

# evaluation grid
grid <- cbind(
    matrix(1, 50, 10),
    sapply(1:5, function(i) seq(0, 1, l = 50))
)
colnames(grid) <- c(paste0("B.", 1:5), paste0("Z.", 1:5), paste0("S.", 1:5))

# extract results for a single pair-copula fit
get_pc_results <- function(x) {
    gamBiCopPredict(x, newdata = grid, type = "terms", target = "calib") %>%
        map_df(~ tidy_terms(.)) %>%
        mutate(family = if (is.list(x)) x$family else x@family)
}

# tidy results returned from gamBiCopPredict
tidy_terms <- function(x) {
    # rename smooths to bare variable names
    colnames(x) <- gsub("(^s\\()|(\\))", "", colnames(x))
    # add intercept as column
    x <- as_tibble(cbind(intercept = attr(x, "constant"), x))
    
    # fill missing columns with 0
    x[setdiff(colnames(grid), colnames(x))] <- 0
    
    # standardize variable order
    x <- x[c("intercept", colnames(grid))]
    
    x %>%
        # in case one of the smooth covariates is selected as a linear term, 
        # it will not integrate to zero; hence, we need to subsume the 
        # means of all S.i into the intercept, and standardize the variables
        # accordingly
        mutate(intercept = intercept - mean(S.1 + S.2 + S.3 + S.4 + S.5)) %>%
        mutate_at(paste0("S.", 1:5), funs(. - mean(.))) %>%
        # add grid sequence for smooth covariates
        mutate(t = grid[, 15])
}

Simulation and fitting function for a single seed

do_one <- function(seed, n) {
    
    ## simulate from true model ----------------
    # simulate covariates
    B <- matrix(rbinom(n * 5, 1, 0.5), n, 5)
    Z <- matrix(rnorm(n * 5), n, 5)
    S <- matrix(runif(n * 5), n, 5)
    covariates <- data.frame(B = B, Z = Z, S = S)
    # create the gamVineCopula object
    family_set <- c(1:2, 1:2, 301:302, 401:402)  # allowed copula families
    GVC <- make_sim_model(covariates, family_set)
    # simulate data from gamVine model
    sim <- gamVineSimulate(n, GVC, newdata = covariates)
    
    ## fit models ----------------
    time_oracle <- system.time(
        fit_oracle <- gamVineSeqFit(
            data = cbind(sim, covariates),
            GVC = GVC,
            covariates = colnames(covariates),
            method = "FS",
            n.iters = 30
        )
    )[3]
    
    time_selection <- system.time(
        fit_selection <- gamVineCopSelect(
            data = sim,
            Matrix = GVC@Matrix,
            lin.covs = covariates[, 1:10],
            smooth.covs = covariates[, 11:15],
            simplified = TRUE,
            familyset = unique(family_set),
            familycrit = "AIC",
            rotations = FALSE,
            method = "FS",
            n.iters = 30,
            select.once = FALSE
        )
    )[3]
    
    ## extract results ----------------
    res_oracle <- map(fit_oracle@model[1:10], ~ get_pc_results(.)) %>%
        bind_rows(.id = "pc") %>%
        mutate(method = "oracle", time = time_oracle)
    res_selection <- map(fit_selection@model[1:10], ~ get_pc_results(.)) %>%
        bind_rows(.id = "pc") %>%
        mutate(method = "selection", time = time_selection)
    
    bind_rows(res_oracle, res_selection) %>%
        mutate(
            pc = as.numeric(pc),
            tree = as.factor(map_dbl(pc, ~ which.max(. <= c(4, 7, 9, 10)))),
            seed = seed,
            n = n
        )
}

Run the study

Required packages

library(doParallel)

Backend for parallel computing

cl <- makeCluster(detectCores(), outfile = "")
registerDoParallel(cl)
pkg <- c("gamCopula", "dplyr", "purrr", "tibble")

Parallelize over seeds

# n = 500
res_5h <- foreach(seed = 1:500, .packages = pkg, .export = ls()) %dopar% {
    message(seed)
    tryCatch(do_one(seed, n = 500), error = function(e) NULL)
}
saveRDS(res_5h, file = "GAM-PCC-sim-5h.rds")
# n = 5000
res_5k <- foreach(seed = 1:500, .packages = pkg, .export = ls()) %dopar% {
    message(seed)
    tryCatch(do_one(seed, n = 5000), error = function(e) NULL)
}
saveRDS(res_5k, file = "GAM-PCC-sim-5k.rds")

Tear down parallel backend

stopCluster(cl)

Analysis of results

Preliminaries

Required packages

library(tidyr)
library(ggplot2)
library(dichromat)

Colors

cols <- dichromat::colorschemes[[5]][c(2, 11)]

Combine results for both sample sizes

res <- bind_rows(
    readRDS("GAM-PCC-sim-5h.rds"),
    readRDS("GAM-PCC-sim-5k.rds")
)

Estimation accuracy

Linear covariates

res_linear <- res %>%
    # estimates for linear coefficients are constant for all t, we can focus on 
    # first value
    filter(t == 0) %>%
    # select linear covariates
    select(method, n, tree, seed, starts_with("B"), starts_with("Z")) %>%
    gather(term, value, -method, -n, -tree, -seed) %>%
    # rename terms to appropriate index of beta
    mutate(j = (substr(term, 1, 1) == "Z") * 5 + as.numeric(substr(term, 3, 3))) %>%
    # calculate mean estimates and quantiles
    group_by(j, n, method, tree) %>%
    summarize(
        mu = mean(value),
        q05 = quantile(value, 0.05),
        q95 = quantile(value, 0.95),
        true = beta[1 + first(j)]
    ) %>%
    ungroup()
res_linear %>%
    mutate(n = paste0("n == ", n)) %>%
    ggplot(aes(j, mu, alpha = tree, color = method)) +
    geom_segment(
        aes(x = j - 0.5, xend = j + 0.5, y = true, yend = true), 
        color = "grey40"
    ) +
    geom_point(position = position_dodge(0.9)) + 
    geom_errorbar(aes(ymin = q05, ymax = q95), position = position_dodge(0.9)) +
    facet_grid(n ~ method, labeller = label_parsed) +
    theme_bw() +
    theme(
        legend.position = "bottom",
        plot.margin = grid::unit(c(0, 0, 0, 0), "mm")
    ) +
    scale_color_manual(values = cols) +
    scale_alpha_manual(values = seq(1, 0.4, l = 4)) +
    scale_x_continuous(breaks = 1:10) +
    geom_vline(xintercept = 1:10 - 0.5, color = "grey90") +
    guides(color = FALSE) +
    ylab(quote(beta[j]))

plot of chunk linear_terms

Nonlinear covariates

res_nonlinear <- res %>%
    # select nonlinear covariates
    select(method, n, tree, seed, t, starts_with("S")) %>%
    gather(term, value, -method, -n, -tree, -seed, -t) %>%
    # calculate mean estimates and quantiles
    group_by(method, n, tree, term, t) %>%
    summarize(
        mu = mean(value),
        q05 = quantile(value, 0.05),
        q95 = quantile(value, 0.95)
    ) %>%
    group_by(n, tree, method, term) %>%
    # add true values
    mutate(
        true_fun = map_int(term, ~ as.integer(substr(., 3, 3))),
        true = map2_dbl(true_fun, t, ~ s_funs[[.x]](.y))
    ) %>%
    ungroup() %>%
    mutate(term = map_chr(true_fun, ~ paste0("s[", ., "]"))) # better plot labels
res_nonlinear %>%
    filter(tree == 1) %>%
    mutate(n = paste0("n == ", n)) %>%
    ggplot(aes(t)) +
    geom_ribbon(aes(ymin = q05, ymax = q95, alpha = 0.02)) +
    geom_line(aes(y = true), linetype = 2) +
    geom_line(aes(y = mu, color = method)) +
    facet_grid(n + method ~ term, labeller = label_parsed) +
    scale_color_manual(values = cols) +
    theme_bw() +
    theme(legend.position = "none") +
    ylab(quote(s[k])) +
    theme(plot.margin = grid::unit(c(0, 0, 0, 0), "mm")) +
    coord_cartesian(ylim = c(-0.4, 0.4)) +
    scale_x_continuous(breaks = c(0, 0.5, 1.0))

plot of chunk nonlinear_terms_tree1

res_nonlinear %>%
    filter(tree == 4) %>%
    mutate(n = paste0("n == ", n)) %>%
    ggplot(aes(t)) +
    geom_ribbon(aes(ymin = q05, ymax = q95, alpha = 0.02)) +
    geom_line(aes(y = true), linetype = 2) +
    geom_line(aes(y = mu, color = method)) +
    facet_grid(n + method ~ term, labeller = label_parsed) +
    scale_color_manual(values = cols) +
    theme_bw() +
    theme(legend.position = "none") +
    theme(plot.margin = grid::unit(c(0, 0, 0, 0), "mm")) +
    ylab(quote(s[k])) +
    coord_cartesian(ylim = c(-0.4, 0.4)) +
    scale_x_continuous(breaks = c(0, 0.5, 1.0))

plot of chunk nonlinear_terms_tree4

Family selection

Frequency of correctly selected families

res %>%
    filter(t == 0) %>%
    mutate(family = substr(family, 1, 1)) %>%
    group_by(tree, pc, n, seed) %>%
    select(method, family) %>%
    spread(method, family) %>%
    ungroup() %>%
    group_by(tree, n) %>%
    summarize(
        freq = mean(selection == oracle) * 100,
        se   = sd(selection == oracle) / sqrt(length(oracle)) * 100
    ) %>%
    gather(type, value, -n, -tree) %>%
    spread(tree, value) %>%
    knitr::kable(digits = 1)
n type 1 2 3 4
500 freq 76.8 63.0 53.5 43.0
500 se 0.9 1.2 1.6 2.2
5000 freq 97.5 96.7 87.9 72.2
5000 se 0.3 0.5 1.0 2.0

True vs selected

# tree 1
tab1 <- res %>%
    filter(t == 0, tree == 1, n == 500) %>%
    mutate(family = substr(family, 1, 1)) %>%
    select(method, family, seed, pc) %>%
    spread(method, family) %>%
    select(selection, oracle) %>%
    table()
tab1 <- tab1 / sum(tab1) * 100
colnames(tab1) <- rownames(tab1) <- c("Gaussian", "Student t", "Clayton", "Gumbel")
tab1 <- rbind(tab1, colSums(tab1))
tab1 <- cbind(tab1, rowSums(tab1))
knitr::kable(tab1, digits = 1)
Gaussian Student t Clayton Gumbel
Gaussian 18.2 0.7 4.3 5.1 28.4
Student t 4.8 23.4 2.8 3.4 34.4
Clayton 0.0 0.0 18.5 0.0 18.6
Gumbel 1.0 0.8 0.2 16.7 18.7
24.1 25.0 25.9 25.2 100.0
# tree 4
tab4 <- res %>%
    filter(t == 0, tree == 4, n == 500) %>%
    mutate(family = substr(family, 1, 1)) %>%
    select(method, family, seed, pc) %>%
    spread(method, family) %>%
    select(selection, oracle) %>%
    table()
tab4 <- tab4 / sum(tab4) * 100
colnames(tab4) <- rownames(tab4) <- c("Gaussian", "Student t", "Clayton", "Gumbel")
tab4 <- rbind(tab4, colSums(tab4))
tab4 <- cbind(tab4, rowSums(tab4))
knitr::kable(tab4, digits = 1)
Gaussian Student t Clayton Gumbel
Gaussian 15.0 5.2 12.0 12.2 44.4
Student t 6.4 14.6 7.2 6.2 34.4
Clayton 1.4 0.6 4.6 0.2 6.8
Gumbel 2.8 2.2 0.6 8.8 14.4
25.6 22.6 24.4 27.4 100.0

Covariate selection

true_covs <- c(paste0("S.", 1:3), paste0("B.", 1:3), paste0("Z.", 1:3))
res %>%
    filter(method == "selection") %>%
    group_by(tree, pc, n, seed) %>%
    select(starts_with("S."), starts_with("B."), starts_with("Z.")) %>%
    summarize_all(funs(1 * any(. != 0))) %>%
    ungroup() %>%
    select(-pc, -seed) %>%
    gather(var, value, -tree, -n) %>%
    mutate(correct = (value == ifelse(var %in% true_covs, 1, 0))) %>%
    mutate(type = ifelse(substr(var, 1, 1) == "S", "s", "beta")) %>%
    group_by(tree, n, type) %>%
    summarize(
        freq = mean(correct) * 100,
        se = sd(correct) / sqrt(length(correct)) * 100
    ) %>%
    gather(what, value, -tree, -n, -type) %>%
    unite(tree_type, tree, type) %>%
    spread(tree_type, value) %>%
    knitr::kable(digits = 1)
n what 1_beta 1_s 2_beta 2_s 3_beta 3_s 4_beta 4_s
500 freq 83.5 75.4 81.3 72.5 73.6 63.5 61.4 54.8
500 se 0.3 0.4 0.3 0.5 0.4 0.7 0.7 1.0
5000 freq 94.6 91.5 94.8 91.4 94.8 91.2 94.4 90.2
5000 se 0.2 0.3 0.2 0.3 0.2 0.4 0.3 0.6

Linear term selected correctly

res %>% 
  filter(method == "selection") %>%
  group_by(tree, pc, n, seed) %>%
  summarize(correct = ((var(diff(S.1)) < 1e-20) & any(S.1 != 0))) %>%
  ungroup() %>%
  group_by(tree, n) %>%
  summarize(
      freq = mean(correct) * 100,
      se   = sd(correct) / sqrt(length(correct))
  ) %>%
  gather(type, value, -tree, -n) %>%
  spread(tree, value) %>%
  knitr::kable(digits = 1)
n type 1 2 3 4
500 freq 42.5 41.9 31.8 19.2
500 se 0.0 0.0 0.0 0.0
5000 freq 58.8 57.6 58.5 52.2
5000 se 0.0 0.0 0.0 0.0

Computing time

res %>%
    group_by(n, method) %>%
    mutate(minutes = time / 60) %>%
    summarize(
        sd      = sd(minutes),
        minutes = mean(minutes),
    ) %>%
    gather(type, value, -n, -method) %>%
    spread(method, value) %>%
    knitr::kable(digits = 1)
n type oracle selection
500 minutes 0.4 12.8
500 sd 0.1 1.2
5000 minutes 2.0 68.8
5000 sd 0.5 6.4

Environment info

sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu/x86_64 (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2    dichromat_2.0-0 ggplot2_2.1.0   tidyr_0.6.3    
## [5] purrr_0.2.2.2   dplyr_0.7.2     tibble_1.3.3    gamCopula_0.0-3
## 
## loaded via a namespace (and not attached):
##  [1] pcaPP_1.9-72      Rcpp_0.12.12      highr_0.6        
##  [4] plyr_1.8.4        bindr_0.1         iterators_1.0.8  
##  [7] tools_3.3.1       digest_0.6.10     gtable_0.2.0     
## [10] evaluate_0.10.1   nlme_3.1-128      lattice_0.20-35  
## [13] pspline_1.0-18    mgcv_1.8-15       pkgconfig_2.0.1  
## [16] rlang_0.1.1       Matrix_1.2-7.1    foreach_1.4.3    
## [19] igraph_1.1.1      parallel_3.3.1    mvtnorm_1.0-6    
## [22] copula_0.999-17   stringr_1.1.0     knitr_1.16       
## [25] stats4_3.3.1      grid_3.3.1        ADGofTest_0.3    
## [28] glue_1.1.1        R6_2.2.0          VineCopula_2.1.2 
## [31] reshape2_1.4.2    magrittr_1.5      scales_0.4.0     
## [34] codetools_0.2-15  MASS_7.3-45       stabledist_0.7-1 
## [37] assertthat_0.1    colorspace_1.2-7  numDeriv_2016.8-1
## [40] labeling_0.3      stringi_1.1.2     network_1.13.0   
## [43] munsell_0.4.3     gsl_1.9-10.3      doParallel_1.0.10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment