alexpghayes/contagion-quick-sim.R

## contagion-quick-sim.R
library(Matrix)
library(tidyverse)

# need these two packages that are not on CRAN
# if something goes wrong, let me know, i wrote them and will fix

# to install:
#
# > install.packages("pak")
# > pak::pkg_install("RoheLab/fastRG")
# > pak::pkg_install("RoheLab/vsp")

library(fastRG)
library(vsp)

##### part 1: simulation the underlying network --------------------------------

n <- 500  # number of nodes in the network

# network parameters
k <- 5
B <- matrix(0.1, nrow = k, ncol = k)
diag(B) <- 1
B

theta <- round(stats::rlnorm(n, 2))
pi <- c(1, 2, 4, 1, 1)

pop_dcsbm <- dcsbm(
  theta = theta,
  B = B,
  pi = pi,
  expected_density = 0.2
)

pop_dcsbm # population properties of the network

# A is the adjacency matrix of the underlying network, where contagion
# spreads along edges in this network

A <- sample_sparse(pop_dcsbm, poisson_edges = FALSE, allow_self_loops = FALSE)
A

##### part 2: simulate contagion on the network according to shalizi's model ---

num_periods <- 5
Y <- matrix(data = 0, ncol = num_periods, nrow = n)

Y[, 1] <- rbinom(n = n, size = 1, prob = 0.1)
Y

intercept <- 0.01
alpha <- 0.05
beta <- 0.1
gamma <- c(0.15, 0, 0, 0.05, 0.1)

C <- eigs_sym(pop_dcsbm)$vectors

for (t in 2:num_periods) {
  mu_t <- intercept + alpha * Y[, t - 1] + beta * A %*% Y[, t - 1] + C %*% gamma
  y_t <- mu_t + rnorm(n, sd = 0.1)
  Y[, t] <- drop(y_t)
}

# Y[i , t] is the infection level of node i at time t
Y

##### part 3: try to recover the parameters of the simulation ------------------

# most of this is data munging to get our simulated data into a nice shape
# once it's into a nice shape, all we need is linear regression

# (a) sanity check the case where we know the latent network parameters C

colnames(C) <- paste0("c", 1:k)
colnames(Y) <- paste0("y_", 1:k)
AY <- A %*% Y
colnames(AY) <- paste0("Ay_", 1:k)

X <- as.matrix(cbind(Y, AY, C))

# this is the data, just not quite in a format we can do linear regression
# on yet. we want the coefficient for a lagged Ay term, which measures
# how much infections spread on the network
X

fit_known_c <- as_tibble(X) %>%
  mutate(node = row_number()) %>%
  pivot_longer(
    cols = contains("y"),
    names_to = c("feature", "time"),
    names_sep = "_"
  ) %>%
  pivot_wider(
    names_from = "feature",
    values_from = "value"
  ) %>%
  arrange(node, time) %>%
  group_by(node) %>%
  mutate(
    y_lag = dplyr::lag(y),
    Ay_lag = dplyr::lag(Ay)
  ) %>%
  ungroup(node) %>%
  select(node, time, y, y_lag, Ay_lag, contains("c")) %>%
  select(y, y_lag, Ay_lag, contains("c")) %>%
  lm(y ~ ., data = .)

summary(fit_known_c)

# (b) now the case where we have to estimate the latent network parameters

fa <- vsp(A, rank = k)  # latent parameter estimation
fa

C_hat <- fa$u
colnames(C_hat) <- paste0("c", 1:k)

# copy-pasta of data munging now using estimates instead of population
# parameters

X <- as.matrix(cbind(Y, AY, C_hat))

fit <- as_tibble(X) %>%
  mutate(node = row_number()) %>%
  pivot_longer(
    cols = contains("y"),
    names_to = c("feature", "time"),
    names_sep = "_"
  ) %>%
  pivot_wider(
    names_from = "feature",
    values_from = "value"
  ) %>%
  arrange(node, time) %>%
  group_by(node) %>%
  mutate(
    y_lag = dplyr::lag(y),
    Ay_lag = dplyr::lag(Ay)
  ) %>%
  ungroup(node) %>%
  select(node, time, y, y_lag, Ay_lag, contains("c")) %>%
  select(y, y_lag, Ay_lag, contains("c")) %>%
  lm(y ~ ., data = .)

summary(fit)

# plot results

broom::tidy(fit_known_c, conf.int = TRUE) %>%
  mutate(true_value = c(intercept, alpha, beta, gamma)) %>%
  filter(!str_detect(term, "c[0-9]")) %>%
  ggplot(aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_pointrange(alpha = 0.5) +
  geom_point(aes(y = true_value), shape = 4, size = 4, color = "darkgreen") +
  coord_flip() +
  theme_classic() +
  labs(
    title = "95 percent confidence intervals for estimated parameters when C is known",
    x = "Parameter",
    y = "",
    caption = "True parameter value at the green X"
  )

broom::tidy(fit, conf.int = TRUE) %>%
  mutate(true_value = c(intercept, alpha, beta, gamma)) %>%
  filter(!str_detect(term, "c[0-9]")) %>%
  ggplot(aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_pointrange(alpha = 0.5) +
  geom_point(aes(y = true_value), shape = 4, size = 4, color = "darkgreen") +
  coord_flip() +
  theme_classic() +
  labs(
    title = "95 percent confidence intervals for estimated parameters when C is estimated",
    x = "Parameter",
    y = "",
    caption = "True parameter value at the green X"
  )
	library(Matrix)
	library(tidyverse)

	# need these two packages that are not on CRAN
	# if something goes wrong, let me know, i wrote them and will fix

	# to install:
	#
	# > install.packages("pak")
	# > pak::pkg_install("RoheLab/fastRG")
	# > pak::pkg_install("RoheLab/vsp")

	library(fastRG)
	library(vsp)

	##### part 1: simulation the underlying network --------------------------------

	n <- 500 # number of nodes in the network

	# network parameters
	k <- 5
	B <- matrix(0.1, nrow = k, ncol = k)
	diag(B) <- 1
	B

	theta <- round(stats::rlnorm(n, 2))
	pi <- c(1, 2, 4, 1, 1)

	pop_dcsbm <- dcsbm(
	theta = theta,
	B = B,
	pi = pi,
	expected_density = 0.2
	)

	pop_dcsbm # population properties of the network

	# A is the adjacency matrix of the underlying network, where contagion
	# spreads along edges in this network

	A <- sample_sparse(pop_dcsbm, poisson_edges = FALSE, allow_self_loops = FALSE)
	A

	##### part 2: simulate contagion on the network according to shalizi's model ---

	num_periods <- 5
	Y <- matrix(data = 0, ncol = num_periods, nrow = n)

	Y[, 1] <- rbinom(n = n, size = 1, prob = 0.1)
	Y

	intercept <- 0.01
	alpha <- 0.05
	beta <- 0.1
	gamma <- c(0.15, 0, 0, 0.05, 0.1)

	C <- eigs_sym(pop_dcsbm)$vectors

	for (t in 2:num_periods) {
	mu_t <- intercept + alpha * Y[, t - 1] + beta * A %% Y[, t - 1] + C %% gamma
	y_t <- mu_t + rnorm(n, sd = 0.1)
	Y[, t] <- drop(y_t)
	}

	# Y[i , t] is the infection level of node i at time t
	Y

	##### part 3: try to recover the parameters of the simulation ------------------

	# most of this is data munging to get our simulated data into a nice shape
	# once it's into a nice shape, all we need is linear regression

	# (a) sanity check the case where we know the latent network parameters C

	colnames(C) <- paste0("c", 1:k)
	colnames(Y) <- paste0("y_", 1:k)
	AY <- A %*% Y
	colnames(AY) <- paste0("Ay_", 1:k)

	X <- as.matrix(cbind(Y, AY, C))

	# this is the data, just not quite in a format we can do linear regression
	# on yet. we want the coefficient for a lagged Ay term, which measures
	# how much infections spread on the network
	X

	fit_known_c <- as_tibble(X) %>%
	mutate(node = row_number()) %>%
	pivot_longer(
	cols = contains("y"),
	names_to = c("feature", "time"),
	names_sep = "_"
	) %>%
	pivot_wider(
	names_from = "feature",
	values_from = "value"
	) %>%
	arrange(node, time) %>%
	group_by(node) %>%
	mutate(
	y_lag = dplyr::lag(y),
	Ay_lag = dplyr::lag(Ay)
	) %>%
	ungroup(node) %>%
	select(node, time, y, y_lag, Ay_lag, contains("c")) %>%
	select(y, y_lag, Ay_lag, contains("c")) %>%
	lm(y ~ ., data = .)

	summary(fit_known_c)

	# (b) now the case where we have to estimate the latent network parameters

	fa <- vsp(A, rank = k) # latent parameter estimation
	fa

	C_hat <- fa$u
	colnames(C_hat) <- paste0("c", 1:k)

	# copy-pasta of data munging now using estimates instead of population
	# parameters

	X <- as.matrix(cbind(Y, AY, C_hat))

	fit <- as_tibble(X) %>%
	mutate(node = row_number()) %>%
	pivot_longer(
	cols = contains("y"),
	names_to = c("feature", "time"),
	names_sep = "_"
	) %>%
	pivot_wider(
	names_from = "feature",
	values_from = "value"
	) %>%
	arrange(node, time) %>%
	group_by(node) %>%
	mutate(
	y_lag = dplyr::lag(y),
	Ay_lag = dplyr::lag(Ay)
	) %>%
	ungroup(node) %>%
	select(node, time, y, y_lag, Ay_lag, contains("c")) %>%
	select(y, y_lag, Ay_lag, contains("c")) %>%
	lm(y ~ ., data = .)

	summary(fit)

	# plot results

	broom::tidy(fit_known_c, conf.int = TRUE) %>%
	mutate(true_value = c(intercept, alpha, beta, gamma)) %>%
	filter(!str_detect(term, "c[0-9]")) %>%
	ggplot(aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
	geom_pointrange(alpha = 0.5) +
	geom_point(aes(y = true_value), shape = 4, size = 4, color = "darkgreen") +
	coord_flip() +
	theme_classic() +
	labs(
	title = "95 percent confidence intervals for estimated parameters when C is known",
	x = "Parameter",
	y = "",
	caption = "True parameter value at the green X"
	)

	broom::tidy(fit, conf.int = TRUE) %>%
	mutate(true_value = c(intercept, alpha, beta, gamma)) %>%
	filter(!str_detect(term, "c[0-9]")) %>%
	ggplot(aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
	geom_pointrange(alpha = 0.5) +
	geom_point(aes(y = true_value), shape = 4, size = 4, color = "darkgreen") +
	coord_flip() +
	theme_classic() +
	labs(
	title = "95 percent confidence intervals for estimated parameters when C is estimated",
	x = "Parameter",
	y = "",
	caption = "True parameter value at the green X"
	)