mbjoseph/bernoulli-occupancy.R

## bernoulli-occupancy.R
library(tidyverse)
library(rstan)
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)

# Simulate true occupancy states ------------------------------------------
# define a design matrix for site-level occupancy
n_site <- 200
m_psi <- 3    # m_psi is the number of columns in the site level design matrix
X_psi <- matrix(c(rep(1, n_site), rnorm((m_psi - 1) * n_site)),
                ncol = m_psi)
stopifnot(ncol(X_psi) >= 1)

# get occupancy probabilities
beta_psi <- rnorm(m_psi)
psi <- plogis(X_psi %*% beta_psi)
z <- rbinom(n_site, size = 1, prob = psi)


# Simulate imperfect detection/nondetection data --------------------------

# determine number of surveys per site
n_survey <- rpois(n_site, lambda = 4)
total_surveys <- sum(n_survey)

# define a survey-level design matrix for detection probabilities
m_p <- 2    # m_p is the number of survey-level covariates
beta_p <- rnorm(m_p)
X_p <- matrix(c(rep(1, total_surveys), rnorm((m_p - 1) * total_surveys)),
              ncol = m_p)
stopifnot(ncol(X_p) >= 1)
p <- plogis(X_p %*% beta_p)

# simulate observations
survey_df <- tibble(site = rep(1:n_site, n_survey)) %>%
  mutate(y = rbinom(n = total_surveys, size = 1, prob = z[site] * p))

# get start and end indices to extract slices of y for each site
start_idx <- rep(0, n_site)
end_idx <- rep(0, n_site)
for (i in 1:n_site) {
  if (n_survey[i] > 0) {
    site_indices <- which(survey_df$site == i)
    start_idx[i] <- site_indices[1]
    end_idx[i] <- site_indices[n_survey[i]]
  }
}

# create vector of whether any positive observations were seen at each site
any_seen <- rep(0, n_site)
for (i in 1:n_site) {
  if (n_survey[i] > 0) {
    any_seen[i] <- max(survey_df$y[start_idx[i]:end_idx[i]])
  }
}


# Bundle data for Stan ----------------------------------------------------
stan_d <- list(n_site = n_site,
               m_psi = m_psi,
               X_psi = X_psi,
               total_surveys = total_surveys,
               m_p = m_p,
               X_p = X_p,
               site = survey_df$site,
               y = survey_df$y,
               start_idx = start_idx,
               end_idx = end_idx,
               any_seen = any_seen,
               n_survey = n_survey)


# Fit model ---------------------------------------------------------------

m_init <- stan_model('bernoulli-occupancy.stan')
m_fit <- sampling(m_init, data = stan_d, pars = c('beta_psi', 'beta_p'))

# compare fits to true values
m_fit
beta_psi
beta_p

## bernoulli-occupancy.stan
data {
  // site-level occupancy covariates
  int<lower = 1> n_site;
  int<lower = 1> m_psi;
  matrix[n_site, m_psi] X_psi;

  // survey-level detection covariates
  int<lower = 1> total_surveys;
  int<lower = 1> m_p;
  matrix[total_surveys, m_p] X_p;

  // survey level information
  int<lower = 1, upper = n_site> site[total_surveys];
  int<lower = 0, upper = 1> y[total_surveys];
  int<lower = 0, upper = total_surveys> start_idx[n_site];
  int<lower = 0, upper = total_surveys> end_idx[n_site];

  // summary of whether species is known to be present at each site
  int<lower = 0, upper = 1> any_seen[n_site];

  // number of surveys at each site
  int<lower = 0> n_survey[n_site];
}
parameters {
  vector[m_psi] beta_psi;
  vector[m_p] beta_p;
}
transformed parameters {
  vector[total_surveys] logit_p = X_p * beta_p;
  vector[n_site] logit_psi = X_psi * beta_psi;
}
model {
  vector[n_site] log_psi = log_inv_logit(logit_psi);
  vector[n_site] log1m_psi = log1m_inv_logit(logit_psi);

  beta_psi ~ normal(0, 1);
  beta_p ~ normal(0, 1);
  for (i in 1:n_site) {
    if (n_survey[i] > 0) {
      if (any_seen[i]) {
        // site is occupied
        target += log_psi[i]
                  + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] |
                                         logit_p[start_idx[i]:end_idx[i]]);
      } else {
        // site may or may not be occupied
        target += log_sum_exp(
          log_psi[i] + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] |
                                            logit_p[start_idx[i]:end_idx[i]]),
          log1m_psi[i]
        );
      }
    }
  }
}
	library(tidyverse)
	library(rstan)
	options(mc.cores = parallel::detectCores())
	rstan_options(auto_write = TRUE)

	# Simulate true occupancy states ------------------------------------------
	# define a design matrix for site-level occupancy
	n_site <- 200
	m_psi <- 3 # m_psi is the number of columns in the site level design matrix
	X_psi <- matrix(c(rep(1, n_site), rnorm((m_psi - 1) * n_site)),
	ncol = m_psi)
	stopifnot(ncol(X_psi) >= 1)

	# get occupancy probabilities
	beta_psi <- rnorm(m_psi)
	psi <- plogis(X_psi %*% beta_psi)
	z <- rbinom(n_site, size = 1, prob = psi)


	# Simulate imperfect detection/nondetection data --------------------------

	# determine number of surveys per site
	n_survey <- rpois(n_site, lambda = 4)
	total_surveys <- sum(n_survey)

	# define a survey-level design matrix for detection probabilities
	m_p <- 2 # m_p is the number of survey-level covariates
	beta_p <- rnorm(m_p)
	X_p <- matrix(c(rep(1, total_surveys), rnorm((m_p - 1) * total_surveys)),
	ncol = m_p)
	stopifnot(ncol(X_p) >= 1)
	p <- plogis(X_p %*% beta_p)

	# simulate observations
	survey_df <- tibble(site = rep(1:n_site, n_survey)) %>%
	mutate(y = rbinom(n = total_surveys, size = 1, prob = z[site] * p))

	# get start and end indices to extract slices of y for each site
	start_idx <- rep(0, n_site)
	end_idx <- rep(0, n_site)
	for (i in 1:n_site) {
	if (n_survey[i] > 0) {
	site_indices <- which(survey_df$site == i)
	start_idx[i] <- site_indices[1]
	end_idx[i] <- site_indices[n_survey[i]]
	}
	}

	# create vector of whether any positive observations were seen at each site
	any_seen <- rep(0, n_site)
	for (i in 1:n_site) {
	if (n_survey[i] > 0) {
	any_seen[i] <- max(survey_df$y[start_idx[i]:end_idx[i]])
	}
	}


	# Bundle data for Stan ----------------------------------------------------
	stan_d <- list(n_site = n_site,
	m_psi = m_psi,
	X_psi = X_psi,
	total_surveys = total_surveys,
	m_p = m_p,
	X_p = X_p,
	site = survey_df$site,
	y = survey_df$y,
	start_idx = start_idx,
	end_idx = end_idx,
	any_seen = any_seen,
	n_survey = n_survey)



	# Fit model ---------------------------------------------------------------

	m_init <- stan_model('bernoulli-occupancy.stan')
	m_fit <- sampling(m_init, data = stan_d, pars = c('beta_psi', 'beta_p'))

	# compare fits to true values
	m_fit
	beta_psi
	beta_p
	data {
	// site-level occupancy covariates
	int<lower = 1> n_site;
	int<lower = 1> m_psi;
	matrix[n_site, m_psi] X_psi;

	// survey-level detection covariates
	int<lower = 1> total_surveys;
	int<lower = 1> m_p;
	matrix[total_surveys, m_p] X_p;

	// survey level information
	int<lower = 1, upper = n_site> site[total_surveys];
	int<lower = 0, upper = 1> y[total_surveys];
	int<lower = 0, upper = total_surveys> start_idx[n_site];
	int<lower = 0, upper = total_surveys> end_idx[n_site];

	// summary of whether species is known to be present at each site
	int<lower = 0, upper = 1> any_seen[n_site];

	// number of surveys at each site
	int<lower = 0> n_survey[n_site];
	}
	parameters {
	vector[m_psi] beta_psi;
	vector[m_p] beta_p;
	}
	transformed parameters {
	vector[total_surveys] logit_p = X_p * beta_p;
	vector[n_site] logit_psi = X_psi * beta_psi;
	}
	model {
	vector[n_site] log_psi = log_inv_logit(logit_psi);
	vector[n_site] log1m_psi = log1m_inv_logit(logit_psi);

	beta_psi ~ normal(0, 1);
	beta_p ~ normal(0, 1);
	for (i in 1:n_site) {
	if (n_survey[i] > 0) {
	if (any_seen[i]) {
	// site is occupied
	target += log_psi[i]
	+ bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] \|
	logit_p[start_idx[i]:end_idx[i]]);
	} else {
	// site may or may not be occupied
	target += log_sum_exp(
	log_psi[i] + bernoulli_logit_lpmf(y[start_idx[i]:end_idx[i]] \|
	logit_p[start_idx[i]:end_idx[i]]),
	log1m_psi[i]
	);
	}
	}
	}
	}