halflearned/ranger_covariate_bug.R

## ranger_covariate_bug.R
library(tidyverse)

test_fun <- function(n=200, p=20, num.trees=NULL, mtry=NULL) {

  # Creating regressors: x1, ..., xp
  X <- as.data.frame(matrix(rnorm(n*p), n, p))
  colnames(X) <- paste("x", seq_len(p), sep="")

  # Let x1 be the only relevant covariate
  Y <- X[,1]

  order_A <- colnames(X)       # Regular column order; x1 first
  order_B <- rev(colnames(X))  # Reverse column order; x1 last

  fmla_A <- as.formula(str_c("Y ~", str_c(order_A, collapse=" + ")))
  fmla_B <- as.formula(str_c("Y ~", str_c(order_B, collapse=" + ")))

  # Train same forest twice
  set.seed(1234)
  rforest_A <- ranger::ranger(
    formula=fmla_A,
    data=cbind(X, Y),
    num.trees=num.trees,
    mtry=mtry)

  set.seed(1234)
  rforest_B <- ranger::ranger(
    formula=fmla_B,
    data=cbind(X, Y),
    num.trees=num.trees,
    mtry=mtry)

  # Reset the seed
  rm(.Random.seed, envir=globalenv())

  error_A <- rforest_A$prediction.error
  error_B <- rforest_B$prediction.error

  return(tibble(n=n, p=p, num.trees=num.trees, mtry=mtry,
                error_A=error_A, error_B=error_B,
                diff=error_A-error_B, ratio=error_A/error_B))
}


# Helper function to repeat the experiment n_sim times and take the average of the results
iterate <- function(num.trees, mtry, n, p, n_sim=5000) {
  print("Running with")
  print(c(n, p, num.trees, mtry))
  purrr::rerun(test_fun(num.trees=num.trees, mtry=mtry, n=n, p=p), .n=n_sim) %>%
    bind_rows()
}

# Table with different parameters
# (Note we are filtering out parameter combinations with mtry > p)
params <- expand.grid(
  num.trees=c(20),
  mtry=seq(20),
  n=c(100),
  p=c(20)
) %>% filter(mtry <= p)


output <- purrr::pmap(params, iterate) %>%
          bind_rows()

avg_output <- output %>% group_by(mtry) %>%
              summarize_all(mean)

# Let's see if error_A - error_B is significantly different from zero
table <- lm(diff ~ 0 + factor(mtry), data=output) %>% summary %>% coef %>% round(., 4)
print(table)
	library(tidyverse)

	test_fun <- function(n=200, p=20, num.trees=NULL, mtry=NULL) {

	# Creating regressors: x1, ..., xp
	X <- as.data.frame(matrix(rnorm(n*p), n, p))
	colnames(X) <- paste("x", seq_len(p), sep="")

	# Let x1 be the only relevant covariate
	Y <- X[,1]

	order_A <- colnames(X) # Regular column order; x1 first
	order_B <- rev(colnames(X)) # Reverse column order; x1 last

	fmla_A <- as.formula(str_c("Y ~", str_c(order_A, collapse=" + ")))
	fmla_B <- as.formula(str_c("Y ~", str_c(order_B, collapse=" + ")))

	# Train same forest twice
	set.seed(1234)
	rforest_A <- ranger::ranger(
	formula=fmla_A,
	data=cbind(X, Y),
	num.trees=num.trees,
	mtry=mtry)

	set.seed(1234)
	rforest_B <- ranger::ranger(
	formula=fmla_B,
	data=cbind(X, Y),
	num.trees=num.trees,
	mtry=mtry)

	# Reset the seed
	rm(.Random.seed, envir=globalenv())

	error_A <- rforest_A$prediction.error
	error_B <- rforest_B$prediction.error

	return(tibble(n=n, p=p, num.trees=num.trees, mtry=mtry,
	error_A=error_A, error_B=error_B,
	diff=error_A-error_B, ratio=error_A/error_B))
	}



	# Helper function to repeat the experiment n_sim times and take the average of the results
	iterate <- function(num.trees, mtry, n, p, n_sim=5000) {
	print("Running with")
	print(c(n, p, num.trees, mtry))
	purrr::rerun(test_fun(num.trees=num.trees, mtry=mtry, n=n, p=p), .n=n_sim) %>%
	bind_rows()
	}

	# Table with different parameters
	# (Note we are filtering out parameter combinations with mtry > p)
	params <- expand.grid(
	num.trees=c(20),
	mtry=seq(20),
	n=c(100),
	p=c(20)
	) %>% filter(mtry <= p)


	output <- purrr::pmap(params, iterate) %>%
	bind_rows()

	avg_output <- output %>% group_by(mtry) %>%
	summarize_all(mean)

	# Let's see if error_A - error_B is significantly different from zero
	table <- lm(diff ~ 0 + factor(mtry), data=output) %>% summary %>% coef %>% round(., 4)
	print(table)