skgrange/linking_pmf_sources_to_op_with_multiple_linear_regression_models.R

## linking_pmf_sources_to_op_with_multiple_linear_regression_models.R
# Define the helper functions ---------------------------
# https://stats.stackexchange.com/questions/83826/is-a-weighted-r2-in-robust-linear-model-meaningful-for-goodness-of-fit-analys/87085
r2w <- function(model, observed){
  SSe <- sum(model$w * (model$resid) ^ 2)
  observed <- model$resid+model$fitted
  SSt <- sum((model$w * observed - mean(model$w * observed)) ^ 2)
  value <- 1 - SSe / SSt;
  return(value)
}


# The function which does the pmf-source to OP linkage
model_op <- function(df, variables = NA) {

  if (is.na(variables[1])) {
    variables <- c(
      "sulfate_rich", "nitrate_rich", "road_traffic",
      "primary_biogenic", "wood_combustion", "aged_sea_salt", "mineral_dust",
      "secondary_biogenic"
    )
  }

  # Drop the missing variables and get names
  names <- df %>%
    threadr::drop_na_columns() %>%
    names()

  # Drop variables if they do not exist
  variables <- intersect(variables, names)

  # Build formula
  formula <- stringr::str_c(
    "value ~ ",
    stringr::str_c(variables, collapse = " + ")
  ) %>%
    as.formula()

  # Model
  list_model <- MASS::rlm(
    formula, weights = error, data = df, method = "M", maxit = 200
  )

  return(list_model)

}


# Package set-up and checking ---------------------------
# Check if the packages are installed, if not, install them
if (!"threadr" %in% rownames(installed.packages())) {
  # Install helper package
  install.packages("devtools")
  # Install threadr from GitHub
  remotes::install_github("skgrange/threadr")
}

# Install MASS if needed
if (!"MASS" %in% rownames(installed.packages())) {
  install.packages("MASS")
}

# Install broom if needed
if (!"broom" %in% rownames(installed.packages())) {
  install.packages("broom")
}

# Load the helper package
library(threadr)


# Load an example data set which has been simulated ---------------------------
# Load data and group data for iteration, this is a nested tibble
# Within the nested set, `value` is the oxidative potential measurement and `error`
# is the measurement error of OP
data_nest_sampled_sets_simulated <- devtools::source_gist(
  "https://gist.github.com/skgrange/d8322063d574d8bd4ffe418036c9e557"
)$value %>%
  rowwise(field_campaign,
          sampled_set,
          site,
          site_name,
          particulate_fraction,
          variable)

# Print the observations
data_nest_sampled_sets_simulated

# Keep modelling reproducible
set.seed(123)


# Do the modelling ---------------------------
data_nest_models <- data_nest_sampled_sets_simulated %>%
  mutate(model = list(model_op(resampled_set)),
         model_tidy = list(broom::tidy(model)),
         model_glance = list(broom::glance(model)),
         r_squared = r2w(model, resampled_set$value))

# Print return
data_nest_models

# Summarise the models' terms
data_nest_models %>%
  summarise(model_tidy,
            .groups = "drop")

# Continue with the analysis....
	# Define the helper functions ---------------------------
	# https://stats.stackexchange.com/questions/83826/is-a-weighted-r2-in-robust-linear-model-meaningful-for-goodness-of-fit-analys/87085
	r2w <- function(model, observed){
	SSe <- sum(model$w * (model$resid) ^ 2)
	observed <- model$resid+model$fitted
	SSt <- sum((model$w * observed - mean(model$w * observed)) ^ 2)
	value <- 1 - SSe / SSt;
	return(value)
	}


	# The function which does the pmf-source to OP linkage
	model_op <- function(df, variables = NA) {

	if (is.na(variables[1])) {
	variables <- c(
	"sulfate_rich", "nitrate_rich", "road_traffic",
	"primary_biogenic", "wood_combustion", "aged_sea_salt", "mineral_dust",
	"secondary_biogenic"
	)
	}

	# Drop the missing variables and get names
	names <- df %>%
	threadr::drop_na_columns() %>%
	names()

	# Drop variables if they do not exist
	variables <- intersect(variables, names)

	# Build formula
	formula <- stringr::str_c(
	"value ~ ",
	stringr::str_c(variables, collapse = " + ")
	) %>%
	as.formula()

	# Model
	list_model <- MASS::rlm(
	formula, weights = error, data = df, method = "M", maxit = 200
	)

	return(list_model)

	}


	# Package set-up and checking ---------------------------
	# Check if the packages are installed, if not, install them
	if (!"threadr" %in% rownames(installed.packages())) {
	# Install helper package
	install.packages("devtools")
	# Install threadr from GitHub
	remotes::install_github("skgrange/threadr")
	}

	# Install MASS if needed
	if (!"MASS" %in% rownames(installed.packages())) {
	install.packages("MASS")
	}

	# Install broom if needed
	if (!"broom" %in% rownames(installed.packages())) {
	install.packages("broom")
	}

	# Load the helper package
	library(threadr)


	# Load an example data set which has been simulated ---------------------------
	# Load data and group data for iteration, this is a nested tibble
	# Within the nested set, `value` is the oxidative potential measurement and `error`
	# is the measurement error of OP
	data_nest_sampled_sets_simulated <- devtools::source_gist(
	"https://gist.github.com/skgrange/d8322063d574d8bd4ffe418036c9e557"
	)$value %>%
	rowwise(field_campaign,
	sampled_set,
	site,
	site_name,
	particulate_fraction,
	variable)

	# Print the observations
	data_nest_sampled_sets_simulated

	# Keep modelling reproducible
	set.seed(123)


	# Do the modelling ---------------------------
	data_nest_models <- data_nest_sampled_sets_simulated %>%
	mutate(model = list(model_op(resampled_set)),
	model_tidy = list(broom::tidy(model)),
	model_glance = list(broom::glance(model)),
	r_squared = r2w(model, resampled_set$value))

	# Print return
	data_nest_models

	# Summarise the models' terms
	data_nest_models %>%
	summarise(model_tidy,
	.groups = "drop")

	# Continue with the analysis....