Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save skgrange/60923587d3a39fc9dd440d053b3b7388 to your computer and use it in GitHub Desktop.
Save skgrange/60923587d3a39fc9dd440d053b3b7388 to your computer and use it in GitHub Desktop.
Example of training multiple linear regression (MLR) models to explain/predict oxidative potential (OP) by particulate matter (PM) sources as identified by positive matrix factorisation (PMF) using simulated observations
# Define the helper functions ---------------------------
# https://stats.stackexchange.com/questions/83826/is-a-weighted-r2-in-robust-linear-model-meaningful-for-goodness-of-fit-analys/87085
r2w <- function(model, observed){
SSe <- sum(model$w * (model$resid) ^ 2)
observed <- model$resid+model$fitted
SSt <- sum((model$w * observed - mean(model$w * observed)) ^ 2)
value <- 1 - SSe / SSt;
return(value)
}
# The function which does the pmf-source to OP linkage
model_op <- function(df, variables = NA) {
if (is.na(variables[1])) {
variables <- c(
"sulfate_rich", "nitrate_rich", "road_traffic",
"primary_biogenic", "wood_combustion", "aged_sea_salt", "mineral_dust",
"secondary_biogenic"
)
}
# Drop the missing variables and get names
names <- df %>%
threadr::drop_na_columns() %>%
names()
# Drop variables if they do not exist
variables <- intersect(variables, names)
# Build formula
formula <- stringr::str_c(
"value ~ ",
stringr::str_c(variables, collapse = " + ")
) %>%
as.formula()
# Model
list_model <- MASS::rlm(
formula, weights = error, data = df, method = "M", maxit = 200
)
return(list_model)
}
# Package set-up and checking ---------------------------
# Check if the packages are installed, if not, install them
if (!"threadr" %in% rownames(installed.packages())) {
# Install helper package
install.packages("devtools")
# Install threadr from GitHub
remotes::install_github("skgrange/threadr")
}
# Install MASS if needed
if (!"MASS" %in% rownames(installed.packages())) {
install.packages("MASS")
}
# Install broom if needed
if (!"broom" %in% rownames(installed.packages())) {
install.packages("broom")
}
# Load the helper package
library(threadr)
# Load an example data set which has been simulated ---------------------------
# Load data and group data for iteration, this is a nested tibble
# Within the nested set, `value` is the oxidative potential measurement and `error`
# is the measurement error of OP
data_nest_sampled_sets_simulated <- devtools::source_gist(
"https://gist.github.com/skgrange/d8322063d574d8bd4ffe418036c9e557"
)$value %>%
rowwise(field_campaign,
sampled_set,
site,
site_name,
particulate_fraction,
variable)
# Print the observations
data_nest_sampled_sets_simulated
# Keep modelling reproducible
set.seed(123)
# Do the modelling ---------------------------
data_nest_models <- data_nest_sampled_sets_simulated %>%
mutate(model = list(model_op(resampled_set)),
model_tidy = list(broom::tidy(model)),
model_glance = list(broom::glance(model)),
r_squared = r2w(model, resampled_set$value))
# Print return
data_nest_models
# Summarise the models' terms
data_nest_models %>%
summarise(model_tidy,
.groups = "drop")
# Continue with the analysis....
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment