Created
November 22, 2021 10:12
-
-
Save skgrange/60923587d3a39fc9dd440d053b3b7388 to your computer and use it in GitHub Desktop.
Example of training multiple linear regression (MLR) models to explain/predict oxidative potential (OP) by particulate matter (PM) sources as identified by positive matrix factorisation (PMF) using simulated observations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define the helper functions --------------------------- | |
# https://stats.stackexchange.com/questions/83826/is-a-weighted-r2-in-robust-linear-model-meaningful-for-goodness-of-fit-analys/87085 | |
r2w <- function(model, observed){ | |
SSe <- sum(model$w * (model$resid) ^ 2) | |
observed <- model$resid+model$fitted | |
SSt <- sum((model$w * observed - mean(model$w * observed)) ^ 2) | |
value <- 1 - SSe / SSt; | |
return(value) | |
} | |
# The function which does the pmf-source to OP linkage | |
model_op <- function(df, variables = NA) { | |
if (is.na(variables[1])) { | |
variables <- c( | |
"sulfate_rich", "nitrate_rich", "road_traffic", | |
"primary_biogenic", "wood_combustion", "aged_sea_salt", "mineral_dust", | |
"secondary_biogenic" | |
) | |
} | |
# Drop the missing variables and get names | |
names <- df %>% | |
threadr::drop_na_columns() %>% | |
names() | |
# Drop variables if they do not exist | |
variables <- intersect(variables, names) | |
# Build formula | |
formula <- stringr::str_c( | |
"value ~ ", | |
stringr::str_c(variables, collapse = " + ") | |
) %>% | |
as.formula() | |
# Model | |
list_model <- MASS::rlm( | |
formula, weights = error, data = df, method = "M", maxit = 200 | |
) | |
return(list_model) | |
} | |
# Package set-up and checking --------------------------- | |
# Check if the packages are installed, if not, install them | |
if (!"threadr" %in% rownames(installed.packages())) { | |
# Install helper package | |
install.packages("devtools") | |
# Install threadr from GitHub | |
remotes::install_github("skgrange/threadr") | |
} | |
# Install MASS if needed | |
if (!"MASS" %in% rownames(installed.packages())) { | |
install.packages("MASS") | |
} | |
# Install broom if needed | |
if (!"broom" %in% rownames(installed.packages())) { | |
install.packages("broom") | |
} | |
# Load the helper package | |
library(threadr) | |
# Load an example data set which has been simulated --------------------------- | |
# Load data and group data for iteration, this is a nested tibble | |
# Within the nested set, `value` is the oxidative potential measurement and `error` | |
# is the measurement error of OP | |
data_nest_sampled_sets_simulated <- devtools::source_gist( | |
"https://gist.github.com/skgrange/d8322063d574d8bd4ffe418036c9e557" | |
)$value %>% | |
rowwise(field_campaign, | |
sampled_set, | |
site, | |
site_name, | |
particulate_fraction, | |
variable) | |
# Print the observations | |
data_nest_sampled_sets_simulated | |
# Keep modelling reproducible | |
set.seed(123) | |
# Do the modelling --------------------------- | |
data_nest_models <- data_nest_sampled_sets_simulated %>% | |
mutate(model = list(model_op(resampled_set)), | |
model_tidy = list(broom::tidy(model)), | |
model_glance = list(broom::glance(model)), | |
r_squared = r2w(model, resampled_set$value)) | |
# Print return | |
data_nest_models | |
# Summarise the models' terms | |
data_nest_models %>% | |
summarise(model_tidy, | |
.groups = "drop") | |
# Continue with the analysis.... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment