Skip to content

Instantly share code, notes, and snippets.

@topepo
Last active March 25, 2021 17:45
Show Gist options
  • Save topepo/5420abecd4f5795b103669a7a29522e7 to your computer and use it in GitHub Desktop.
Save topepo/5420abecd4f5795b103669a7a29522e7 to your computer and use it in GitHub Desktop.
A small simulation to see how often PCA loading and score signs flip form data perturbations
suppressPackageStartupMessages(library(tidymodels))
library(furrr)
plan(multisession, workers = 20)
data(cells)
set.seed(1)
split <- initial_split(cells, prop = .95)
cell_train <- training(split)
cell_test <- testing(split)
full_data_rec <-
recipe(class ~ ., data = cell_train) %>%
step_normalize(all_numeric()) %>%
step_pca(all_numeric(), num_comp = 50, id = "pca") %>%
prep()
old_scores <-
bake(full_data_rec, new_data = cell_test, starts_with("PC"), composition = "matrix") %>%
sign()
old_loadings <-
full_data_rec$steps[[2]]$res$rotation %>%
vctrs::vec_slice(1) %>%
sign()
changed_sign <- function(x) {
set.seed(x)
new_pct <- runif(1, min = .9, max = .99)
reduced_data_cec <-
recipe(class ~ ., data = cell_train %>% sample_frac(new_pct)) %>%
step_normalize(all_numeric()) %>%
step_pca(all_numeric(), num_comp = 50, id = "pca") %>%
prep()
new_scores <-
bake(reduced_data_cec, new_data = cell_test, starts_with("PC"), composition = "matrix") %>%
sign()
new_loadings <-
reduced_data_cec$steps[[2]]$res$rotation %>%
vctrs::vec_slice(1) %>%
sign()
percent_change_loadings <- mean(old_loadings != new_loadings)
percent_change_scores <- mean(map2_lgl(old_scores, new_scores, ~ all(.x != .y)))
tibble(percent_change_loadings = percent_change_loadings,
percent_change_scores = percent_change_scores,
version = as.character(packageVersion("recipes")))
}
changed <- future_map_dfr(1:10000, changed_sign, .options = furrr_options(seed = 123))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment