Created
January 27, 2021 02:51
-
-
Save nikdata/ce4c72550f184e6bc20d6c98475e5a5d to your computer and use it in GitHub Desktop.
Using {reticulate}
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(reticulate) | |
library(palmerpenguins) | |
library(dplyr) | |
library(ggplot2) | |
library(extrafont) | |
df_pgs <- palmerpenguins::penguins | |
dplyr::glimpse(df_pgs) | |
# number missing | |
df_pgs %>% | |
purrr::map_df(function(x) sum(is.na(x))) %>% | |
tidyr::pivot_longer(names_to = 'variable', cols = everything()) | |
# number of unique values | |
df_pgs %>% | |
purrr::map_df(function(x) n_distinct(x)) %>% | |
tidyr::pivot_longer(names_to = 'variable', cols = everything()) | |
# get count by group | |
df_pgs %>% | |
group_by(species) %>% | |
count() | |
# species count by island | |
df_pgs %>% | |
group_by(island, species) %>% | |
count() | |
# plotting | |
df_pgs %>% | |
ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) + | |
geom_point(aes(color = species)) + | |
scale_x_continuous(limits = c(30,60)) + | |
scale_y_continuous(limits = c(12,22), breaks = seq(12,22,2)) + | |
labs(x = "Bill Length (mm)", | |
y = "Bill Depth (mm)", | |
color = "Species", | |
title = "Bill Depth vs Length", | |
caption = "Source: {palmerpenguins} package") + | |
theme(plot.title = element_text(family = "Bahnschrift", color = 'grey100'), | |
plot.caption = element_text(family = "Bahnschrift", color = 'grey60'), | |
axis.title = element_text(family = 'Bahnschrift', color = 'grey100'), | |
axis.text.x = element_text(family = "Bahnschrift", color = 'grey70'), | |
axis.text.y = element_text(family = "Bahnschrift", color = 'grey70'), | |
plot.background = element_rect(fill = 'grey10'), | |
panel.background = element_blank(), | |
panel.grid.major = element_line(color = 'grey30', size = 0.2), | |
panel.grid.minor = element_line(color = 'grey30', size = 0.2), | |
legend.background = element_rect(fill = 'grey20'), | |
legend.key = element_blank(), | |
legend.title = element_text(family = 'Bahnschrift', color = 'grey80'), | |
legend.text = element_text(family = "Bahnschrift", color = 'grey90'), | |
legend.position = c(0.9, 0.2) | |
) | |
df_pgs %>% | |
ggplot(aes(x = sex, y = body_mass_g)) + | |
geom_boxplot(aes(fill = species), color = 'steelblue') + | |
labs(x = "Sex", | |
y = "Body Mass (grams)", | |
title = "Body Mass of Species by Sex", | |
caption = 'Source: {palmerpenguins} package') + | |
theme(plot.title = element_text(family = "Bahnschrift", color = 'grey100'), | |
plot.caption = element_text(family = "Bahnschrift", color = 'grey60'), | |
axis.title = element_text(family = 'Bahnschrift', color = 'grey100'), | |
axis.text.x = element_text(family = "Bahnschrift", color = 'grey70'), | |
axis.text.y = element_text(family = "Bahnschrift", color = 'grey70'), | |
plot.background = element_rect(fill = 'grey10'), | |
panel.background = element_blank(), | |
panel.grid.major = element_line(color = 'grey30', size = 0.2), | |
panel.grid.minor = element_line(color = 'grey30', size = 0.2), | |
legend.background = element_rect(fill = 'grey20'), | |
legend.key = element_blank(), | |
legend.title = element_text(family = 'Bahnschrift', color = 'grey80'), | |
legend.text = element_text(family = "Bahnschrift", color = 'grey90'), | |
legend.position = c(0.90, 0.9) | |
) | |
# values are already encoded as factors | |
# want to use recipes and SKLEARN for modeling | |
# split into training/test | |
set.seed(1337) | |
pg_split <- rsample::initial_split(df_pgs, prop = 0.75) | |
pg_train <- rsample::training(pg_split) | |
pg_test <- rsample::testing(pg_split) | |
# define recipe | |
base_recipe <- recipes::recipe(species ~ ., data = pg_train) %>% | |
recipes::step_mutate(tmp_species = species) %>% | |
recipes::step_meanimpute(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, id = 'impute_missing_continuous') %>% | |
recipes::step_factor2string(sex) %>% | |
recipes::step_mutate(sex = ifelse(is.na(sex),'unknown',sex)) %>% | |
recipes::step_string2factor(sex, levels = c('male','female','unknown')) %>% | |
recipes::step_dummy(sex, one_hot = TRUE, levels = c('male','female','unknown'), preserve = TRUE) %>% | |
recipes::step_integer(species, strict = TRUE, id = 'label encode response variable') %>% | |
recipes::step_dummy(island, id = 'sparse encoding of islad variable', one_hot = TRUE, levels = c('Biscoe','Dream','Torgersen'), preserve = TRUE) %>% | |
recipes::prep(training = pg_train) | |
base_recipe | |
summary(base_recipe) | |
# juice the recipe | |
cln_train <- recipes::juice(base_recipe) | |
cln_test <- recipes::bake(base_recipe, pg_test) | |
# imbalnce? | |
cln_train %>% | |
group_by(tmp_species, species) %>% | |
count() | |
cln_train %>% | |
group_by(island, tmp_species, species) %>% | |
count() | |
glimpse(cln_train) | |
glimpse(cln_test) | |
# import sklearn & smote | |
pysmote_over <- reticulate::import('imblearn.over_sampling') | |
pysmote_under <- reticulate::import('imblearn.under_sampling') | |
pysmote_pipe <- reticulate::import('imblearn.pipeline') | |
sklearn_modelselection <- reticulate::import('sklearn.model_selection') | |
sklearn_ensemble <- reticulate::import('sklearn.ensemble') | |
sklearn_metrics <- reticulate::import('sklearn.metrics') | |
# rebalance the dataset | |
cln_train %>% | |
group_by(species, tmp_species) %>% | |
count() | |
over <- pysmote_over$SMOTE(sampling_strategy = reticulate::dict('Chinstrap' = 100L, 'Gentoo' = 100L)) | |
under <- pysmote_under$RandomUnderSampler(sampling_strategy = reticulate::dict('Adelie' = 100L)) | |
steps <- list(c('o', over), c('u', under)) | |
pipeline <- pysmote_pipe$Pipeline(steps = steps) | |
df_rebal <- pipeline$fit_resample(X = cln_train %>% select(-tmp_species, -island, -sex), y = cln_train %>% select(tmp_species)) | |
nb_pred <- as_tibble(df_rebal[[1]]) | |
nb_resp <- as_tibble(df_rebal[[2]]) | |
cln_bal <- bind_cols(nb_resp, nb_pred) | |
cln_bal %>% | |
group_by(tmp_species, species) %>% | |
count() | |
# define the hypertuning parameter grid | |
param_grid <- reticulate::dict('bootstrap' = list(TRUE), | |
'max_depth' = seq(1L,4L,1L), | |
'max_features' = seq(2L,6L,1L), | |
'min_samples_split' = seq(2L,5L,1L), | |
'n_estimators' = c(250L, 500L)) | |
rf_mdl <- sklearn_ensemble$RandomForestClassifier() | |
grid_search <- sklearn_modelselection$GridSearchCV(estimator = rf_mdl, | |
param_grid = param_grid, | |
cv = 5L, | |
n_jobs = -1L, | |
verbose = 2L, | |
scoring = 'accuracy', | |
refit = TRUE) | |
grid_search$fit(X = cln_bal %>% select(-tmp_species, -species), y = cln_bal$species) | |
grid_search$best_params_ | |
best_grid = grid_search$best_estimator_ | |
# make train/test predictions | |
ypred_train <- best_grid$predict(cln_train %>% select(-island, -tmp_species, -sex, -species)) | |
ypred_test <- best_grid$predict(cln_test %>% select(-island, -tmp_species, -sex, -species)) | |
# sklearn metrics | |
sklearn_metrics$accuracy_score(y_true = cln_train$species, y_pred = ypred_train) | |
sklearn_metrics$f1_score(cln_train$species, ypred_train, average = 'weighted') | |
sklearn_metrics$accuracy_score(y_true = cln_test$species, y_pred = ypred_test) | |
sklearn_metrics$f1_score(cln_test$species, ypred_test, average = 'weighted') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment