Skip to content

Instantly share code, notes, and snippets.

View keithmcnulty's full-sized avatar

Keith McNulty keithmcnulty

View GitHub Profile
@keithmcnulty
keithmcnulty / python_functions.py
Created April 12, 2021 12:55
Functions for running k-fold cross-validated XGBoost on an arbitrary dataset
import pandas as pd
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
# spit data into train test
def split_data(df: pd.DataFrame, parameters: dict) -> dict:
library(patchwork)
# generate scatter for entire dataset
p_all <- scatter_fn(penguins, bill_length_mm, bill_depth_mm, "All Species")
# get species scatters from penguin_scatters dataframe
for (i in 1:3) {
assign(paste("p", i, sep = "_"),
penguin_scatters$plot[i][[1]])
}
# generic function for generating a simple scatter plot in ggplot2
scatter_fn <- function(df, col1, col2, title) {
df %>%
ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) +
ggplot2::geom_point() +
ggplot2::geom_smooth() +
ggplot2::labs(title = title)
}
# run function across species and store plots in a list column
penguins %>%
nest_by(species)
# A tibble: 3 x 2
# Rowwise: species
species data
<fct> <list<tbl_df[,7]>>
1 Adelie [152 × 7]
2 Chinstrap [68 × 7]
3 Gentoo [124 × 7]
penguins %>%
dplyr::group_by(species) %>%
tidyr::nest() %>%
dplyr::rowwise()
# A tibble: 3 x 2
# Rowwise: species
species data
<fct> <list>
1 Adelie <tibble [152 × 7]>
library(broom)
penguin_models <- penguins %>%
dplyr::group_by(species) %>%
dplyr::summarise(broom::glance(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # summarise model stats
penguin_models
# A tibble: 3 x 13
species r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
penguin_models <- penguins %>%
dplyr::group_by(species) %>%
dplyr::summarise(model = list(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # store models in a list column
penguin_models
# A tibble: 3 x 2
species model
<fct> <list>
1 Adelie <lm>
penguin_stats <- penguins %>%
dplyr::group_by(species) %>%
dplyr::summarise(across(ends_with("mm"),
list(mean = ~mean(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE)), # name summary functions
.names = "{gsub('_|_mm', '', col)}_{fn}")) # structure for summarised column names
penguin_stats
# A tibble: 3 x 7
species billlength_mean billlength_sd billdepth_mean billdepth_sd flipperlength_mean flipperlength_sd
penguin_stats <- penguins %>%
dplyr::group_by(species) %>%
dplyr::summarise(across(ends_with("mm"), # do this for any column ending in mm
list(~mean(.x, na.rm = TRUE), ~sd(.x, na.rm = TRUE)))) # calculate a mean and sd
penguin_stats
# A tibble: 3 x 7
species bill_length_mm_1 bill_length_mm_2 bill_depth_mm_1 bill_depth_mm_2 flipper_length_mm_1 flipper_length_mm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
penguins_wide <- penguins_long %>%
tidyr::pivot_wider(names_from = c("part", "measure", "unit"), # pivot these columns
values_from = "value", # take the values from here
names_sep = "_") # combine col names using an underscore
penguins_wide
# A tibble: 344 x 9
# Groups: species, island, sex, year [35]
species island sex year penguinid bill_length_mm bill_depth_mm flipper_length_mm body_mass_g