This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import glob | |
import opendatasets as od | |
# dataset URL | |
dataset = 'https://www.kaggle.com/datasets/aashita/nyt-comments/' | |
# Using opendatasets let's download the data sets (480 MB) | |
od.download(dataset) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from scipy import stats | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import RandomizedSearchCV, KFold | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import classification_report | |
from xgboost import XGBClassifier | |
# spit data into train test | |
def split_data(df: pd.DataFrame, parameters: dict) -> dict: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(patchwork) | |
# generate scatter for entire dataset | |
p_all <- scatter_fn(penguins, bill_length_mm, bill_depth_mm, "All Species") | |
# get species scatters from penguin_scatters dataframe | |
for (i in 1:3) { | |
assign(paste("p", i, sep = "_"), | |
penguin_scatters$plot[i][[1]]) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# generic function for generating a simple scatter plot in ggplot2 | |
scatter_fn <- function(df, col1, col2, title) { | |
df %>% | |
ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) + | |
ggplot2::geom_point() + | |
ggplot2::geom_smooth() + | |
ggplot2::labs(title = title) | |
} | |
# run function across species and store plots in a list column |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
penguins %>% | |
nest_by(species) | |
# A tibble: 3 x 2 | |
# Rowwise: species | |
species data | |
<fct> <list<tbl_df[,7]>> | |
1 Adelie [152 × 7] | |
2 Chinstrap [68 × 7] | |
3 Gentoo [124 × 7] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
penguins %>% | |
dplyr::group_by(species) %>% | |
tidyr::nest() %>% | |
dplyr::rowwise() | |
# A tibble: 3 x 2 | |
# Rowwise: species | |
species data | |
<fct> <list> | |
1 Adelie <tibble [152 × 7]> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(broom) | |
penguin_models <- penguins %>% | |
dplyr::group_by(species) %>% | |
dplyr::summarise(broom::glance(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # summarise model stats | |
penguin_models | |
# A tibble: 3 x 13 | |
species r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
penguin_models <- penguins %>% | |
dplyr::group_by(species) %>% | |
dplyr::summarise(model = list(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # store models in a list column | |
penguin_models | |
# A tibble: 3 x 2 | |
species model | |
<fct> <list> | |
1 Adelie <lm> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
penguin_stats <- penguins %>% | |
dplyr::group_by(species) %>% | |
dplyr::summarise(across(ends_with("mm"), | |
list(mean = ~mean(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE)), # name summary functions | |
.names = "{gsub('_|_mm', '', col)}_{fn}")) # structure for summarised column names | |
penguin_stats | |
# A tibble: 3 x 7 | |
species billlength_mean billlength_sd billdepth_mean billdepth_sd flipperlength_mean flipperlength_sd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
penguin_stats <- penguins %>% | |
dplyr::group_by(species) %>% | |
dplyr::summarise(across(ends_with("mm"), # do this for any column ending in mm | |
list(~mean(.x, na.rm = TRUE), ~sd(.x, na.rm = TRUE)))) # calculate a mean and sd | |
penguin_stats | |
# A tibble: 3 x 7 | |
species bill_length_mm_1 bill_length_mm_2 bill_depth_mm_1 bill_depth_mm_2 flipper_length_mm_1 flipper_length_mm_2 | |
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> |
NewerOlder