Keith McNulty keithmcnulty

## load_data.py
import pandas as pd
import os
import glob
import opendatasets as od

# dataset URL
dataset = 'https://www.kaggle.com/datasets/aashita/nyt-comments/'

# Using opendatasets let's download the data sets (480 MB)
od.download(dataset)

## python_functions.py
import pandas as pd
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# spit data into train test
def split_data(df: pd.DataFrame, parameters: dict) -> dict:

## tentidy10-2.R
library(patchwork)

# generate scatter for entire dataset
p_all <- scatter_fn(penguins, bill_length_mm, bill_depth_mm, "All Species")

# get species scatters from penguin_scatters dataframe
for (i in 1:3) {
 assign(paste("p", i, sep = "_"),
        penguin_scatters$plot[i][[1]])
}

## tentidy10-1.R
# generic function for generating a simple scatter plot in ggplot2
scatter_fn <- function(df, col1, col2, title) {
  df %>%
    ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) +
    ggplot2::geom_point() +
    ggplot2::geom_smooth() +
    ggplot2::labs(title = title)
}

# run function across species and store plots in a list column

## tentidy9-2.R
penguins %>%
  nest_by(species)

# A tibble: 3 x 2
# Rowwise:  species
  species                 data
  <fct>     <list<tbl_df[,7]>>
1 Adelie             [152 × 7]
2 Chinstrap           [68 × 7]
3 Gentoo             [124 × 7]

## tentidy9-1.R
penguins %>%
  dplyr::group_by(species) %>%
  tidyr::nest() %>%
  dplyr::rowwise()

# A tibble: 3 x 2
# Rowwise:  species
  species   data
  <fct>     <list>
1 Adelie    <tibble [152 × 7]>

## tentidy8-2.R
library(broom)

penguin_models <- penguins %>%
  dplyr::group_by(species) %>%
  dplyr::summarise(broom::glance(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # summarise model stats

penguin_models

# A tibble: 3 x 13
  species   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC  deviance df.residual  nobs

## tentidy8-1.R
penguin_models <- penguins %>%
  dplyr::group_by(species) %>%
  dplyr::summarise(model = list(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm)))  # store models in a list column

penguin_models

# A tibble: 3 x 2
  species   model
  <fct>     <list>
1 Adelie    <lm>

## tentidy7.R
penguin_stats <- penguins %>%
  dplyr::group_by(species) %>%
  dplyr::summarise(across(ends_with("mm"),
                          list(mean = ~mean(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE)), # name summary functions
                          .names = "{gsub('_|_mm', '', col)}_{fn}")) # structure for summarised column names

penguin_stats

# A tibble: 3 x 7
  species   billlength_mean billlength_sd billdepth_mean billdepth_sd flipperlength_mean flipperlength_sd

## tentidy6.R
penguin_stats <- penguins %>%
  dplyr::group_by(species) %>%
  dplyr::summarise(across(ends_with("mm"), # do this for any column ending in mm
                          list(~mean(.x, na.rm = TRUE), ~sd(.x, na.rm = TRUE)))) # calculate a mean and sd

penguin_stats

# A tibble: 3 x 7
  species   bill_length_mm_1 bill_length_mm_2 bill_depth_mm_1 bill_depth_mm_2 flipper_length_mm_1 flipper_length_mm_2
  <fct>                <dbl>            <dbl>           <dbl>           <dbl>               <dbl>               <dbl>
	import pandas as pd
	import os
	import glob
	import opendatasets as od

	# dataset URL
	dataset = 'https://www.kaggle.com/datasets/aashita/nyt-comments/'

	# Using opendatasets let's download the data sets (480 MB)
	od.download(dataset)
	import pandas as pd
	from scipy import stats
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_selection import RandomizedSearchCV, KFold
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report
	from xgboost import XGBClassifier

	# spit data into train test
	def split_data(df: pd.DataFrame, parameters: dict) -> dict:
	library(patchwork)

	# generate scatter for entire dataset
	p_all <- scatter_fn(penguins, bill_length_mm, bill_depth_mm, "All Species")

	# get species scatters from penguin_scatters dataframe
	for (i in 1:3) {
	assign(paste("p", i, sep = "_"),
	penguin_scatters$plot[i][[1]])
	}
	# generic function for generating a simple scatter plot in ggplot2
	scatter_fn <- function(df, col1, col2, title) {
	df %>%
	ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) +
	ggplot2::geom_point() +
	ggplot2::geom_smooth() +
	ggplot2::labs(title = title)
	}

	# run function across species and store plots in a list column
	penguins %>%
	nest_by(species)

	# A tibble: 3 x 2
	# Rowwise: species
	species data
	<fct> <list<tbl_df[,7]>>
	1 Adelie [152 × 7]
	2 Chinstrap [68 × 7]
	3 Gentoo [124 × 7]
	penguins %>%
	dplyr::group_by(species) %>%
	tidyr::nest() %>%
	dplyr::rowwise()

	# A tibble: 3 x 2
	# Rowwise: species
	species data
	<fct> <list>
	1 Adelie <tibble [152 × 7]>
	library(broom)

	penguin_models <- penguins %>%
	dplyr::group_by(species) %>%
	dplyr::summarise(broom::glance(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # summarise model stats

	penguin_models

	# A tibble: 3 x 13
	species r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
	penguin_models <- penguins %>%
	dplyr::group_by(species) %>%
	dplyr::summarise(model = list(lm(body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm))) # store models in a list column

	penguin_models

	# A tibble: 3 x 2
	species model
	<fct> <list>
	1 Adelie <lm>
	penguin_stats <- penguins %>%
	dplyr::group_by(species) %>%
	dplyr::summarise(across(ends_with("mm"),
	list(mean = ~mean(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE)), # name summary functions
	.names = "{gsub('_\|_mm', '', col)}_{fn}")) # structure for summarised column names

	penguin_stats

	# A tibble: 3 x 7
	species billlength_mean billlength_sd billdepth_mean billdepth_sd flipperlength_mean flipperlength_sd