Skip to content

Instantly share code, notes, and snippets.

@MokeEire
Last active December 15, 2020 19:30
Show Gist options
  • Save MokeEire/cb5b8768e544d8f8b22718c245eb75d2 to your computer and use it in GitHub Desktop.
Save MokeEire/cb5b8768e544d8f8b22718c245eb75d2 to your computer and use it in GitHub Desktop.
Build a codebook from a dataframe
library(tidyverse)
library(lubridate)
# The function below was written primarily with the aim of producing a codebook which can be exported to Excel.
# If you are trying to get a quick understanding of your data, str(), glimpse(), or summary() will all likely better serve you
# If you use this and make any improvements, please let me know.
# There's lots of room for this to be improved.
# - Check for presence of each data type, and perform processing accordingly
# - Check if the remaining df is empty after summarising each data type
# I wrote it without any safeguards because I know what I use it for but it could definitely use some more careful argument checking
build_codebook = function(df, factor_levels, description_df){
# This function takes a dataframe as an input,
# partitions the data into factors, numerics, characters, and dates,
# and summarises the data for each group.
#
# Arguments
#
# @df: the data frame to summarise
# @factor_levels: the max number of levels that can be in a factor
# @description_df: a two column dataframe containing the column names in df and a corresponding description of the column name
#
# This summary differs for each group:
# Factors
# - A count of distinct values for each column
# - List distinct values, and their frequency for each column
# - Missingness
# Numerics
# - Summary stats: min, mean, median, max
# - Distinct values
# - Missingness
# Characters
# - A count of distinct values
# - Missingness
# Dates
# - Summary stats: min, mean, median, max
# - Distinct values
# - Missingness
# A dataframe to track remaining columns
# After each variable type, we will remove the variables from this dataframe
# This means that we don't double count character variables as both character and factor
remaining = df
################################
## Factor vars
################################
# Select variables which have at most the number given by factor_levels
factor_vars = df %>%
select(where(~n_distinct(., na.rm=T) < factor_levels))
# Summarise factor variables
factor_summary = factor_vars %>%
# Convert all variables to character in order to pivot their values into the same column
mutate(across(.fns = as.character)) %>%
# Pivot data, one row is a column-value pair
pivot_longer(cols = everything(), names_to = "column", values_to = "value") %>%
# 1. Count obs, missingness for each column
group_by(column) %>%
mutate(n = n(),
missing = sum(is.na(value))) %>%
ungroup() %>%
# 2. Get frequency of each distinct value
count(column, value, n, missing, name = "value_count") %>%
group_by(column) %>%
# Ensure count is a dbl, calculate missingness % and number of distinct values
mutate(min = NA_character_,
mean = NA_character_,
median = NA_character_,
max = NA_character_,
value_count = as.double(value_count),
missing_pct = scales::percent(missing/n),
distinct_values = n_distinct(value, na.rm=T),
type = "factor") %>% # Assign variable type
ungroup() %>%
# Now remove missing values (we have already counted them)
filter(!is.na(value) | distinct_values == 0)
# Remove factor vars from the remaining df
remaining = remaining %>%
select(-colnames(factor_vars))
################################
## Numeric vars
################################
if(
map(remaining, class) %>%
some(~(.=="numeric"))
){
# Select numeric variables
numeric_vars = remaining %>%
select(where(is.numeric))
numeric_summary = numeric_vars %>%
# Summarise values: min, mean, median, max, missingness, and n distinct
summarise(
n = n(),
across(.fns = list(
min = ~round(min(., na.rm=T), 3),
mean = ~round(mean(., na.rm=T), 3),
median = ~round(median(., na.rm=T), 3),
max = ~round(max(., na.rm=T), 3),
missing = ~sum(is.na(.)),
distinct_values = ~n_distinct(., na.rm = T)
)
)
) %>%
# Pivot to have a row per column-measure
pivot_longer(-n, names_to = "col", values_to = "val") %>%
# Separate column-measure concatenation
separate(col = col, into = c("column", "measure"), sep = "_(?=(min|mean|median|max|missing|missing_pct|distinct_values))") %>%
# Pivot measures to their own columns
pivot_wider(id_cols = c(column, n), names_from = measure, values_from = val) %>%
mutate(type = "numeric",
# Calculate missingness percent
missing_pct = scales::percent(missing/n, accuracy = .01)) %>%
# Convert numeric measures to character to allow min/max of dates
mutate(across(.cols = matches("^(min|mean|median|max)$"), .fns = as.character))
# Remove numerics from remaining df
remaining = remaining %>%
select(-colnames(numeric_vars))
}
################################
## Character vars
################################
if(
map(remaining, class) %>%
some(~(.=="character"))
){
# Select character vars
character_vars = remaining %>%
select(where(is.character))
character_summary = character_vars %>%
summarise(
n = n(),
across(.cols = -n,
.fns = list(
missing = ~sum(is.na(.)),
distinct_values = ~n_distinct(., na.rm = T)
)
)
) %>%
# Pivot to have a row per column-measure
pivot_longer(-n, names_to = "col", values_to = "val") %>%
# Separate column-measure concatenation
separate(col = col, into = c("column", "measure"), sep = "_(?=(missing|missing_pct|distinct_values))") %>%
# Pivot measures to their own columns
pivot_wider(id_cols = c(column, n), names_from = measure, values_from = val) %>%
mutate(min = NA_character_,
mean = NA_character_,
median = NA_character_,
max = NA_character_,
type = "character",
# Calculate missingness percent
missing_pct = scales::percent(missing/n, accuracy = .01))
# Remove characters from remaining df
remaining = remaining %>%
select(-colnames(character_vars))
}
################################
## Date vars
################################
if(
map(remaining, class) %>%
some(~(.=="date"))
){
# Select date vars
date_vars = remaining %>%
select(where(is.POSIXct)) %>%
mutate(across(
.fns = ~date(.)
))
date_summary = date_vars %>%
# Summarise values: min, mean, median, max, missingness, and n distinct
summarise(
n = n(),
across(.cols = -n,
.fns = list(
min = ~min(., na.rm=T),
mean = ~mean(., na.rm=T),
median = ~median(., na.rm=T),
max = ~max(., na.rm=T),
missing = ~sum(is.na(.)),
distinct_values = ~n_distinct(., na.rm = T)
)
)
) %>%
# Convert all variables to character in order to pivot their values into the same column
mutate(across(.cols = -n, .fns = as.character)) %>%
# Pivot to have a row per column-measure
pivot_longer(-n, names_to = "col", values_to = "val") %>%
# Separate column-measure concatenation
separate(col = col, into = c("column", "measure"), sep = "_(?=(min|mean|median|max|missing|missing_pct|distinct_values))") %>%
# Pivot to have a row per column
pivot_wider(id_cols = c(column, n), names_from = measure, values_from = val) %>%
mutate(type = "date",
across(.cols = c(missing, distinct_values), .fns = as.numeric),
# Calculate missingness percent
missing_pct = scales::percent(missing/n, accuracy = .01))
# Remove dates from remaining df
remaining = remaining %>%
select(-colnames(date_vars))
}
########################################
## Join each summary together
########################################
codebook = list("factor_summary",
"numeric_summary",
"character_summary",
"date_summary") %>%
# Keep the dataframe objects which exist
keep(exists, envir = rlang::env_parent()) %>%
# Evaluate them by their name and bind them into a dataframe
map_dfr(~eval(sym(.))) %>%
# Remove n
filter(column != "n") %>%
select(column, type, value, value_count, distinct_values, missing, missing_pct, min, mean, median, max) %>%
arrange(column, value)
# Include a description
if(!missing(description_df)){
codebook = left_join(codebook, description_df, by = "column") %>%
select(column, type, description, missing, missing_pct,
distinct_values, value, value_count,
min, mean, median, max) %>%
# Make sure descriptive stats are not characters
mutate(across(.cols = c(min, mean, median, max), .fns = ~suppressWarnings(as.numeric(.)))) %>%
arrange(column, value)
}
return(codebook)
}
# > build_codebook(mtcars, factor_levels = 4)
# A tibble: 17 x 11
# column type value value_count distinct_values missing missing_pct min mean median max
# <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
# 1 am factor 0 19 2 0 0.00% NA NA NA NA
# 2 am factor 1 13 2 0 0.00% NA NA NA NA
# 3 carb numeric NA NA 6 0 0.00% 1 2.812 2 8
# 4 cyl factor 4 11 3 0 0.00% NA NA NA NA
# 5 cyl factor 6 7 3 0 0.00% NA NA NA NA
# 6 cyl factor 8 14 3 0 0.00% NA NA NA NA
# 7 disp numeric NA NA 27 0 0.00% 71.1 230.722 196.3 472
# 8 drat numeric NA NA 22 0 0.00% 2.76 3.597 3.695 4.93
# 9 gear factor 3 15 3 0 0.00% NA NA NA NA
# 10 gear factor 4 12 3 0 0.00% NA NA NA NA
# 11 gear factor 5 5 3 0 0.00% NA NA NA NA
# 12 hp numeric NA NA 22 0 0.00% 52 146.688 123 335
# 13 mpg numeric NA NA 25 0 0.00% 10.4 20.091 19.2 33.9
# 14 qsec numeric NA NA 30 0 0.00% 14.5 17.849 17.71 22.9
# 15 vs factor 0 18 2 0 0.00% NA NA NA NA
# 16 vs factor 1 14 2 0 0.00% NA NA NA NA
# 17 wt numeric NA NA 29 0 0.00% 1.513 3.217 3.325 5.424
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment