Skip to content

Instantly share code, notes, and snippets.

@expersso
Last active September 12, 2018 15:11
Show Gist options
  • Save expersso/b3da2c02edbd4a3c9c5c9c7d44ddff27 to your computer and use it in GitHub Desktop.
Save expersso/b3da2c02edbd4a3c9c5c9c7d44ddff27 to your computer and use it in GitHub Desktop.
Wrangle complicated spreadsheet into tidy dataframe
# Inspired by http://www.brodrigues.co/blog/2018-09-11-human_to_machine/
library(tidyverse)
library(readxl)
# Extract all data from one sheet into tidy dataframe
get_sheet <- function(sheet, path = "time-use.xlsx") {
ranges <- list(
col = "C3:BS5",
row = "A6:B58",
data = "C6:BS58"
)
# Read the three different ranges into a list
x <- imap(ranges, ~read_excel(path, sheet, ., FALSE))
# Extract column names
x$col <- x$col %>%
discard(~all(is.na(.))) %>%
t() %>%
as.data.frame() %>%
set_names(c("population", "subgroup", "unit")) %>%
fill(population, subgroup, unit)
# Extract row names
x$row <- x$row %>%
unite(row, 1:2, sep = "_") %>%
pull()
# Extract actual data
x$data <- x$data %>%
discard(~all(is.na(.))) %>%
t()
# Combine into tibble
cbind(x$col, x$data) %>%
set_names(c(names(x$col), x$row)) %>%
gather(id, value, -c(1:3)) %>%
separate(id, c("activity_id", "activity"), sep = "_") %>%
tbl_df()
}
# Read data from all relevant sheets into one dataframe
df_full <- excel_sheets("time-use.xlsx")[7:10] %>%
set_names() %>%
imap_dfr(~get_sheet(.), .id = "day")
# Extract a subset of the data to match original blog post
df_sub <- df_full %>%
filter(subgroup == "Total") %>%
select(day, population, activity, time = value)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment