Skip to content

Instantly share code, notes, and snippets.

@nacnudus
Created September 15, 2018 10:08
Show Gist options
  • Save nacnudus/4880ccde1867041a6f85341406556d26 to your computer and use it in GitHub Desktop.
Save nacnudus/4880ccde1867041a6f85341406556d26 to your computer and use it in GitHub Desktop.
Tidy a spreadsheet of the Luxembourg Time Use Survey with experimental unpivotr branch
# Inspired by http://www.brodrigues.co/blog/2018-09-11-human_to_machine/
# https://twitter.com/brodriguesco/status/1039604517287931904
# "You can find the data I will use here. Click on the “Time use” folder and you can download the workbook."
# http://statistiques.public.lu/stat/ReportFolders/ReportFolder.aspx?IF_Language=eng&MainTheme=3&FldrName=1&RFPath=14306
# This time using experimental unpivotr code to allow custom filtering of header cells, rather than having to reposition them.
# https://github.com/nacnudus/unpivotr/commit/0961ec3c3e17b34755f0fce94db7f5bf380d43ce
library(tidyverse)
library(tidyxl)
library(unpivotr)
library(lubridate)
path <- "./download.xlsx"
formats <- xlsx_formats(path)
cells <-
path %>%
xlsx_cells() %>%
# Drop French and Index sheets
dplyr::filter(str_detect(sheet, "day$")) %>%
# Clean character values
mutate(character = str_trim(character)) %>%
# Drop empty cells
dplyr::filter(data_type != "blank",
!(data_type == "character" && character == "")) %>%
# Drop total rows
dplyr::filter(row <= 58L) %>%
# Fix time values expressed as dates rather than character
mutate(character = if_else(data_type == "date", "00:00", character),
data_type = if_else(data_type == "date", "character", data_type))
is_bold <- function(cells, formats) {
formats$local$font$bold[cells$local_format_id]
}
# Tidy every sheet
tidy_sheet <- function(cells) {
series <- dplyr::filter(cells, row == 1L, col == 1L)$character
cells %>%
dplyr::filter(row >= 2L) %>%
behead("WNW", "activity_category_id",
header_filter = is_bold, .args = list(formats = formats)) %>%
behead("W", "activity_subcategory_id") %>%
behead("WNW", "activity_category",
header_filter = is_bold, .args = list(formats = formats)) %>%
behead("W", "activity_subcategory") %>%
behead("NNW", "grouping") %>%
behead("NNW", "group") %>%
behead("NNW", "metric") %>%
behead("N", "unit") %>%
select(-row, -col, -local_format_id) %>%
spatter(unit) %>% # like tidyr::spread() by handles mixed data types
mutate(Time = as.integer(as.duration(lubridate::hm(Time))))
}
tidy_data <-
cells %>%
select(sheet, row, col, data_type, character, numeric, local_format_id) %>%
nest(-sheet) %>%
mutate(data = map(data, tidy_sheet)) %>%
unnest()
tidy_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment