nacnudus/luxembourg-time-use-unpivotr-experimental.R

## luxembourg-time-use-unpivotr-experimental.R
# Inspired by http://www.brodrigues.co/blog/2018-09-11-human_to_machine/
# https://twitter.com/brodriguesco/status/1039604517287931904
# "You can find the data I will use here. Click on the “Time use” folder and you can download the workbook."
# http://statistiques.public.lu/stat/ReportFolders/ReportFolder.aspx?IF_Language=eng&MainTheme=3&FldrName=1&RFPath=14306

# This time using experimental unpivotr code to allow custom filtering of header cells, rather than having to reposition them.
# https://github.com/nacnudus/unpivotr/commit/0961ec3c3e17b34755f0fce94db7f5bf380d43ce

library(tidyverse)
library(tidyxl)
library(unpivotr)
library(lubridate)

path <- "./download.xlsx"
formats <- xlsx_formats(path)

cells <-
  path %>%
  xlsx_cells() %>%
  # Drop French and Index sheets
  dplyr::filter(str_detect(sheet, "day$")) %>%
  # Clean character values
  mutate(character = str_trim(character)) %>%
  # Drop empty cells
  dplyr::filter(data_type != "blank",
                !(data_type == "character" && character == "")) %>%
  # Drop total rows
  dplyr::filter(row <= 58L) %>%
  # Fix time values expressed as dates rather than character
  mutate(character = if_else(data_type == "date", "00:00", character),
         data_type = if_else(data_type == "date", "character", data_type))

is_bold <- function(cells, formats) {
  formats$local$font$bold[cells$local_format_id]
}

# Tidy every sheet
tidy_sheet <- function(cells) {
  series <- dplyr::filter(cells, row == 1L, col == 1L)$character
  cells %>%
    dplyr::filter(row >= 2L) %>%
    behead("WNW", "activity_category_id",
           header_filter = is_bold, .args = list(formats = formats)) %>%
    behead("W", "activity_subcategory_id") %>%
    behead("WNW", "activity_category",
           header_filter = is_bold, .args = list(formats = formats)) %>%
    behead("W", "activity_subcategory") %>%
    behead("NNW", "grouping") %>%
    behead("NNW", "group") %>%
    behead("NNW", "metric") %>%
    behead("N", "unit") %>%
    select(-row, -col, -local_format_id) %>%
    spatter(unit) %>% # like tidyr::spread() by handles mixed data types
    mutate(Time = as.integer(as.duration(lubridate::hm(Time))))
}

tidy_data <-
  cells %>%
  select(sheet, row, col, data_type, character, numeric, local_format_id) %>%
  nest(-sheet) %>%
  mutate(data = map(data, tidy_sheet)) %>%
  unnest()
tidy_data
	# Inspired by http://www.brodrigues.co/blog/2018-09-11-human_to_machine/
	# https://twitter.com/brodriguesco/status/1039604517287931904
	# "You can find the data I will use here. Click on the “Time use” folder and you can download the workbook."
	# http://statistiques.public.lu/stat/ReportFolders/ReportFolder.aspx?IF_Language=eng&MainTheme=3&FldrName=1&RFPath=14306

	# This time using experimental unpivotr code to allow custom filtering of header cells, rather than having to reposition them.
	# https://github.com/nacnudus/unpivotr/commit/0961ec3c3e17b34755f0fce94db7f5bf380d43ce

	library(tidyverse)
	library(tidyxl)
	library(unpivotr)
	library(lubridate)

	path <- "./download.xlsx"
	formats <- xlsx_formats(path)

	cells <-
	path %>%
	xlsx_cells() %>%
	# Drop French and Index sheets
	dplyr::filter(str_detect(sheet, "day$")) %>%
	# Clean character values
	mutate(character = str_trim(character)) %>%
	# Drop empty cells
	dplyr::filter(data_type != "blank",
	!(data_type == "character" && character == "")) %>%
	# Drop total rows
	dplyr::filter(row <= 58L) %>%
	# Fix time values expressed as dates rather than character
	mutate(character = if_else(data_type == "date", "00:00", character),
	data_type = if_else(data_type == "date", "character", data_type))

	is_bold <- function(cells, formats) {
	formats$local$font$bold[cells$local_format_id]
	}

	# Tidy every sheet
	tidy_sheet <- function(cells) {
	series <- dplyr::filter(cells, row == 1L, col == 1L)$character
	cells %>%
	dplyr::filter(row >= 2L) %>%
	behead("WNW", "activity_category_id",
	header_filter = is_bold, .args = list(formats = formats)) %>%
	behead("W", "activity_subcategory_id") %>%
	behead("WNW", "activity_category",
	header_filter = is_bold, .args = list(formats = formats)) %>%
	behead("W", "activity_subcategory") %>%
	behead("NNW", "grouping") %>%
	behead("NNW", "group") %>%
	behead("NNW", "metric") %>%
	behead("N", "unit") %>%
	select(-row, -col, -local_format_id) %>%
	spatter(unit) %>% # like tidyr::spread() by handles mixed data types
	mutate(Time = as.integer(as.duration(lubridate::hm(Time))))
	}

	tidy_data <-
	cells %>%
	select(sheet, row, col, data_type, character, numeric, local_format_id) %>%
	nest(-sheet) %>%
	mutate(data = map(data, tidy_sheet)) %>%
	unnest()
	tidy_data