A R function to deal with datasets with a double header
library(tidyverse) | |
library(ggplot2) | |
library(janitor) | |
library(readxl) | |
filePath <- "data/4247.xlsx" | |
csv <- read_xlsx(filePath, skip = 6) | |
# This function fills empty cells with previous values to the right and then combine them with the row above | |
combineHeaders <- function(data){ | |
# Store the row | |
row <- colnames(data) | |
# Get unique values, the ones will be repeated | |
types <- row[is.na(as.numeric(gsub("\\.", "", row)))] | |
# Create an index that will be incremented across the vector 'types' | |
z <- 1 | |
# Store the first as the default one | |
type <- types[z] | |
# Iterate through colnames | |
for(i in 1:length(row)){ | |
variable <- row[i] | |
# Assume the work starts at the second column | |
if (i!=1) { | |
# combine with the previous row | |
row[i] <- paste(type, data[1, ][i], sep = "_") | |
# If current value is equal to the second one from unique values it updates the default value | |
if (z + 1 <= length(types) & variable == types[z + 1]) { | |
z <- z + 1 | |
type <- types[z] | |
row[i] <- paste(type, data[1, ][i], sep = "_") | |
} | |
} | |
} | |
return(row) | |
} | |
# Assign the new headers from combineHeaders | |
names(csv) <- combineHeaders(csv) | |
cleaned <- csv %>% | |
remove_empty('cols') %>% | |
remove_empty('rows') %>% | |
slice(3:19) %>% | |
rename( | |
ccaa = 1 | |
) %>% | |
mutate( | |
id = substr(ccaa, 0, 2), | |
ccaa = substring(ccaa, 3) | |
) %>% | |
select(id, everything(), -matches("Menores de 25 años"), -matches("25 y más años")) | |
tidied_data <- gather(cleaned, group, value, 3:length(colnames(cleaned))) %>% | |
separate(group, c('group', 'quarter'), sep = '_') %>% | |
mutate( | |
value = as.numeric(value), | |
date = as.Date(paste(substring(quarter, 1, 4), as.integer(substring(quarter, 6, 7)) * 3, 1, sep = "-")) | |
) | |
ggplot(data=tidied_data, aes(x = date, y = value)) + | |
geom_line(aes(color = group)) + | |
scale_color_manual(values = c("#966bff", "#FF6AD5", "#ffde8b", "#20de8b")) + # Color scale by vapeplot | |
facet_wrap( ~ ccaa, ncol = 4) + | |
ggtitle("Unemployment rates by different age groups, genre and autonomous community") + | |
theme_minimal() + | |
theme(axis.title.x=element_blank(), | |
axis.text.x=element_blank(), | |
axis.ticks.x=element_blank(), | |
axis.title.y=element_blank(), | |
axis.text.y=element_blank(), | |
axis.ticks.y=element_blank()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment