Last active
August 8, 2019 17:57
-
-
Save LuisSevillano/42ee0de0695ec504b97152da5f971240 to your computer and use it in GitHub Desktop.
A R function to deal with datasets with a double header
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(ggplot2) | |
library(janitor) | |
library(readxl) | |
filePath <- "data/4247.xlsx" | |
csv <- read_xlsx(filePath, skip = 6) | |
# This function fills empty cells with previous values to the right and then combine them with the row above | |
combineHeaders <- function(data){ | |
# Store the row | |
row <- colnames(data) | |
# Get unique values, the ones will be repeated | |
types <- row[is.na(as.numeric(gsub("\\.", "", row)))] | |
# Create an index that will be incremented across the vector 'types' | |
z <- 1 | |
# Store the first as the default one | |
type <- types[z] | |
# Iterate through colnames | |
for(i in 1:length(row)){ | |
variable <- row[i] | |
# Assume the work starts at the second column | |
if (i!=1) { | |
# combine with the previous row | |
row[i] <- paste(type, data[1, ][i], sep = "_") | |
# If current value is equal to the second one from unique values it updates the default value | |
if (z + 1 <= length(types) & variable == types[z + 1]) { | |
z <- z + 1 | |
type <- types[z] | |
row[i] <- paste(type, data[1, ][i], sep = "_") | |
} | |
} | |
} | |
return(row) | |
} | |
# Assign the new headers from combineHeaders | |
names(csv) <- combineHeaders(csv) | |
cleaned <- csv %>% | |
remove_empty('cols') %>% | |
remove_empty('rows') %>% | |
slice(3:19) %>% | |
rename( | |
ccaa = 1 | |
) %>% | |
mutate( | |
id = substr(ccaa, 0, 2), | |
ccaa = substring(ccaa, 3) | |
) %>% | |
select(id, everything(), -matches("Menores de 25 años"), -matches("25 y más años")) | |
tidied_data <- gather(cleaned, group, value, 3:length(colnames(cleaned))) %>% | |
separate(group, c('group', 'quarter'), sep = '_') %>% | |
mutate( | |
value = as.numeric(value), | |
date = as.Date(paste(substring(quarter, 1, 4), as.integer(substring(quarter, 6, 7)) * 3, 1, sep = "-")) | |
) | |
ggplot(data=tidied_data, aes(x = date, y = value)) + | |
geom_line(aes(color = group)) + | |
scale_color_manual(values = c("#966bff", "#FF6AD5", "#ffde8b", "#20de8b")) + # Color scale by vapeplot | |
facet_wrap( ~ ccaa, ncol = 4) + | |
ggtitle("Unemployment rates by different age groups, genre and autonomous community") + | |
theme_minimal() + | |
theme(axis.title.x=element_blank(), | |
axis.text.x=element_blank(), | |
axis.ticks.x=element_blank(), | |
axis.title.y=element_blank(), | |
axis.text.y=element_blank(), | |
axis.ticks.y=element_blank()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment