Skip to content

Instantly share code, notes, and snippets.

@LuisSevillano
Last active August 8, 2019 17:57
Show Gist options
  • Save LuisSevillano/42ee0de0695ec504b97152da5f971240 to your computer and use it in GitHub Desktop.
Save LuisSevillano/42ee0de0695ec504b97152da5f971240 to your computer and use it in GitHub Desktop.
A R function to deal with datasets with a double header
library(tidyverse)
library(ggplot2)
library(janitor)
library(readxl)
filePath <- "data/4247.xlsx"
csv <- read_xlsx(filePath, skip = 6)
# This function fills empty cells with previous values to the right and then combine them with the row above
combineHeaders <- function(data){
# Store the row
row <- colnames(data)
# Get unique values, the ones will be repeated
types <- row[is.na(as.numeric(gsub("\\.", "", row)))]
# Create an index that will be incremented across the vector 'types'
z <- 1
# Store the first as the default one
type <- types[z]
# Iterate through colnames
for(i in 1:length(row)){
variable <- row[i]
# Assume the work starts at the second column
if (i!=1) {
# combine with the previous row
row[i] <- paste(type, data[1, ][i], sep = "_")
# If current value is equal to the second one from unique values it updates the default value
if (z + 1 <= length(types) & variable == types[z + 1]) {
z <- z + 1
type <- types[z]
row[i] <- paste(type, data[1, ][i], sep = "_")
}
}
}
return(row)
}
# Assign the new headers from combineHeaders
names(csv) <- combineHeaders(csv)
cleaned <- csv %>%
remove_empty('cols') %>%
remove_empty('rows') %>%
slice(3:19) %>%
rename(
ccaa = 1
) %>%
mutate(
id = substr(ccaa, 0, 2),
ccaa = substring(ccaa, 3)
) %>%
select(id, everything(), -matches("Menores de 25 años"), -matches("25 y más años"))
tidied_data <- gather(cleaned, group, value, 3:length(colnames(cleaned))) %>%
separate(group, c('group', 'quarter'), sep = '_') %>%
mutate(
value = as.numeric(value),
date = as.Date(paste(substring(quarter, 1, 4), as.integer(substring(quarter, 6, 7)) * 3, 1, sep = "-"))
)
ggplot(data=tidied_data, aes(x = date, y = value)) +
geom_line(aes(color = group)) +
scale_color_manual(values = c("#966bff", "#FF6AD5", "#ffde8b", "#20de8b")) + # Color scale by vapeplot
facet_wrap( ~ ccaa, ncol = 4) +
ggtitle("Unemployment rates by different age groups, genre and autonomous community") +
theme_minimal() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment