LuisSevillano/combineHeaders.R

## combineHeaders.R
library(tidyverse)
library(ggplot2)
library(janitor)
library(readxl)

filePath <- "data/4247.xlsx"

csv <- read_xlsx(filePath, skip = 6)

# This function fills empty cells with previous values to the right and then combine them with the row above
combineHeaders <- function(data){

  # Store the row
  row <- colnames(data)

  # Get unique values, the ones will be repeated
  types <- row[is.na(as.numeric(gsub("\\.", "", row)))]

  # Create an index that will be incremented across the vector 'types'
  z <- 1

  # Store the first as the default one
  type <- types[z]

  # Iterate through colnames
  for(i in 1:length(row)){
    variable <- row[i]

    # Assume the work starts at the second column
    if (i!=1) {
      # combine with the previous row
      row[i] <- paste(type, data[1, ][i], sep = "_")

      # If current value is equal to the second one from unique values it updates the default value
      if (z + 1 <= length(types) & variable == types[z + 1]) {
        z <- z + 1
        type <- types[z]
        row[i] <- paste(type, data[1, ][i], sep = "_")
      }
    }

  }
  return(row)
}


# Assign the new headers from combineHeaders
names(csv) <- combineHeaders(csv)

cleaned <- csv %>%
  remove_empty('cols') %>%
  remove_empty('rows') %>%
  slice(3:19) %>%
  rename(
    ccaa = 1
  ) %>%
  mutate(
    id = substr(ccaa, 0, 2),
    ccaa = substring(ccaa, 3)
  ) %>%
  select(id, everything(), -matches("Menores de 25 años"), -matches("25 y más años"))

tidied_data <- gather(cleaned, group, value, 3:length(colnames(cleaned))) %>%
  separate(group, c('group', 'quarter'), sep = '_') %>%
  mutate(
    value = as.numeric(value),
    date = as.Date(paste(substring(quarter, 1, 4), as.integer(substring(quarter, 6, 7)) * 3, 1, sep = "-"))
  )

ggplot(data=tidied_data, aes(x = date, y = value)) +
  geom_line(aes(color = group)) +
  scale_color_manual(values = c("#966bff", "#FF6AD5", "#ffde8b", "#20de8b")) + # Color scale by vapeplot
  facet_wrap( ~ ccaa, ncol = 4) +
  ggtitle("Unemployment rates by different age groups, genre and autonomous community") +
  theme_minimal() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

## www.ine.es_jaxiT3_Datos.htm_t=4247.png

      
    Raw
  

              www.ine.es_jaxiT3_Datos.htm_t=4247.png
	library(tidyverse)
	library(ggplot2)
	library(janitor)
	library(readxl)

	filePath <- "data/4247.xlsx"

	csv <- read_xlsx(filePath, skip = 6)

	# This function fills empty cells with previous values to the right and then combine them with the row above
	combineHeaders <- function(data){

	# Store the row
	row <- colnames(data)

	# Get unique values, the ones will be repeated
	types <- row[is.na(as.numeric(gsub("\\.", "", row)))]

	# Create an index that will be incremented across the vector 'types'
	z <- 1

	# Store the first as the default one
	type <- types[z]

	# Iterate through colnames
	for(i in 1:length(row)){
	variable <- row[i]

	# Assume the work starts at the second column
	if (i!=1) {
	# combine with the previous row
	row[i] <- paste(type, data[1, ][i], sep = "_")

	# If current value is equal to the second one from unique values it updates the default value
	if (z + 1 <= length(types) & variable == types[z + 1]) {
	z <- z + 1
	type <- types[z]
	row[i] <- paste(type, data[1, ][i], sep = "_")
	}
	}

	}
	return(row)
	}


	# Assign the new headers from combineHeaders
	names(csv) <- combineHeaders(csv)

	cleaned <- csv %>%
	remove_empty('cols') %>%
	remove_empty('rows') %>%
	slice(3:19) %>%
	rename(
	ccaa = 1
	) %>%
	mutate(
	id = substr(ccaa, 0, 2),
	ccaa = substring(ccaa, 3)
	) %>%
	select(id, everything(), -matches("Menores de 25 años"), -matches("25 y más años"))

	tidied_data <- gather(cleaned, group, value, 3:length(colnames(cleaned))) %>%
	separate(group, c('group', 'quarter'), sep = '_') %>%
	mutate(
	value = as.numeric(value),
	date = as.Date(paste(substring(quarter, 1, 4), as.integer(substring(quarter, 6, 7)) * 3, 1, sep = "-"))
	)

	ggplot(data=tidied_data, aes(x = date, y = value)) +
	geom_line(aes(color = group)) +
	scale_color_manual(values = c("#966bff", "#FF6AD5", "#ffde8b", "#20de8b")) + # Color scale by vapeplot
	facet_wrap( ~ ccaa, ncol = 4) +
	ggtitle("Unemployment rates by different age groups, genre and autonomous community") +
	theme_minimal() +
	theme(axis.title.x=element_blank(),
	axis.text.x=element_blank(),
	axis.ticks.x=element_blank(),
	axis.title.y=element_blank(),
	axis.text.y=element_blank(),
	axis.ticks.y=element_blank())