Tadge-Analytics/Tableau Certification Directory -pdf to csv

## Tableau Certification Directory -pdf to csv
library(tidyverse)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

countries <- c("AE", "AF", "AN", "AO", "AR", "AS", "AT", "AU", "BB", "BD",
               "BE", "BF", "BH", "BM", "BN", "BR", "BY", "CA", "CH", "CL", "CN",
               "CO", "CR", "CZ", "DE", "DK", "DZ", "EC", "EE", "EG", "ES", "ET",
               "FI", "FO", "FR", "GA", "GB", "GE", "GH", "GI", "GR", "GT", "HK",
               "HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM",
               "JO", "JP", "KE", "KH", "KR", "KW", "KZ", "LA", "LB", "LK", "LT",
               "LU", "LV", "MA", "MN", "MO", "MT", "MU", "MX", "MY", "NG", "NI",
               "NL", "NO", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PR", "PS",
               "PT", "PY", "QA", "RE", "RO", "RS", "RU", "SA", "SE", "SG", "SI",
               "SK", "SN", "SV", "TH", "TN", "TR", "TT", "TW", "UA", "UG", "US",
               "UY", "VE", "VG", "VI", "VN", "WS", "ZA", "ZM", "ZW")


import <- read_csv("new raw data as text.txt", col_types = "c", col_names = F) %>%
  filter(X1 != "Ab.",
         X1 != ".") %>%
  mutate(index = row_number())


# split into parts by spaces
processed <- import %>%
  mutate(value = str_split(X1, pattern = " ")) %>%
  unnest(value) %>% # unpivot

  # set column name to rank of item
  # find the dates, by a dash
  mutate(dates = if_else(str_detect(value, "-") | value == "None", value, NULL)) %>%
  separate(dates, "-", into = "start_number", extra = "drop", remove = F) %>%
  mutate(start_number = as.numeric(start_number)) %>%
  mutate(dates = case_when(value == "None" ~ value,
                           !is.na(start_number) ~ dates)) %>%
  select(-start_number) %>%

  # if the value is a complete number... it's the person_id
  mutate(person_id = if_else(!is.na(as.numeric(value)), value, NULL)) %>%

  # if the value is one of the following two letter country abbreviations, we're good
  mutate(country = if_else(value %in% countries, value, NULL)) %>%

  # if the word is "Tableau" it's the start of the accreditation title (we'll come back to this)
  mutate(is_tableau = case_when(value == "Tableau" ~ "certification",
                                !is.na(dates)|!is.na(person_id)|!is.na(country) ~ "data")) %>%
  fill(is_tableau) %>%
  mutate(certification = if_else(is_tableau == "certification", value, NULL)) %>%
  select(-is_tableau) %>%
  mutate(person_name = if_else(is.na(dates) & is.na(person_id) & is.na(country) & is.na(certification), value, NULL))


# take away dates, person names and accreditation names...
# these have multiple parts and need to be concatenated together

# dates
dates <- processed %>%
  select(index, dates) %>%
  filter(!is.na(dates)) %>%
  group_by(index) %>%
  mutate(row_number = row_number()) %>%
  spread(row_number, dates) %>%
  rename(achieved_date = 2,
         expiry_date = 3)


# person names
person_names <- processed %>%
  select(index, person_name) %>%
  filter(!is.na(person_name)) %>%
  group_by(index) %>%
  mutate(row_number = row_number()) %>%
  spread(row_number, person_name)

person_names <- person_names %>%
  select(index) %>%
  bind_cols(
    person_names %>%
      ungroup() %>%
      select(-index) %>%
      apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
      as.data.frame() %>%
      rename(Full_name = 1))


# accreditations
certificates <- processed %>%
  select(index, certification) %>%
  filter(!is.na(certification)) %>%
  group_by(index) %>%
  mutate(row_number = row_number()) %>%
  spread(row_number, certification)


certificates <- certificates %>%
  select(index) %>%
  bind_cols(
    certificates %>%
      ungroup() %>%
      select(-index) %>%
      apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
      as.data.frame() %>%
      rename(certificate_name = 1))


person_id <- processed %>%
  select(index, person_id) %>%
  filter(!is.na(person_id))


country <- processed %>%
  select(index, country) %>%
  filter(!is.na(country))


# let's piece all the data together
completed <- import %>%
  left_join(person_id , by = "index") %>%
  left_join(person_names, by = "index") %>%
  left_join(country , by = "index") %>%
  left_join(certificates, by = "index") %>%
  left_join(dates, by = "index") %>%
  fill(person_id, Full_name, country) %>%
  filter(!is.na(certificate_name)) %>%
  select(-X1, -index)


completed %>%
  write_csv("Certifications in Excel.csv", na = "")
	library(tidyverse)
	setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

	countries <- c("AE", "AF", "AN", "AO", "AR", "AS", "AT", "AU", "BB", "BD",
	"BE", "BF", "BH", "BM", "BN", "BR", "BY", "CA", "CH", "CL", "CN",
	"CO", "CR", "CZ", "DE", "DK", "DZ", "EC", "EE", "EG", "ES", "ET",
	"FI", "FO", "FR", "GA", "GB", "GE", "GH", "GI", "GR", "GT", "HK",
	"HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM",
	"JO", "JP", "KE", "KH", "KR", "KW", "KZ", "LA", "LB", "LK", "LT",
	"LU", "LV", "MA", "MN", "MO", "MT", "MU", "MX", "MY", "NG", "NI",
	"NL", "NO", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PR", "PS",
	"PT", "PY", "QA", "RE", "RO", "RS", "RU", "SA", "SE", "SG", "SI",
	"SK", "SN", "SV", "TH", "TN", "TR", "TT", "TW", "UA", "UG", "US",
	"UY", "VE", "VG", "VI", "VN", "WS", "ZA", "ZM", "ZW")


	import <- read_csv("new raw data as text.txt", col_types = "c", col_names = F) %>%
	filter(X1 != "Ab.",
	X1 != ".") %>%
	mutate(index = row_number())


	# split into parts by spaces
	processed <- import %>%
	mutate(value = str_split(X1, pattern = " ")) %>%
	unnest(value) %>% # unpivot

	# set column name to rank of item
	# find the dates, by a dash
	mutate(dates = if_else(str_detect(value, "-") \| value == "None", value, NULL)) %>%
	separate(dates, "-", into = "start_number", extra = "drop", remove = F) %>%
	mutate(start_number = as.numeric(start_number)) %>%
	mutate(dates = case_when(value == "None" ~ value,
	!is.na(start_number) ~ dates)) %>%
	select(-start_number) %>%

	# if the value is a complete number... it's the person_id
	mutate(person_id = if_else(!is.na(as.numeric(value)), value, NULL)) %>%

	# if the value is one of the following two letter country abbreviations, we're good
	mutate(country = if_else(value %in% countries, value, NULL)) %>%

	# if the word is "Tableau" it's the start of the accreditation title (we'll come back to this)
	mutate(is_tableau = case_when(value == "Tableau" ~ "certification",
	!is.na(dates)\|!is.na(person_id)\|!is.na(country) ~ "data")) %>%
	fill(is_tableau) %>%
	mutate(certification = if_else(is_tableau == "certification", value, NULL)) %>%
	select(-is_tableau) %>%
	mutate(person_name = if_else(is.na(dates) & is.na(person_id) & is.na(country) & is.na(certification), value, NULL))


	# take away dates, person names and accreditation names...
	# these have multiple parts and need to be concatenated together

	# dates
	dates <- processed %>%
	select(index, dates) %>%
	filter(!is.na(dates)) %>%
	group_by(index) %>%
	mutate(row_number = row_number()) %>%
	spread(row_number, dates) %>%
	rename(achieved_date = 2,
	expiry_date = 3)


	# person names
	person_names <- processed %>%
	select(index, person_name) %>%
	filter(!is.na(person_name)) %>%
	group_by(index) %>%
	mutate(row_number = row_number()) %>%
	spread(row_number, person_name)

	person_names <- person_names %>%
	select(index) %>%
	bind_cols(
	person_names %>%
	ungroup() %>%
	select(-index) %>%
	apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
	as.data.frame() %>%
	rename(Full_name = 1))


	# accreditations
	certificates <- processed %>%
	select(index, certification) %>%
	filter(!is.na(certification)) %>%
	group_by(index) %>%
	mutate(row_number = row_number()) %>%
	spread(row_number, certification)


	certificates <- certificates %>%
	select(index) %>%
	bind_cols(
	certificates %>%
	ungroup() %>%
	select(-index) %>%
	apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
	as.data.frame() %>%
	rename(certificate_name = 1))


	person_id <- processed %>%
	select(index, person_id) %>%
	filter(!is.na(person_id))


	country <- processed %>%
	select(index, country) %>%
	filter(!is.na(country))



	# let's piece all the data together
	completed <- import %>%
	left_join(person_id , by = "index") %>%
	left_join(person_names, by = "index") %>%
	left_join(country , by = "index") %>%
	left_join(certificates, by = "index") %>%
	left_join(dates, by = "index") %>%
	fill(person_id, Full_name, country) %>%
	filter(!is.na(certificate_name)) %>%
	select(-X1, -index)


	completed %>%
	write_csv("Certifications in Excel.csv", na = "")