Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tadge-Analytics/445261cfe0fc9a21165525b3ecf3172f to your computer and use it in GitHub Desktop.
Save Tadge-Analytics/445261cfe0fc9a21165525b3ecf3172f to your computer and use it in GitHub Desktop.
library(tidyverse)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
countries <- c("AE", "AF", "AN", "AO", "AR", "AS", "AT", "AU", "BB", "BD",
"BE", "BF", "BH", "BM", "BN", "BR", "BY", "CA", "CH", "CL", "CN",
"CO", "CR", "CZ", "DE", "DK", "DZ", "EC", "EE", "EG", "ES", "ET",
"FI", "FO", "FR", "GA", "GB", "GE", "GH", "GI", "GR", "GT", "HK",
"HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM",
"JO", "JP", "KE", "KH", "KR", "KW", "KZ", "LA", "LB", "LK", "LT",
"LU", "LV", "MA", "MN", "MO", "MT", "MU", "MX", "MY", "NG", "NI",
"NL", "NO", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PR", "PS",
"PT", "PY", "QA", "RE", "RO", "RS", "RU", "SA", "SE", "SG", "SI",
"SK", "SN", "SV", "TH", "TN", "TR", "TT", "TW", "UA", "UG", "US",
"UY", "VE", "VG", "VI", "VN", "WS", "ZA", "ZM", "ZW")
import <- read_csv("new raw data as text.txt", col_types = "c", col_names = F) %>%
filter(X1 != "Ab.",
X1 != ".") %>%
mutate(index = row_number())
# split into parts by spaces
processed <- import %>%
mutate(value = str_split(X1, pattern = " ")) %>%
unnest(value) %>% # unpivot
# set column name to rank of item
# find the dates, by a dash
mutate(dates = if_else(str_detect(value, "-") | value == "None", value, NULL)) %>%
separate(dates, "-", into = "start_number", extra = "drop", remove = F) %>%
mutate(start_number = as.numeric(start_number)) %>%
mutate(dates = case_when(value == "None" ~ value,
!is.na(start_number) ~ dates)) %>%
select(-start_number) %>%
# if the value is a complete number... it's the person_id
mutate(person_id = if_else(!is.na(as.numeric(value)), value, NULL)) %>%
# if the value is one of the following two letter country abbreviations, we're good
mutate(country = if_else(value %in% countries, value, NULL)) %>%
# if the word is "Tableau" it's the start of the accreditation title (we'll come back to this)
mutate(is_tableau = case_when(value == "Tableau" ~ "certification",
!is.na(dates)|!is.na(person_id)|!is.na(country) ~ "data")) %>%
fill(is_tableau) %>%
mutate(certification = if_else(is_tableau == "certification", value, NULL)) %>%
select(-is_tableau) %>%
mutate(person_name = if_else(is.na(dates) & is.na(person_id) & is.na(country) & is.na(certification), value, NULL))
# take away dates, person names and accreditation names...
# these have multiple parts and need to be concatenated together
# dates
dates <- processed %>%
select(index, dates) %>%
filter(!is.na(dates)) %>%
group_by(index) %>%
mutate(row_number = row_number()) %>%
spread(row_number, dates) %>%
rename(achieved_date = 2,
expiry_date = 3)
# person names
person_names <- processed %>%
select(index, person_name) %>%
filter(!is.na(person_name)) %>%
group_by(index) %>%
mutate(row_number = row_number()) %>%
spread(row_number, person_name)
person_names <- person_names %>%
select(index) %>%
bind_cols(
person_names %>%
ungroup() %>%
select(-index) %>%
apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
as.data.frame() %>%
rename(Full_name = 1))
# accreditations
certificates <- processed %>%
select(index, certification) %>%
filter(!is.na(certification)) %>%
group_by(index) %>%
mutate(row_number = row_number()) %>%
spread(row_number, certification)
certificates <- certificates %>%
select(index) %>%
bind_cols(
certificates %>%
ungroup() %>%
select(-index) %>%
apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
as.data.frame() %>%
rename(certificate_name = 1))
person_id <- processed %>%
select(index, person_id) %>%
filter(!is.na(person_id))
country <- processed %>%
select(index, country) %>%
filter(!is.na(country))
# let's piece all the data together
completed <- import %>%
left_join(person_id , by = "index") %>%
left_join(person_names, by = "index") %>%
left_join(country , by = "index") %>%
left_join(certificates, by = "index") %>%
left_join(dates, by = "index") %>%
fill(person_id, Full_name, country) %>%
filter(!is.na(certificate_name)) %>%
select(-X1, -index)
completed %>%
write_csv("Certifications in Excel.csv", na = "")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment