Last active
December 10, 2019 22:50
-
-
Save nhatley/4ef9029cfea16552134f17f8e1f21a18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## challenge from: https://twitter.com/brookLYNevery1/status/1204208310158794753 | |
library(tidyverse) | |
starwars_garbage_data1 <- data.frame( | |
stringsAsFactors = FALSE, | |
v1 = c( | |
"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678", | |
"123598423", "238952395", "6232048034", "3036308047", | |
"Species:", "Character Name", "Darth Vader", "Person-film ID", | |
"7731900678", "2072161410", "5775513390", "5583683275", | |
"Species:", "Character Name", "Yoda", "Person-film ID", | |
"6562229951", "9476002994", "958505842", "6754035462", "2138742347", | |
"Species:" | |
), | |
v2 = c( | |
NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine", | |
"Tattooine", "Tattooine", "Droid", NA, NA, "Homeworld", NA, NA, NA, | |
NA, "Human", NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", | |
"Tattooine", "Tattooine", "Yoda's Species" | |
), | |
v3 = c( | |
"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace", | |
"Revenge of the Sith", "Return of the Jedi", | |
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Yellow", "Film", | |
"Revenge of the Sith", "Return of the Jedi", | |
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Brown", "Film", | |
"Attack of the Clones", "The Phantom Menace", | |
"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", "" | |
), | |
v4 = c( | |
"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000", | |
"400000", "500000", "", "Height", "202", "Arbitrary Number", | |
"100000", "150000", "350000", "500000", "", "Height", "66", | |
"Arbitrary Number", "160000", "160000", "200000", "200000", "200000", | |
"" | |
) | |
) | |
#NH attempt | |
## use these extra columns as flags for | |
## a) columns to copy over and fill | |
## b) columns to flag for removal later | |
starwars_clean_step1 = starwars_garbage_data1 %>% | |
mutate(paradata1 = v1 == "Character Name", | |
paradata2 = lag(v1) == "Character Name", | |
paradata3 = lead(v1) == "Character Name", | |
## my lag/lead approach fails for the last column | |
paradata4 = row_number() == max(row_number()), | |
## copying over starts here | |
## if our paradata cols are false set to NA then use fill() | |
character_name = ifelse(paradata2, v1, NA), | |
eye_color = ifelse(paradata2, v3, NA), | |
height = ifelse(paradata2, v4, NA), | |
species = ifelse(paradata3 | paradata4, v2, NA) | |
) %>% | |
fill(c("character_name", "eye_color", "height"), .direction = 'down') %>% | |
fill("species", .direction = 'up') | |
starwars_col_names = c( | |
starwars_clean_step1 %>% select(starts_with("v")) %>% slice(3) %>% gather(col, name) %>% pull(name), | |
starwars_clean_step1 %>% select(-starts_with("v"), -starts_with("paradata")) %>% names) | |
starwars_clean = starwars_clean_step1 %>% | |
filter_at(vars(starts_with("paradata")), all_vars(. == FALSE)) %>% | |
select(-starts_with("paradata")) %>% | |
filter_at(vars(starts_with('v')), any_vars(!(. %in% starwars_col_names))) %>% | |
set_names(starwars_col_names) %>% | |
janitor::clean_names(case = 'snake') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment