Skip to content

Instantly share code, notes, and snippets.

@nhatley
Last active December 10, 2019 22:50
Show Gist options
  • Save nhatley/4ef9029cfea16552134f17f8e1f21a18 to your computer and use it in GitHub Desktop.
Save nhatley/4ef9029cfea16552134f17f8e1f21a18 to your computer and use it in GitHub Desktop.
## challenge from: https://twitter.com/brookLYNevery1/status/1204208310158794753
library(tidyverse)
starwars_garbage_data1 <- data.frame(
stringsAsFactors = FALSE,
v1 = c(
"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
"123598423", "238952395", "6232048034", "3036308047",
"Species:", "Character Name", "Darth Vader", "Person-film ID",
"7731900678", "2072161410", "5775513390", "5583683275",
"Species:", "Character Name", "Yoda", "Person-film ID",
"6562229951", "9476002994", "958505842", "6754035462", "2138742347",
"Species:"
),
v2 = c(
NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
"Tattooine", "Tattooine", "Droid", NA, NA, "Homeworld", NA, NA, NA,
NA, "Human", NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
"Tattooine", "Tattooine", "Yoda's Species"
),
v3 = c(
"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
"Revenge of the Sith", "Return of the Jedi",
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Yellow", "Film",
"Revenge of the Sith", "Return of the Jedi",
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Brown", "Film",
"Attack of the Clones", "The Phantom Menace",
"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", ""
),
v4 = c(
"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
"400000", "500000", "", "Height", "202", "Arbitrary Number",
"100000", "150000", "350000", "500000", "", "Height", "66",
"Arbitrary Number", "160000", "160000", "200000", "200000", "200000",
""
)
)
#NH attempt
## use these extra columns as flags for
## a) columns to copy over and fill
## b) columns to flag for removal later
starwars_clean_step1 = starwars_garbage_data1 %>%
mutate(paradata1 = v1 == "Character Name",
paradata2 = lag(v1) == "Character Name",
paradata3 = lead(v1) == "Character Name",
## my lag/lead approach fails for the last column
paradata4 = row_number() == max(row_number()),
## copying over starts here
## if our paradata cols are false set to NA then use fill()
character_name = ifelse(paradata2, v1, NA),
eye_color = ifelse(paradata2, v3, NA),
height = ifelse(paradata2, v4, NA),
species = ifelse(paradata3 | paradata4, v2, NA)
) %>%
fill(c("character_name", "eye_color", "height"), .direction = 'down') %>%
fill("species", .direction = 'up')
starwars_col_names = c(
starwars_clean_step1 %>% select(starts_with("v")) %>% slice(3) %>% gather(col, name) %>% pull(name),
starwars_clean_step1 %>% select(-starts_with("v"), -starts_with("paradata")) %>% names)
starwars_clean = starwars_clean_step1 %>%
filter_at(vars(starts_with("paradata")), all_vars(. == FALSE)) %>%
select(-starts_with("paradata")) %>%
filter_at(vars(starts_with('v')), any_vars(!(. %in% starwars_col_names))) %>%
set_names(starwars_col_names) %>%
janitor::clean_names(case = 'snake')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment