Skip to content

Instantly share code, notes, and snippets.

@brooke-watson
Last active December 10, 2019 19:51
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save brooke-watson/ccf3d1b1f4449ab55a72f7835a52e599 to your computer and use it in GitHub Desktop.
Save brooke-watson/ccf3d1b1f4449ab55a72f7835a52e599 to your computer and use it in GitHub Desktop.
cursed_data_challenge
# ---------------------------------------
# untidy data
# ---------------------------------------
# this dataset is a sample of the kind of data that might appear in the wild,
# particularly when dealing with government data,
# particularly when trying to convert an output table or individual report
# back into a raw data format that can be analyzed.
# in these test datasets, discrete observations are spread out across multiple rows.
# starwars_garbage_data1 reads in a small sample dataset in which headers
# repeat across multiple rows, some columns are not included in some rows,
# and some (but not all!) variable headers are in the *same row* as their values.
# starwars_garbage_data2 is the same data without repeating headers.
# The structure of the data repeats consistently.
# what is the most effective way to convert either of these datasets (ideally both,
# but there could be two separate solutions) into a tidy dataframe that
# includes all information?
# i currently have a hacky solution, but i want something *nice* and *replicable*.
# the unique identifier in this data is the person-film ID, which is made up.
# the arbitrary number column is also made up, but goes with the person-film ID.
starwars_garbage_data1 <- data.frame(
stringsAsFactors = FALSE,
v1 = c(
"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
"123598423", "238952395", "6232048034", "3036308047",
"Species:", "Character Name", "Darth Vader", "Person-film ID",
"7731900678", "2072161410", "5775513390", "5583683275",
"Species:", "Character Name", "Yoda", "Person-film ID",
"6562229951", "9476002994", "958505842", "6754035462", "2138742347",
"Species:"
),
v2 = c(
NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
"Tattooine", "Tattooine", "Droid", NA, NA, "Homeworld", NA, NA, NA,
NA, "Human", NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
"Tattooine", "Tattooine", "Yoda's Species"
),
v3 = c(
"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
"Revenge of the Sith", "Return of the Jedi",
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Yellow", "Film",
"Revenge of the Sith", "Return of the Jedi",
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Brown", "Film",
"Attack of the Clones", "The Phantom Menace",
"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", ""
),
v4 = c(
"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
"400000", "500000", "", "Height", "202", "Arbitrary Number",
"100000", "150000", "350000", "500000", "", "Height", "66",
"Arbitrary Number", "160000", "160000", "200000", "200000", "200000",
""
)
)
starwars_garbage_data2 <- data.frame(
stringsAsFactors = FALSE,
v1 = c(
"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
"123598423", "238952395", "6232048034", "3036308047",
"Species:", "Darth Vader", "Person-film ID", "7731900678",
"2072161410", "5775513390", "5583683275", "Species:", "Yoda",
"Person-film ID", "6562229951", "9476002994", "958505842",
"6754035462", "2138742347", "Species:"
),
v2 = c(
NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
"Tattooine", "Tattooine", "Droid", NA, "Homeworld", NA, NA, NA, NA,
"Human", NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
"Tattooine", "Tattooine", "Yoda's Species"
),
v3 = c(
"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
"Revenge of the Sith", "Return of the Jedi",
"The Empire Strikes Back", "A New Hope", "", "Yellow", "Film",
"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back",
"A New Hope", "", "Brown", "Film", "Attack of the Clones",
"The Phantom Menace", "Revenge of the Sith", "Return of the Jedi",
"The Empire Strikes Back", ""
),
v4 = c(
"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
"400000", "500000", "", "202", "Arbitrary Number", "100000",
"150000", "350000", "500000", "", "66", "Arbitrary Number",
"160000", "160000", "200000", "200000", "200000", ""
)
)
# here's my messy attempt, using starwars_garbage_data1
library(tidyverse)
library(janitor)
fix_header <- function(df, n_lines = 1) {
newcolnames <- df[n_lines, ]
names(df) <- newcolnames
newdf <- df[-c(1:n_lines), ]
newdf <- janitor::clean_names(newdf)
return(newdf)
}
tst <- starwars_garbage_data1 %>%
set_names(nm = letters[1:4]) %>%
mutate(group = ifelse(a == "Character Name", 1:1000, NA)) %>%
fill(group) %>%
group_by(group) %>%
nest() %>%
mutate(film_data = map(data, fix_header, 3)) %>%
mutate(person_data = map(data, ~ {
.x %>%
slice(1:2) %>%
fix_header() %>%
remove_empty("cols")
})) %>%
unnest_wider(person_data) %>%
select(-data) %>%
unnest(film_data)
# then I have to break the pipe i think unfortunately.
ints <- which(tst$person_film_id == "Species:")
tst$species[ints] <- tst$homeworld[ints]
# final data cleaning
final <- tst %>%
fill(species, .direction = "up") %>%
filter(person_film_id != "Species:") %>%
select(-group)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment