-
-
Save nacnudus/9407659f9a33f2eae60f5c7d43433d41 to your computer and use it in GitHub Desktop.
cursed_data_challenge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# --------------------------------------- | |
# untidy data | |
# --------------------------------------- | |
# this dataset is a sample of the kind of data that might appear in the wild, | |
# particularly when dealing with government data, | |
# particularly when trying to convert an output table or individual report | |
# back into a raw data format that can be analyzed. | |
# in these test datasets, discrete observations are spread out across multiple rows. | |
# starwars_garbage_data1 reads in a small sample dataset in which headers | |
# repeat across multiple rows, some columns are not included in some rows, | |
# and some (but not all!) variable headers are in the *same row* as their values. | |
# starwars_garbage_data2 is the same data without repeating headers. | |
# The structure of the data repeats consistently. | |
# what is the most effective way to convert either of these datasets (ideally both, | |
# but there could be two separate solutions) into a tidy dataframe that | |
# includes all information? | |
# i currently have a hacky solution, but i want something *nice* and *replicable*. | |
# the unique identifier in this data is the person-film ID, which is made up. | |
# the arbitrary number column is also made up, but goes with the person-film ID. | |
starwars_garbage_data1 <- data.frame( | |
stringsAsFactors = FALSE, | |
v1 = c( | |
"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678", | |
"123598423", "238952395", "6232048034", "3036308047", | |
"Species:", "Character Name", "Darth Vader", "Person-film ID", | |
"7731900678", "2072161410", "5775513390", "5583683275", | |
"Species:", "Character Name", "Yoda", "Person-film ID", | |
"6562229951", "9476002994", "958505842", "6754035462", "2138742347", | |
"Species:" | |
), | |
v2 = c( | |
NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine", | |
"Tattooine", "Tattooine", "Droid", NA, NA, "Homeworld", NA, NA, NA, | |
NA, "Human", NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", | |
"Tattooine", "Tattooine", "Yoda's Species" | |
), | |
v3 = c( | |
"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace", | |
"Revenge of the Sith", "Return of the Jedi", | |
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Yellow", "Film", | |
"Revenge of the Sith", "Return of the Jedi", | |
"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Brown", "Film", | |
"Attack of the Clones", "The Phantom Menace", | |
"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", "" | |
), | |
v4 = c( | |
"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000", | |
"400000", "500000", "", "Height", "202", "Arbitrary Number", | |
"100000", "150000", "350000", "500000", "", "Height", "66", | |
"Arbitrary Number", "160000", "160000", "200000", "200000", "200000", | |
"" | |
) | |
) | |
starwars_garbage_data2 <- data.frame( | |
stringsAsFactors = FALSE, | |
v1 = c( | |
"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678", | |
"123598423", "238952395", "6232048034", "3036308047", | |
"Species:", "Darth Vader", "Person-film ID", "7731900678", | |
"2072161410", "5775513390", "5583683275", "Species:", "Yoda", | |
"Person-film ID", "6562229951", "9476002994", "958505842", | |
"6754035462", "2138742347", "Species:" | |
), | |
v2 = c( | |
NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine", | |
"Tattooine", "Tattooine", "Droid", NA, "Homeworld", NA, NA, NA, NA, | |
"Human", NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", | |
"Tattooine", "Tattooine", "Yoda's Species" | |
), | |
v3 = c( | |
"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace", | |
"Revenge of the Sith", "Return of the Jedi", | |
"The Empire Strikes Back", "A New Hope", "", "Yellow", "Film", | |
"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", | |
"A New Hope", "", "Brown", "Film", "Attack of the Clones", | |
"The Phantom Menace", "Revenge of the Sith", "Return of the Jedi", | |
"The Empire Strikes Back", "" | |
), | |
v4 = c( | |
"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000", | |
"400000", "500000", "", "202", "Arbitrary Number", "100000", | |
"150000", "350000", "500000", "", "66", "Arbitrary Number", | |
"160000", "160000", "200000", "200000", "200000", "" | |
) | |
) | |
# here's my messy attempt, using starwars_garbage_data1 | |
library(tidyverse) | |
library(janitor) | |
fix_header <- function(df, n_lines = 1) { | |
newcolnames <- df[n_lines, ] | |
names(df) <- newcolnames | |
newdf <- df[-c(1:n_lines), ] | |
newdf <- janitor::clean_names(newdf) | |
return(newdf) | |
} | |
tst <- starwars_garbage_data1 %>% | |
set_names(nm = letters[1:4]) %>% | |
mutate(group = ifelse(a == "Character Name", 1:1000, NA)) %>% | |
fill(group) %>% | |
group_by(group) %>% | |
nest() %>% | |
mutate(film_data = map(data, fix_header, 3)) %>% | |
mutate(person_data = map(data, ~ { | |
.x %>% | |
slice(1:2) %>% | |
fix_header() %>% | |
remove_empty("cols") | |
})) %>% | |
unnest_wider(person_data) %>% | |
select(-data) %>% | |
unnest(film_data) | |
# then I have to break the pipe i think unfortunately. | |
ints <- which(tst$person_film_id == "Species:") | |
tst$species[ints] <- tst$homeworld[ints] | |
# final data cleaning | |
final <- tst %>% | |
fill(species, .direction = "up") %>% | |
filter(person_film_id != "Species:") %>% | |
select(-group) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment