nacnudus/cursed_starwars_data.R

## cursed_starwars_data.R
# ---------------------------------------
# untidy data
# ---------------------------------------

# this dataset is a sample of the kind of data that might appear in the wild,
# particularly when dealing with government data,
# particularly when trying to convert an output table or individual report
# back into a raw data format that can be analyzed.

# in these test datasets, discrete observations are spread out across multiple rows.
# starwars_garbage_data1 reads in a small sample dataset in which headers
# repeat across multiple rows, some columns are not included in some rows,
# and some (but not all!) variable headers are in the *same row* as their values.

# starwars_garbage_data2 is the same data without repeating headers.
# The structure of the data repeats consistently.

# what is the most effective way to convert either of these datasets (ideally both,
# but there could be two separate solutions) into a tidy dataframe that
# includes all information?
# i currently have a hacky solution, but i want something *nice* and *replicable*.
# the unique identifier in this data is the person-film ID, which is made up.
# the arbitrary number column is also made up, but goes with the person-film ID.


starwars_garbage_data1 <- data.frame(
  stringsAsFactors = FALSE,
  v1 = c(
    "Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
    "123598423", "238952395", "6232048034", "3036308047",
    "Species:", "Character Name", "Darth Vader", "Person-film ID",
    "7731900678", "2072161410", "5775513390", "5583683275",
    "Species:", "Character Name", "Yoda", "Person-film ID",
    "6562229951", "9476002994", "958505842", "6754035462", "2138742347",
    "Species:"
  ),
  v2 = c(
    NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
    "Tattooine", "Tattooine", "Droid", NA, NA, "Homeworld", NA, NA, NA,
    NA, "Human", NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
    "Tattooine", "Tattooine", "Yoda's Species"
  ),
  v3 = c(
    "Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
    "Revenge of the Sith", "Return of the Jedi",
    "The Empire Strikes Back", "A New Hope", "", "Eye Color", "Yellow", "Film",
    "Revenge of the Sith", "Return of the Jedi",
    "The Empire Strikes Back", "A New Hope", "", "Eye Color", "Brown", "Film",
    "Attack of the Clones", "The Phantom Menace",
    "Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", ""
  ),
  v4 = c(
    "Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
    "400000", "500000", "", "Height", "202", "Arbitrary Number",
    "100000", "150000", "350000", "500000", "", "Height", "66",
    "Arbitrary Number", "160000", "160000", "200000", "200000", "200000",
    ""
  )
)


starwars_garbage_data2 <- data.frame(
  stringsAsFactors = FALSE,
  v1 = c(
    "Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
    "123598423", "238952395", "6232048034", "3036308047",
    "Species:", "Darth Vader", "Person-film ID", "7731900678",
    "2072161410", "5775513390", "5583683275", "Species:", "Yoda",
    "Person-film ID", "6562229951", "9476002994", "958505842",
    "6754035462", "2138742347", "Species:"
  ),
  v2 = c(
    NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
    "Tattooine", "Tattooine", "Droid", NA, "Homeworld", NA, NA, NA, NA,
    "Human", NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
    "Tattooine", "Tattooine", "Yoda's Species"
  ),
  v3 = c(
    "Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
    "Revenge of the Sith", "Return of the Jedi",
    "The Empire Strikes Back", "A New Hope", "", "Yellow", "Film",
    "Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back",
    "A New Hope", "", "Brown", "Film", "Attack of the Clones",
    "The Phantom Menace", "Revenge of the Sith", "Return of the Jedi",
    "The Empire Strikes Back", ""
  ),
  v4 = c(
    "Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
    "400000", "500000", "", "202", "Arbitrary Number", "100000",
    "150000", "350000", "500000", "", "66", "Arbitrary Number",
    "160000", "160000", "200000", "200000", "200000", ""
  )
)


# here's my messy attempt, using starwars_garbage_data1

library(tidyverse)
library(janitor)

fix_header <- function(df, n_lines = 1) {
  newcolnames <- df[n_lines, ]
  names(df) <- newcolnames
  newdf <- df[-c(1:n_lines), ]
  newdf <- janitor::clean_names(newdf)
  return(newdf)
}

tst <- starwars_garbage_data1 %>%
  set_names(nm = letters[1:4]) %>%
  mutate(group = ifelse(a == "Character Name", 1:1000, NA)) %>%
  fill(group) %>%
  group_by(group) %>%
  nest() %>%
  mutate(film_data = map(data, fix_header, 3)) %>%
  mutate(person_data = map(data, ~ {
    .x %>%
      slice(1:2) %>%
      fix_header() %>%
      remove_empty("cols")
  })) %>%
  unnest_wider(person_data) %>%
  select(-data) %>%
  unnest(film_data)

# then I have to break the pipe i think unfortunately.
ints <- which(tst$person_film_id == "Species:")
tst$species[ints] <- tst$homeworld[ints]

# final data cleaning
final <- tst %>%
  fill(species, .direction = "up") %>%
  filter(person_film_id != "Species:") %>%
  select(-group)
	# ---------------------------------------
	# untidy data
	# ---------------------------------------

	# this dataset is a sample of the kind of data that might appear in the wild,
	# particularly when dealing with government data,
	# particularly when trying to convert an output table or individual report
	# back into a raw data format that can be analyzed.

	# in these test datasets, discrete observations are spread out across multiple rows.
	# starwars_garbage_data1 reads in a small sample dataset in which headers
	# repeat across multiple rows, some columns are not included in some rows,
	# and some (but not all!) variable headers are in the same row as their values.

	# starwars_garbage_data2 is the same data without repeating headers.
	# The structure of the data repeats consistently.

	# what is the most effective way to convert either of these datasets (ideally both,
	# but there could be two separate solutions) into a tidy dataframe that
	# includes all information?
	# i currently have a hacky solution, but i want something nice and replicable.
	# the unique identifier in this data is the person-film ID, which is made up.
	# the arbitrary number column is also made up, but goes with the person-film ID.


	starwars_garbage_data1 <- data.frame(
	stringsAsFactors = FALSE,
	v1 = c(
	"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
	"123598423", "238952395", "6232048034", "3036308047",
	"Species:", "Character Name", "Darth Vader", "Person-film ID",
	"7731900678", "2072161410", "5775513390", "5583683275",
	"Species:", "Character Name", "Yoda", "Person-film ID",
	"6562229951", "9476002994", "958505842", "6754035462", "2138742347",
	"Species:"
	),
	v2 = c(
	NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
	"Tattooine", "Tattooine", "Droid", NA, NA, "Homeworld", NA, NA, NA,
	NA, "Human", NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
	"Tattooine", "Tattooine", "Yoda's Species"
	),
	v3 = c(
	"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
	"Revenge of the Sith", "Return of the Jedi",
	"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Yellow", "Film",
	"Revenge of the Sith", "Return of the Jedi",
	"The Empire Strikes Back", "A New Hope", "", "Eye Color", "Brown", "Film",
	"Attack of the Clones", "The Phantom Menace",
	"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back", ""
	),
	v4 = c(
	"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
	"400000", "500000", "", "Height", "202", "Arbitrary Number",
	"100000", "150000", "350000", "500000", "", "Height", "66",
	"Arbitrary Number", "160000", "160000", "200000", "200000", "200000",
	""
	)
	)


	starwars_garbage_data2 <- data.frame(
	stringsAsFactors = FALSE,
	v1 = c(
	"Character Name", "C-3PO", "Person-film ID", "2218529825", "7731900678",
	"123598423", "238952395", "6232048034", "3036308047",
	"Species:", "Darth Vader", "Person-film ID", "7731900678",
	"2072161410", "5775513390", "5583683275", "Species:", "Yoda",
	"Person-film ID", "6562229951", "9476002994", "958505842",
	"6754035462", "2138742347", "Species:"
	),
	v2 = c(
	NA, NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine", "Tattooine",
	"Tattooine", "Tattooine", "Droid", NA, "Homeworld", NA, NA, NA, NA,
	"Human", NA, "Homeworld", "Tattooine", "Tattooine", "Tattooine",
	"Tattooine", "Tattooine", "Yoda's Species"
	),
	v3 = c(
	"Eye Color", "Yellow", "Film", "Attack of the Clones", "The Phantom Menace",
	"Revenge of the Sith", "Return of the Jedi",
	"The Empire Strikes Back", "A New Hope", "", "Yellow", "Film",
	"Revenge of the Sith", "Return of the Jedi", "The Empire Strikes Back",
	"A New Hope", "", "Brown", "Film", "Attack of the Clones",
	"The Phantom Menace", "Revenge of the Sith", "Return of the Jedi",
	"The Empire Strikes Back", ""
	),
	v4 = c(
	"Height", "167", "Arbitrary Number", "150000", "300000", "300000", "300000",
	"400000", "500000", "", "202", "Arbitrary Number", "100000",
	"150000", "350000", "500000", "", "66", "Arbitrary Number",
	"160000", "160000", "200000", "200000", "200000", ""
	)
	)



	# here's my messy attempt, using starwars_garbage_data1

	library(tidyverse)
	library(janitor)

	fix_header <- function(df, n_lines = 1) {
	newcolnames <- df[n_lines, ]
	names(df) <- newcolnames
	newdf <- df[-c(1:n_lines), ]
	newdf <- janitor::clean_names(newdf)
	return(newdf)
	}

	tst <- starwars_garbage_data1 %>%
	set_names(nm = letters[1:4]) %>%
	mutate(group = ifelse(a == "Character Name", 1:1000, NA)) %>%
	fill(group) %>%
	group_by(group) %>%
	nest() %>%
	mutate(film_data = map(data, fix_header, 3)) %>%
	mutate(person_data = map(data, ~ {
	.x %>%
	slice(1:2) %>%
	fix_header() %>%
	remove_empty("cols")
	})) %>%
	unnest_wider(person_data) %>%
	select(-data) %>%
	unnest(film_data)

	# then I have to break the pipe i think unfortunately.
	ints <- which(tst$person_film_id == "Species:")
	tst$species[ints] <- tst$homeworld[ints]

	# final data cleaning
	final <- tst %>%
	fill(species, .direction = "up") %>%
	filter(person_film_id != "Species:") %>%
	select(-group)