hadley/separate-examples.R

## separate-examples.R
library(tidyverse)

# https://twitter.com/buddyherms/status/1576966150680121344 --------------

# PROs: at, by, and regexp examples
# CONs: quite simple

vt_census <- tidycensus::get_decennial(
  geography = "block",
  state = "VT",
  county = "Washington",
  variables = "P1_001N",
  year = 2020
)
vt_census |>
  separate_wider_position(
    GEOID,
    widths = c(state = 2, county = 3, tract = 6, block = 4)
  )

vt_census |>
  separate_wider_delim(
    NAME,
    delim = ", ",
    names = c("block", "block_group", "tract", "county", "state")
  )

vt_census |>
  separate_wider_regex(
    NAME,
    patterns = c(
      "Block ", block = "\\d+", ", ",
      "Block Group ", block_group = "\\d+", ", ",
      "Census Tract ", tract = "\\d+.\\d+", ", ",
      county = "[^,]+", ", ",
      state = ".*"
    )
  )


# https://twitter.com/iandgow/status/1577198761612189696 ------------------
# https://iangow.github.io/far_book/web-data.html?q=Extract#extracting-data-from-messy-formats

library(pdftools)
library(lubridate)

# Could almost use width, but different set of widths on each page
# (because of pdf_text()'s table algorithm)
url <- "https://site.warrington.ufl.edu/ritter/files/money-left-on-the-table.pdf"
lines <- readr::read_lines(pdftools::pdf_text(url)[-1])

money <- tibble(line = lines)

regex <- c(
  amount_left_on_table = "\\S+", "\\s+",
  company = ".+", "\\s+",
  ipo_date = "[0-9]{6}", "\\s+",
  offer_price = "\\S+", "\\s+",
  first_close_price = "\\S+", "\\s+",
  shares_offered = "\\S+", "\\s+",
  ticker = "\\S+"
)
money %>%
  filter(str_detect(line, "^\\$")) %>%
  separate_wider_regex(line, regex) %>%
  mutate(
    amount_left_on_table = parse_number(amount_left_on_table),
    company = str_trim(company),
    ipo_date = lubridate::ymd(ipo_date),
    offer_price = parse_number(offer_price),
    first_close_price = parse_number(first_close_price),
    shares_offered = parse_number(shares_offered),
  ) %>%
  arrange(ipo_date) %>%
  relocate(amount_left_on_table, .after = shares_offered) %>%
  relocate(ticker, .before = 1)

# https://twitter.com/randyboyes/status/1576933550393171968 ---------------
# PROs: wider + longer, simple problem
# CONs: inauthentic, licensing???

# https://adventofcode.com/2020/day/7
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt")
advent7 <- tibble(line = lines)

advent7 %>%
  filter(!str_detect(lines, "no other bags")) %>%
  separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>%
  separate_by_longer(bags, ", ") %>%
  separate_by_wider(bags, sep = " ", names = c("number", "property", "colour", NA))

# https://adventofcode.com/2020/day/14
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_14.txt")
advent14 <- tibble(line = lines)

advent14 %>%
  separate_regex_wider(line, c(var = "\\w+", address = "(?:\\[\\d+\\])?", " = ", value = ".+")) %>%
  mutate(address = parse_number(address))

# https://twitter.com/Natttersley/status/1577026577350946817 --------------

# PRO: simple, built-in dataset, shows align_short
# CON: have to talk about rownames

mtcars %>%
  as_tibble(rownames = "rn") %>%
  separate_regex_wider(rn,
    c(manufacturer = "\\w+", " ", model = ".*?"),
    align_short = "start"
  )

# https://twitter.com/josephjefe/status/1576931962366763008 ---------------

particip <- nflreadr::load_participation()
particip %>%
  as_tibble() %>%
  select(nflverse_game_id, play_id, offense_formation, offense_personnel) %>%
  filter(!is.na(offense_formation)) %>%
  separate_longer_delim(offense_personnel, regex(", ?")) %>%
  separate_wider_delim(offense_personnel, " ", names = c("n", "position"))


# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------

all_drinks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/all_drinks.csv')

all_drinks %>%
  select(idDrink, contains("Ingredient"), contains("Measure")) %>%
  pivot_longer(
    -idDrink,
    names_pattern = "str(Ingredient|Measure)(\\d+)",
    names_to = c(".value", "i"),
    values_drop_na = TRUE,
  ) %>%
  separate_by_wider(Measure, " ", c("Quanitity", "Unit"), align_long = "merge")


# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------

raw_df <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-23/raw_anime.csv")

raw_df %>%
  select(title_synonyms) %>%
  separate_regex_wider(title_synonyms, c("[", contents = ".*", "]")) %>%
  separate_by_longer(contents, ", ")

raw_df %>%
  pull(aired) %>%
  str_replace_all("None", "null") %>%
  str_replace_all("'", '"') %>%
  map_if(is.na, \(x) "null") %>%
  map(jsonlite::fromJSON) %>%
  head() %>%
  str()


# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------

# if want to do teams, will need https://github.com/tidyverse/rvest/issues/361

library(rvest)

html <- read_html("https://db.ipc-services.org/sdms/hira/web/competition/code/PG2016/sport/AR")
html %>%
  html_node(".table-responsive-lg") %>%
  html_table() %>%
  rename_with(\(x) str_to_lower(str_replace(x, fixed(" Medallist(s)"), ""))) %>%
  pivot_longer(names_to = "medal", values_to = "athlete", cols = -event) %>%
  separate_regex_wider(event,
    patterns = c(
      gender = "Men's|Women's|Mixed",
      " ",
      team = "Team|Individual",
      " ",
      event = ".*"
    )
  ) %>%
  filter(team == "Individual") %>%
  separate_regex_wider(athlete,
    patterns = c(
      last_name = "[A-Z]+",
      " ",
      first_name = "[A-Za-z ]+",
      " \\(",
      country = "[A-Z]+",
      "\\)"
    ))

## separate-group-meeting-notes.R
library(tidyverse)

# https://github.com/tidyverse/tidyups/blob/main/002-tidyr-stringr.md
# For now: no paired stringr functions
# Big change to problems reporting

# Add warning when debug mode on:
# Warning: align_short="debug" adding debug columns x_ok, x_pieces, x_remainder


# separate_*_wider --------------------------------------------------------

# * separate_at_wider()     = separate()
# * separate_by_wider()     = separate()
# * separate_regex_wider()  = extract()

vt_census <- tidycensus::get_decennial(
  geography = "block",
  state = "VT",
  county = "Washington",
  variables = "P1_001N",
  year = 2020
)
vt_census

vt_census |>
  separate_at_wider(
    GEOID,
    widths = c(state = 2, county = 3, tract = 6, block = 4)
  )

# by -> delim, by -> sep
# at -> width, separate_loc_wider()
# separate_by_wider / separate_by_longer
# which order of components?
# should sep be delim?
# separate_wider_delim / separate_longer_delim

vt_census |>
  separate_by_wider(
    NAME,
    sep = ", ",
    names = c("block", "block_group", "tract", "county", "state")
  ) |>
  mutate(
    block = block %>% parse_number(),
    block_group = block_group %>% parse_number(),
    tract = tract %>% parse_number()
  )

vt_census |>
  separate_regex_wider(
    NAME,
    patterns = c(
      "Block ", block = "\\d+", ", ",
      "Block Group ", block_group = "\\d+", ", ",
      "Census Tract ", tract = "\\d+.\\d+", ", ",
      county = "[^,]+", ", ",
      state = ".*"
    )
  )


# separate_*_longer -------------------------------------------------------

# * separate_by_longer()    = separate_rows()
# * separate_at_longer()

# https://adventofcode.com/2020/day/7
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt")
advent7 <- tibble(line = lines)

advent7 %>%
  filter(!str_detect(lines, "no other bags")) %>%
  separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>%
  separate_by_longer(bags, ", ") %>%
  separate_by_wider(bags, sep = " ", names = c("number", "adjective", "colour", NA)) %>%
  separate_by_wider(container, " ", names = c("adjective", "colour"), names_sep = "_")


# Debugging ---------------------------------------------------------------

df <- tibble(
  x = c("a", "a-b", "a-b-c")
)

df %>% separate(x, c("x", "y"))


# Now errors if there's a problem
df %>% separate_by_wider(x, sep = "-", names = c("x", "y"))
df %>% separate_by_wider(x, sep = "-", names = c("x"))


# Debug with:
probs <- df %>%
  separate_by_wider(
    x,
    sep = "-",
    names = c("a", "b"),
    align_short = "debug",
    align_long = "debug"
  )
probs
probs %>% filter(!x_ok)

df %>%
  separate_by_wider(
    x,
    sep = "-",
    names = c("x", "y"),
    align_short = "end",
    align_long = "merge"
  )

# New feature:
df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "start")
df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "end")

# Really handy for separate_regex_wider
vt_census |>
  separate_regex_wider(
    NAME,
    patterns = c(
      "Block ", block = "\\d+", ", ",
      "Block Group ", block_group = "\\d+", ", ",
      "Census Tract ", tract = "\\d+.\\d+", ", ",
      county = "[^,]+", ", ",
      state = ".*"
    ),
    align_short = "debug"
  )

%>%
  select(block:NAME_remainder) %>%
  select(NAME_remainder)


df <- tibble(x = c("b-d", "e-f", "e"))
df %>% separate_by_wider(
  x, "-",
  names = c("x", "y"),
  align_long = "debug",
  align_short = "start"
)


github_repos <- c(
	"metacran/crandb",
	"jeroenooms/curl@v0.9.3",
  "jimhester/covr#47",
	"hadley/dplyr@*release",
  "mangothecat/remotes@550a3c7d3f9e1493a2ba",
  "/$&@R64&3"
)
owner_rx   <- "(?:(?<owner>[^/]+)/)?"
repo_rx    <- "(?<repo>[^/@#]+)"
subdir_rx  <- "(?:/(?<subdir>[^@#]*[^@#/]))?"
ref_rx     <- "(?:@(?<ref>[^*].*))"
pull_rx    <- "(?:#(?<pull>[0-9]+))"
release_rx <- "(?:@(?<release>[*]release))"
subtype_rx <- sprintf("(?:%s|%s|%s)?", ref_rx, pull_rx, release_rx)
github_rx  <- sprintf(
	"^(?:%s%s%s%s|(?<catchall>.*))$",
    owner_rx, repo_rx, subdir_rx, subtype_rx
)
re_match(text = github_repos, pattern = github_rx)
	library(tidyverse)

	# https://twitter.com/buddyherms/status/1576966150680121344 --------------

	# PROs: at, by, and regexp examples
	# CONs: quite simple

	vt_census <- tidycensus::get_decennial(
	geography = "block",
	state = "VT",
	county = "Washington",
	variables = "P1_001N",
	year = 2020
	)
	vt_census \|>
	separate_wider_position(
	GEOID,
	widths = c(state = 2, county = 3, tract = 6, block = 4)
	)

	vt_census \|>
	separate_wider_delim(
	NAME,
	delim = ", ",
	names = c("block", "block_group", "tract", "county", "state")
	)

	vt_census \|>
	separate_wider_regex(
	NAME,
	patterns = c(
	"Block ", block = "\\d+", ", ",
	"Block Group ", block_group = "\\d+", ", ",
	"Census Tract ", tract = "\\d+.\\d+", ", ",
	county = "[^,]+", ", ",
	state = ".*"
	)
	)


	# https://twitter.com/iandgow/status/1577198761612189696 ------------------
	# https://iangow.github.io/far_book/web-data.html?q=Extract#extracting-data-from-messy-formats

	library(pdftools)
	library(lubridate)

	# Could almost use width, but different set of widths on each page
	# (because of pdf_text()'s table algorithm)
	url <- "https://site.warrington.ufl.edu/ritter/files/money-left-on-the-table.pdf"
	lines <- readr::read_lines(pdftools::pdf_text(url)[-1])

	money <- tibble(line = lines)

	regex <- c(
	amount_left_on_table = "\\S+", "\\s+",
	company = ".+", "\\s+",
	ipo_date = "[0-9]{6}", "\\s+",
	offer_price = "\\S+", "\\s+",
	first_close_price = "\\S+", "\\s+",
	shares_offered = "\\S+", "\\s+",
	ticker = "\\S+"
	)
	money %>%
	filter(str_detect(line, "^\\$")) %>%
	separate_wider_regex(line, regex) %>%
	mutate(
	amount_left_on_table = parse_number(amount_left_on_table),
	company = str_trim(company),
	ipo_date = lubridate::ymd(ipo_date),
	offer_price = parse_number(offer_price),
	first_close_price = parse_number(first_close_price),
	shares_offered = parse_number(shares_offered),
	) %>%
	arrange(ipo_date) %>%
	relocate(amount_left_on_table, .after = shares_offered) %>%
	relocate(ticker, .before = 1)

	# https://twitter.com/randyboyes/status/1576933550393171968 ---------------
	# PROs: wider + longer, simple problem
	# CONs: inauthentic, licensing???

	# https://adventofcode.com/2020/day/7
	lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt")
	advent7 <- tibble(line = lines)

	advent7 %>%
	filter(!str_detect(lines, "no other bags")) %>%
	separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>%
	separate_by_longer(bags, ", ") %>%
	separate_by_wider(bags, sep = " ", names = c("number", "property", "colour", NA))

	# https://adventofcode.com/2020/day/14
	lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_14.txt")
	advent14 <- tibble(line = lines)

	advent14 %>%
	separate_regex_wider(line, c(var = "\\w+", address = "(?:\\[\\d+\\])?", " = ", value = ".+")) %>%
	mutate(address = parse_number(address))

	# https://twitter.com/Natttersley/status/1577026577350946817 --------------

	# PRO: simple, built-in dataset, shows align_short
	# CON: have to talk about rownames

	mtcars %>%
	as_tibble(rownames = "rn") %>%
	separate_regex_wider(rn,
	c(manufacturer = "\\w+", " ", model = ".*?"),
	align_short = "start"
	)

	# https://twitter.com/josephjefe/status/1576931962366763008 ---------------

	particip <- nflreadr::load_participation()
	particip %>%
	as_tibble() %>%
	select(nflverse_game_id, play_id, offense_formation, offense_personnel) %>%
	filter(!is.na(offense_formation)) %>%
	separate_longer_delim(offense_personnel, regex(", ?")) %>%
	separate_wider_delim(offense_personnel, " ", names = c("n", "position"))


	# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------

	all_drinks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/all_drinks.csv')

	all_drinks %>%
	select(idDrink, contains("Ingredient"), contains("Measure")) %>%
	pivot_longer(
	-idDrink,
	names_pattern = "str(Ingredient\|Measure)(\\d+)",
	names_to = c(".value", "i"),
	values_drop_na = TRUE,
	) %>%
	separate_by_wider(Measure, " ", c("Quanitity", "Unit"), align_long = "merge")


	# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------

	raw_df <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-23/raw_anime.csv")

	raw_df %>%
	select(title_synonyms) %>%
	separate_regex_wider(title_synonyms, c("[", contents = ".*", "]")) %>%
	separate_by_longer(contents, ", ")

	raw_df %>%
	pull(aired) %>%
	str_replace_all("None", "null") %>%
	str_replace_all("'", '"') %>%
	map_if(is.na, \(x) "null") %>%
	map(jsonlite::fromJSON) %>%
	head() %>%
	str()


	# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------

	# if want to do teams, will need https://github.com/tidyverse/rvest/issues/361

	library(rvest)

	html <- read_html("https://db.ipc-services.org/sdms/hira/web/competition/code/PG2016/sport/AR")
	html %>%
	html_node(".table-responsive-lg") %>%
	html_table() %>%
	rename_with(\(x) str_to_lower(str_replace(x, fixed(" Medallist(s)"), ""))) %>%
	pivot_longer(names_to = "medal", values_to = "athlete", cols = -event) %>%
	separate_regex_wider(event,
	patterns = c(
	gender = "Men's\|Women's\|Mixed",
	" ",
	team = "Team\|Individual",
	" ",
	event = ".*"
	)
	) %>%
	filter(team == "Individual") %>%
	separate_regex_wider(athlete,
	patterns = c(
	last_name = "[A-Z]+",
	" ",
	first_name = "[A-Za-z ]+",
	" \\(",
	country = "[A-Z]+",
	"\\)"
	))
	library(tidyverse)

	# https://github.com/tidyverse/tidyups/blob/main/002-tidyr-stringr.md
	# For now: no paired stringr functions
	# Big change to problems reporting

	# Add warning when debug mode on:
	# Warning: align_short="debug" adding debug columns x_ok, x_pieces, x_remainder



	# separate_*_wider --------------------------------------------------------

	# * separate_at_wider() = separate()
	# * separate_by_wider() = separate()
	# * separate_regex_wider() = extract()

	vt_census <- tidycensus::get_decennial(
	geography = "block",
	state = "VT",
	county = "Washington",
	variables = "P1_001N",
	year = 2020
	)
	vt_census

	vt_census \|>
	separate_at_wider(
	GEOID,
	widths = c(state = 2, county = 3, tract = 6, block = 4)
	)

	# by -> delim, by -> sep
	# at -> width, separate_loc_wider()
	# separate_by_wider / separate_by_longer
	# which order of components?
	# should sep be delim?
	# separate_wider_delim / separate_longer_delim

	vt_census \|>
	separate_by_wider(
	NAME,
	sep = ", ",
	names = c("block", "block_group", "tract", "county", "state")
	) \|>
	mutate(
	block = block %>% parse_number(),
	block_group = block_group %>% parse_number(),
	tract = tract %>% parse_number()
	)

	vt_census \|>
	separate_regex_wider(
	NAME,
	patterns = c(
	"Block ", block = "\\d+", ", ",
	"Block Group ", block_group = "\\d+", ", ",
	"Census Tract ", tract = "\\d+.\\d+", ", ",
	county = "[^,]+", ", ",
	state = ".*"
	)
	)


	# separate_*_longer -------------------------------------------------------

	# * separate_by_longer() = separate_rows()
	# * separate_at_longer()

	# https://adventofcode.com/2020/day/7
	lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt")
	advent7 <- tibble(line = lines)

	advent7 %>%
	filter(!str_detect(lines, "no other bags")) %>%
	separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>%
	separate_by_longer(bags, ", ") %>%
	separate_by_wider(bags, sep = " ", names = c("number", "adjective", "colour", NA)) %>%
	separate_by_wider(container, " ", names = c("adjective", "colour"), names_sep = "_")


	# Debugging ---------------------------------------------------------------

	df <- tibble(
	x = c("a", "a-b", "a-b-c")
	)

	df %>% separate(x, c("x", "y"))


	# Now errors if there's a problem
	df %>% separate_by_wider(x, sep = "-", names = c("x", "y"))
	df %>% separate_by_wider(x, sep = "-", names = c("x"))


	# Debug with:
	probs <- df %>%
	separate_by_wider(
	x,
	sep = "-",
	names = c("a", "b"),
	align_short = "debug",
	align_long = "debug"
	)
	probs
	probs %>% filter(!x_ok)

	df %>%
	separate_by_wider(
	x,
	sep = "-",
	names = c("x", "y"),
	align_short = "end",
	align_long = "merge"
	)

	# New feature:
	df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "start")
	df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "end")

	# Really handy for separate_regex_wider
	vt_census \|>
	separate_regex_wider(
	NAME,
	patterns = c(
	"Block ", block = "\\d+", ", ",
	"Block Group ", block_group = "\\d+", ", ",
	"Census Tract ", tract = "\\d+.\\d+", ", ",
	county = "[^,]+", ", ",
	state = ".*"
	),
	align_short = "debug"
	)

	%>%
	select(block:NAME_remainder) %>%
	select(NAME_remainder)


	df <- tibble(x = c("b-d", "e-f", "e"))
	df %>% separate_by_wider(
	x, "-",
	names = c("x", "y"),
	align_long = "debug",
	align_short = "start"
	)



	github_repos <- c(
	"metacran/crandb",
	"jeroenooms/curl@v0.9.3",
	"jimhester/covr#47",
	"hadley/dplyr@*release",
	"mangothecat/remotes@550a3c7d3f9e1493a2ba",
	"/$&@R64&3"
	)
	owner_rx <- "(?:(?<owner>[^/]+)/)?"
	repo_rx <- "(?<repo>[^/@#]+)"
	subdir_rx <- "(?:/(?<subdir>[^@#]*[^@#/]))?"
	ref_rx <- "(?:@(?<ref>[^].))"
	pull_rx <- "(?:#(?<pull>[0-9]+))"
	release_rx <- "(?:@(?<release>[*]release))"
	subtype_rx <- sprintf("(?:%s\|%s\|%s)?", ref_rx, pull_rx, release_rx)
	github_rx <- sprintf(
	"^(?:%s%s%s%s\|(?<catchall>.*))$",
	owner_rx, repo_rx, subdir_rx, subtype_rx
	)
	re_match(text = github_repos, pattern = github_rx)