Skip to content

Instantly share code, notes, and snippets.

@hadley
Created November 14, 2022 13:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hadley/61f0bcaf79c695eca46c06be55ffad69 to your computer and use it in GitHub Desktop.
Save hadley/61f0bcaf79c695eca46c06be55ffad69 to your computer and use it in GitHub Desktop.
library(tidyverse)
# https://twitter.com/buddyherms/status/1576966150680121344 --------------
# PROs: at, by, and regexp examples
# CONs: quite simple
vt_census <- tidycensus::get_decennial(
geography = "block",
state = "VT",
county = "Washington",
variables = "P1_001N",
year = 2020
)
vt_census |>
separate_wider_position(
GEOID,
widths = c(state = 2, county = 3, tract = 6, block = 4)
)
vt_census |>
separate_wider_delim(
NAME,
delim = ", ",
names = c("block", "block_group", "tract", "county", "state")
)
vt_census |>
separate_wider_regex(
NAME,
patterns = c(
"Block ", block = "\\d+", ", ",
"Block Group ", block_group = "\\d+", ", ",
"Census Tract ", tract = "\\d+.\\d+", ", ",
county = "[^,]+", ", ",
state = ".*"
)
)
# https://twitter.com/iandgow/status/1577198761612189696 ------------------
# https://iangow.github.io/far_book/web-data.html?q=Extract#extracting-data-from-messy-formats
library(pdftools)
library(lubridate)
# Could almost use width, but different set of widths on each page
# (because of pdf_text()'s table algorithm)
url <- "https://site.warrington.ufl.edu/ritter/files/money-left-on-the-table.pdf"
lines <- readr::read_lines(pdftools::pdf_text(url)[-1])
money <- tibble(line = lines)
regex <- c(
amount_left_on_table = "\\S+", "\\s+",
company = ".+", "\\s+",
ipo_date = "[0-9]{6}", "\\s+",
offer_price = "\\S+", "\\s+",
first_close_price = "\\S+", "\\s+",
shares_offered = "\\S+", "\\s+",
ticker = "\\S+"
)
money %>%
filter(str_detect(line, "^\\$")) %>%
separate_wider_regex(line, regex) %>%
mutate(
amount_left_on_table = parse_number(amount_left_on_table),
company = str_trim(company),
ipo_date = lubridate::ymd(ipo_date),
offer_price = parse_number(offer_price),
first_close_price = parse_number(first_close_price),
shares_offered = parse_number(shares_offered),
) %>%
arrange(ipo_date) %>%
relocate(amount_left_on_table, .after = shares_offered) %>%
relocate(ticker, .before = 1)
# https://twitter.com/randyboyes/status/1576933550393171968 ---------------
# PROs: wider + longer, simple problem
# CONs: inauthentic, licensing???
# https://adventofcode.com/2020/day/7
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt")
advent7 <- tibble(line = lines)
advent7 %>%
filter(!str_detect(lines, "no other bags")) %>%
separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>%
separate_by_longer(bags, ", ") %>%
separate_by_wider(bags, sep = " ", names = c("number", "property", "colour", NA))
# https://adventofcode.com/2020/day/14
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_14.txt")
advent14 <- tibble(line = lines)
advent14 %>%
separate_regex_wider(line, c(var = "\\w+", address = "(?:\\[\\d+\\])?", " = ", value = ".+")) %>%
mutate(address = parse_number(address))
# https://twitter.com/Natttersley/status/1577026577350946817 --------------
# PRO: simple, built-in dataset, shows align_short
# CON: have to talk about rownames
mtcars %>%
as_tibble(rownames = "rn") %>%
separate_regex_wider(rn,
c(manufacturer = "\\w+", " ", model = ".*?"),
align_short = "start"
)
# https://twitter.com/josephjefe/status/1576931962366763008 ---------------
particip <- nflreadr::load_participation()
particip %>%
as_tibble() %>%
select(nflverse_game_id, play_id, offense_formation, offense_personnel) %>%
filter(!is.na(offense_formation)) %>%
separate_longer_delim(offense_personnel, regex(", ?")) %>%
separate_wider_delim(offense_personnel, " ", names = c("n", "position"))
# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------
all_drinks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/all_drinks.csv')
all_drinks %>%
select(idDrink, contains("Ingredient"), contains("Measure")) %>%
pivot_longer(
-idDrink,
names_pattern = "str(Ingredient|Measure)(\\d+)",
names_to = c(".value", "i"),
values_drop_na = TRUE,
) %>%
separate_by_wider(Measure, " ", c("Quanitity", "Unit"), align_long = "merge")
# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------
raw_df <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-23/raw_anime.csv")
raw_df %>%
select(title_synonyms) %>%
separate_regex_wider(title_synonyms, c("[", contents = ".*", "]")) %>%
separate_by_longer(contents, ", ")
raw_df %>%
pull(aired) %>%
str_replace_all("None", "null") %>%
str_replace_all("'", '"') %>%
map_if(is.na, \(x) "null") %>%
map(jsonlite::fromJSON) %>%
head() %>%
str()
# https://github.com/rfordatascience/tidytuesday/blob/master/data/ --------
# if want to do teams, will need https://github.com/tidyverse/rvest/issues/361
library(rvest)
html <- read_html("https://db.ipc-services.org/sdms/hira/web/competition/code/PG2016/sport/AR")
html %>%
html_node(".table-responsive-lg") %>%
html_table() %>%
rename_with(\(x) str_to_lower(str_replace(x, fixed(" Medallist(s)"), ""))) %>%
pivot_longer(names_to = "medal", values_to = "athlete", cols = -event) %>%
separate_regex_wider(event,
patterns = c(
gender = "Men's|Women's|Mixed",
" ",
team = "Team|Individual",
" ",
event = ".*"
)
) %>%
filter(team == "Individual") %>%
separate_regex_wider(athlete,
patterns = c(
last_name = "[A-Z]+",
" ",
first_name = "[A-Za-z ]+",
" \\(",
country = "[A-Z]+",
"\\)"
))
library(tidyverse)
# https://github.com/tidyverse/tidyups/blob/main/002-tidyr-stringr.md
# For now: no paired stringr functions
# Big change to problems reporting
# Add warning when debug mode on:
# Warning: align_short="debug" adding debug columns x_ok, x_pieces, x_remainder
# separate_*_wider --------------------------------------------------------
# * separate_at_wider() = separate()
# * separate_by_wider() = separate()
# * separate_regex_wider() = extract()
vt_census <- tidycensus::get_decennial(
geography = "block",
state = "VT",
county = "Washington",
variables = "P1_001N",
year = 2020
)
vt_census
vt_census |>
separate_at_wider(
GEOID,
widths = c(state = 2, county = 3, tract = 6, block = 4)
)
# by -> delim, by -> sep
# at -> width, separate_loc_wider()
# separate_by_wider / separate_by_longer
# which order of components?
# should sep be delim?
# separate_wider_delim / separate_longer_delim
vt_census |>
separate_by_wider(
NAME,
sep = ", ",
names = c("block", "block_group", "tract", "county", "state")
) |>
mutate(
block = block %>% parse_number(),
block_group = block_group %>% parse_number(),
tract = tract %>% parse_number()
)
vt_census |>
separate_regex_wider(
NAME,
patterns = c(
"Block ", block = "\\d+", ", ",
"Block Group ", block_group = "\\d+", ", ",
"Census Tract ", tract = "\\d+.\\d+", ", ",
county = "[^,]+", ", ",
state = ".*"
)
)
# separate_*_longer -------------------------------------------------------
# * separate_by_longer() = separate_rows()
# * separate_at_longer()
# https://adventofcode.com/2020/day/7
lines <- read_lines("https://raw.githubusercontent.com/rdboyes/adventofcode2020/master/data/input_7.txt")
advent7 <- tibble(line = lines)
advent7 %>%
filter(!str_detect(lines, "no other bags")) %>%
separate_by_wider(line, sep = " bags contain ", names = c("container", "bags")) %>%
separate_by_longer(bags, ", ") %>%
separate_by_wider(bags, sep = " ", names = c("number", "adjective", "colour", NA)) %>%
separate_by_wider(container, " ", names = c("adjective", "colour"), names_sep = "_")
# Debugging ---------------------------------------------------------------
df <- tibble(
x = c("a", "a-b", "a-b-c")
)
df %>% separate(x, c("x", "y"))
# Now errors if there's a problem
df %>% separate_by_wider(x, sep = "-", names = c("x", "y"))
df %>% separate_by_wider(x, sep = "-", names = c("x"))
# Debug with:
probs <- df %>%
separate_by_wider(
x,
sep = "-",
names = c("a", "b"),
align_short = "debug",
align_long = "debug"
)
probs
probs %>% filter(!x_ok)
df %>%
separate_by_wider(
x,
sep = "-",
names = c("x", "y"),
align_short = "end",
align_long = "merge"
)
# New feature:
df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "start")
df %>% separate_by_wider(x, sep = "-", names_sep = "", align_short = "end")
# Really handy for separate_regex_wider
vt_census |>
separate_regex_wider(
NAME,
patterns = c(
"Block ", block = "\\d+", ", ",
"Block Group ", block_group = "\\d+", ", ",
"Census Tract ", tract = "\\d+.\\d+", ", ",
county = "[^,]+", ", ",
state = ".*"
),
align_short = "debug"
)
%>%
select(block:NAME_remainder) %>%
select(NAME_remainder)
df <- tibble(x = c("b-d", "e-f", "e"))
df %>% separate_by_wider(
x, "-",
names = c("x", "y"),
align_long = "debug",
align_short = "start"
)
github_repos <- c(
"metacran/crandb",
"jeroenooms/curl@v0.9.3",
"jimhester/covr#47",
"hadley/dplyr@*release",
"mangothecat/remotes@550a3c7d3f9e1493a2ba",
"/$&@R64&3"
)
owner_rx <- "(?:(?<owner>[^/]+)/)?"
repo_rx <- "(?<repo>[^/@#]+)"
subdir_rx <- "(?:/(?<subdir>[^@#]*[^@#/]))?"
ref_rx <- "(?:@(?<ref>[^*].*))"
pull_rx <- "(?:#(?<pull>[0-9]+))"
release_rx <- "(?:@(?<release>[*]release))"
subtype_rx <- sprintf("(?:%s|%s|%s)?", ref_rx, pull_rx, release_rx)
github_rx <- sprintf(
"^(?:%s%s%s%s|(?<catchall>.*))$",
owner_rx, repo_rx, subdir_rx, subtype_rx
)
re_match(text = github_repos, pattern = github_rx)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment