Skip to content

Instantly share code, notes, and snippets.

@tjmahr
Last active August 13, 2024 13:55
Show Gist options
  • Save tjmahr/f32ab08716d8eebdf34e88544bf8f755 to your computer and use it in GitHub Desktop.
Save tjmahr/f32ab08716d8eebdf34e88544bf8f755 to your computer and use it in GitHub Desktop.
# gonna write a script to read in tsvs, check for ones with the Date column
# formatted with "MM/DD/YYYY" and write them back out with "YYYY-MM-DD" formatting.
# doing this so that duckdb can run faster when it ingests all the tsvs into a
# database
repair_dates <- function(paths, date_col = "Date") {
vapply(
paths,
repair_dates_one,
character(1),
date_col = date_col,
USE.NAMES = FALSE
)
}
repair_dates_one <- function(path, date_col = "Date") {
lines <- readr::read_lines(path)
first_row <- read.delim(text = lines, nrows = 1)
need_to_fix <- first_row |>
getElement(date_col) |>
stringr::str_detect("/")
if (need_to_fix) {
# First pass:
# - Read literal data
# - Don't treat "" as NA
data <- I(lines) |>
readr::read_tsv(
col_types = list(.default = readr::col_character()),
show_col_types = FALSE,
progress = FALSE,
na = "NA"
)
# Create a col_character() for each all-blank column, otherwise they
# will be converted to logicals by col_guess()
cols_empty <- data |>
Filter(function(xs) all(xs == ""), x = _) |>
lapply(function(x) readr::col_character())
cols <- c(list(), cols_empty)
cols[[date_col]] <- readr::col_date(format = "%m/%d/%Y")
cols$.default <- readr::col_guess()
# Second pass:
# - Guess the column types
# - Treat all blank columns as character
# - Don't convert "" to NA
data <- readr::type_convert(data, cols, na = "NA")
if (nrow(readr::problems(data))) {
cli::cli_warn("Skipping repair in {.path {path}} (Parsing problem)")
} else {
cli::cli_inform("Repaired {.field {date_col}} column in {.path {path}}")
readr::write_tsv(data, path)
}
}
path
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment