-
-
Save tjmahr/f32ab08716d8eebdf34e88544bf8f755 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# gonna write a script to read in tsvs, check for ones with the Date column | |
# formatted with "MM/DD/YYYY" and write them back out with "YYYY-MM-DD" formatting. | |
# doing this so that duckdb can run faster when it ingests all the tsvs into a | |
# database | |
repair_dates <- function(paths, date_col = "Date") { | |
vapply( | |
paths, | |
repair_dates_one, | |
character(1), | |
date_col = date_col, | |
USE.NAMES = FALSE | |
) | |
} | |
repair_dates_one <- function(path, date_col = "Date") { | |
lines <- readr::read_lines(path) | |
first_row <- read.delim(text = lines, nrows = 1) | |
need_to_fix <- first_row |> | |
getElement(date_col) |> | |
stringr::str_detect("/") | |
if (need_to_fix) { | |
# First pass: | |
# - Read literal data | |
# - Don't treat "" as NA | |
data <- I(lines) |> | |
readr::read_tsv( | |
col_types = list(.default = readr::col_character()), | |
show_col_types = FALSE, | |
progress = FALSE, | |
na = "NA" | |
) | |
# Create a col_character() for each all-blank column, otherwise they | |
# will be converted to logicals by col_guess() | |
cols_empty <- data |> | |
Filter(function(xs) all(xs == ""), x = _) |> | |
lapply(function(x) readr::col_character()) | |
cols <- c(list(), cols_empty) | |
cols[[date_col]] <- readr::col_date(format = "%m/%d/%Y") | |
cols$.default <- readr::col_guess() | |
# Second pass: | |
# - Guess the column types | |
# - Treat all blank columns as character | |
# - Don't convert "" to NA | |
data <- readr::type_convert(data, cols, na = "NA") | |
if (nrow(readr::problems(data))) { | |
cli::cli_warn("Skipping repair in {.path {path}} (Parsing problem)") | |
} else { | |
cli::cli_inform("Repaired {.field {date_col}} column in {.path {path}}") | |
readr::write_tsv(data, path) | |
} | |
} | |
path | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment