library(tidyverse)
txt <- "day 0 dis influenza host 1111 age 19.19day 0 dis influenza host 2222 age 5.55day 0 dis influenza host 333 age 11.11"
# age has to have two \\d+ as it abuts the next column, "day"
pattern <- "day (\\w+) dis (\\w+) host (\\w+) age (\\d+\\.\\d+)"
# Use str_match_all to extract all occurrences
matches <- str_match_all(txt, pattern)[[1]]
matched_data <- as_tibble(matches[,2:5])
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
#> `.name_repair` is omitted as of tibble 2.0.0.
#> ℹ Using compatibility `.name_repair`.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
# could potentially write it out as a CSV and read in
# (saves you guessing column type)
temporary_file <- tempfile(fileext = ".csv")
write_csv(matched_data, file = temporary_file)
read_csv(file = temporary_file,
skip = 1,
col_names = c("day", "dis", "host", "age"))
#> Rows: 3 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (1): dis
#> dbl (3): day, host, age
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 3 × 4
#> day dis host age
#> <dbl> <chr> <dbl> <dbl>
#> 1 0 influenza 1111 19.2
#> 2 0 influenza 2222 5.55
#> 3 0 influenza 333 11.1
# or could manage data frame like so:
colnames(matched_data) <- c("day", "dis", "host", "age")
tidied_data <- matched_data |>
as_tibble() |>
mutate(
across(c(day, host, age), as.numeric)
)
tidied_data
#> # A tibble: 3 × 4
#> day dis host age
#> <dbl> <chr> <dbl> <dbl>
#> 1 0 influenza 1111 19.2
#> 2 0 influenza 2222 5.55
#> 3 0 influenza 333 11.1
# As functions
inline_text_to_tbl <- function(text){
# age has to have two \\d+ as it abuts the next column, "day"
pattern <- "day (\\w+) dis (\\w+) host (\\w+) age (\\d+\\.\\d+)"
# Would be nice to parameterise this?
# e.g.,
# pattern <- "{col1} (\\w+) {col2}"
# but it's hard to generalise to any number of columns and also the final
# column is a bit tricky
# Use str_match_all to extract all occurrences
matches <- str_match_all(text, pattern)[[1]]
matched_data <- matches[,2:5]
colnames(matched_data) <- c("day", "dis", "host", "age")
as_tibble(matched_data)
}
inline_text_to_tbl(txt)
#> # A tibble: 3 × 4
#> day dis host age
#> <chr> <chr> <chr> <chr>
#> 1 0 influenza 1111 19.19
#> 2 0 influenza 2222 5.55
#> 3 0 influenza 333 11.11
inline_text_to_tbl_via_readr <- function(txt){
matched_data <- inline_text_to_tbl(txt)
temporary_file <- tempfile(fileext = ".csv")
write_csv(matched_data, file = temporary_file)
dat <- read_csv(file = temporary_file, show_col_types = FALSE)
dat
}
inline_text_to_tbl_via_tbl <- function(txt){
matched_data <- inline_text_to_tbl(txt)
tidied_data <- matched_data |>
as_tibble() |>
mutate(
across(c(day, host, age), as.numeric)
)
tidied_data
}
via_readr <- inline_text_to_tbl_via_readr(txt)
via_tbl <- inline_text_to_tbl_via_tbl(txt)
via_readr
#> # A tibble: 3 × 4
#> day dis host age
#> <dbl> <chr> <dbl> <dbl>
#> 1 0 influenza 1111 19.2
#> 2 0 influenza 2222 5.55
#> 3 0 influenza 333 11.1
via_tbl
#> # A tibble: 3 × 4
#> day dis host age
#> <dbl> <chr> <dbl> <dbl>
#> 1 0 influenza 1111 19.2
#> 2 0 influenza 2222 5.55
#> 3 0 influenza 333 11.1
bm1 <- bench::mark(
readr = inline_text_to_tbl_via_readr(txt),
tbl = inline_text_to_tbl_via_tbl(txt),
check = FALSE
)
bm1
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 readr 1.95ms 2.08ms 285. 21.5KB 20.5
#> 2 tbl 961.2µs 999.54µs 753. 39.1KB 36.1
summary(bm1, relative = TRUE)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 readr 2.03 2.09 1 1 1
#> 2 tbl 1 1 2.65 1.82 1.76
plot(bm1)
# inline_text_to_tbl_via_tbl is probably the better choice?
Created on 2024-10-17 with reprex v2.1.1
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.4.1 Patched (2024-07-08 r86915)
#> os macOS Sonoma 14.5
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Australia/Hobart
#> date 2024-10-17
#> pandoc 3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> beeswarm 0.4.0 2021-06-01 [1] CRAN (R 4.4.0)
#> bench 1.1.3 2023-05-04 [1] CRAN (R 4.4.0)
#> bit 4.0.5 2022-11-15 [1] CRAN (R 4.4.0)
#> bit64 4.0.5 2020-08-30 [1] CRAN (R 4.4.0)
#> cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0)
#> colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.4.0)
#> crayon 1.5.3 2024-06-20 [1] CRAN (R 4.4.0)
#> curl 5.2.1 2024-03-01 [1] CRAN (R 4.4.0)
#> digest 0.6.36 2024-06-23 [1] CRAN (R 4.4.0)
#> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.0)
#> evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.4.0)
#> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.4.0)
#> farver 2.1.2 2024-05-13 [1] CRAN (R 4.4.0)
#> fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
#> forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.4.0)
#> fs 1.6.4.9000 2024-06-26 [1] Github (r-lib/fs@714990b)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.0)
#> ggbeeswarm 0.7.2 2023-04-29 [1] CRAN (R 4.4.0)
#> ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.4.0)
#> glue 1.7.0 2024-01-09 [1] CRAN (R 4.4.0)
#> gtable 0.3.5 2024-04-22 [1] CRAN (R 4.4.0)
#> highr 0.11 2024-05-26 [1] CRAN (R 4.4.0)
#> hms 1.1.3 2023-03-21 [1] CRAN (R 4.4.0)
#> htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
#> knitr 1.48 2024-07-07 [1] CRAN (R 4.4.0)
#> labeling 0.4.3 2023-08-29 [1] CRAN (R 4.4.0)
#> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0)
#> lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.4.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.0)
#> munsell 0.5.1 2024-04-01 [1] CRAN (R 4.4.0)
#> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.4.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
#> profmem 0.6.0 2020-12-13 [1] CRAN (R 4.4.0)
#> purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.4.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0)
#> readr * 2.1.5 2024-01-10 [1] CRAN (R 4.4.0)
#> reprex 2.1.1 2024-07-06 [1] CRAN (R 4.4.0)
#> rlang 1.1.4 2024-06-04 [1] CRAN (R 4.4.0)
#> rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.4.0)
#> rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.0)
#> scales 1.3.0 2023-11-28 [1] CRAN (R 4.4.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0)
#> stringi 1.8.4 2024-05-06 [1] CRAN (R 4.4.0)
#> stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.4.0)
#> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.4.0)
#> tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.4.0)
#> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
#> tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.0)
#> timechange 0.3.0 2024-01-18 [1] CRAN (R 4.4.0)
#> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.4.0)
#> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.4.0)
#> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.0)
#> vipor 0.4.7 2023-12-18 [1] CRAN (R 4.4.0)
#> vroom 1.6.5 2023-12-05 [1] CRAN (R 4.4.0)
#> withr 3.0.1 2024-07-31 [1] CRAN (R 4.4.0)
#> xfun 0.46 2024-07-18 [1] CRAN (R 4.4.0)
#> xml2 1.3.6 2023-12-04 [1] CRAN (R 4.4.0)
#> yaml 2.3.10 2024-07-26 [1] CRAN (R 4.4.0)
#>
#> [1] /Users/nick/Library/R/arm64/4.4/library
#> [2] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
#>
#> ──────────────────────────────────────────────────────────────────────────────