Skip to content

Instantly share code, notes, and snippets.

@njtierney
Created October 16, 2024 23:13
Show Gist options
  • Save njtierney/c3055c9fce2c8da2f1af25067033b58e to your computer and use it in GitHub Desktop.
Save njtierney/c3055c9fce2c8da2f1af25067033b58e to your computer and use it in GitHub Desktop.
library(tidyverse)
txt <- "day 0 dis influenza host 1111 age 19.19day 0 dis influenza host 2222 age 5.55day 0 dis influenza host 333 age 11.11"

# age has to have two \\d+ as it abuts the next column, "day"
pattern <- "day (\\w+) dis (\\w+) host (\\w+) age (\\d+\\.\\d+)"

# Use str_match_all to extract all occurrences
matches <- str_match_all(txt, pattern)[[1]]

matched_data <- as_tibble(matches[,2:5])
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
#> `.name_repair` is omitted as of tibble 2.0.0.
#> ℹ Using compatibility `.name_repair`.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.

# could potentially write it out as a CSV and read in
# (saves you guessing column type)
temporary_file <- tempfile(fileext = ".csv")
write_csv(matched_data, file = temporary_file)
read_csv(file = temporary_file,
         skip = 1,
         col_names = c("day", "dis", "host", "age"))
#> Rows: 3 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (1): dis
#> dbl (3): day, host, age
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 3 × 4
#>     day dis        host   age
#>   <dbl> <chr>     <dbl> <dbl>
#> 1     0 influenza  1111 19.2 
#> 2     0 influenza  2222  5.55
#> 3     0 influenza   333 11.1

# or could manage data frame like so:
colnames(matched_data) <- c("day", "dis", "host", "age")
tidied_data <- matched_data |> 
  as_tibble() |> 
  mutate(
    across(c(day, host, age), as.numeric)
  )

tidied_data
#> # A tibble: 3 × 4
#>     day dis        host   age
#>   <dbl> <chr>     <dbl> <dbl>
#> 1     0 influenza  1111 19.2 
#> 2     0 influenza  2222  5.55
#> 3     0 influenza   333 11.1

# As functions
inline_text_to_tbl <- function(text){
  # age has to have two \\d+ as it abuts the next column, "day"
  pattern <- "day (\\w+) dis (\\w+) host (\\w+) age (\\d+\\.\\d+)"
  # Would be nice to parameterise this?
  # e.g., 
  # pattern <- "{col1} (\\w+) {col2}"
  # but it's hard to generalise to any number of columns and also the final
  # column is a bit tricky
  
  # Use str_match_all to extract all occurrences
  matches <- str_match_all(text, pattern)[[1]]
  
  matched_data <- matches[,2:5]
  
  colnames(matched_data) <- c("day", "dis", "host", "age")
  
  as_tibble(matched_data)
  
}

inline_text_to_tbl(txt)
#> # A tibble: 3 × 4
#>   day   dis       host  age  
#>   <chr> <chr>     <chr> <chr>
#> 1 0     influenza 1111  19.19
#> 2 0     influenza 2222  5.55 
#> 3 0     influenza 333   11.11

inline_text_to_tbl_via_readr <- function(txt){
  matched_data <- inline_text_to_tbl(txt)
  temporary_file <- tempfile(fileext = ".csv")
  write_csv(matched_data, file = temporary_file)
  dat <- read_csv(file = temporary_file, show_col_types = FALSE)
  dat
}

inline_text_to_tbl_via_tbl <- function(txt){
  matched_data <- inline_text_to_tbl(txt)
  tidied_data <- matched_data |> 
    as_tibble() |> 
    mutate(
      across(c(day, host, age), as.numeric)
    )
  tidied_data
}

via_readr <- inline_text_to_tbl_via_readr(txt)
via_tbl <- inline_text_to_tbl_via_tbl(txt)

via_readr
#> # A tibble: 3 × 4
#>     day dis        host   age
#>   <dbl> <chr>     <dbl> <dbl>
#> 1     0 influenza  1111 19.2 
#> 2     0 influenza  2222  5.55
#> 3     0 influenza   333 11.1
via_tbl
#> # A tibble: 3 × 4
#>     day dis        host   age
#>   <dbl> <chr>     <dbl> <dbl>
#> 1     0 influenza  1111 19.2 
#> 2     0 influenza  2222  5.55
#> 3     0 influenza   333 11.1

bm1 <- bench::mark(
  readr = inline_text_to_tbl_via_readr(txt),
  tbl = inline_text_to_tbl_via_tbl(txt),
  check = FALSE
)

bm1
#> # A tibble: 2 × 6
#>   expression      min    median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm>  <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 readr        1.95ms    2.08ms      285.    21.5KB     20.5
#> 2 tbl         961.2µs  999.54µs      753.    39.1KB     36.1
summary(bm1, relative = TRUE)
#> # A tibble: 2 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 readr       2.03   2.09      1         1        1   
#> 2 tbl         1      1         2.65      1.82     1.76
plot(bm1)

# inline_text_to_tbl_via_tbl is probably the better choice?

Created on 2024-10-17 with reprex v2.1.1

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.4.1 Patched (2024-07-08 r86915)
#>  os       macOS Sonoma 14.5
#>  system   aarch64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       Australia/Hobart
#>  date     2024-10-17
#>  pandoc   3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package     * version    date (UTC) lib source
#>  beeswarm      0.4.0      2021-06-01 [1] CRAN (R 4.4.0)
#>  bench         1.1.3      2023-05-04 [1] CRAN (R 4.4.0)
#>  bit           4.0.5      2022-11-15 [1] CRAN (R 4.4.0)
#>  bit64         4.0.5      2020-08-30 [1] CRAN (R 4.4.0)
#>  cli           3.6.3      2024-06-21 [1] CRAN (R 4.4.0)
#>  colorspace    2.1-1      2024-07-26 [1] CRAN (R 4.4.0)
#>  crayon        1.5.3      2024-06-20 [1] CRAN (R 4.4.0)
#>  curl          5.2.1      2024-03-01 [1] CRAN (R 4.4.0)
#>  digest        0.6.36     2024-06-23 [1] CRAN (R 4.4.0)
#>  dplyr       * 1.1.4      2023-11-17 [1] CRAN (R 4.4.0)
#>  evaluate      0.24.0     2024-06-10 [1] CRAN (R 4.4.0)
#>  fansi         1.0.6      2023-12-08 [1] CRAN (R 4.4.0)
#>  farver        2.1.2      2024-05-13 [1] CRAN (R 4.4.0)
#>  fastmap       1.2.0      2024-05-15 [1] CRAN (R 4.4.0)
#>  forcats     * 1.0.0      2023-01-29 [1] CRAN (R 4.4.0)
#>  fs            1.6.4.9000 2024-06-26 [1] Github (r-lib/fs@714990b)
#>  generics      0.1.3      2022-07-05 [1] CRAN (R 4.4.0)
#>  ggbeeswarm    0.7.2      2023-04-29 [1] CRAN (R 4.4.0)
#>  ggplot2     * 3.5.1      2024-04-23 [1] CRAN (R 4.4.0)
#>  glue          1.7.0      2024-01-09 [1] CRAN (R 4.4.0)
#>  gtable        0.3.5      2024-04-22 [1] CRAN (R 4.4.0)
#>  highr         0.11       2024-05-26 [1] CRAN (R 4.4.0)
#>  hms           1.1.3      2023-03-21 [1] CRAN (R 4.4.0)
#>  htmltools     0.5.8.1    2024-04-04 [1] CRAN (R 4.4.0)
#>  knitr         1.48       2024-07-07 [1] CRAN (R 4.4.0)
#>  labeling      0.4.3      2023-08-29 [1] CRAN (R 4.4.0)
#>  lifecycle     1.0.4      2023-11-07 [1] CRAN (R 4.4.0)
#>  lubridate   * 1.9.3      2023-09-27 [1] CRAN (R 4.4.0)
#>  magrittr      2.0.3      2022-03-30 [1] CRAN (R 4.4.0)
#>  munsell       0.5.1      2024-04-01 [1] CRAN (R 4.4.0)
#>  pillar        1.9.0      2023-03-22 [1] CRAN (R 4.4.0)
#>  pkgconfig     2.0.3      2019-09-22 [1] CRAN (R 4.4.0)
#>  profmem       0.6.0      2020-12-13 [1] CRAN (R 4.4.0)
#>  purrr       * 1.0.2      2023-08-10 [1] CRAN (R 4.4.0)
#>  R6            2.5.1      2021-08-19 [1] CRAN (R 4.4.0)
#>  readr       * 2.1.5      2024-01-10 [1] CRAN (R 4.4.0)
#>  reprex        2.1.1      2024-07-06 [1] CRAN (R 4.4.0)
#>  rlang         1.1.4      2024-06-04 [1] CRAN (R 4.4.0)
#>  rmarkdown     2.27       2024-05-17 [1] CRAN (R 4.4.0)
#>  rstudioapi    0.16.0     2024-03-24 [1] CRAN (R 4.4.0)
#>  scales        1.3.0      2023-11-28 [1] CRAN (R 4.4.0)
#>  sessioninfo   1.2.2      2021-12-06 [1] CRAN (R 4.4.0)
#>  stringi       1.8.4      2024-05-06 [1] CRAN (R 4.4.0)
#>  stringr     * 1.5.1      2023-11-14 [1] CRAN (R 4.4.0)
#>  tibble      * 3.2.1      2023-03-20 [1] CRAN (R 4.4.0)
#>  tidyr       * 1.3.1      2024-01-24 [1] CRAN (R 4.4.0)
#>  tidyselect    1.2.1      2024-03-11 [1] CRAN (R 4.4.0)
#>  tidyverse   * 2.0.0      2023-02-22 [1] CRAN (R 4.4.0)
#>  timechange    0.3.0      2024-01-18 [1] CRAN (R 4.4.0)
#>  tzdb          0.4.0      2023-05-12 [1] CRAN (R 4.4.0)
#>  utf8          1.2.4      2023-10-22 [1] CRAN (R 4.4.0)
#>  vctrs         0.6.5      2023-12-01 [1] CRAN (R 4.4.0)
#>  vipor         0.4.7      2023-12-18 [1] CRAN (R 4.4.0)
#>  vroom         1.6.5      2023-12-05 [1] CRAN (R 4.4.0)
#>  withr         3.0.1      2024-07-31 [1] CRAN (R 4.4.0)
#>  xfun          0.46       2024-07-18 [1] CRAN (R 4.4.0)
#>  xml2          1.3.6      2023-12-04 [1] CRAN (R 4.4.0)
#>  yaml          2.3.10     2024-07-26 [1] CRAN (R 4.4.0)
#> 
#>  [1] /Users/nick/Library/R/arm64/4.4/library
#>  [2] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment