-
-
Save thibautjombart/d968e767bbca165de33cf7756aab12cb to your computer and use it in GitHub Desktop.
library(outbreaks) | |
library(tidyverse) | |
make_linelist <- function(x, date, interval = 1L, date_start = NULL, date_stop = NULL) { | |
## TODO: add tests on inputs | |
x <- tibble::as_tibble(x) | |
out <- dplyr::select(x, date, everything()) | |
dates <- pull(out, date) | |
if (is.null(date_start)) { | |
date_start <- min(dates, na.rm = TRUE) | |
} | |
if (is.null(date_stop)) { | |
date_stop <- max(dates, na.rm = TRUE) | |
} | |
x_info <- list( | |
date = names(out)[1], | |
interval = interval, | |
date_start = date_start, | |
date_stop = date_stop | |
) | |
## append class and add attributes | |
class(out) <- c("linelist", class(x)) | |
attr(out, "linelist_info") <- x_info | |
out | |
} | |
x <- make_linelist(ebola_sim_clean$linelist, "date_of_onset") | |
x | |
## some operations are okay preserving attributes | |
x %>% | |
select(1:10) %>% | |
attr("linelist_info") | |
x %>% | |
select(1:10) %>% | |
group_by(gender) %>% | |
attr("linelist_info") | |
x %>% | |
select(1:10) %>% | |
filter(date_of_onset < as.Date("2015-01-01")) %>% | |
attr("linelist_info") | |
## some are not | |
x %>% | |
select(1:10) %>% | |
group_by(gender) %>% | |
filter(date_of_onset < as.Date("2015-01-01")) %>% | |
attr("linelist_info") |
A way to work around this would be the implement wrappers around existing functions which systematically restore attributes. As they all seem to be generics this should be relatively easy. Here is an example:
## work around losing attributes
copy_linelist_info <- function(from, to) {
attr(to, "linelist_info") <- attr(from, "linelist_info")
to
}
declass_linelist <- function(x) {
class(x) <- setdiff(class(x), "linelist")
x
}
reclass_linelist <- function(x) {
class(x) <- c("linelist", class(x))
x
}
select.linelist <- function(.data, ...) {
out <- declass_linelist(.data)
out <- select(out, ...)
out <- copy_linelist_info(.data, out)
reclass_linelist(out)
}
group_by.linelist <- function(.data, ...) {
out <- declass_linelist(.data)
out <- group_by(out, ...)
out <- copy_linelist_info(.data, out)
reclass_linelist(out)
}
filter.linelist <- function(.data, ...) {
out <- declass_linelist(.data)
out <- filter(out, ...)
out <- copy_linelist_info(.data, out)
reclass_linelist(out)
}
## some are not
x %>%
select(1:10) %>%
group_by(gender) %>%
filter(date_of_onset < as.Date("2015-01-01")) %>%
attr("linelist_info")
#> $date
#> [1] "date_of_onset"
#>
#> $interval
#> [1] 1
#>
#> $date_start
#> [1] "2014-04-07"
#>
#> $date_stop
#> [1] "2015-04-30"
Created on 2020-06-17 by the reprex package (v0.3.0)
What about the following for the class creator?
library(rlang)
library(tibble)
library(outbreaks)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# a monotonically increasing function (i.e. will maintain ordering)
interval_function <- function(x, interval) {
# TODO write this based on existing package
x
}
# create incidence class
incidence <- function(..., date_var, interval = 1L, groups = NULL,
date_start = NULL, date_stop = NULL) {
# TODO checks and tests
# make tibble from input
dots <- list2(...)
tbl <- tibble(!!!dots)
# grouping via specified groups and interval
date_var <- ensym(date_var)
tbl <- mutate(tbl, .interval = interval_function(!!date_var, interval))
tbl <- group_by(tbl, !!!syms(groups))
tbl <- group_by(tbl, .interval, .add = TRUE)
tbl <- summarise(tbl, count = n(), .groups = "drop")
# TODO filtering / expanding dates
# create subclass of tibble
new_tibble(tbl,
date = as_string(date_var),
interval = interval,
groups = groups,
date_start = date_start,
date_stop = date_stop,
nrow = nrow(tbl),
class = "incidence")
}
# example
dat <- ebola_sim_clean$linelist
i1 <- incidence(dat,
date_var = date_of_onset,
interval = 1L,
groups = c("hospital", "gender"))
i1
#> # A tibble: 2,535 x 4
#> hospital gender .interval count
#> <fct> <fct> <date> <int>
#> 1 Connaught Hospital f 2014-05-03 1
#> 2 Connaught Hospital f 2014-05-07 2
#> 3 Connaught Hospital f 2014-05-18 1
#> 4 Connaught Hospital f 2014-05-19 1
#> 5 Connaught Hospital f 2014-05-22 1
#> 6 Connaught Hospital f 2014-05-24 1
#> 7 Connaught Hospital f 2014-05-25 1
#> 8 Connaught Hospital f 2014-05-30 1
#> 9 Connaught Hospital f 2014-06-03 1
#> 10 Connaught Hospital f 2014-06-11 1
#> # … with 2,525 more rows
str(i1)
#> tibble [2,535 × 4] (S3: incidence/tbl_df/tbl/data.frame)
#> $ hospital : Factor w/ 5 levels "Connaught Hospital",..: 1 1 1 1 1 1 1 1 1 1 ...
#> $ gender : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
#> $ .interval: Date[1:2535], format: "2014-05-03" "2014-05-07" ...
#> $ count : int [1:2535] 1 2 1 1 1 1 1 1 1 1 ...
#> - attr(*, "date")= chr "date_of_onset"
#> - attr(*, "interval")= int 1
#> - attr(*, "groups")= chr [1:2] "hospital" "gender"
Created on 2020-06-18 by the reprex package (v0.3.0)
I really like it! It is both close to the previous implementation in terms of interface, and adds some of the key features we need - most importantly stratification by > 1 factor. A small note on the interval (though I understand this is a proof of concept): here .interval
seems wrong (interval = 1L
should be taken as one day). Just to make sure we're on the same page:
- question / confirmation: here
.interval
is the left hand-side of the bins to count cases by (right?) - in practice, the argument
interval
inincidence()
should keep current ability to handle named time units (and thus non-constant intervals), e.g."1 month"
or"2 weeks"
or"quarter"
Yes and yes.
I think we're on the same page. I'll expand on my thinking on the interval function and how it can be used:
- In principle the interval function is any function applied to
date_var
that maintains the monotonic ordering of thedate_var
. - By keeping it abstract it makes it easier to apply different functions in future should we choose.
- Initially I'll just do a "cut, paste and tweak" to the current functionality in the incidence package to make it fit.
- the variable
.interval
is probably better nameddate_group
. By putting it as a variable in the tibble rather than an attribute it's easier to work with. - In theory we would like the interval function to dispatch on both it's arguments although in practice I will probably dispatch on one and switch on the other (I'm undecided on order of arguments at moment). This way it will match current functionality and deal with character vectors and integer/numeric.
Does this make sense / answer your questions?
Yes it makes total sense, and I think it is a nice way to do things. We may need to add:
interval_function <- function(dat_var, interval, date_start, date_stop) {
...
}
+1 to naming the output of that function date_group
inside incidence::incidence
, probably clearer like this.
Showing outputs:
Created on 2020-06-17 by the reprex package (v0.3.0)