Skip to content

Instantly share code, notes, and snippets.

@artemklevtsov
Created March 26, 2020 04:08
Show Gist options
  • Save artemklevtsov/9778fa136731bea3ea73437c6e25bc1d to your computer and use it in GitHub Desktop.
Save artemklevtsov/9778fa136731bea3ea73437c6e25bc1d to your computer and use it in GitHub Desktop.

I tried searching for the data.table functions to trunc dates.

Bug in round.IDate(x, "week")

round.IDate for the weeks (IDateTime.R#81) seems inaccurate: first week of year have a 6 days instead 7 (fix: should be yday(x) - 1L)

> rle(unclass(round(as.IDate(0:21), "week")))
Run Length Encoding
  lengths: int [1:4] 6 7 7 2
  values : int [1:4] 0 7 14 21

Suggestions

Add trunc.IDate (may be replace round.IDate). For the "weeks" unit add additional option for the start: year, monday, sunday.

Posible implementaiton:

trunc_week <- function(x, start = c("year", "monday", "sunday")) {
  switch(
    match.arg(start),
    year = {
      l = as.POSIXlt(x)
      w = (l$yday %/% 7L) * 7L
      x - l$yday + w
    },
    # lubridate::floor_date(x, "week", week_start = 1)
    monday = as.IDate(7 * ((unclass(x) - 4L) %/% 7) + 4L),
    # lubridate::floor_date(x, "week", week_start = 0)
    sunday = as.IDate(7 * ((unclass(x) - 3L) %/% 7) + 3L)
  )
}

Improvements

Utility function to print bench results:

print.bench = function(x, ...) {
  cols = c("expression", "n", "min", "median", "mem_alloc", "n_itr", "n_gc")
  r = as.data.table(x)[, ..cols]
  r[, expression :=  sapply(expression, deparse)]
  r[, relative := round(as.numeric(median / min(median)), 3), by = c("n")]
  r[]

Data to process.

set.seed(42)
x = seq.Date(as.Date("1900-01-01"), as.Date("2020-03-01"), by = "day")
x = as.IDate(x)

Weeks

trunc_week_old = function(x) {
  # IDateTime.R#81
  # bug: first week of year have 6 days instead 7
  # fix: should be yday(x) - 1L
  round(x, "year") + 7L * ((yday(x) - 1L) %/% 7L)
}

trunc_week_new = function(x) {
  l = as.POSIXlt(x)
  w = (l$yday %/% 7L) * 7L
  x - l$yday + w
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_week_old(d), trunc_week_new(d))
  }
)

print.bench(r)
#>            expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_week_old(d) 1e+02     225.62µs     232.11µs      146.73KB  1950     2    7.050
#>  2: trunc_week_new(d) 1e+02      30.57µs      32.92µs       40.53KB  9996     4    1.000
#>  3: trunc_week_old(d) 1e+03       1.39ms       1.47ms         213KB   318     2   13.605
#>  4: trunc_week_new(d) 1e+03     106.56µs     108.29µs        59.2KB  4151     4    1.000
#>  5: trunc_week_old(d) 1e+04      13.44ms      13.88ms        2.06MB    33     1   15.994
#>  6: trunc_week_new(d) 1e+04     859.88µs     867.72µs      586.55KB   534     5    1.000
#>  7: trunc_week_old(d) 1e+05      134.9ms     137.59ms        20.6MB     2     2   15.515
#>  8: trunc_week_new(d) 1e+05       8.39ms       8.87ms        5.72MB    41     9    1.000
#>  9: trunc_week_old(d) 1e+06        1.45s        1.45s         206MB     1     6   15.516
#> 10: trunc_week_new(d) 1e+06      88.27ms      93.29ms       57.22MB     6     6    1.000

Months

trunc_month_old = function(x) {
  # IDateTime.R#81
  as.IDate(ISOdate(year(x), month(x), 1L))
}
trunc_month_new = function(x) {
  x - as.POSIXlt(x)$mday + 1L
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_month_old(d), trunc_month_new(d))
  }
)

print.bench(r)
#>             expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                 <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_month_old(d) 1e+02     198.49µs     219.99µs        27.5KB  1947     2    7.332
#>  2: trunc_month_new(d) 1e+02      28.26µs         30µs        6.03KB  9996     4    1.000
#>  3: trunc_month_old(d) 1e+03       1.47ms        1.5ms      216.91KB   329     1   15.449
#>  4: trunc_month_new(d) 1e+03      94.05µs      96.94µs       55.25KB  4683     3    1.000
#>  5: trunc_month_old(d) 1e+04      13.93ms      14.39ms         2.1MB    35     0   17.223
#>  6: trunc_month_new(d) 1e+04     794.89µs     835.49µs      547.44KB   571     4    1.000
#>  7: trunc_month_old(d) 1e+05     139.84ms     141.22ms       20.98MB     3     1   17.648
#>  8: trunc_month_new(d) 1e+05       7.93ms          8ms        5.34MB    58     4    1.000
#>  9: trunc_month_old(d) 1e+06        1.42s        1.42s      209.81MB     1     4   17.221
#> 10: trunc_month_new(d) 1e+06      80.22ms      82.72ms       53.41MB     6     7    1.000

Qurters

trunc_quarter_old = function(x) {
  # IDateTime.R#82
  as.IDate(ISOdate(year(x), 3L * (quarter(x) - 1L) + 1L, 1L))
}
trunc_quarter_new = function(x) {
  l = as.POSIXlt(x)
  l$mon = (l$mon %/% 3L) * 3L
  l$mday = 1L
  as.IDate(l)
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_quarter_old(d), trunc_quarter_new(d))
  }
)

print.bench(r)
#>               expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                   <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_quarter_old(d) 1e+02        199µs     217.63µs       28.12KB  2213     2    6.289
#>  2: trunc_quarter_new(d) 1e+02      32.26µs      34.61µs       18.62KB  9995     5    1.000
#>  3: trunc_quarter_old(d) 1e+03       1.55ms       1.64ms      216.91KB   294     1    8.413
#>  4: trunc_quarter_new(d) 1e+03     177.79µs     194.43µs       98.64KB  2308     3    1.000
#>  5: trunc_quarter_old(d) 1e+04      15.15ms      15.46ms         2.1MB    31     1    8.443
#>  6: trunc_quarter_new(d) 1e+04       1.73ms       1.83ms      977.55KB   258     4    1.000
#>  7: trunc_quarter_old(d) 1e+05     153.45ms     153.83ms       20.98MB     2     2    8.313
#>  8: trunc_quarter_new(d) 1e+05      18.02ms       18.5ms        9.54MB    19     6    1.000
#>  9: trunc_quarter_old(d) 1e+06        1.59s        1.59s      209.81MB     1     5    6.672
#> 10: trunc_quarter_new(d) 1e+06     187.06ms     238.51ms       95.37MB     3     6    1.000

Years

trunc_year_old = function(x) {
  # IDateTime.R#83
  as.IDate(ISOdate(year(x), 1L, 1L))
}
trunc_year_new = function(x) {
  x - as.POSIXlt(x)$yday
}

r = bench::press(
  n = c(100, 1000, 10000, 100000, 1000000),
  {
    d = sample(x, size = n, replace = TRUE)
    bench::mark(trunc_year_old(d), trunc_year_new(d))
  }
)

print.bench(r)
#>            expression     n          min       median     mem_alloc n_itr  n_gc relative
#>                <char> <num> <bench_time> <bench_time> <bench_bytes> <int> <num>    <num>
#>  1: trunc_year_old(d) 1e+02     178.84µs     184.22µs       17.56KB  2405     2    8.662
#>  2: trunc_year_new(d) 1e+02      19.81µs      21.27µs        5.59KB  9997     3    1.000
#>  3: trunc_year_old(d) 1e+03       1.26ms        1.3ms       161.7KB   375     0   14.864
#>  4: trunc_year_new(d) 1e+03      85.66µs      87.45µs        51.3KB  5431     3    1.000
#>  5: trunc_year_old(d) 1e+04      12.01ms      12.62ms        1.57MB    40     0   16.732
#>  6: trunc_year_new(d) 1e+04     743.28µs     754.02µs      508.33KB   578     3    1.000
#>  7: trunc_year_old(d) 1e+05     125.14ms     125.57ms       15.64MB     3     1   16.846
#>  8: trunc_year_new(d) 1e+05       7.38ms       7.45ms        4.96MB    64     3    1.000
#>  9: trunc_year_old(d) 1e+06        1.27s        1.27s       156.4MB     1     4   17.129
#> 10: trunc_year_new(d) 1e+06      73.92ms      74.31ms       49.59MB     7     3    1.000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment