Skip to content

Instantly share code, notes, and snippets.

@xiaodaigh
Last active January 28, 2018 03:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xiaodaigh/a7bd4cc8ae49d503ddb02d0ba0a724ad to your computer and use it in GitHub Desktop.
Save xiaodaigh/a7bd4cc8ae49d503ddb02d0ba0a724ad to your computer and use it in GitHub Desktop.
Counting unique elements in vector of T/F NA (missing)
# version 7 only
if VERSION >= v"0.7.0"
function bittwiddling2(A::Vector{Union{Bool,Missing}})
ptra = Ptr{UInt8}(pointer(A))
l = length(A)
# load the missing
nbools = 0
t = 0
for i = 0:l-1
new_bool = unsafe_load(ptra + l + i)
nbools += new_bool
t += unsafe_load(ptra + i) & new_bool
end
t, nbools - t, l - nbools
end
a = [missing, true, false]
@time A = rand(a, 2^31-1)
@time res = [@elapsed bittwiddling2(A) for i=1:5]
mean(res) # 0.958867
end
using Missings
function count_missing(A::Vector{Union{Bool,Missing}})
t, m = 0, 0
@inbounds for v in A
ismissing(v) && (m += 1; continue)
v && (t += 1)
end
return t, length(A) - t - m, m
end
a = [missing, true, false];
srand(1);
@time x = rand(a, 2^31-1);
using BenchmarkTools
jltimings = @benchmark count_missing(x)
x = nothing
gc()
using RCall
rtiming = R"""
memory.limit(2^31-1)
counttfna = function(a) {
t = sum(a, na.rm=T)
na = sum(is.na(a))
list(t, length(a)-t-na, na)
}
x = sample(c(T,F, NA), 2^31-1, r=T)
system.time(counttfna(x))
"""
@time rtiming2 = R"""
# memory.limit(2^31-1)
# library(bit)
# N<- 2^31-1
# v<-bit(N)
# n<-bit(N)
# v <- as.bit(sample(c(T, F ), N, r=T))
# n <- as.bit(sample(c(T, F, F), N, r=T))
# count <- function(v, n) {
# t <- sum(!n & v)
# na <- sum(n)
# list(t, length(v)-t-na, na)
# }
# system.time({print(count(v, n))})
library(matrixStats)
countbool2 = function(x) {
countx = sum2(x, na.rm=T)
countna = sum2(is.na(x))
list(true = countx, false = length(x) - countx - countna, na = countna)
}
st = system.time(res2 <- countbool2(x))
st
"""
jltiming = mean(jltimings.times)/1000/1000/1000
using StatPlots
using Plots
bar(
["Julia v0.6.2","Julia v0.7.0", "R", "R matrixStats"],
[jltiming, 0.958867, rtiming[3], rtiming2[3]],
title = "Count unique True/False/NA in a vector of length 2^31-1",
label = "seconds"
)
savefig("r vs julia boolean missing unique count.png")
function count_missing(A::Vector{Union{Bool,Missing}})
t, m = 0, 0
@inbounds for v in A
ismissing(v) && (m += 1; continue)
v && (t += 1)
end
return t, length(A) - t - m, m
end
a = [missing, true, false];
x = rand(a, 2^31-1);
function basic_bench(x)
[@elapsed count_missing(x) for i =1:3]
end
basic_bench(x)
library(data.table)
a = data.table(a = sample(c(T,F, NA), 1e9, replace = T))
counttfna = function(a) {
t = sum(a, na.rm=T)
na = sum(is.na(a))
list(t, length(a)-t-na, na)
}
system.time(a[,.N,a]) # 12s
system.time(counttfna(a$a)) #5s
@HenrikBengtsson
Copy link

HenrikBengtsson commented Jan 26, 2018

With matrixStats::count():

count_tfna <- function(a) {
  t = sum(a, na.rm = TRUE)
  na = sum(is.na(a))
  c(t, length(a) - t - na, na)
}

count <- matrixStats::count
count_tfna_ms <- function(a) {
  t = count(a, value = TRUE, na.rm = TRUE)
  na = count(a, value = NA, na.rm = FALSE)
  c(t, length(a) - t - na, na)
}
> x <- rep(c(TRUE, FALSE, NA), length.out = 1e9)

> system.time(count_tfna(x))
   user  system elapsed 
  2.244   2.012   4.257

> system.time(count_tfna_ms(x))
   user  system elapsed 
  1.664   0.000   1.665 

What's killing you in the first case is the is.na(a) call which allocates and populates another vector of length(a) which is then discarded as soon as it's done - the matrixStats::count() function avoids that:

> profmem::profmem(count_tfna(x))
Rprofmem memory profiling of:
count_tfna(x)

Memory allocations:
      bytes        calls
1     4e+09 count_tfna()
total 4e+09             

> profmem::profmem(count_tfna_2(x))
Rprofmem memory profiling of:
count_tfna_2(x)

Memory allocations:
      bytes calls
total     0      

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment