Last active August 29, 2015 13:57
split & relist

A comparison of the functions split and relist. I study how they can be used to translate the elements in a list of vectors.

Both functions are more than ten times faster than lapply.

relist seems to be a bit faster than split.

A function to "translate" lists of vectors is next provided in the file translate_list_function.r

see how split and relist work

# ?split package:base ?relist package:utils

Data Simulation

# rm (list = ls ())


N <- 10^7
dat <- 1:N
ids <- paste0("id", dat)
names(dat) <- ids
# cls <- sample (LETTERS, size = N, replace = TRUE)
LTRS <- sort(apply(expand.grid(LETTERS, letters), 1, paste, collapse = ""))
> [1] 676
cls <- sample(LTRS, size = N, replace = TRUE)
cbind(dat, ids, cls)[1:3, ]
>     dat ids   cls 
> id1 "1" "id1" "Ks"
> id2 "2" "id2" "Yx"
> id3 "3" "id3" "Sr"

Starting data:

list0 <- split(x = ids, f = cls)
translation <- dat

My Raw Data

lapply(list0[1:3], head)
> $Aa
> [1] "id2357" "id2426" "id2522" "id3452" "id3648" "id3905"
> $Ab
> [1] "id1231" "id1535" "id1668" "id2464" "id3731" "id4825"
> $Ac
> [1] "id1196" "id1315" "id2696" "id3172" "id3375" "id3879"
> id1 id2 id3 id4 id5 
>   1   2   3   4   5
> [1] 676
summary(sapply(list0, length))
>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
>   14300   14700   14800   14800   14900   15100

Naive conversion

Using lapply

    fun <- function(x) translation[x]
    conv0 <- lapply(list0, fun)
>    user  system elapsed 
> 500.484   0.491 501.799
lapply(conv0[1:3], head)
> $Aa
> id2357 id2426 id2522 id3452 id3648 id3905 
>   2357   2426   2522   3452   3648   3905 
> $Ab
> id1231 id1535 id1668 id2464 id3731 id4825 
>   1231   1535   1668   2464   3731   4825 
> $Ac
> id1196 id1315 id2696 id3172 id3375 id3879 
>   1196   1315   2696   3172   3375   3879

Conversion using unlist split

    lon <- sapply(list0, length)
    fvec <- rep(names(list0), times = lon)
    vec0 <- unlist(list0)
    vec1 <- translation[vec0]
    conv1 <- split(x = vec1, f = fvec)
>    user  system elapsed 
>  18.943   0.235  19.212

the converted lists are identical

identical(conv1, conv0)
> [1] TRUE

Some remarks

> [1] "character"
> [1] "integer"
> [1] "character"
sum(lon) == length(vec0)
> [1] TRUE
sum(lon) == length(vec1)
> [1] TRUE
sum(lon) == length(fvec)
> [1] TRUE
cbind(vec0, vec1, fvec)[1:3, ]
>     vec0     vec1   fvec
> Aa1 "id2357" "2357" "Aa"
> Aa2 "id2426" "2426" "Aa"
> Aa3 "id2522" "2522" "Aa"
> [1] "names"
> [1] "names"

Conversion using relist

    vec3 <- unlist(as.relistable(list0))
    vec4 <- translation[vec3]
    conv2 <- relist(flesh = vec4, skeleton = attributes(vec3)$skeleton)
    ## conv3 <- relist (flesh = vec4, skeleton = vec3) ## this does not work (the
    ## list is wrongly formatted)a
>    user  system elapsed 
>  14.289   0.164  14.478

the converted lists are not identical

identical(conv2, conv0)
> [1] FALSE

because conv2 does not have names

lapply(conv2[1:2], head)
> $Aa
> [1] 2357 2426 2522 3452 3648 3905
> $Ab
> [1] 1231 1535 1668 2464 3731 4825
lapply(conv0[1:2], head)
> $Aa
> id2357 id2426 id2522 id3452 id3648 id3905 
>   2357   2426   2522   3452   3648   3905 
> $Ab
> id1231 id1535 id1668 id2464 id3731 id4825 
>   1231   1535   1668   2464   3731   4825

but the values are the same

table(sapply(conv2, length) == sapply(conv0, length))
>  676
table(unlist(conv2) == unlist(conv0))
>     TRUE 
> 10000000


# for (i in 1:length (conv2)) {
for (i in 1:10) {
    print(table(conv0[[i]] == conv2[[i]]))
>  TRUE 
> 14997 
>  TRUE 
> 15081 
>  TRUE 
> 14678 
>  TRUE 
> 14887 
>  TRUE 
> 14707 
>  TRUE 
> 14890 
>  TRUE 
> 14883 
>  TRUE 
> 14677 
>  TRUE 
> 15067 
>  TRUE 
> 14678

Some remarks

table(vec3 == vec0)
>     TRUE 
> 10000000
table(vec4 == vec1)
>     TRUE 
> 10000000
#function to translate a list of vectors
translateList <- function (x, translation, method = "split") {#2014-04-09
##' method = "split" returns the original names translated
##' method = "relist" does not
if (method == "split") {
lon <- sapply (x, length)
fvec <- rep (names (x), times = lon)
vec0 <- unlist (x)
vec1 <- translation[vec0]
conv <- split (x = vec1, f = fvec)
} else {
vec3 <- unlist (as.relistable (x))
vec4 <- translation[vec3]
conv <- relist (flesh = vec4, skeleton = attributes (vec3)$skeleton)
return (conv)
