Skip to content

Instantly share code, notes, and snippets.

@dmontaner
Last active August 29, 2015 13:58
Show Gist options
  • Save dmontaner/10107905 to your computer and use it in GitHub Desktop.
Save dmontaner/10107905 to your computer and use it in GitHub Desktop.
split & relist

A comparison of the functions split and relist. I study how they can be used to translate the elements in a list of vectors.

Both functions are more than ten times faster than lapply.

relist seems to be a bit faster than split.


A function to "translate" lists of vectors is next provided in the file translate_list_function.r


estudio_split_relist_1.r
2014-04-04 dmontaner@cipf.es
see how split and relist work

# ?split package:base ?relist package:utils

Data Simulation

# rm (list = ls ())

set.seed(20140404)

N <- 10^7
dat <- 1:N
ids <- paste0("id", dat)
names(dat) <- ids
# cls <- sample (LETTERS, size = N, replace = TRUE)
LTRS <- sort(apply(expand.grid(LETTERS, letters), 1, paste, collapse = ""))
length(LTRS)
> [1] 676
cls <- sample(LTRS, size = N, replace = TRUE)
cbind(dat, ids, cls)[1:3, ]
>     dat ids   cls 
> id1 "1" "id1" "Ks"
> id2 "2" "id2" "Yx"
> id3 "3" "id3" "Sr"

Starting data:

list0 <- split(x = ids, f = cls)
translation <- dat

My Raw Data

lapply(list0[1:3], head)
> $Aa
> [1] "id2357" "id2426" "id2522" "id3452" "id3648" "id3905"
> 
> $Ab
> [1] "id1231" "id1535" "id1668" "id2464" "id3731" "id4825"
> 
> $Ac
> [1] "id1196" "id1315" "id2696" "id3172" "id3375" "id3879"
translation[1:5]
> id1 id2 id3 id4 id5 
>   1   2   3   4   5
length(list0)
> [1] 676
summary(sapply(list0, length))
>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
>   14300   14700   14800   14800   14900   15100

Naive conversion

Using lapply

system.time({
    fun <- function(x) translation[x]
    conv0 <- lapply(list0, fun)
})
>    user  system elapsed 
> 500.484   0.491 501.799
lapply(conv0[1:3], head)
> $Aa
> id2357 id2426 id2522 id3452 id3648 id3905 
>   2357   2426   2522   3452   3648   3905 
> 
> $Ab
> id1231 id1535 id1668 id2464 id3731 id4825 
>   1231   1535   1668   2464   3731   4825 
> 
> $Ac
> id1196 id1315 id2696 id3172 id3375 id3879 
>   1196   1315   2696   3172   3375   3879

Conversion using unlist split

system.time({
    lon <- sapply(list0, length)
    fvec <- rep(names(list0), times = lon)
    vec0 <- unlist(list0)
    vec1 <- translation[vec0]
    conv1 <- split(x = vec1, f = fvec)
})
>    user  system elapsed 
>  18.943   0.235  19.212

the converted lists are identical

identical(conv1, conv0)
> [1] TRUE

Some remarks

class(vec0)
> [1] "character"
class(vec1)
> [1] "integer"
class(fvec)
> [1] "character"
sum(lon) == length(vec0)
> [1] TRUE
sum(lon) == length(vec1)
> [1] TRUE
sum(lon) == length(fvec)
> [1] TRUE
cbind(vec0, vec1, fvec)[1:3, ]
>     vec0     vec1   fvec
> Aa1 "id2357" "2357" "Aa"
> Aa2 "id2426" "2426" "Aa"
> Aa3 "id2522" "2522" "Aa"
names(attributes(vec0))
> [1] "names"
names(attributes(vec1))
> [1] "names"
names(attributes(fvec))
> NULL

Conversion using relist

system.time({
    vec3 <- unlist(as.relistable(list0))
    vec4 <- translation[vec3]
    conv2 <- relist(flesh = vec4, skeleton = attributes(vec3)$skeleton)
    ## conv3 <- relist (flesh = vec4, skeleton = vec3) ## this does not work (the
    ## list is wrongly formatted)a
})
>    user  system elapsed 
>  14.289   0.164  14.478

the converted lists are not identical

identical(conv2, conv0)
> [1] FALSE

because conv2 does not have names

lapply(conv2[1:2], head)
> $Aa
> [1] 2357 2426 2522 3452 3648 3905
> 
> $Ab
> [1] 1231 1535 1668 2464 3731 4825
lapply(conv0[1:2], head)
> $Aa
> id2357 id2426 id2522 id3452 id3648 id3905 
>   2357   2426   2522   3452   3648   3905 
> 
> $Ab
> id1231 id1535 id1668 id2464 id3731 id4825 
>   1231   1535   1668   2464   3731   4825

but the values are the same

table(sapply(conv2, length) == sapply(conv0, length))
> 
> TRUE 
>  676
table(unlist(conv2) == unlist(conv0))
> 
>     TRUE 
> 10000000

or

# for (i in 1:length (conv2)) {
for (i in 1:10) {
    print(table(conv0[[i]] == conv2[[i]]))
}
> 
>  TRUE 
> 14997 
> 
>  TRUE 
> 15081 
> 
>  TRUE 
> 14678 
> 
>  TRUE 
> 14887 
> 
>  TRUE 
> 14707 
> 
>  TRUE 
> 14890 
> 
>  TRUE 
> 14883 
> 
>  TRUE 
> 14677 
> 
>  TRUE 
> 15067 
> 
>  TRUE 
> 14678

Some remarks

table(vec3 == vec0)
> 
>     TRUE 
> 10000000
table(vec4 == vec1)
> 
>     TRUE 
> 10000000
##' estudio_split_relist_1.r
##' 2014-04-04 dmontaner@cipf.es
##' see how split and relist work
#?split package:base
#?relist package:utils
##' # Data Simulation
#rm (list = ls ())
set.seed (20140404)
N <- 10^7
dat <- 1:N
ids <- paste0 ("id", dat)
names (dat) <- ids
#cls <- sample (LETTERS, size = N, replace = TRUE)
LTRS <- sort (apply (expand.grid (LETTERS, letters), 1, paste, collapse = ""))
length (LTRS)
cls <- sample (LTRS, size = N, replace = TRUE)
cbind (dat, ids, cls)[1:3,]
##' Starting data:
list0 <- split (x = ids, f = cls)
translation <- dat
##' ## My Raw Data
lapply (list0[1:3], head)
translation[1:5]
length (list0)
summary (sapply (list0, length))
##' # Naive conversion
##' Using lapply
system.time ({
fun <- function (x) translation[x]
conv0 <- lapply (list0, fun)
})
lapply (conv0[1:3], head)
##' # Conversion using unlist split
system.time ({
lon <- sapply (list0, length)
fvec <- rep (names (list0), times = lon)
vec0 <- unlist (list0)
vec1 <- translation[vec0]
conv1 <- split (x = vec1, f = fvec)
})
##' the converted lists are identical
identical (conv1, conv0)
##' Some remarks
class (vec0)
class (vec1)
class (fvec)
sum (lon) == length (vec0)
sum (lon) == length (vec1)
sum (lon) == length (fvec)
cbind (vec0, vec1, fvec)[1:3,]
names (attributes (vec0))
names (attributes (vec1))
names (attributes (fvec))
##' # Conversion using relist
system.time ({
vec3 <- unlist (as.relistable (list0))
vec4 <- translation[vec3]
conv2 <- relist (flesh = vec4, skeleton = attributes (vec3)$skeleton)
##conv3 <- relist (flesh = vec4, skeleton = vec3) ## this does not work (the list is wrongly formatted)a
})
##' the converted lists are __not__ identical
identical (conv2, conv0)
##' because `conv2` does not have names
lapply (conv2[1:2], head)
lapply (conv0[1:2], head)
##' but the values are the same
table (sapply (conv2, length) == sapply (conv0, length))
table (unlist (conv2) == unlist (conv0))
##' or
#for (i in 1:length (conv2)) {
for (i in 1:10) {
print (table (conv0[[i]] == conv2[[i]]))
}
##' Some remarks
table (vec3 == vec0)
table (vec4 == vec1)
#translate_list_function.r
#2014-04-09 dmontaner@cipf.es
#function to translate a list of vectors
translateList <- function (x, translation, method = "split") {#2014-04-09 dmontaner@cipf.es
##' method = "split" returns the original names translated
##' method = "relist" does not
if (method == "split") {
lon <- sapply (x, length)
fvec <- rep (names (x), times = lon)
vec0 <- unlist (x)
vec1 <- translation[vec0]
conv <- split (x = vec1, f = fvec)
} else {
vec3 <- unlist (as.relistable (x))
vec4 <- translation[vec3]
conv <- relist (flesh = vec4, skeleton = attributes (vec3)$skeleton)
}
return (conv)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment