Skip to content

Instantly share code, notes, and snippets.

@EmilHvitfeldt
Created April 17, 2024 16:38
Show Gist options
  • Save EmilHvitfeldt/c811b8c379604a452737d07c34aa1514 to your computer and use it in GitHub Desktop.
Save EmilHvitfeldt/c811b8c379604a452737d07c34aa1514 to your computer and use it in GitHub Desktop.
profmem of sparse vs textrecipes
library(tidymodels)
library(textrecipes)
library(friends)

library(profmem)

p <- profmem({
preped_rec <- recipe(season ~ text, data = friends) %>%
  step_tokenize(text) %>%
  step_tf(text) %>%
  prep()

term_freq <- bake(preped_rec, new_data = NULL)
})
#> Warning in asMethod(object): sparse->dense coercion: allocating vector of size
#> 8.7 GiB

sum(p$bytes, na.rm = TRUE)
#> [1] 32962639008
str(p, 1)
#> Classes 'Rprofmem' and 'data.frame': 84837 obs. of  3 variables:
#>  $ what : chr  "alloc" "alloc" "alloc" "alloc" ...
#>  $ bytes: num  704 3424 3424 1072 264 ...
#>  $ trace:List of 84837
#>  - attr(*, "threshold")= int 0
#>  - attr(*, "expression")= language {  preped_rec <- recipe(season ~ text, data = friends) %>% step_tokenize(text) %>%; step_tf(text) %>% prep(); ter| __truncated__
#>   ..- attr(*, "srcref")=List of 3
#>   ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x136ca25a0> 
#>   ..- attr(*, "wholeSrcref")= 'srcref' int [1:8] 1 0 14 1 0 1 1 14
#>   .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x136ca25a0> 
#>  - attr(*, "value")= tibble [67,373 × 17,378] (S3: tbl_df/tbl/data.frame)
dim(term_freq)
#> [1] 67373 17378

library(quanteda)
#> Package version: 3.3.1
#> Unicode version: 14.0
#> ICU version: 71.1
#> Parallel computing: 10 of 10 threads used.
#> See https://quanteda.io for tutorials and examples.
library(friends)
sparse_mat <- tokens(friends$text) %>%
  dfm()
dim(sparse_mat)
#> [1] 67373 19671
lobstr::obj_size(sparse_mat)
#> 16.56 MB

Created on 2024-04-17 with reprex v2.1.0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment