Skip to content

Instantly share code, notes, and snippets.

@JosiahParry
Last active April 19, 2024 14:55
Show Gist options
  • Save JosiahParry/841daaf9a0a9deca6b60d13e2aaa044b to your computer and use it in GitHub Desktop.
Save JosiahParry/841daaf9a0a9deca6b60d13e2aaa044b to your computer and use it in GitHub Desktop.
fast UUID replacement
FROM python:3.9-bookworm
ENV R_VERSION=4.3.3
# Install from Posit binaries
# https://docs.posit.co/resources/install-r/#verify-r-installation
RUN apt-get update && \
apt-get install -y gdebi-core curl && \
curl -O https://cdn.rstudio.com/r/debian-12/pkgs/r-${R_VERSION}_1_amd64.deb && \
gdebi -n r-${R_VERSION}_1_amd64.deb && \
ln -s /opt/R/${R_VERSION}/bin/R /usr/local/bin/R && \
ln -s /opt/R/${R_VERSION}/bin/Rscript /usr/local/bin/Rscript
RUN R -e "install.packages('https://josiahparry.r-universe.dev/bin/linux/jammy/4.3/src/contrib/uwu_0.0.0.9000.tar.gz')"
# install packages to create fake data
RUN R -e "install.packages(c('collapse', 'lorem', 'bench'), repos = 'https://p3m.dev/cran/__linux__/bookworm/latest')"
RUN echo "\
words <- lorem::ipsum_words(1e6) |> strsplit(' ') |> unlist(); \
df <- as.data.frame(replicate(10, sample(words))); \
df <- collapse::na_insert(df, prop = 0.1); \
bench::mark( \
uwu = { \
df_names <- colnames(df); \
for (j in seq_along(df)) { \
uwu::impute_uuid(df[[j]], paste0('NA_', df_names[j], '_')); \
} \
}, \
iterations = 1 \
) \
" > /tmp/script.R
ENTRYPOINT ["Rscript", "/tmp/script.R"]
library(data.table)
library(collapse)
words <- lorem::ipsum_words(1e6) |>
strsplit(" ") |>
unlist()
df <- as.data.frame(replicate(10, sample(words)))
df <- na_insert(df, prop = 0.1)
# create a data.table
dt <- df
setDT(dt)
# based on
# https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
f_dowle3 = function(DT) {
dt_names <- colnames(DT)
for (j in seq_len(ncol(DT))) {
na_index <- which(is.na(DT[[j]]))
uuids <- uuid::UUIDgenerate(use.time = NA, n = length(na_index))
to_fill_with <- paste0("NA_", dt_names[j], "_", uuids)
set(DT,na_index,j, to_fill_with)
}
}
f_dowle3(dt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment