JosiahParry/Dockerfile

## Dockerfile
FROM python:3.9-bookworm

ENV R_VERSION=4.3.3

# Install from Posit binaries
# https://docs.posit.co/resources/install-r/#verify-r-installation
RUN apt-get update && \
    apt-get install -y gdebi-core curl && \
    curl -O https://cdn.rstudio.com/r/debian-12/pkgs/r-${R_VERSION}_1_amd64.deb && \
    gdebi -n r-${R_VERSION}_1_amd64.deb && \
    ln -s /opt/R/${R_VERSION}/bin/R /usr/local/bin/R && \
    ln -s /opt/R/${R_VERSION}/bin/Rscript /usr/local/bin/Rscript

RUN R -e "install.packages('https://josiahparry.r-universe.dev/bin/linux/jammy/4.3/src/contrib/uwu_0.0.0.9000.tar.gz')"

# install packages to create fake data
RUN R -e "install.packages(c('collapse', 'lorem', 'bench'), repos = 'https://p3m.dev/cran/__linux__/bookworm/latest')"


RUN echo "\
    words <- lorem::ipsum_words(1e6) |> strsplit(' ') |> unlist(); \
    df <- as.data.frame(replicate(10, sample(words))); \
    df <- collapse::na_insert(df, prop = 0.1); \
    bench::mark( \
    uwu = { \
    df_names <- colnames(df); \
    for (j in seq_along(df)) { \
    uwu::impute_uuid(df[[j]], paste0('NA_', df_names[j], '_')); \
    } \
    }, \
    iterations = 1 \
    ) \
    " > /tmp/script.R

ENTRYPOINT ["Rscript", "/tmp/script.R"]

## tweep.R
library(data.table)
library(collapse)


words <- lorem::ipsum_words(1e6) |>
  strsplit(" ") |>
  unlist()

df <- as.data.frame(replicate(10, sample(words)))
df <- na_insert(df, prop = 0.1)

# create a data.table
dt <- df
setDT(dt)

# based on
# https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
f_dowle3 = function(DT) {
  dt_names <- colnames(DT)
  for (j in seq_len(ncol(DT))) {
    na_index <- which(is.na(DT[[j]]))
    uuids <- uuid::UUIDgenerate(use.time = NA, n = length(na_index))

    to_fill_with <- paste0("NA_", dt_names[j], "_", uuids)
    set(DT,na_index,j, to_fill_with)
  }
}

f_dowle3(dt)
	FROM python:3.9-bookworm

	ENV R_VERSION=4.3.3

	# Install from Posit binaries
	# https://docs.posit.co/resources/install-r/#verify-r-installation
	RUN apt-get update && \
	apt-get install -y gdebi-core curl && \
	curl -O https://cdn.rstudio.com/r/debian-12/pkgs/r-${R_VERSION}_1_amd64.deb && \
	gdebi -n r-${R_VERSION}_1_amd64.deb && \
	ln -s /opt/R/${R_VERSION}/bin/R /usr/local/bin/R && \
	ln -s /opt/R/${R_VERSION}/bin/Rscript /usr/local/bin/Rscript

	RUN R -e "install.packages('https://josiahparry.r-universe.dev/bin/linux/jammy/4.3/src/contrib/uwu_0.0.0.9000.tar.gz')"

	# install packages to create fake data
	RUN R -e "install.packages(c('collapse', 'lorem', 'bench'), repos = 'https://p3m.dev/cran/__linux__/bookworm/latest')"


	RUN echo "\
	words <- lorem::ipsum_words(1e6) \|> strsplit(' ') \|> unlist(); \
	df <- as.data.frame(replicate(10, sample(words))); \
	df <- collapse::na_insert(df, prop = 0.1); \
	bench::mark( \
	uwu = { \
	df_names <- colnames(df); \
	for (j in seq_along(df)) { \
	uwu::impute_uuid(df[[j]], paste0('NA_', df_names[j], '_')); \
	} \
	}, \
	iterations = 1 \
	) \
	" > /tmp/script.R

	ENTRYPOINT ["Rscript", "/tmp/script.R"]
	library(data.table)
	library(collapse)


	words <- lorem::ipsum_words(1e6) \|>
	strsplit(" ") \|>
	unlist()

	df <- as.data.frame(replicate(10, sample(words)))
	df <- na_insert(df, prop = 0.1)

	# create a data.table
	dt <- df
	setDT(dt)

	# based on
	# https://stackoverflow.com/questions/7235657/fastest-way-to-replace-nas-in-a-large-data-table
	f_dowle3 = function(DT) {
	dt_names <- colnames(DT)
	for (j in seq_len(ncol(DT))) {
	na_index <- which(is.na(DT[[j]]))
	uuids <- uuid::UUIDgenerate(use.time = NA, n = length(na_index))

	to_fill_with <- paste0("NA_", dt_names[j], "_", uuids)
	set(DT,na_index,j, to_fill_with)
	}
	}

	f_dowle3(dt)