Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code snippets for single node Dask cluster on GCP blog post
gcloud compute instances create-with-container dask-cluster-instance \
--zone=us-central1-a \
--machine-type=e2-highcpu-16 \
--tags=http-server,https-server \
--container-env=MEMORY_PER_WORKER=1,THREADS_PER_WORKER=1 \
--container-image=registry.hub.docker.com/ianwhitestone/domi-dask:latest
#!/bin/bash
set -x
if [ "$NUM_WORKERS" ]; then
echo "NUM_WORKERS environment variable found. Setting number of workers to $NUM_WORKERS."
num_workers=$NUM_WORKERS
else
num_cores=$(poetry run python -c "import multiprocessing; print(multiprocessing.cpu_count())")
echo "NUM_WORKERS environment variable NOT found. Defaulting to $num_cores workers"
num_workers=$num_cores
fi
if [ "$SCHEDULER_MEMORY" ]; then
echo "SCHEDULER_MEMORY environment variable found. Setting scheduler memory to $SCHEDULER_MEMORY GB"
scheduler_memory=$SCHEDULER_MEMORY
else
echo "SCHEDULER_MEMORY environment variable NOT found. Defaulting to 2GB"
scheduler_memory=2
fi
if [ "$MEMORY_PER_WORKER" ]; then
echo "MEMORY_PER_WORKER environment variable found. Setting memory per work to $MEMORY_PER_WORKER."
memory_per_worker=$MEMORY_PER_WORKER
else
total_memory=$(poetry run python -c "from distributed.system import memory_limit; print (memory_limit()/1e9)")
echo "MEMORY_PER_WORKER environment variable NOT found. "\
"Defaulting to ($total_memory - $scheduler_memory)/$num_cores GB per worker"
available_worker_memory=$(echo "$total_memory - $scheduler_memory" | bc -l)
memory_per_worker=$(echo "$available_worker_memory/$num_cores" | bc -l)
echo "Memory per worker set to ${memory_per_worker}GB"
fi
if [ "$THREADS_PER_WORKER" ]; then
echo "THREADS_PER_WORKER environment variable found. Setting threads per work to $THREADS_PER_WORKER."
threads_per_worker=$THREADS_PER_WORKER
else
echo "THREADS_PER_WORKER environment variable NOT found. Defaulting to 1 thread per worker"
threads_per_worker=1
fi
# Start the dask scheduler & workers
echo "Starting dask-scheduler"
poetry run dask-scheduler > log.txt 2>&1 &
echo "Creating $num_workers dask workers"
for i in `seq $num_workers`
do
poetry run dask-worker \
--nthreads $threads_per_worker \
--memory-limit "${memory_per_worker}GB" \
127.0.0.1:8786 > log.txt 2>&1 &
done
tail -f log.txt
FROM python:3.7.6-buster
# Set the working directory
RUN mkdir /opt/app
WORKDIR /opt/app
# Copy poetry files into docker image
COPY pyproject.toml .
COPY poetry.lock .
# Copy dask-entrypoint.sh into docker image & make the script executable
COPY docker/dask-entrypoint.sh /usr/local/bin/dask-entrypoint.sh
RUN chmod +x /usr/local/bin/dask-entrypoint.sh
# Linux updates & dependencies
RUN apt-get update -y
RUN apt-get install -y libpq-dev
RUN apt-get install -y pandoc
# Build poetry environment
RUN pip install poetry
RUN poetry config virtualenvs.in-project false
RUN poetry config virtualenvs.path ~/.virtualenvs
RUN poetry install --no-root
# dask-entrypoint.sh will start up the dask scheduler & workers
ENTRYPOINT ["/usr/local/bin/dask-entrypoint.sh"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.