Last active
September 4, 2023 20:15
-
-
Save vsoch/53c6cc59f8e47fa42c979697909c1b67 to your computer and use it in GitHub Desktop.
MiniCluster with Rocky Linux + Intel MPI for Google Cloud Testing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: flux-framework.org/v1alpha1 | |
kind: MiniCluster | |
metadata: | |
name: flux-sample | |
namespace: flux-operator | |
spec: | |
# Number of pods to create for MiniCluster | |
# YOU NEED TO CUSTOMIZE HERE FOR vCPU / 2 * pods | |
# For example 176 vCPU is 88 CPU | |
# This means 8 pods (1 pod per node) | |
size: 8 | |
# 88 cores / pod * 64 pods | |
tasks: 704 | |
# This starts the flux broker without a command (interactive) | |
interactive: true | |
logging: | |
quiet: false | |
strict: false | |
zeromq: true | |
flux: | |
installRoot: /opt/view | |
optionFlags: "-c 1 -o cpu-affinity=per-task" | |
containers: | |
- image: ghcr.io/rse-ops/lammps-intel-mpi-rocky:tag-8 | |
# command | |
# lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite | |
# lmp -v x 64 -v y 16 -v z 16 -in in.reaxc.hns -nocite | |
# working directory is important to set here, because it will be / | |
# and by default we try to change permissions of what is under it | |
workingDir: /opt/lammps | |
# Mount Google Filestore to our cluster | |
# see pvc.yaml for creating this first | |
# https://cloud.google.com/filestore/docs/csi-driver#access | |
# existingVolumes: | |
# data: | |
# path: /workflow | |
# claimName: data | |
# Resource limits to ensure 1 pod assigned per node | |
# These are purposefully lower - the actual value didn't work (448 and 56), | |
# but this should still assign 1:1 | |
resources: | |
requests: | |
memory: "650G" | |
cpu: "75" | |
commands: | |
# The workers need to come up after the broker and network - we are hitting this issue | |
# when we get to this larger scale (and it is compounded by the new networking issue with the service) | |
# that appeared after we removed the certificate generation pod | |
workerPre: sleep 60 | |
# The added step here is sourcing envars for the intel mpi | |
pre: | | |
. /etc/profile.d/z10_spack_environment.sh | |
cd /opt/spack-environment | |
. /opt/spack-environment/spack/share/spack/setup-env.sh | |
. /opt/intel/mpi/latest/env/vars.sh | |
spack env activate . | |
cd /home/flux/examples/reaxff/HNS | |
echo $PATH | |
echo $PYTHONPATH | |
asFlux="sudo -u flux -E PYTHONPATH=$PYTHONPATH -E PATH=$PATH -E RDMAV_FORK_SAFE=1" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment