build-fedora-rapids-master-vm.yaml
Run from laptop or jump server with Ansible:
ansible-playbook build-fedora-rapids-master-vm.yaml --tag deploy
ansible-playbook build-fedora-rapids-master-vm.yaml --tag destroy
build-fedora-rapids-master-vm.yaml
---
- hosts: localhost
connection: local
gather_facts: false
serial: 1
vars:
virtual_machines:
PLACE_HOLDER:
memory: 16
cpu: 8
vm_names:
- fedora-rapids-master-vm-000
tasks:
- name: Build RAPIDS Master VM
kubernetes.core.k8s:
template:
path: 'fedora-rapids-master-vm.yaml'
state: present
with_items:
- "{{ virtual_machines | dict2items | subelements('value.vm_names') }}"
tags: ['never', 'deploy']
- name: Destroy RAPIDS Master VM
kubernetes.core.k8s:
template:
path: 'fedora-rapids-master-vm.yaml'
state: absent
with_items:
- "{{ virtual_machines | dict2items | subelements('value.vm_names') }}"
tags: [ 'never', 'destroy' ]
Manifests:
fedora-rapids-master-vm.yaml
kind: Secret
apiVersion: v1
metadata:
name: {{ item.1 }}-cloudinit-secret
namespace: percap
stringData:
userData: |
#cloud-config
user: dave
password: password
chpasswd: { expire: False }
ssh_pwauth: true
timezone: "America/New_York"
write_files:
- path: /usr/local/bin/percap_clone.sh
permissions: "0755"
content: |
#!/bin/bash
# https://stackoverflow.com/a/46252335
if [ ! -d /root/worker ] ; then
git clone <<URL>>.git /root/worker
git clone <<URL>>.git /root/worker/percap
else
rm -rf /root/worker
git clone <<URL>>.git /root/worker
git clone <<URL>>.git /root/worker/percap
fi
- path: /etc/systemd/system/percap-mps.service
permissions: "0755"
content: |
# Systemd unit file for running the nvidia mps service
# Use to view service logs:
# journalctl -u percap-mps
[Unit]
Description=Percap MPS Service
[Service]
ExecStartPre=nvidia-persistenced --verbose
ExecStartPre=nvidia-smi -c EXCLUSIVE_PROCESS
ExecStart=nvidia-cuda-mps-control -f
# ExecStartPost=/bin/bash -c "/bin/echo start_server -uid 0 | nvidia-cuda-mps-control"
ExecStop=echo quit | nvidia-cuda-mps-control
Restart=on-failure
RestartSec=3
- path: /etc/systemd/system/percap-worker.service
permissions: "0755"
content: |
# Systemd unit file for running the worker.py service
# Use to view service logs:
# journalctl -u percap-worker
# Use to view details/dependencies:
# systemd-analyze critical-chain percap-worker.service
# systemd-analyze critical-chain percap-worker.service percap-mps.service
# systemctl list-dependencies percap-worker.service
[Unit]
Description=Percap Worker Service
After=network-online.target
Requires=network-online.target
After=percap-mps.service
Requires=percap-mps.service
[Service]
Environment=PYTHONUNBUFFERED=1
ExecStartPre=/usr/local/bin/percap_clone.sh
ExecStartPre=python3.10 -m pip install -U -r /root/worker/requirements.txt
ExecStartPre=python3.10 -m pip install -U -r /root/worker/percap/requirements.txt
ExecStart=python3.10 /root/worker/worker.py
Restart=on-failure
[Install]
WantedBy=network-online.target
- path: /etc/dnf/dnf.conf
append: true
content: |
max_parallel_downloads=10
fastestmirror=True
runcmd:
- sed -i 's/enforcing/disabled/g' /etc/selinux/config /etc/selinux/config
- grubby --args=ipv6.disable=1 --update-kernel=ALL
- dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora35/x86_64/cuda-fedora35.repo
- dnf clean all
- dnf -y module install nvidia-driver:latest-dkms
- dnf -y install cuda python3.10 python3.10-devel git pciutils htop tree
- python3.10 -m ensurepip
- python3.10 -m pip install -U pip
- python3.10 -m pip install -U wheel setuptools nvitop nvidia-ml-py
- python3.10 -m pip install cudf-cu11 cuml-cu11 --extra-index-url=https://pypi.nvidia.com
- systemctl enable /etc/systemd/system/percap-worker.service
power_state:
timeout: 120
mode: poweroff
---
# runcmd:
# - sed -i 's/enforcing/disabled/g' /etc/selinux/config /etc/selinux/config
# - grubby --args=ipv6.disable=1 --update-kernel=ALL
# - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# - dnf clean all
# - dnf -y module install nvidia-driver:latest-dkms
# - dnf -y install conda cuda python3.10 git pciutils htop tree
# - dnf -y update
# - conda init
# - conda create -n rapids-23.04 -c rapidsai -c conda-forge -c nvidia cudf=23.04 cuml=23.04 python=3.10 cudatoolkit=11.8 -y
# - /root/.conda/envs/rapids-23.04/bin/python3.10 -m ensurepip
# - /root/.conda/envs/rapids-23.04/bin/python3.10 -m pip install -U pip
# - /root/.conda/envs/rapids-23.04/bin/python3.10 -m pip install -U wheel setuptools nvitop nvidia-ml-py
# - systemctl enable /etc/systemd/system/percap-worker.service
# NOTE: The Driver/CUDA version and the repo pulled from above needs to match the latest RAPIDS offering
apiVersion: kubevirt.io/v1
kind: VirtualMachine
metadata:
name: {{ item.1 }}
namespace: percap
spec:
dataVolumeTemplates:
- kind: DataVolume
metadata:
name: "{{ item.1 }}-pvc"
spec:
sourceRef:
kind: DataSource
name: fedora
namespace: openshift-virtualization-os-images
storage:
storageClassName: ocs-storagecluster-ceph-rbd
resources:
requests:
storage: 32Gi
runStrategy: RerunOnFailure
template:
metadata:
labels:
externalService: "{{ item.1 }}-ssh-service"
spec:
domain:
clock:
timer: {}
timezone: America/New York
cpu:
sockets: {{ item.0.value.cpu }}
cores: 1
threads: 1
devices:
disks:
- disk:
bus: virtio
name: rootdisk
- disk:
bus: virtio
name: cloudinitdisk
interfaces:
- masquerade: {}
model: virtio
name: default
networkInterfaceMultiqueue: true
rng: {}
features:
acpi:
enabled: true
smm:
enabled: true
firmware:
bootloader:
efi:
secureBoot: false
machine:
type: pc-q35-rhel8.6.0
resources:
requests:
memory: {{ item.0.value.memory }}Gi
hostname: {{ item.1 }}
networks:
- name: default
pod: {}
terminationGracePeriodSeconds: 60
volumes:
- name: rootdisk
dataVolume:
name: "{{ item.1 }}-pvc"
- name: cloudinitdisk
cloudInitNoCloud:
secretRef:
name: {{ item.1 }}-cloudinit-secret
---
kind: Service
apiVersion: v1
metadata:
name: "{{ item.1 }}-ssh-service"
namespace: percap
spec:
selector:
externalService: "{{ item.1 }}-ssh-service"
externalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ports:
- protocol: TCP
port: {{ 22 ~ item.1[-3:] }}
targetPort: 22
nodePort: {{ 30 ~ item.1[-3:] }}
internalTrafficPolicy: Cluster
type: NodePort
ipFamilyPolicy: SingleStack
clone-master-disk-and-create-ds.yaml
# oc apply -f clone-master-disk-and-create-ds.yaml && oc get dv -n percap
apiVersion: cdi.kubevirt.io/v1beta1
kind: DataVolume
metadata:
name: fedora-rapids-master-vm-000-template-pvc
namespace: percap
spec:
source:
pvc:
name: fedora-rapids-master-vm-000-pvc
namespace: percap
storage:
storageClassName: ocs-storagecluster-ceph-rbd
accessModes:
- ReadWriteMany
resources:
requests:
storage: 35Gi
---
apiVersion: cdi.kubevirt.io/v1beta1
kind: DataSource
metadata:
name: fedora-rapids-master-vm-000-template-ds
namespace: percap
spec:
source:
pvc:
name: fedora-rapids-master-vm-000-template-pvc
namespace: percap
fedora-rapids-worker-vm.yaml
apiVersion: kubevirt.io/v1
kind: VirtualMachine
metadata:
name: {{ item.1 }}
namespace: percap
labels:
app.kubernetes.io/part-of: percap-app
spec:
dataVolumeTemplates:
- kind: DataVolume
metadata:
name: "{{ item.1 }}-pvc"
spec:
sourceRef:
kind: DataSource
name: fedora-rapids-master-vm-000-template-ds
namespace: percap
storage:
storageClassName: ocs-storagecluster-ceph-rbd
resources:
requests:
storage: 35Gi
runStrategy: RerunOnFailure
template:
metadata:
labels:
externalService: "{{ item.1 }}-ssh-service"
spec:
domain:
clock:
timer: {}
timezone: America/New York
cpu:
sockets: {{ item.0.value.cpu }}
cores: 1
threads: 1
devices:
hostDevices:
- deviceName: nvidia.com/{{ item.0.key }}
name: gpu1
disks:
- disk:
bus: virtio
name: rootdisk
- disk:
bus: virtio
name: cloudinitdisk
interfaces:
- masquerade: {}
model: virtio
name: default
networkInterfaceMultiqueue: true
rng: {}
features:
acpi:
enabled: true
smm:
enabled: true
firmware:
bootloader:
efi:
secureBoot: false
machine:
type: pc-q35-rhel8.6.0
resources:
requests:
memory: {{ item.0.value.memory }}Gi
hostname: {{ item.1 }}
networks:
- name: default
pod: {}
terminationGracePeriodSeconds: 60
volumes:
- name: rootdisk
dataVolume:
name: "{{ item.1 }}-pvc"
- name: cloudinitdisk
cloudInitNoCloud:
userData: |-
#cloud-config
# This needs to stay as it probably invokes hostname config in yaml above
# and re-enabled password auth in sshd
ssh_pwauth: true
timezone: "America/New_York"
---
kind: Service
apiVersion: v1
metadata:
name: "{{ item.1 }}-ssh-service"
namespace: percap
spec:
selector:
externalService: "{{ item.1 }}-ssh-service"
externalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ports:
- protocol: TCP
port: {{ 22 ~ item.1[-3:] }}
targetPort: 22
nodePort: {{ 30 ~ item.1[-3:] }}
internalTrafficPolicy: Cluster
type: NodePort
ipFamilyPolicy: SingleStack