Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save singlecheeze/279c9db08c913040682db9fb26cfa565 to your computer and use it in GitHub Desktop.
Save singlecheeze/279c9db08c913040682db9fb26cfa565 to your computer and use it in GitHub Desktop.
OCP Virt VM Creation with Ansible

build-fedora-rapids-master-vm.yaml
Run from laptop or jump server with Ansible:

ansible-playbook build-fedora-rapids-master-vm.yaml --tag deploy
ansible-playbook build-fedora-rapids-master-vm.yaml --tag destroy

build-fedora-rapids-master-vm.yaml

---
- hosts: localhost
  connection: local
  gather_facts: false
  serial: 1
  vars:
    virtual_machines:
      PLACE_HOLDER:
        memory: 16
        cpu: 8
        vm_names:
          - fedora-rapids-master-vm-000
  tasks:
    - name: Build RAPIDS Master VM
      kubernetes.core.k8s:
        template:
          path: 'fedora-rapids-master-vm.yaml'
        state: present
      with_items:
        - "{{ virtual_machines | dict2items | subelements('value.vm_names') }}"
      tags: ['never', 'deploy']

    - name: Destroy RAPIDS Master VM
      kubernetes.core.k8s:
        template:
          path: 'fedora-rapids-master-vm.yaml'
        state: absent
      with_items:
        - "{{ virtual_machines | dict2items | subelements('value.vm_names') }}"
      tags: [ 'never', 'destroy' ]

Manifests:
fedora-rapids-master-vm.yaml

kind: Secret
apiVersion: v1
metadata:
  name: {{ item.1 }}-cloudinit-secret
  namespace: percap
stringData:
  userData: |
    #cloud-config
    user: dave
    password: password
    chpasswd: { expire: False }
    ssh_pwauth: true
    timezone: "America/New_York"
    write_files:
    - path: /usr/local/bin/percap_clone.sh
      permissions: "0755"
      content: |
        #!/bin/bash
        # https://stackoverflow.com/a/46252335
        if [ ! -d /root/worker ] ; then
            git clone <<URL>>.git /root/worker
            git clone <<URL>>.git /root/worker/percap
        else
            rm -rf /root/worker
            git clone <<URL>>.git /root/worker
            git clone <<URL>>.git /root/worker/percap
        fi
    - path: /etc/systemd/system/percap-mps.service
      permissions: "0755"
      content: |
        # Systemd unit file for running the nvidia mps service
        # Use to view service logs:
        # journalctl -u percap-mps
        
        [Unit]
        Description=Percap MPS Service
        
        [Service]
        ExecStartPre=nvidia-persistenced --verbose
        ExecStartPre=nvidia-smi -c EXCLUSIVE_PROCESS
        
        ExecStart=nvidia-cuda-mps-control -f
        # ExecStartPost=/bin/bash -c "/bin/echo start_server -uid 0 | nvidia-cuda-mps-control"
        
        ExecStop=echo quit | nvidia-cuda-mps-control
        
        Restart=on-failure
        RestartSec=3
    - path: /etc/systemd/system/percap-worker.service
      permissions: "0755"
      content: |
        # Systemd unit file for running the worker.py service
        # Use to view service logs:
        # journalctl -u percap-worker
        # Use to view details/dependencies:
        # systemd-analyze critical-chain percap-worker.service
        # systemd-analyze critical-chain percap-worker.service percap-mps.service
        # systemctl list-dependencies percap-worker.service
        
        [Unit]
        Description=Percap Worker Service
        
        After=network-online.target
        Requires=network-online.target
        
        After=percap-mps.service
        Requires=percap-mps.service
        
        [Service]
        Environment=PYTHONUNBUFFERED=1
        ExecStartPre=/usr/local/bin/percap_clone.sh
        ExecStartPre=python3.10 -m pip install -U -r /root/worker/requirements.txt
        ExecStartPre=python3.10 -m pip install -U -r /root/worker/percap/requirements.txt
        ExecStart=python3.10 /root/worker/worker.py
        
        Restart=on-failure
        
        [Install]
        WantedBy=network-online.target
    - path: /etc/dnf/dnf.conf
      append: true
      content: |
        max_parallel_downloads=10
        fastestmirror=True
    runcmd:
        - sed -i 's/enforcing/disabled/g' /etc/selinux/config /etc/selinux/config
        - grubby --args=ipv6.disable=1 --update-kernel=ALL
        - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora35/x86_64/cuda-fedora35.repo
        - dnf clean all
        - dnf -y module install nvidia-driver:latest-dkms
        - dnf -y install cuda python3.10 python3.10-devel git pciutils htop tree
        - python3.10 -m ensurepip
        - python3.10 -m pip install -U pip
        - python3.10 -m pip install -U wheel setuptools nvitop nvidia-ml-py
        - python3.10 -m pip install cudf-cu11 cuml-cu11 --extra-index-url=https://pypi.nvidia.com        
        - systemctl enable /etc/systemd/system/percap-worker.service
    power_state:
        timeout: 120
        mode: poweroff

---
#    runcmd:
#        - sed -i 's/enforcing/disabled/g' /etc/selinux/config /etc/selinux/config
#        - grubby --args=ipv6.disable=1 --update-kernel=ALL
#        - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
#        - dnf clean all
#        - dnf -y module install nvidia-driver:latest-dkms
#        - dnf -y install conda cuda python3.10 git pciutils htop tree
#        - dnf -y update
#        - conda init
#        - conda create -n rapids-23.04 -c rapidsai -c conda-forge -c nvidia cudf=23.04 cuml=23.04 python=3.10 cudatoolkit=11.8 -y
#        - /root/.conda/envs/rapids-23.04/bin/python3.10 -m ensurepip
#        - /root/.conda/envs/rapids-23.04/bin/python3.10 -m pip install -U pip
#        - /root/.conda/envs/rapids-23.04/bin/python3.10 -m pip install -U wheel setuptools nvitop nvidia-ml-py
#        - systemctl enable /etc/systemd/system/percap-worker.service

# NOTE: The Driver/CUDA version and the repo pulled from above needs to match the latest RAPIDS offering

apiVersion: kubevirt.io/v1
kind: VirtualMachine
metadata:
  name: {{ item.1 }}
  namespace: percap
spec:
  dataVolumeTemplates:
    - kind: DataVolume
      metadata:
        name: "{{ item.1 }}-pvc"
      spec:
        sourceRef:
          kind: DataSource
          name: fedora
          namespace: openshift-virtualization-os-images
        storage:
          storageClassName: ocs-storagecluster-ceph-rbd
          resources:
            requests:
              storage: 32Gi
  runStrategy: RerunOnFailure
  template:
    metadata:
      labels:
        externalService: "{{ item.1 }}-ssh-service"
    spec:
      domain:
        clock:
          timer: {}
          timezone: America/New York
        cpu:
          sockets: {{ item.0.value.cpu }}
          cores: 1
          threads: 1
        devices:
          disks:
            - disk:
                bus: virtio
              name: rootdisk
            - disk:
                bus: virtio
              name: cloudinitdisk
          interfaces:
            - masquerade: {}
              model: virtio
              name: default
          networkInterfaceMultiqueue: true
          rng: {}
        features:
          acpi:
            enabled: true
          smm:
            enabled: true
        firmware:
          bootloader:
            efi:
              secureBoot: false
        machine:
          type: pc-q35-rhel8.6.0
        resources:
          requests:
            memory:  {{ item.0.value.memory }}Gi
      hostname: {{ item.1 }}
      networks:
        - name: default
          pod: {}
      terminationGracePeriodSeconds: 60
      volumes:
        - name: rootdisk
          dataVolume:
            name: "{{ item.1 }}-pvc"
        - name: cloudinitdisk
          cloudInitNoCloud:
            secretRef:
              name: {{ item.1 }}-cloudinit-secret

---

kind: Service
apiVersion: v1
metadata:
  name: "{{ item.1 }}-ssh-service"
  namespace: percap
spec:
  selector:
    externalService: "{{ item.1 }}-ssh-service"
  externalTrafficPolicy: Cluster
  ipFamilies:
    - IPv4
  ports:
    - protocol: TCP
      port: {{ 22 ~ item.1[-3:] }}
      targetPort: 22
      nodePort: {{ 30 ~ item.1[-3:] }}
  internalTrafficPolicy: Cluster
  type: NodePort
  ipFamilyPolicy: SingleStack

clone-master-disk-and-create-ds.yaml

# oc apply -f clone-master-disk-and-create-ds.yaml && oc get dv -n percap

apiVersion: cdi.kubevirt.io/v1beta1
kind: DataVolume
metadata:
  name: fedora-rapids-master-vm-000-template-pvc
  namespace: percap
spec:
  source:
    pvc:
      name: fedora-rapids-master-vm-000-pvc
      namespace: percap
  storage:
    storageClassName: ocs-storagecluster-ceph-rbd
    accessModes:
      - ReadWriteMany
    resources:
      requests:
        storage: 35Gi

---

apiVersion: cdi.kubevirt.io/v1beta1
kind: DataSource
metadata:
  name: fedora-rapids-master-vm-000-template-ds
  namespace: percap
spec:
  source:
    pvc:
      name: fedora-rapids-master-vm-000-template-pvc
      namespace: percap

fedora-rapids-worker-vm.yaml

apiVersion: kubevirt.io/v1
kind: VirtualMachine
metadata:
  name: {{ item.1 }}
  namespace: percap
  labels:
    app.kubernetes.io/part-of: percap-app
spec:
  dataVolumeTemplates:
    - kind: DataVolume
      metadata:
        name: "{{ item.1 }}-pvc"
      spec:
        sourceRef:
          kind: DataSource
          name: fedora-rapids-master-vm-000-template-ds
          namespace: percap
        storage:
          storageClassName: ocs-storagecluster-ceph-rbd
          resources:
            requests:
              storage: 35Gi
  runStrategy: RerunOnFailure
  template:
    metadata:
      labels:
        externalService: "{{ item.1 }}-ssh-service"
    spec:
      domain:
        clock:
          timer: {}
          timezone: America/New York
        cpu:
          sockets: {{ item.0.value.cpu }}
          cores: 1
          threads: 1
        devices:
          hostDevices:
            - deviceName: nvidia.com/{{ item.0.key }}
              name: gpu1
          disks:
            - disk:
                bus: virtio
              name: rootdisk
            - disk:
                bus: virtio
              name: cloudinitdisk
          interfaces:
            - masquerade: {}
              model: virtio
              name: default
          networkInterfaceMultiqueue: true
          rng: {}
        features:
          acpi:
            enabled: true
          smm:
            enabled: true
        firmware:
          bootloader:
            efi:
              secureBoot: false
        machine:
          type: pc-q35-rhel8.6.0
        resources:
          requests:
            memory:  {{ item.0.value.memory }}Gi
      hostname: {{ item.1 }}
      networks:
        - name: default
          pod: {}
      terminationGracePeriodSeconds: 60
      volumes:
        - name: rootdisk
          dataVolume:
            name: "{{ item.1 }}-pvc"
        - name: cloudinitdisk
          cloudInitNoCloud:
            userData: |-
              #cloud-config
              # This needs to stay as it probably invokes hostname config in yaml above
              # and re-enabled password auth in sshd
              ssh_pwauth: true
              timezone: "America/New_York"

---

kind: Service
apiVersion: v1
metadata:
  name: "{{ item.1 }}-ssh-service"
  namespace: percap
spec:
  selector:
    externalService: "{{ item.1 }}-ssh-service"
  externalTrafficPolicy: Cluster
  ipFamilies:
    - IPv4
  ports:
    - protocol: TCP
      port: {{ 22 ~ item.1[-3:] }}
      targetPort: 22
      nodePort: {{ 30 ~ item.1[-3:] }}
  internalTrafficPolicy: Cluster
  type: NodePort
  ipFamilyPolicy: SingleStack
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment