Skip to content

Instantly share code, notes, and snippets.

View sachin-netbook's full-sized avatar

sachin-netbook

View GitHub Profile
FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
RUN apt-key adv - fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
RUN apt-key adv - fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
RUN apt-get update
RUN apt-get install ffmpeg libsm6 libxext6 -y
RUN apt-get update && apt-get install libgl1
#Optional. I prefer creating this so that its easy to track root path for mounting
RUN mkdir train
WORKDIR train
COPY yolov5 yolov5
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: efs-claim
namespace: elastic-job
spec:
storageClassName: efs-sc
resources:
requests:
storage: 60Gi
apiVersion: v1
kind: PersistentVolume
metadata:
name: efs-pv
spec:
capacity:
storage: 60Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: yolov5-training
spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 4
maxRestarts: 100