Last active
December 5, 2022 09:39
-
-
Save pwq1989/58d03fc82eddb92cd36e2f87947f7fce to your computer and use it in GitHub Desktop.
yaml example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
# jiansuan job example | |
apiVersion: batch.jiansuan.tech/v1alpha1 | |
kind: Job | |
metadata: | |
labels: | |
jiansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
jiansuan.tech/jiansuan.label.group: "test-group" | |
jiansuan.tech/jiansuan.label.workload: "job" | |
jiansuan.tech/jiansuan.label.framework: "pytorch" | |
name: jiansuan-job-db8kyn-1668675887705 | |
namespace: jiansuan-default-ns | |
spec: | |
maxRetry: 3 | |
frameworkType: pytorch | |
minAvailable: 1 | |
maxAvailable: 1 | |
scheduler: | |
name: volcano | |
queue: "vc-queue-1" | |
tasks: | |
- replicas: 1 | |
name: master | |
template: | |
metadata: | |
annotations: | |
jiansuan.tech/jiansuan.job.role-name: normal | |
labels: | |
iansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
jiansuan.tech/jiansuan.label.group: "test-group" | |
jiansuan.tech/jiansuan.label.workload: "job" | |
jiansuan.tech/jiansuan.label.framework: "pytorch" | |
spec: | |
containers: | |
- command: | |
- /bin/bash | |
- /nfs/common/jiansuan/scripts/current/offline-vj-pytorch-entrypoint.sh | |
- xxxxx | |
env: | |
- name: JIANSUAN_USER | |
value: luban | |
- name: JIANSUAN_JOB_CREATOR | |
value: peiwenqian | |
image: image-url | |
name: jiansuan-job-db8kyn-1668675887705-master | |
resourceGroup: A100_80x8 | |
storageMount: | |
- name: nfs-common | |
type: hostPath | |
readOnly: true | |
mountPath: "/nfs/common" | |
sourceDescription: "file:///mnt_jiansuan/common" | |
- name: user-nas | |
type: hostPath | |
mountPath: "/nfs/private" | |
sourceDescription: "file:///mnt_jiansuan/private/users/peiwenqian" | |
nodeSelector: | |
jiansuan_gpu_a100: "true" | |
--- | |
# rendered volcano job example | |
apiVersion: batch.volcano.sh/v1alpha1 | |
kind: Job | |
metadata: | |
labels: | |
jiansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
jiansuan.tech/jiansuan.label.group: "test-group" | |
jiansuan.tech/jiansuan.label.workload: "job" | |
jiansuan.tech/jiansuan.label.framework: "pytorch" | |
name: jiansuan-job-db8kyn-1668675887705 | |
namespace: jiansuan-default-ns | |
spec: | |
maxRetry: 3 | |
minAvailable: 1 | |
plugins: | |
env: [] | |
svc: [] | |
policies: | |
- action: RetryJob | |
event: PodEvicted | |
- action: RestartJob | |
event: PodFailed | |
queue: vc-queue-1 | |
schedulerName: volcano | |
tasks: | |
- maxRetry: 3 | |
minAvailable: 1 | |
name: master | |
policies: | |
- action: CompleteJob | |
event: TaskCompleted | |
replicas: 1 | |
template: | |
metadata: | |
annotations: | |
jiansuan.tech/jiansuan.job.role-name: normal | |
labels: | |
iansuan.tech/jiansuan.label.name: "jiansuan-job-db8kyn-1668675887705" | |
jiansuan.tech/jiansuan.label.username: "peiwenqian" | |
jiansuan.tech/jiansuan.label.group: "test-group" | |
jiansuan.tech/jiansuan.label.workload: "job" | |
jiansuan.tech/jiansuan.label.framework: "pytorch" | |
spec: | |
containers: | |
- command: | |
- /bin/bash | |
- /nfs/common/jiansuan/scripts/current/offline-vj-pytorch-entrypoint.sh | |
- xxxxx | |
env: | |
- name: NVIDIA_DRIVER_CAPABILITIES | |
value: all | |
- name: JIANSUAN_USER | |
value: luban | |
- name: JIANSUAN_JOB_CREATOR | |
value: peiwenqian | |
- name: JIANSUAN_RESOURCE_NUM_CPU | |
value: "96" | |
- name: JIANSUAN_RESOURCE_NUM_GPU | |
value: "8" | |
- name: JIANSUAN_RESOURCE_NUM_MEM | |
value: 180Gi | |
- name: JIANSUAN_DISTRIBUTED_IDENTIFICATION | |
value: jiansuan-job-db8kyn-1668675887705 | |
- name: JIANSUAN_DISTRIBUTED_NODE_COUNT | |
value: "1" | |
- name: JIANSUAN_DISTRIBUTED_TASK_ROLE | |
value: master | |
- name: JIANSUAN_FRAMEWORK | |
value: pytorch | |
image: image-url | |
name: jiansuan-job-db8kyn-1668675887705-master | |
resources: | |
limits: | |
cpu: "96" | |
memory: 180Gi | |
nvidia.com/gpu: "8" | |
requests: | |
cpu: "48" | |
memory: 90Gi | |
nvidia.com/gpu: "8" | |
securityContext: | |
capabilities: | |
add: | |
- ALL | |
privileged: false | |
volumeMounts: | |
- mountPath: /dev/infiniband | |
name: infiniband | |
- mountPath: /sys/fs/cgroup | |
name: cgroup | |
- mountPath: /dev/shm | |
name: dev-shm | |
- mountPath: /nfs/common | |
name: nfs-common | |
readOnly: true | |
- mountPath: /nfs/private | |
name: user-nas | |
nodeSelector: | |
jiansuan_gpu_a100: "true" | |
dnsPolicy: ClusterFirstWithHostNet | |
hostNetwork: true | |
imagePullSecrets: | |
- name: jiansuan-registry-secret | |
restartPolicy: Never | |
volumes: | |
- hostPath: | |
path: /dev/infiniband | |
name: infiniband | |
- hostPath: | |
path: /sys/fs/cgroup | |
name: cgroup | |
- hostPath: | |
path: /mnt_jiansuan/common | |
name: nfs-common | |
- hostPath: | |
path: /mnt_jiansuan/private/users/peiwenqian | |
name: user-nas | |
- emptyDir: | |
medium: Memory | |
name: dev-shm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment