Skip to content

Instantly share code, notes, and snippets.

View Csinclair0's full-sized avatar

Colin Sinclair Csinclair0

  • San Francisco CA
View GitHub Profile
@Csinclair0
Csinclair0 / logs_nccl_test_fail.txt
Created November 9, 2022 19:10
nccl_tests_error
+ POD_NAME=nccl-tests-worker-1
+ shift
+ POD_NAME=nccl-tests-worker-0
+ shift
+ /opt/kube/kubectl exec nccl-tests-worker-1 -- /bin/sh -c ' OPAL_PREFIX= ; export OPAL_PREFIX; PATH=/opt/amazon/openmpi/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/opt/amazon/openmpi/lib:${LD_LIBRARY_PATH:-} ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/opt/amazon/openmpi/lib:${DYLD_LIBRARY_PATH:-} ; export DYLD_LIBRARY_PATH ; /opt/amazon/openmpi/bin/orted -mca ess "env" -mca ess_base_jobid "390594560" -mca ess_base_vpid 2 -mca ess_base_num_procs "3" -mca orte_node_regex "nccl-tests-launcher,nccl-tests-worker-[1:0-1]@0(3)" -mca orte_hnp_uri "390594560.0;tcp://100.74.165.198:34707" --mca pml "^cm" -mca plm "rsh" --tree-spawn -mca routed "radix" -mca orte_parent_uri "390594560.0;tcp://100.74.165.198:34707" -mca plm_rsh_agent "/etc/mpi/kubexec.sh" -mca orte_default_hostfile "/etc/mpi/hostfile" -mca coll_hcoll_enable "0" -mca orte_tag_output "1" -mca hwloc_base_binding_policy "none" -mca rmaps_base_mapping_policy "slot" -mca rmap
apiVersion: kubeflow.org/v1alpha2
kind: MPIJob
metadata:
name: nccl-tests
spec:
slotsPerWorker: 8
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1