Host OS: ensure file permissions comply with ssh requirements.
$ find /mnt/c/Users/vmarch/Documents/.ssh -printf "%M %p\n"
drwx------ /mnt/c/Users/$USER/Documents/.ssh
-rw------- /mnt/c/Users/$USER/Documents/.ssh/authorized_keys
$ cat /sys/devices/virtual/dmi/id/product_name
trn1.32xlarge
$ cat /sys/devices/virtual/dmi/id/board_asset_tag
i-0000000000example
#!/bin/bash | |
################################################################################ | |
# NOTE for Slurm users: when Slurm is configured to enable cgroup, upon job | |
# completion Slurm will kill the mount-s3 process. This causes on-access error | |
# "transport not connected". | |
# | |
# [20240404] In the practical sense, running this script under srun will: | |
# - not work on pcluster-3.9.0 (ProctrackType=proctrack/cgroup) | |
# - probably work on SageMaker HyperPod (ProctrackType=proctrack/linuxproc) |
# Install awsume https://awsu.me/
$ brew install awsume
# Edit ~/.aws/config, and create a profile:
$ vi ~/.aws/config
[hyperpod]
output = json
region = us-west-2
mfa_serial = arn:aws:iam:::mfa/
# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables | |
export HF_HOME=/fsx/marcverd/hf_home | |
export HF_HUB_DISABLE_TELEMETRY=1 |
NOTE: also check additional changes in typing across major python versions. | |
- 312 pep669: new debugging/profiling API | |
- 312 pep684: C-API for per-interpreger GIL -> low-level features. | |
* Py-API expected for 313. | |
- 312 @override decorator for methods | |
- 312 pep701: more flexible f-string parsing | |
- 311 typing: Self type | |
- 311 exception groups, except* | |
- 311 pep678: enrich exceptions with notes |
https://developer.nvidia.com/deep-learning-performance-training-inference/training
# Additional args -- optional, on case-by-case basis
declare -a CONTAINER_ARGS=(
--gpus all
--ipc=host
--ulimit memlock=1
--ulimit stack=67108864
interface CheckovRule { | |
id: string, | |
comment: string, | |
} | |
function silence_checkov(construct: Construct, rules: CheckovRule[]) { | |
let metadata = (construct.node.defaultChild as cdk.CfnResource).cfnOptions.metadata; | |
metadata = { checkov: { skip: rules }, ...metadata }; | |
(construct.node.defaultChild as cdk.CfnResource).cfnOptions.metadata = metadata | |
} |
#!/bin/bash | |
#set -aex | |
echo "PWD = $(pwd)" | |
: "${SM_NUM_GPUS:=4}" | |
: "${MODEL_NAME:=gpt2}" | |
: "${OUTPUT_ROOT:=/mnt/scratch}" | |
: "${TRAINING_JOB_NAME:=haha}" |