Skip to content

Instantly share code, notes, and snippets.

@yuntan
Created September 6, 2021 04:21
Show Gist options
  • Save yuntan/654a936c0cebb4e02505fae2f4a350ef to your computer and use it in GitHub Desktop.
Save yuntan/654a936c0cebb4e02505fae2f4a350ef to your computer and use it in GitHub Desktop.
import os
import tensorflow as tf
if __name__ == "__main__":
print(os.environ["MY_ENV"])
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"source": [
"import sagemaker\r\n",
"sagemaker.__version__"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'2.59.1'"
]
},
"metadata": {},
"execution_count": 1
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"source": [
"role = \"XXX\""
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 4,
"source": [
"from sagemaker.tensorflow import TensorFlow\r\n",
"\r\n",
"environment = {\"MY_ENV\": \"42\"}\r\n",
"\r\n",
"estimator = TensorFlow(entry_point=\"entrypoint.py\",\r\n",
" instance_count=1,\r\n",
" instance_type=\"ml.c5.xlarge\",\r\n",
" py_version=\"py37\",\r\n",
" framework_version=\"2.4.1\",\r\n",
" role=role,\r\n",
" environment=environment)\r\n",
"\r\n",
"estimator.fit()"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"2021-09-06 03:07:32 Starting - Starting the training job...\n",
"2021-09-06 03:07:56 Starting - Launching requested ML instancesProfilerReport-1630897651: InProgress\n",
"......\n",
"2021-09-06 03:08:56 Starting - Preparing the instances for training......\n",
"2021-09-06 03:10:04 Downloading - Downloading input data\n",
"2021-09-06 03:10:04 Training - Downloading the training image...\n",
"2021-09-06 03:10:29 Uploading - Uploading generated training model\n",
"2021-09-06 03:10:29 Completed - Training job completed\n",
"\u001b[34m2021-09-06 03:10:20.859090: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:20.866083: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:20.952795: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23,448 sagemaker-training-toolkit INFO Imported framework sagemaker_tensorflow_container.training\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23,455 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23,809 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23,823 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23,836 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23,845 sagemaker-training-toolkit INFO Invoking user script\n",
"\u001b[0m\n",
"\u001b[34mTraining Env:\n",
"\u001b[0m\n",
"\u001b[34m{\n",
" \"additional_framework_parameters\": {},\n",
" \"channel_input_dirs\": {},\n",
" \"current_host\": \"algo-1\",\n",
" \"framework_module\": \"sagemaker_tensorflow_container.training:main\",\n",
" \"hosts\": [\n",
" \"algo-1\"\n",
" ],\n",
" \"hyperparameters\": {\n",
" \"model_dir\": \"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"\n",
" },\n",
" \"input_config_dir\": \"/opt/ml/input/config\",\n",
" \"input_data_config\": {},\n",
" \"input_dir\": \"/opt/ml/input\",\n",
" \"is_master\": true,\n",
" \"job_name\": \"tensorflow-training-2021-09-06-03-07-30-818\",\n",
" \"log_level\": 20,\n",
" \"master_hostname\": \"algo-1\",\n",
" \"model_dir\": \"/opt/ml/model\",\n",
" \"module_dir\": \"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/source/sourcedir.tar.gz\",\n",
" \"module_name\": \"entrypoint\",\n",
" \"network_interface_name\": \"eth0\",\n",
" \"num_cpus\": 4,\n",
" \"num_gpus\": 0,\n",
" \"output_data_dir\": \"/opt/ml/output/data\",\n",
" \"output_dir\": \"/opt/ml/output\",\n",
" \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n",
" \"resource_config\": {\n",
" \"current_host\": \"algo-1\",\n",
" \"hosts\": [\n",
" \"algo-1\"\n",
" ],\n",
" \"network_interface_name\": \"eth0\"\n",
" },\n",
" \"user_entry_point\": \"entrypoint.py\"\u001b[0m\n",
"\u001b[34m}\n",
"\u001b[0m\n",
"\u001b[34mEnvironment variables:\n",
"\u001b[0m\n",
"\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n",
"\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n",
"\u001b[34mSM_HPS={\"model_dir\":\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"}\u001b[0m\n",
"\u001b[34mSM_USER_ENTRY_POINT=entrypoint.py\u001b[0m\n",
"\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n",
"\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n",
"\u001b[34mSM_INPUT_DATA_CONFIG={}\u001b[0m\n",
"\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n",
"\u001b[34mSM_CHANNELS=[]\u001b[0m\n",
"\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n",
"\u001b[34mSM_MODULE_NAME=entrypoint\u001b[0m\n",
"\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n",
"\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main\u001b[0m\n",
"\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n",
"\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n",
"\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n",
"\u001b[34mSM_NUM_CPUS=4\u001b[0m\n",
"\u001b[34mSM_NUM_GPUS=0\u001b[0m\n",
"\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n",
"\u001b[34mSM_MODULE_DIR=s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/source/sourcedir.tar.gz\u001b[0m\n",
"\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_tensorflow_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"model_dir\":\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"tensorflow-training-2021-09-06-03-07-30-818\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/source/sourcedir.tar.gz\",\"module_name\":\"entrypoint\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"entrypoint.py\"}\u001b[0m\n",
"\u001b[34mSM_USER_ARGS=[\"--model_dir\",\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"]\u001b[0m\n",
"\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n",
"\u001b[34mSM_HP_MODEL_DIR=s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\u001b[0m\n",
"\u001b[34mPYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/local/lib/python37.zip:/usr/local/lib/python3.7:/usr/local/lib/python3.7/lib-dynload:/usr/local/lib/python3.7/site-packages\n",
"\u001b[0m\n",
"\u001b[34mInvoking script with the following command:\n",
"\u001b[0m\n",
"\u001b[34m/usr/local/bin/python3.7 entrypoint.py --model_dir s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\n",
"\n",
"\u001b[0m\n",
"\u001b[34m42\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23.970964: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23.971078: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:23.995316: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\n",
"\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:25,536 sagemaker_tensorflow_container.training WARNING No model artifact is saved under path /opt/ml/model. Your training job will not save any model files to S3.\u001b[0m\n",
"\u001b[34mFor details of how to construct your training script see:\u001b[0m\n",
"\u001b[34mhttps://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script\u001b[0m\n",
"\u001b[34m2021-09-06 03:10:25,536 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n",
"Training seconds: 37\n",
"Billable seconds: 37\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"source": [
"from sagemaker.tuner import HyperparameterTuner, CategoricalParameter\r\n",
"\r\n",
"hp_ranges = dict( # dummy\r\n",
" lr=CategoricalParameter([0.01, 0.001, 0.0001]),\r\n",
" batch_size=CategoricalParameter([64, 128, 256, 512, 1024]),\r\n",
")\r\n",
"metrics = [{\"Name\": \"mae\", \"Regex\": \"^MAE: ([0-9\\\\.]+)\"}] # dummy\r\n",
"\r\n",
"tuner = HyperparameterTuner(estimator,\r\n",
" objective_metric_name=\"mae\",\r\n",
" metric_definitions=metrics,\r\n",
" hyperparameter_ranges=hp_ranges,\r\n",
" max_jobs=1,\r\n",
" max_parallel_jobs=1)\r\n",
"\r\n",
"tuner.fit()"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
".......................................................*\n"
]
},
{
"output_type": "error",
"ename": "UnexpectedStatusException",
"evalue": "Error for HyperParameterTuning job tensorflow-training-210906-1215: Failed. Reason: All training jobs failed. Please take a look at the training jobs failures to get more details.",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_12412/1360231367.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 14\u001b[0m max_parallel_jobs=1)\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mtuner\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\tuner.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, inputs, job_name, include_cls_metadata, estimator_kwargs, wait, **kwargs)\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 450\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlatest_tuning_job\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 452\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit_with_estimator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minclude_cls_metadata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\tuner.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1595\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1596\u001b[0m \u001b[1;34m\"\"\"Placeholder docstring.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1597\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for_tuning_job\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1598\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1599\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36mwait_for_tuning_job\u001b[1;34m(self, job, poll)\u001b[0m\n\u001b[0;32m 3151\u001b[0m \"\"\"\n\u001b[0;32m 3152\u001b[0m \u001b[0mdesc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_wait_until\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0m_tuning_job_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpoll\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3153\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"HyperParameterTuningJobStatus\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3154\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3155\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[1;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[0;32m 3229\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"FailureReason\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"(No reason provided)\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3230\u001b[0m \u001b[0mjob_type\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstatus_key_name\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"JobStatus\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\" job\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3231\u001b[1;33m raise exceptions.UnexpectedStatusException(\n\u001b[0m\u001b[0;32m 3232\u001b[0m message=\"Error for {job_type} {job_name}: {status}. Reason: {reason}\".format(\n\u001b[0;32m 3233\u001b[0m \u001b[0mjob_type\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mreason\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mUnexpectedStatusException\u001b[0m: Error for HyperParameterTuning job tensorflow-training-210906-1215: Failed. Reason: All training jobs failed. Please take a look at the training jobs failures to get more details."
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 17,
"source": [
"TensorFlow.attach(\"tensorflow-training-210906-1215-001-71d0517c\")"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"2021-09-06 03:18:57 Starting - Preparing the instances for training\n",
"2021-09-06 03:18:57 Downloading - Downloading input data\n",
"2021-09-06 03:18:57 Training - Training image download completed. Training in progress.\n",
"2021-09-06 03:18:57 Uploading - Uploading generated training model\n",
"2021-09-06 03:18:57 Failed - Training job failed\n"
]
},
{
"output_type": "error",
"ename": "UnexpectedStatusException",
"evalue": "Error for Training job tensorflow-training-210906-1215-001-71d0517c: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/usr/local/bin/python3.7 entrypoint.py --batch_size 1024 --lr 0.0001 --model_dir s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model/tensorflow-training-210906-1215-001-71d0517c/model\"\n2021-09-06 03:18:07.015279: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\n2021-09-06 03:18:07.015385: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\n2021-09-06 03:18:07.040815: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\nTraceback (most recent call last):\n File \"entrypoint.py\", line 7, in <module>\n print(os.environ[\"MY_ENV\"])\n File \"/usr/local/lib/python3.7/os.py\", line 681, in __getitem__\n raise KeyError(key) from None\nKeyError: 'MY_ENV'",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_12412/119782671.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mTensorFlow\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mattach\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"tensorflow-training-210906-1215-001-71d0517c\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\estimator.py\u001b[0m in \u001b[0;36mattach\u001b[1;34m(cls, training_job_name, sagemaker_session, model_channel_name)\u001b[0m\n\u001b[0;32m 2479\u001b[0m \u001b[0mtraining\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2480\u001b[0m \"\"\"\n\u001b[1;32m-> 2481\u001b[1;33m estimator = super(Framework, cls).attach(\n\u001b[0m\u001b[0;32m 2482\u001b[0m \u001b[0mtraining_job_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msagemaker_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel_channel_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2483\u001b[0m )\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\estimator.py\u001b[0m in \u001b[0;36mattach\u001b[1;34m(cls, training_job_name, sagemaker_session, model_channel_name)\u001b[0m\n\u001b[0;32m 834\u001b[0m )\n\u001b[0;32m 835\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_current_job_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 836\u001b[1;33m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"None\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 837\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 838\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\estimator.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, logs)\u001b[0m\n\u001b[0;32m 1631\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1632\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1633\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1634\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1635\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdescribe\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36mwait_for_job\u001b[1;34m(self, job, poll)\u001b[0m\n\u001b[0;32m 3083\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mlast_desc\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0m_train_done\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlast_desc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpoll\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3084\u001b[0m )\n\u001b[1;32m-> 3085\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"TrainingJobStatus\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3086\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3087\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[1;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[0;32m 3229\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"FailureReason\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"(No reason provided)\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3230\u001b[0m \u001b[0mjob_type\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstatus_key_name\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"JobStatus\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\" job\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3231\u001b[1;33m raise exceptions.UnexpectedStatusException(\n\u001b[0m\u001b[0;32m 3232\u001b[0m message=\"Error for {job_type} {job_name}: {status}. Reason: {reason}\".format(\n\u001b[0;32m 3233\u001b[0m \u001b[0mjob_type\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mreason\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mUnexpectedStatusException\u001b[0m: Error for Training job tensorflow-training-210906-1215-001-71d0517c: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/usr/local/bin/python3.7 entrypoint.py --batch_size 1024 --lr 0.0001 --model_dir s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model/tensorflow-training-210906-1215-001-71d0517c/model\"\n2021-09-06 03:18:07.015279: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\n2021-09-06 03:18:07.015385: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\n2021-09-06 03:18:07.040815: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\nTraceback (most recent call last):\n File \"entrypoint.py\", line 7, in <module>\n print(os.environ[\"MY_ENV\"])\n File \"/usr/local/lib/python3.7/os.py\", line 681, in __getitem__\n raise KeyError(key) from None\nKeyError: 'MY_ENV'"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"orig_nbformat": 4,
"language_info": {
"name": "python",
"version": "3.9.6",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.9.6 ('sagemaker': conda)"
},
"interpreter": {
"hash": "790539abed859dbef189e34492d04bceb94e5a35a76cb03288110a08470c9070"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment