Created
September 6, 2021 04:21
-
-
Save yuntan/654a936c0cebb4e02505fae2f4a350ef to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import tensorflow as tf | |
if __name__ == "__main__": | |
print(os.environ["MY_ENV"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"source": [ | |
"import sagemaker\r\n", | |
"sagemaker.__version__" | |
], | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'2.59.1'" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 1 | |
} | |
], | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"source": [ | |
"role = \"XXX\"" | |
], | |
"outputs": [], | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"source": [ | |
"from sagemaker.tensorflow import TensorFlow\r\n", | |
"\r\n", | |
"environment = {\"MY_ENV\": \"42\"}\r\n", | |
"\r\n", | |
"estimator = TensorFlow(entry_point=\"entrypoint.py\",\r\n", | |
" instance_count=1,\r\n", | |
" instance_type=\"ml.c5.xlarge\",\r\n", | |
" py_version=\"py37\",\r\n", | |
" framework_version=\"2.4.1\",\r\n", | |
" role=role,\r\n", | |
" environment=environment)\r\n", | |
"\r\n", | |
"estimator.fit()" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"2021-09-06 03:07:32 Starting - Starting the training job...\n", | |
"2021-09-06 03:07:56 Starting - Launching requested ML instancesProfilerReport-1630897651: InProgress\n", | |
"......\n", | |
"2021-09-06 03:08:56 Starting - Preparing the instances for training......\n", | |
"2021-09-06 03:10:04 Downloading - Downloading input data\n", | |
"2021-09-06 03:10:04 Training - Downloading the training image...\n", | |
"2021-09-06 03:10:29 Uploading - Uploading generated training model\n", | |
"2021-09-06 03:10:29 Completed - Training job completed\n", | |
"\u001b[34m2021-09-06 03:10:20.859090: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:20.866083: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:20.952795: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23,448 sagemaker-training-toolkit INFO Imported framework sagemaker_tensorflow_container.training\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23,455 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23,809 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23,823 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23,836 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23,845 sagemaker-training-toolkit INFO Invoking user script\n", | |
"\u001b[0m\n", | |
"\u001b[34mTraining Env:\n", | |
"\u001b[0m\n", | |
"\u001b[34m{\n", | |
" \"additional_framework_parameters\": {},\n", | |
" \"channel_input_dirs\": {},\n", | |
" \"current_host\": \"algo-1\",\n", | |
" \"framework_module\": \"sagemaker_tensorflow_container.training:main\",\n", | |
" \"hosts\": [\n", | |
" \"algo-1\"\n", | |
" ],\n", | |
" \"hyperparameters\": {\n", | |
" \"model_dir\": \"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"\n", | |
" },\n", | |
" \"input_config_dir\": \"/opt/ml/input/config\",\n", | |
" \"input_data_config\": {},\n", | |
" \"input_dir\": \"/opt/ml/input\",\n", | |
" \"is_master\": true,\n", | |
" \"job_name\": \"tensorflow-training-2021-09-06-03-07-30-818\",\n", | |
" \"log_level\": 20,\n", | |
" \"master_hostname\": \"algo-1\",\n", | |
" \"model_dir\": \"/opt/ml/model\",\n", | |
" \"module_dir\": \"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/source/sourcedir.tar.gz\",\n", | |
" \"module_name\": \"entrypoint\",\n", | |
" \"network_interface_name\": \"eth0\",\n", | |
" \"num_cpus\": 4,\n", | |
" \"num_gpus\": 0,\n", | |
" \"output_data_dir\": \"/opt/ml/output/data\",\n", | |
" \"output_dir\": \"/opt/ml/output\",\n", | |
" \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", | |
" \"resource_config\": {\n", | |
" \"current_host\": \"algo-1\",\n", | |
" \"hosts\": [\n", | |
" \"algo-1\"\n", | |
" ],\n", | |
" \"network_interface_name\": \"eth0\"\n", | |
" },\n", | |
" \"user_entry_point\": \"entrypoint.py\"\u001b[0m\n", | |
"\u001b[34m}\n", | |
"\u001b[0m\n", | |
"\u001b[34mEnvironment variables:\n", | |
"\u001b[0m\n", | |
"\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", | |
"\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", | |
"\u001b[34mSM_HPS={\"model_dir\":\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"}\u001b[0m\n", | |
"\u001b[34mSM_USER_ENTRY_POINT=entrypoint.py\u001b[0m\n", | |
"\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", | |
"\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", | |
"\u001b[34mSM_INPUT_DATA_CONFIG={}\u001b[0m\n", | |
"\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", | |
"\u001b[34mSM_CHANNELS=[]\u001b[0m\n", | |
"\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", | |
"\u001b[34mSM_MODULE_NAME=entrypoint\u001b[0m\n", | |
"\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", | |
"\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_tensorflow_container.training:main\u001b[0m\n", | |
"\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", | |
"\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", | |
"\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", | |
"\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", | |
"\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", | |
"\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", | |
"\u001b[34mSM_MODULE_DIR=s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/source/sourcedir.tar.gz\u001b[0m\n", | |
"\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_tensorflow_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"model_dir\":\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"tensorflow-training-2021-09-06-03-07-30-818\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/source/sourcedir.tar.gz\",\"module_name\":\"entrypoint\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"entrypoint.py\"}\u001b[0m\n", | |
"\u001b[34mSM_USER_ARGS=[\"--model_dir\",\"s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\"]\u001b[0m\n", | |
"\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", | |
"\u001b[34mSM_HP_MODEL_DIR=s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\u001b[0m\n", | |
"\u001b[34mPYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/local/lib/python37.zip:/usr/local/lib/python3.7:/usr/local/lib/python3.7/lib-dynload:/usr/local/lib/python3.7/site-packages\n", | |
"\u001b[0m\n", | |
"\u001b[34mInvoking script with the following command:\n", | |
"\u001b[0m\n", | |
"\u001b[34m/usr/local/bin/python3.7 entrypoint.py --model_dir s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model\n", | |
"\n", | |
"\u001b[0m\n", | |
"\u001b[34m42\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23.970964: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23.971078: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:23.995316: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\n", | |
"\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:25,536 sagemaker_tensorflow_container.training WARNING No model artifact is saved under path /opt/ml/model. Your training job will not save any model files to S3.\u001b[0m\n", | |
"\u001b[34mFor details of how to construct your training script see:\u001b[0m\n", | |
"\u001b[34mhttps://sagemaker.readthedocs.io/en/stable/using_tf.html#adapting-your-local-tensorflow-script\u001b[0m\n", | |
"\u001b[34m2021-09-06 03:10:25,536 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", | |
"Training seconds: 37\n", | |
"Billable seconds: 37\n" | |
] | |
} | |
], | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"source": [ | |
"from sagemaker.tuner import HyperparameterTuner, CategoricalParameter\r\n", | |
"\r\n", | |
"hp_ranges = dict( # dummy\r\n", | |
" lr=CategoricalParameter([0.01, 0.001, 0.0001]),\r\n", | |
" batch_size=CategoricalParameter([64, 128, 256, 512, 1024]),\r\n", | |
")\r\n", | |
"metrics = [{\"Name\": \"mae\", \"Regex\": \"^MAE: ([0-9\\\\.]+)\"}] # dummy\r\n", | |
"\r\n", | |
"tuner = HyperparameterTuner(estimator,\r\n", | |
" objective_metric_name=\"mae\",\r\n", | |
" metric_definitions=metrics,\r\n", | |
" hyperparameter_ranges=hp_ranges,\r\n", | |
" max_jobs=1,\r\n", | |
" max_parallel_jobs=1)\r\n", | |
"\r\n", | |
"tuner.fit()" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
".......................................................*\n" | |
] | |
}, | |
{ | |
"output_type": "error", | |
"ename": "UnexpectedStatusException", | |
"evalue": "Error for HyperParameterTuning job tensorflow-training-210906-1215: Failed. Reason: All training jobs failed. Please take a look at the training jobs failures to get more details.", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[1;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_12412/1360231367.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 14\u001b[0m max_parallel_jobs=1)\n\u001b[0;32m 15\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 16\u001b[1;33m \u001b[0mtuner\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\tuner.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, inputs, job_name, include_cls_metadata, estimator_kwargs, wait, **kwargs)\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 450\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlatest_tuning_job\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 452\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_fit_with_estimator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minclude_cls_metadata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\tuner.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1595\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1596\u001b[0m \u001b[1;34m\"\"\"Placeholder docstring.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1597\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for_tuning_job\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1598\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1599\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36mwait_for_tuning_job\u001b[1;34m(self, job, poll)\u001b[0m\n\u001b[0;32m 3151\u001b[0m \"\"\"\n\u001b[0;32m 3152\u001b[0m \u001b[0mdesc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_wait_until\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0m_tuning_job_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpoll\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3153\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"HyperParameterTuningJobStatus\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3154\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3155\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[1;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[0;32m 3229\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"FailureReason\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"(No reason provided)\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3230\u001b[0m \u001b[0mjob_type\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstatus_key_name\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"JobStatus\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\" job\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3231\u001b[1;33m raise exceptions.UnexpectedStatusException(\n\u001b[0m\u001b[0;32m 3232\u001b[0m message=\"Error for {job_type} {job_name}: {status}. Reason: {reason}\".format(\n\u001b[0;32m 3233\u001b[0m \u001b[0mjob_type\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mreason\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;31mUnexpectedStatusException\u001b[0m: Error for HyperParameterTuning job tensorflow-training-210906-1215: Failed. Reason: All training jobs failed. Please take a look at the training jobs failures to get more details." | |
] | |
} | |
], | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"source": [ | |
"TensorFlow.attach(\"tensorflow-training-210906-1215-001-71d0517c\")" | |
], | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\n", | |
"2021-09-06 03:18:57 Starting - Preparing the instances for training\n", | |
"2021-09-06 03:18:57 Downloading - Downloading input data\n", | |
"2021-09-06 03:18:57 Training - Training image download completed. Training in progress.\n", | |
"2021-09-06 03:18:57 Uploading - Uploading generated training model\n", | |
"2021-09-06 03:18:57 Failed - Training job failed\n" | |
] | |
}, | |
{ | |
"output_type": "error", | |
"ename": "UnexpectedStatusException", | |
"evalue": "Error for Training job tensorflow-training-210906-1215-001-71d0517c: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/usr/local/bin/python3.7 entrypoint.py --batch_size 1024 --lr 0.0001 --model_dir s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model/tensorflow-training-210906-1215-001-71d0517c/model\"\n2021-09-06 03:18:07.015279: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\n2021-09-06 03:18:07.015385: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\n2021-09-06 03:18:07.040815: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\nTraceback (most recent call last):\n File \"entrypoint.py\", line 7, in <module>\n print(os.environ[\"MY_ENV\"])\n File \"/usr/local/lib/python3.7/os.py\", line 681, in __getitem__\n raise KeyError(key) from None\nKeyError: 'MY_ENV'", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[1;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_12412/119782671.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mTensorFlow\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mattach\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"tensorflow-training-210906-1215-001-71d0517c\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\estimator.py\u001b[0m in \u001b[0;36mattach\u001b[1;34m(cls, training_job_name, sagemaker_session, model_channel_name)\u001b[0m\n\u001b[0;32m 2479\u001b[0m \u001b[0mtraining\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2480\u001b[0m \"\"\"\n\u001b[1;32m-> 2481\u001b[1;33m estimator = super(Framework, cls).attach(\n\u001b[0m\u001b[0;32m 2482\u001b[0m \u001b[0mtraining_job_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msagemaker_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodel_channel_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2483\u001b[0m )\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\estimator.py\u001b[0m in \u001b[0;36mattach\u001b[1;34m(cls, training_job_name, sagemaker_session, model_channel_name)\u001b[0m\n\u001b[0;32m 834\u001b[0m )\n\u001b[0;32m 835\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_current_job_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 836\u001b[1;33m \u001b[0mestimator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"None\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 837\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mestimator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 838\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\estimator.py\u001b[0m in \u001b[0;36mwait\u001b[1;34m(self, logs)\u001b[0m\n\u001b[0;32m 1631\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1632\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1633\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1634\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1635\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdescribe\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36mwait_for_job\u001b[1;34m(self, job, poll)\u001b[0m\n\u001b[0;32m 3083\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mlast_desc\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0m_train_done\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlast_desc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpoll\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3084\u001b[0m )\n\u001b[1;32m-> 3085\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"TrainingJobStatus\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3086\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3087\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m~\\anaconda3\\envs\\sagemaker\\lib\\site-packages\\sagemaker\\session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[1;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[0;32m 3229\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdesc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"FailureReason\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"(No reason provided)\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3230\u001b[0m \u001b[0mjob_type\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstatus_key_name\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"JobStatus\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\" job\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3231\u001b[1;33m raise exceptions.UnexpectedStatusException(\n\u001b[0m\u001b[0;32m 3232\u001b[0m message=\"Error for {job_type} {job_name}: {status}. Reason: {reason}\".format(\n\u001b[0;32m 3233\u001b[0m \u001b[0mjob_type\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mjob_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mjob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mreason\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;31mUnexpectedStatusException\u001b[0m: Error for Training job tensorflow-training-210906-1215-001-71d0517c: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/usr/local/bin/python3.7 entrypoint.py --batch_size 1024 --lr 0.0001 --model_dir s3://sagemaker-ap-northeast-1-775362833254/tensorflow-training-2021-09-06-03-07-30-818/model/tensorflow-training-210906-1215-001-71d0517c/model\"\n2021-09-06 03:18:07.015279: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\n2021-09-06 03:18:07.015385: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.\n2021-09-06 03:18:07.040815: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.\nTraceback (most recent call last):\n File \"entrypoint.py\", line 7, in <module>\n print(os.environ[\"MY_ENV\"])\n File \"/usr/local/lib/python3.7/os.py\", line 681, in __getitem__\n raise KeyError(key) from None\nKeyError: 'MY_ENV'" | |
] | |
} | |
], | |
"metadata": {} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"source": [], | |
"outputs": [], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"orig_nbformat": 4, | |
"language_info": { | |
"name": "python", | |
"version": "3.9.6", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3.9.6 ('sagemaker': conda)" | |
}, | |
"interpreter": { | |
"hash": "790539abed859dbef189e34492d04bceb94e5a35a76cb03288110a08470c9070" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment