Skip to content

Instantly share code, notes, and snippets.

@enpassanty
Created February 15, 2022 15:40
Show Gist options
  • Save enpassanty/7f9c0b80875caff47be27d8cbe1591cc to your computer and use it in GitHub Desktop.
Save enpassanty/7f9c0b80875caff47be27d8cbe1591cc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "176aaebf",
"metadata": {},
"outputs": [],
"source": [
"# adapted from compiler example\n",
"# https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler\n",
"\n",
"#running from a sagemaker ml.m5.xlarge instance."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "85a28bd4",
"metadata": {},
"outputs": [],
"source": [
"!pip install \"sagemaker>=2.70.0\" \"transformers==4.11.0\" --upgrade -q\n",
"# using older dataset due to incompatibility of sagemaker notebook & aws-cli with > s3fs and fsspec to >= 2021.10\n",
"!pip install \"datasets==1.13\" --upgrade -q"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9ee93ca0",
"metadata": {},
"outputs": [],
"source": [
"import sagemaker\n",
"assert sagemaker.__version__ >= \"2.70.0\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c4907d2a",
"metadata": {},
"outputs": [],
"source": [
"import sagemaker.huggingface"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3c4ffe74",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sagemaker role arn: arn:aws:iam::440284372476:role/SM_Role\n",
"sagemaker bucket: sagemaker-us-east-1-440284372476\n",
"sagemaker session region: us-east-1\n"
]
}
],
"source": [
"import sagemaker\n",
"\n",
"sess = sagemaker.Session()\n",
"# sagemaker session bucket -> used for uploading data, models and logs\n",
"# sagemaker will automatically create this bucket if it not exists\n",
"sagemaker_session_bucket=None\n",
"if sagemaker_session_bucket is None and sess is not None:\n",
" # set to default bucket if a bucket name is not given\n",
" sagemaker_session_bucket = sess.default_bucket()\n",
"\n",
"role = sagemaker.get_execution_role()\n",
"sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
"\n",
"print(f\"sagemaker role arn: {role}\")\n",
"print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
"print(f\"sagemaker session region: {sess.boto_region_name}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4d02bdd7",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer\n",
"\n",
"# tokenizer used in preprocessing\n",
"model_id = 'bert-base-uncased'\n",
"\n",
"# dataset used\n",
"dataset_name = 'emotion'\n",
"\n",
"# s3 key prefix for the data\n",
"s3_prefix = 'samples/datasets/emotion'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fee6a877",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/transformers/configuration_utils.py:337: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n",
" \"Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 \"\n",
"Using custom data configuration default\n",
"Reusing dataset emotion (/home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "552dacd27b444a02b22cc063dc3d35b1",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading cached processed dataset at /home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-ef70464735ecf316.arrow\n",
"Loading cached processed dataset at /home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-2a94bece2982eda9.arrow\n"
]
}
],
"source": [
"# download tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"\n",
"# tokenizer helper function\n",
"def tokenize(batch):\n",
" return tokenizer(batch['text'], padding='max_length', truncation=True)\n",
"\n",
"# load dataset\n",
"train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])\n",
"\n",
"# tokenize dataset\n",
"train_dataset = train_dataset.map(tokenize, batched=True)\n",
"test_dataset = test_dataset.map(tokenize, batched=True)\n",
"\n",
"# set format for pytorch\n",
"train_dataset = train_dataset.rename_column(\"label\", \"labels\")\n",
"train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
"test_dataset = test_dataset.rename_column(\"label\", \"labels\")\n",
"test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f964aaa1",
"metadata": {},
"outputs": [],
"source": [
"import botocore\n",
"from datasets.filesystems import S3FileSystem\n",
"\n",
"s3 = S3FileSystem() \n",
"\n",
"# save train_dataset to s3\n",
"training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'\n",
"train_dataset.save_to_disk(training_input_path, fs=s3)\n",
"\n",
"# save test_dataset to s3\n",
"test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'\n",
"test_dataset.save_to_disk(test_input_path, fs=s3)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fff215be",
"metadata": {},
"outputs": [],
"source": [
"from sagemaker.huggingface import HuggingFace\n",
"\n",
"# hyperparameters, which are passed into the training job\n",
"hyperparameters={'epochs': 4, # number of training epochs\n",
" 'train_batch_size': 24, # batch size for training\n",
" 'eval_batch_size': 32, # batch size for evaluation\n",
" 'learning_rate': 3e-5, # learning rate used during training\n",
" 'model_id':model_id, # pre-trained model\n",
" 'fp16': True, # Whether to use 16-bit (mixed) precision training\n",
" }\n",
"\n",
"# job name for sagemaker training \n",
"job_name=f\"training-compiler-{hyperparameters['model_id']}-{dataset_name}\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c52baa56",
"metadata": {},
"outputs": [],
"source": [
"git_config = {'repo': 'https://github.com/huggingface/notebooks.git'}\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "071bbb7f",
"metadata": {},
"outputs": [],
"source": [
"# create the Estimator\n",
"huggingface_estimator = HuggingFace(\n",
" entry_point = 'train.py', # fine-tuning script used in training jon\n",
" #source_dir = './scrips',\n",
" source_dir = './sagemaker/01_getting_started_pytorch/scripts', # directory where fine-tuning script is stored\n",
" git_config = git_config,\n",
" instance_type = 'ml.p3.2xlarge', # instances type used for the training job\n",
" instance_count = 1, # the number of instances used for training\n",
" base_job_name = job_name, # the name of the training job\n",
" role = role, # Iam role used in training job to access AWS ressources, e.g. S3\n",
" transformers_version = '4.11.0', # the transformers version used in the training job\n",
" pytorch_version = '1.9.0', # the pytorch_version version used in the training job\n",
" py_version = 'py38', # the python version used in the training job\n",
" hyperparameters = hyperparameters, # the hyperparameter used for running the training job\n",
" disable_profiler = True, # whether to disable the profiler during training used to gain maximum performance\n",
" debugger_hook_config = False, # whether to enable the debugger hook during training used to gain maximum performance\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "449cc712",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2022-02-15 15:00:56 Starting - Starting the training job...\n",
"2022-02-15 15:00:58 Starting - Launching requested ML instances.........\n",
"2022-02-15 15:02:30 Starting - Preparing the instances for training......\n",
"2022-02-15 15:03:45 Downloading - Downloading input data...\n",
"2022-02-15 15:04:06 Training - Downloading the training image............................\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n",
"\u001b[34mbash: no job control in this shell\u001b[0m\n",
"\u001b[34m2022-02-15 15:08:57,310 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n",
"\u001b[34m2022-02-15 15:08:57,330 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n",
"\u001b[34m2022-02-15 15:09:00,355 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n",
"\u001b[34m2022-02-15 15:09:00,864 sagemaker-training-toolkit INFO Invoking user script\u001b[0m\n",
"\u001b[34mTraining Env:\u001b[0m\n",
"\u001b[34m{\n",
" \"additional_framework_parameters\": {},\n",
" \"channel_input_dirs\": {\n",
" \"test\": \"/opt/ml/input/data/test\",\n",
" \"train\": \"/opt/ml/input/data/train\"\n",
" },\n",
" \"current_host\": \"algo-1\",\n",
" \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n",
" \"hosts\": [\n",
" \"algo-1\"\n",
" ],\n",
" \"hyperparameters\": {\n",
" \"train_batch_size\": 24,\n",
" \"model_id\": \"bert-base-uncased\",\n",
" \"epochs\": 4,\n",
" \"learning_rate\": 3e-05,\n",
" \"eval_batch_size\": 32,\n",
" \"fp16\": true\n",
" },\n",
" \"input_config_dir\": \"/opt/ml/input/config\",\n",
" \"input_data_config\": {\n",
" \"test\": {\n",
" \"TrainingInputMode\": \"File\",\n",
" \"S3DistributionType\": \"FullyReplicated\",\n",
" \"RecordWrapperType\": \"None\"\n",
" },\n",
" \"train\": {\n",
" \"TrainingInputMode\": \"File\",\n",
" \"S3DistributionType\": \"FullyReplicated\",\n",
" \"RecordWrapperType\": \"None\"\n",
" }\n",
" },\n",
" \"input_dir\": \"/opt/ml/input\",\n",
" \"is_master\": true,\n",
" \"job_name\": \"training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281\",\n",
" \"log_level\": 20,\n",
" \"master_hostname\": \"algo-1\",\n",
" \"model_dir\": \"/opt/ml/model\",\n",
" \"module_dir\": \"s3://sagemaker-us-east-1-440284372476/training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281/source/sourcedir.tar.gz\",\n",
" \"module_name\": \"train\",\n",
" \"network_interface_name\": \"eth0\",\n",
" \"num_cpus\": 8,\n",
" \"num_gpus\": 1,\n",
" \"output_data_dir\": \"/opt/ml/output/data\",\n",
" \"output_dir\": \"/opt/ml/output\",\n",
" \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n",
" \"resource_config\": {\n",
" \"current_host\": \"algo-1\",\n",
" \"hosts\": [\n",
" \"algo-1\"\n",
" ],\n",
" \"network_interface_name\": \"eth0\"\n",
" },\n",
" \"user_entry_point\": \"train.py\"\u001b[0m\n",
"\u001b[34m}\u001b[0m\n",
"\u001b[34mEnvironment variables:\u001b[0m\n",
"\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n",
"\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n",
"\u001b[34mSM_HPS={\"epochs\":4,\"eval_batch_size\":32,\"fp16\":true,\"learning_rate\":3e-05,\"model_id\":\"bert-base-uncased\",\"train_batch_size\":24}\u001b[0m\n",
"\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n",
"\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n",
"\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n",
"\u001b[34mSM_INPUT_DATA_CONFIG={\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n",
"\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n",
"\u001b[34mSM_CHANNELS=[\"test\",\"train\"]\u001b[0m\n",
"\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n",
"\u001b[34mSM_MODULE_NAME=train\u001b[0m\n",
"\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n",
"\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n",
"\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n",
"\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n",
"\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n",
"\u001b[34mSM_NUM_CPUS=8\u001b[0m\n",
"\u001b[34mSM_NUM_GPUS=1\u001b[0m\n",
"\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n",
"\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-east-1-440284372476/training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281/source/sourcedir.tar.gz\u001b[0m\n",
"\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"epochs\":4,\"eval_batch_size\":32,\"fp16\":true,\"learning_rate\":3e-05,\"model_id\":\"bert-base-uncased\",\"train_batch_size\":24},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-east-1-440284372476/training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":8,\"num_gpus\":1,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n",
"\u001b[34mSM_USER_ARGS=[\"--epochs\",\"4\",\"--eval_batch_size\",\"32\",\"--fp16\",\"True\",\"--learning_rate\",\"3e-05\",\"--model_id\",\"bert-base-uncased\",\"--train_batch_size\",\"24\"]\u001b[0m\n",
"\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n",
"\u001b[34mSM_CHANNEL_TEST=/opt/ml/input/data/test\u001b[0m\n",
"\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n",
"\u001b[34mSM_HP_TRAIN_BATCH_SIZE=24\u001b[0m\n",
"\u001b[34mSM_HP_MODEL_ID=bert-base-uncased\u001b[0m\n",
"\u001b[34mSM_HP_EPOCHS=4\u001b[0m\n",
"\u001b[34mSM_HP_LEARNING_RATE=3e-05\u001b[0m\n",
"\u001b[34mSM_HP_EVAL_BATCH_SIZE=32\u001b[0m\n",
"\u001b[34mSM_HP_FP16=true\u001b[0m\n",
"\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python38.zip:/opt/conda/lib/python3.8:/opt/conda/lib/python3.8/lib-dynload:/opt/conda/lib/python3.8/site-packages\u001b[0m\n",
"\u001b[34mInvoking script with the following command:\u001b[0m\n",
"\u001b[34m/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\u001b[0m\n",
"\u001b[34m2022-02-15 15:09:05,309 - __main__ - INFO - loaded train_dataset length is: 16000\u001b[0m\n",
"\u001b[34m2022-02-15 15:09:05,309 - __main__ - INFO - loaded test_dataset length is: 2000\u001b[0m\n",
"\u001b[34m404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n",
"\u001b[34m404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n",
"\u001b[34mTraceback (most recent call last):\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n",
" resolved_config_file = cached_path(\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n",
" output_path = get_from_cache(\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n",
" r.raise_for_status()\n",
" File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n",
" raise HTTPError(http_error_msg, response=self)\u001b[0m\n",
"\u001b[34mrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n",
"\u001b[34mDuring handling of the above exception, another exception occurred:\u001b[0m\n",
"\u001b[34mTraceback (most recent call last):\n",
" File \"train.py\", line 57, in <module>\n",
" model = AutoModelForSequenceClassification.from_pretrained(args.model_name)\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py\", line 396, in from_pretrained\n",
" config, kwargs = AutoConfig.from_pretrained(\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/configuration_auto.py\", line 527, in from_pretrained\n",
" config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 570, in get_config_dict\n",
" raise EnvironmentError(msg)\u001b[0m\n",
"\u001b[34mOSError: Can't load config for 'None'. Make sure that:\u001b[0m\n",
"\u001b[34m- 'None' is a correct model identifier listed on 'https://huggingface.co/models'\u001b[0m\n",
"\u001b[34m- or 'None' is the correct path to a directory containing a config.json file\u001b[0m\n",
"\u001b[34m2022-02-15 15:09:06,046 sagemaker-training-toolkit ERROR ExecuteUserScriptError:\u001b[0m\n",
"\u001b[34mCommand \"/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\"\u001b[0m\n",
"\u001b[34m404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n",
"\u001b[34mTraceback (most recent call last):\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n",
" resolved_config_file = cached_path(\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n",
" output_path = get_from_cache(\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n",
" r.raise_for_status()\n",
" File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n",
" raise HTTPError(http_error_msg, response=self)\u001b[0m\n",
"\u001b[34mrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n",
"\u001b[34mDuring handling of the above exception, another exception occurred:\u001b[0m\n",
"\u001b[34mTraceback (most recent call last):\n",
" File \"train.py\", line 57, in <module>\n",
" model = AutoModelForSequenceClassification.from_pretrained(args.model_name)\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py\", line 396, in from_pretrained\n",
" config, kwargs = AutoConfig.from_pretrained(\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/configuration_auto.py\", line 527, in from_pretrained\n",
" config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)\n",
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 570, in get_config_dict\n",
" raise EnvironmentError(msg)\u001b[0m\n",
"\u001b[34mOSError: Can't load config for 'None'. Make sure that:\u001b[0m\n",
"\u001b[34m- 'None' is a correct model identifier listed on 'https://huggingface.co/models'\u001b[0m\n",
"\u001b[34m- or 'None' is the correct path to a directory containing a config.json file\u001b[0m\n",
"\n",
"2022-02-15 15:09:16 Uploading - Uploading generated training model\n",
"2022-02-15 15:09:16 Failed - Training job failed\n"
]
},
{
"ename": "UnexpectedStatusException",
"evalue": "Error for Training job training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\"\n404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\nTraceback (most recent call last):\n File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n resolved_config_file = cached_path(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n output_path = get_from_cache(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n r.raise_for_status()\n File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\nrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\n\nDuring ",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-12-f55f62c72c9b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m# starting the train job with our uploaded datasets as input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mhuggingface_estimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name, experiment_config)\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 953\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 954\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 955\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_compilation_job_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m 1938\u001b[0m \u001b[0;31m# If logs are requested, call logs_for_jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1939\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1940\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1941\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1942\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_job\u001b[0;34m(self, job_name, wait, poll, log_type)\u001b[0m\n\u001b[1;32m 3737\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3738\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3739\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TrainingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3740\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3741\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m 3290\u001b[0m ),\n\u001b[1;32m 3291\u001b[0m \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3292\u001b[0;31m \u001b[0mactual_status\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3293\u001b[0m )\n\u001b[1;32m 3294\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Training job training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\"\n404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\nTraceback (most recent call last):\n File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n resolved_config_file = cached_path(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n output_path = get_from_cache(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n r.raise_for_status()\n File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\nrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\n\nDuring "
]
}
],
"source": [
"# define a data input dictonary with our uploaded s3 uris\n",
"data = {\n",
" 'train': training_input_path,\n",
" 'test': test_input_path\n",
"}\n",
"\n",
"# starting the train job with our uploaded datasets as input\n",
"huggingface_estimator.fit(data)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_pytorch_latest_p36",
"language": "python",
"name": "conda_pytorch_latest_p36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment