-
-
Save enpassanty/7f9c0b80875caff47be27d8cbe1591cc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "176aaebf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# adapted from compiler example\n", | |
"# https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler\n", | |
"\n", | |
"#running from a sagemaker ml.m5.xlarge instance." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "85a28bd4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!pip install \"sagemaker>=2.70.0\" \"transformers==4.11.0\" --upgrade -q\n", | |
"# using older dataset due to incompatibility of sagemaker notebook & aws-cli with > s3fs and fsspec to >= 2021.10\n", | |
"!pip install \"datasets==1.13\" --upgrade -q" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "9ee93ca0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sagemaker\n", | |
"assert sagemaker.__version__ >= \"2.70.0\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "c4907d2a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sagemaker.huggingface" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "3c4ffe74", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"sagemaker role arn: arn:aws:iam::440284372476:role/SM_Role\n", | |
"sagemaker bucket: sagemaker-us-east-1-440284372476\n", | |
"sagemaker session region: us-east-1\n" | |
] | |
} | |
], | |
"source": [ | |
"import sagemaker\n", | |
"\n", | |
"sess = sagemaker.Session()\n", | |
"# sagemaker session bucket -> used for uploading data, models and logs\n", | |
"# sagemaker will automatically create this bucket if it not exists\n", | |
"sagemaker_session_bucket=None\n", | |
"if sagemaker_session_bucket is None and sess is not None:\n", | |
" # set to default bucket if a bucket name is not given\n", | |
" sagemaker_session_bucket = sess.default_bucket()\n", | |
"\n", | |
"role = sagemaker.get_execution_role()\n", | |
"sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n", | |
"\n", | |
"print(f\"sagemaker role arn: {role}\")\n", | |
"print(f\"sagemaker bucket: {sess.default_bucket()}\")\n", | |
"print(f\"sagemaker session region: {sess.boto_region_name}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "4d02bdd7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"from transformers import AutoTokenizer\n", | |
"\n", | |
"# tokenizer used in preprocessing\n", | |
"model_id = 'bert-base-uncased'\n", | |
"\n", | |
"# dataset used\n", | |
"dataset_name = 'emotion'\n", | |
"\n", | |
"# s3 key prefix for the data\n", | |
"s3_prefix = 'samples/datasets/emotion'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "fee6a877", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/transformers/configuration_utils.py:337: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n", | |
" \"Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 \"\n", | |
"Using custom data configuration default\n", | |
"Reusing dataset emotion (/home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "552dacd27b444a02b22cc063dc3d35b1", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/2 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Loading cached processed dataset at /home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-ef70464735ecf316.arrow\n", | |
"Loading cached processed dataset at /home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-2a94bece2982eda9.arrow\n" | |
] | |
} | |
], | |
"source": [ | |
"# download tokenizer\n", | |
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n", | |
"\n", | |
"# tokenizer helper function\n", | |
"def tokenize(batch):\n", | |
" return tokenizer(batch['text'], padding='max_length', truncation=True)\n", | |
"\n", | |
"# load dataset\n", | |
"train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])\n", | |
"\n", | |
"# tokenize dataset\n", | |
"train_dataset = train_dataset.map(tokenize, batched=True)\n", | |
"test_dataset = test_dataset.map(tokenize, batched=True)\n", | |
"\n", | |
"# set format for pytorch\n", | |
"train_dataset = train_dataset.rename_column(\"label\", \"labels\")\n", | |
"train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])\n", | |
"test_dataset = test_dataset.rename_column(\"label\", \"labels\")\n", | |
"test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "f964aaa1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import botocore\n", | |
"from datasets.filesystems import S3FileSystem\n", | |
"\n", | |
"s3 = S3FileSystem() \n", | |
"\n", | |
"# save train_dataset to s3\n", | |
"training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'\n", | |
"train_dataset.save_to_disk(training_input_path, fs=s3)\n", | |
"\n", | |
"# save test_dataset to s3\n", | |
"test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'\n", | |
"test_dataset.save_to_disk(test_input_path, fs=s3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "fff215be", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sagemaker.huggingface import HuggingFace\n", | |
"\n", | |
"# hyperparameters, which are passed into the training job\n", | |
"hyperparameters={'epochs': 4, # number of training epochs\n", | |
" 'train_batch_size': 24, # batch size for training\n", | |
" 'eval_batch_size': 32, # batch size for evaluation\n", | |
" 'learning_rate': 3e-5, # learning rate used during training\n", | |
" 'model_id':model_id, # pre-trained model\n", | |
" 'fp16': True, # Whether to use 16-bit (mixed) precision training\n", | |
" }\n", | |
"\n", | |
"# job name for sagemaker training \n", | |
"job_name=f\"training-compiler-{hyperparameters['model_id']}-{dataset_name}\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "c52baa56", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"git_config = {'repo': 'https://github.com/huggingface/notebooks.git'}\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "071bbb7f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create the Estimator\n", | |
"huggingface_estimator = HuggingFace(\n", | |
" entry_point = 'train.py', # fine-tuning script used in training jon\n", | |
" #source_dir = './scrips',\n", | |
" source_dir = './sagemaker/01_getting_started_pytorch/scripts', # directory where fine-tuning script is stored\n", | |
" git_config = git_config,\n", | |
" instance_type = 'ml.p3.2xlarge', # instances type used for the training job\n", | |
" instance_count = 1, # the number of instances used for training\n", | |
" base_job_name = job_name, # the name of the training job\n", | |
" role = role, # Iam role used in training job to access AWS ressources, e.g. S3\n", | |
" transformers_version = '4.11.0', # the transformers version used in the training job\n", | |
" pytorch_version = '1.9.0', # the pytorch_version version used in the training job\n", | |
" py_version = 'py38', # the python version used in the training job\n", | |
" hyperparameters = hyperparameters, # the hyperparameter used for running the training job\n", | |
" disable_profiler = True, # whether to disable the profiler during training used to gain maximum performance\n", | |
" debugger_hook_config = False, # whether to enable the debugger hook during training used to gain maximum performance\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "449cc712", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2022-02-15 15:00:56 Starting - Starting the training job...\n", | |
"2022-02-15 15:00:58 Starting - Launching requested ML instances.........\n", | |
"2022-02-15 15:02:30 Starting - Preparing the instances for training......\n", | |
"2022-02-15 15:03:45 Downloading - Downloading input data...\n", | |
"2022-02-15 15:04:06 Training - Downloading the training image............................\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", | |
"\u001b[34mbash: no job control in this shell\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:08:57,310 sagemaker-training-toolkit INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:08:57,330 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:09:00,355 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:09:00,864 sagemaker-training-toolkit INFO Invoking user script\u001b[0m\n", | |
"\u001b[34mTraining Env:\u001b[0m\n", | |
"\u001b[34m{\n", | |
" \"additional_framework_parameters\": {},\n", | |
" \"channel_input_dirs\": {\n", | |
" \"test\": \"/opt/ml/input/data/test\",\n", | |
" \"train\": \"/opt/ml/input/data/train\"\n", | |
" },\n", | |
" \"current_host\": \"algo-1\",\n", | |
" \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", | |
" \"hosts\": [\n", | |
" \"algo-1\"\n", | |
" ],\n", | |
" \"hyperparameters\": {\n", | |
" \"train_batch_size\": 24,\n", | |
" \"model_id\": \"bert-base-uncased\",\n", | |
" \"epochs\": 4,\n", | |
" \"learning_rate\": 3e-05,\n", | |
" \"eval_batch_size\": 32,\n", | |
" \"fp16\": true\n", | |
" },\n", | |
" \"input_config_dir\": \"/opt/ml/input/config\",\n", | |
" \"input_data_config\": {\n", | |
" \"test\": {\n", | |
" \"TrainingInputMode\": \"File\",\n", | |
" \"S3DistributionType\": \"FullyReplicated\",\n", | |
" \"RecordWrapperType\": \"None\"\n", | |
" },\n", | |
" \"train\": {\n", | |
" \"TrainingInputMode\": \"File\",\n", | |
" \"S3DistributionType\": \"FullyReplicated\",\n", | |
" \"RecordWrapperType\": \"None\"\n", | |
" }\n", | |
" },\n", | |
" \"input_dir\": \"/opt/ml/input\",\n", | |
" \"is_master\": true,\n", | |
" \"job_name\": \"training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281\",\n", | |
" \"log_level\": 20,\n", | |
" \"master_hostname\": \"algo-1\",\n", | |
" \"model_dir\": \"/opt/ml/model\",\n", | |
" \"module_dir\": \"s3://sagemaker-us-east-1-440284372476/training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281/source/sourcedir.tar.gz\",\n", | |
" \"module_name\": \"train\",\n", | |
" \"network_interface_name\": \"eth0\",\n", | |
" \"num_cpus\": 8,\n", | |
" \"num_gpus\": 1,\n", | |
" \"output_data_dir\": \"/opt/ml/output/data\",\n", | |
" \"output_dir\": \"/opt/ml/output\",\n", | |
" \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", | |
" \"resource_config\": {\n", | |
" \"current_host\": \"algo-1\",\n", | |
" \"hosts\": [\n", | |
" \"algo-1\"\n", | |
" ],\n", | |
" \"network_interface_name\": \"eth0\"\n", | |
" },\n", | |
" \"user_entry_point\": \"train.py\"\u001b[0m\n", | |
"\u001b[34m}\u001b[0m\n", | |
"\u001b[34mEnvironment variables:\u001b[0m\n", | |
"\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", | |
"\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", | |
"\u001b[34mSM_HPS={\"epochs\":4,\"eval_batch_size\":32,\"fp16\":true,\"learning_rate\":3e-05,\"model_id\":\"bert-base-uncased\",\"train_batch_size\":24}\u001b[0m\n", | |
"\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", | |
"\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", | |
"\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", | |
"\u001b[34mSM_INPUT_DATA_CONFIG={\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", | |
"\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", | |
"\u001b[34mSM_CHANNELS=[\"test\",\"train\"]\u001b[0m\n", | |
"\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", | |
"\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", | |
"\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", | |
"\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", | |
"\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", | |
"\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", | |
"\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", | |
"\u001b[34mSM_NUM_CPUS=8\u001b[0m\n", | |
"\u001b[34mSM_NUM_GPUS=1\u001b[0m\n", | |
"\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", | |
"\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-east-1-440284372476/training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281/source/sourcedir.tar.gz\u001b[0m\n", | |
"\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"epochs\":4,\"eval_batch_size\":32,\"fp16\":true,\"learning_rate\":3e-05,\"model_id\":\"bert-base-uncased\",\"train_batch_size\":24},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-east-1-440284372476/training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":8,\"num_gpus\":1,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", | |
"\u001b[34mSM_USER_ARGS=[\"--epochs\",\"4\",\"--eval_batch_size\",\"32\",\"--fp16\",\"True\",\"--learning_rate\",\"3e-05\",\"--model_id\",\"bert-base-uncased\",\"--train_batch_size\",\"24\"]\u001b[0m\n", | |
"\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", | |
"\u001b[34mSM_CHANNEL_TEST=/opt/ml/input/data/test\u001b[0m\n", | |
"\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", | |
"\u001b[34mSM_HP_TRAIN_BATCH_SIZE=24\u001b[0m\n", | |
"\u001b[34mSM_HP_MODEL_ID=bert-base-uncased\u001b[0m\n", | |
"\u001b[34mSM_HP_EPOCHS=4\u001b[0m\n", | |
"\u001b[34mSM_HP_LEARNING_RATE=3e-05\u001b[0m\n", | |
"\u001b[34mSM_HP_EVAL_BATCH_SIZE=32\u001b[0m\n", | |
"\u001b[34mSM_HP_FP16=true\u001b[0m\n", | |
"\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python38.zip:/opt/conda/lib/python3.8:/opt/conda/lib/python3.8/lib-dynload:/opt/conda/lib/python3.8/site-packages\u001b[0m\n", | |
"\u001b[34mInvoking script with the following command:\u001b[0m\n", | |
"\u001b[34m/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:09:05,309 - __main__ - INFO - loaded train_dataset length is: 16000\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:09:05,309 - __main__ - INFO - loaded test_dataset length is: 2000\u001b[0m\n", | |
"\u001b[34m404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n", | |
"\u001b[34m404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n", | |
"\u001b[34mTraceback (most recent call last):\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n", | |
" resolved_config_file = cached_path(\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n", | |
" output_path = get_from_cache(\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n", | |
" r.raise_for_status()\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n", | |
" raise HTTPError(http_error_msg, response=self)\u001b[0m\n", | |
"\u001b[34mrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n", | |
"\u001b[34mDuring handling of the above exception, another exception occurred:\u001b[0m\n", | |
"\u001b[34mTraceback (most recent call last):\n", | |
" File \"train.py\", line 57, in <module>\n", | |
" model = AutoModelForSequenceClassification.from_pretrained(args.model_name)\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py\", line 396, in from_pretrained\n", | |
" config, kwargs = AutoConfig.from_pretrained(\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/configuration_auto.py\", line 527, in from_pretrained\n", | |
" config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 570, in get_config_dict\n", | |
" raise EnvironmentError(msg)\u001b[0m\n", | |
"\u001b[34mOSError: Can't load config for 'None'. Make sure that:\u001b[0m\n", | |
"\u001b[34m- 'None' is a correct model identifier listed on 'https://huggingface.co/models'\u001b[0m\n", | |
"\u001b[34m- or 'None' is the correct path to a directory containing a config.json file\u001b[0m\n", | |
"\u001b[34m2022-02-15 15:09:06,046 sagemaker-training-toolkit ERROR ExecuteUserScriptError:\u001b[0m\n", | |
"\u001b[34mCommand \"/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\"\u001b[0m\n", | |
"\u001b[34m404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n", | |
"\u001b[34mTraceback (most recent call last):\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n", | |
" resolved_config_file = cached_path(\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n", | |
" output_path = get_from_cache(\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n", | |
" r.raise_for_status()\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n", | |
" raise HTTPError(http_error_msg, response=self)\u001b[0m\n", | |
"\u001b[34mrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\u001b[0m\n", | |
"\u001b[34mDuring handling of the above exception, another exception occurred:\u001b[0m\n", | |
"\u001b[34mTraceback (most recent call last):\n", | |
" File \"train.py\", line 57, in <module>\n", | |
" model = AutoModelForSequenceClassification.from_pretrained(args.model_name)\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py\", line 396, in from_pretrained\n", | |
" config, kwargs = AutoConfig.from_pretrained(\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/models/auto/configuration_auto.py\", line 527, in from_pretrained\n", | |
" config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)\n", | |
" File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 570, in get_config_dict\n", | |
" raise EnvironmentError(msg)\u001b[0m\n", | |
"\u001b[34mOSError: Can't load config for 'None'. Make sure that:\u001b[0m\n", | |
"\u001b[34m- 'None' is a correct model identifier listed on 'https://huggingface.co/models'\u001b[0m\n", | |
"\u001b[34m- or 'None' is the correct path to a directory containing a config.json file\u001b[0m\n", | |
"\n", | |
"2022-02-15 15:09:16 Uploading - Uploading generated training model\n", | |
"2022-02-15 15:09:16 Failed - Training job failed\n" | |
] | |
}, | |
{ | |
"ename": "UnexpectedStatusException", | |
"evalue": "Error for Training job training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\"\n404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\nTraceback (most recent call last):\n File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n resolved_config_file = cached_path(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n output_path = get_from_cache(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n r.raise_for_status()\n File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\nrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\n\nDuring ", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-12-f55f62c72c9b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m# starting the train job with our uploaded datasets as input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mhuggingface_estimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name, experiment_config)\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 953\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 954\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 955\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_compilation_job_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m 1938\u001b[0m \u001b[0;31m# If logs are requested, call logs_for_jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1939\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1940\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1941\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1942\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_job\u001b[0;34m(self, job_name, wait, poll, log_type)\u001b[0m\n\u001b[1;32m 3737\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3738\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3739\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TrainingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3740\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3741\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m 3290\u001b[0m ),\n\u001b[1;32m 3291\u001b[0m \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3292\u001b[0;31m \u001b[0mactual_status\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3293\u001b[0m )\n\u001b[1;32m 3294\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Training job training-compiler-bert-base-uncased-emo-2022-02-15-15-00-55-281: Failed. Reason: AlgorithmError: ExecuteUserScriptError:\nCommand \"/opt/conda/bin/python3.8 train.py --epochs 4 --eval_batch_size 32 --fp16 True --learning_rate 3e-05 --model_id bert-base-uncased --train_batch_size 24\"\n404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\nTraceback (most recent call last):\n File \"/opt/conda/lib/python3.8/site-packages/transformers/configuration_utils.py\", line 546, in get_config_dict\n resolved_config_file = cached_path(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1402, in cached_path\n output_path = get_from_cache(\n File \"/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py\", line 1574, in get_from_cache\n r.raise_for_status()\n File \"/opt/conda/lib/python3.8/site-packages/requests/models.py\", line 943, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\nrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json\n\nDuring " | |
] | |
} | |
], | |
"source": [ | |
"# define a data input dictonary with our uploaded s3 uris\n", | |
"data = {\n", | |
" 'train': training_input_path,\n", | |
" 'test': test_input_path\n", | |
"}\n", | |
"\n", | |
"# starting the train job with our uploaded datasets as input\n", | |
"huggingface_estimator.fit(data)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "conda_pytorch_latest_p36", | |
"language": "python", | |
"name": "conda_pytorch_latest_p36" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment