Skip to content

Instantly share code, notes, and snippets.

@gorkemozkaya
Forked from ceshine/run_tf_glue.ipynb
Created December 26, 2019 19:15
Show Gist options
  • Save gorkemozkaya/c3ce3ea77d4ed78703ab6fb1844bc5e7 to your computer and use it in GitHub Desktop.
Save gorkemozkaya/c3ce3ea77d4ed78703ab6fb1844bc5e7 to your computer and use it in GitHub Desktop.
Train huggingface/transformers BERT model on Cloud CPU with TF 2.1 (nightly build)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Adapted from [transformers/examples/run_tf_glue.py](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import math\n",
"\n",
"import tensorflow as tf\n",
"import tensorflow_datasets\n",
"\n",
"from transformers import (\n",
" BertConfig,\n",
" BertTokenizer,\n",
" TFBertForSequenceClassification,\n",
" glue_convert_examples_to_features,\n",
" glue_processors\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up the TPU"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"TPU_NAME\"] = \"kaggle-tpu\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running on TPU ['10.166.101.2:8470']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Initializing the TPU system: kaggle-tpu\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:Initializing the TPU system: kaggle-tpu\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Clearing out eager caches\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:Clearing out eager caches\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Finished initializing TPU system.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:Finished initializing TPU system.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Found TPU system:\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:Found TPU system:\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Num TPU Cores: 8\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Num TPU Cores: 8\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Num TPU Workers: 1\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Num TPU Workers: 1\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Num TPU Cores Per Worker: 8\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"REPLICAS: 8\n"
]
}
],
"source": [
"try:\n",
" tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection\n",
" print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])\n",
"except ValueError:\n",
" tpu = None\n",
"strategy = tf.distribute.get_strategy()\n",
"if tpu:\n",
" tf.config.experimental_connect_to_cluster(tpu)\n",
" tf.tpu.experimental.initialize_tpu_system(tpu)\n",
" strategy = tf.distribute.experimental.TPUStrategy(tpu)\n",
"print(\"REPLICAS: \", strategy.num_replicas_in_sync)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2\n"
]
}
],
"source": [
"if strategy.num_replicas_in_sync == 8: # single TPU\n",
" BATCH_SIZE = 16 * strategy.num_replicas_in_sync\n",
" EVAL_BATCH_SIZE = BATCH_SIZE * 4\n",
" EPOCHS = 3\n",
"else:\n",
" BATCH_SIZE = 32\n",
" EVAL_BATCH_SIZE = BATCH_SIZE * 2\n",
" EPOCHS = 3 \n",
"\n",
"TASK = \"mrpc\"\n",
"if TASK == \"sst-2\":\n",
" TFDS_TASK = \"sst2\"\n",
"elif TASK == \"sts-b\":\n",
" TFDS_TASK = \"stsb\"\n",
"else:\n",
" TFDS_TASK = TASK\n",
" \n",
"num_labels = len(glue_processors[TASK]().get_labels())\n",
"print(num_labels)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Reference: [tensorflow/models/official/nlp/bert](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"GLUE_DIR = \"gs://cloud-tpu-checkpoints/bert/classification\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def single_file_dataset(input_file, name_to_features):\n",
" \"\"\"Creates a single-file dataset to be passed for BERT custom training.\"\"\"\n",
" # For training, we want a lot of parallel reading and shuffling.\n",
" # For eval, we want no shuffling and parallel reading doesn't matter.\n",
" d = tf.data.TFRecordDataset(input_file)\n",
" d = d.map(lambda record: decode_record(record, name_to_features))\n",
"\n",
" # When `input_file` is a path to a single file or a list\n",
" # containing a single path, disable auto sharding so that\n",
" # same input file is sent to all workers.\n",
" if isinstance(input_file, str) or len(input_file) == 1:\n",
" options = tf.data.Options()\n",
" options.experimental_distribute.auto_shard_policy = (\n",
" tf.data.experimental.AutoShardPolicy.OFF)\n",
" d = d.with_options(options)\n",
" return d"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def decode_record(record, name_to_features):\n",
" \"\"\"Decodes a record to a TensorFlow example.\"\"\"\n",
" example = tf.io.parse_single_example(record, name_to_features)\n",
"\n",
" # tf.Example only supports tf.int64, but the TPU only supports tf.int32.\n",
" # So cast all int64 to int32.\n",
" for name in list(example.keys()):\n",
" t = example[name]\n",
" if t.dtype == tf.int64:\n",
" t = tf.cast(t, tf.int32)\n",
" example[name] = t\n",
"\n",
" return example"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def create_classifier_dataset(file_path,\n",
" seq_length,\n",
" batch_size,\n",
" is_training=True,\n",
" input_pipeline_context=None):\n",
" \"\"\"Creates input dataset from (tf)records files for train/eval.\"\"\"\n",
" name_to_features = {\n",
" 'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
" 'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
" 'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
" 'label_ids': tf.io.FixedLenFeature([], tf.int64),\n",
" 'is_real_example': tf.io.FixedLenFeature([], tf.int64),\n",
" }\n",
" dataset = single_file_dataset(file_path, name_to_features)\n",
"\n",
" # The dataset is always sharded by number of hosts.\n",
" # num_input_pipelines is the number of hosts rather than number of cores.\n",
" if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:\n",
" dataset = dataset.shard(input_pipeline_context.num_input_pipelines,\n",
" input_pipeline_context.input_pipeline_id)\n",
"\n",
" def _select_data_from_record(record):\n",
" x = {\n",
" 'input_ids': record['input_ids'],\n",
" 'attention_mask': record['input_mask'],\n",
" 'token_type_ids': record['segment_ids']\n",
" }\n",
" y = record['label_ids']\n",
" return (x, y)\n",
"\n",
" dataset = dataset.map(_select_data_from_record)\n",
"\n",
" if is_training:\n",
" dataset = dataset.shuffle(100)\n",
" dataset = dataset.repeat()\n",
"\n",
" dataset = dataset.batch(batch_size, drop_remainder=is_training)\n",
" dataset = dataset.prefetch(1024)\n",
" return dataset"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,\n",
" is_training):\n",
" \"\"\"Gets a closure to create a dataset.\"\"\"\n",
"\n",
" def _dataset_fn(ctx=None):\n",
" \"\"\"Returns tf.data.Dataset for distributed BERT pretraining.\"\"\"\n",
" batch_size = ctx.get_per_replica_batch_size(\n",
" global_batch_size) if ctx else global_batch_size\n",
" dataset = create_classifier_dataset(\n",
" input_file_pattern,\n",
" max_seq_length,\n",
" batch_size,\n",
" is_training=is_training,\n",
" input_pipeline_context=ctx)\n",
" return dataset\n",
"\n",
" return _dataset_fn"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"with tf.io.gfile.GFile(f'{GLUE_DIR}/{TASK}_meta_data', 'rb') as reader:\n",
" input_meta_data = json.loads(reader.read().decode('utf-8'))\n",
"\n",
"max_seq_length = input_meta_data['max_seq_length']\n",
"train_input_fn = get_dataset_fn(\n",
" f\"{GLUE_DIR}/{TASK}_train.tf_record\",\n",
" max_seq_length,\n",
" BATCH_SIZE,\n",
" is_training=True)\n",
"eval_input_fn = get_dataset_fn(\n",
" f\"{GLUE_DIR}/{TASK}_eval.tf_record\",\n",
" max_seq_length,\n",
" EVAL_BATCH_SIZE,\n",
" is_training=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)\n",
"config = BertConfig.from_pretrained(\"bert-base-cased\", num_labels=num_labels)\n",
"tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n",
"with strategy.scope():\n",
" training_dataset = train_input_fn()\n",
" evaluation_dataset = eval_input_fn()\n",
" \n",
" model = TFBertForSequenceClassification.from_pretrained(\"bert-base-cased\", config=config)\n",
" # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule\n",
" opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)\n",
"\n",
" if num_labels == 1:\n",
" loss = tf.keras.losses.MeanSquaredError()\n",
" else:\n",
" loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
"\n",
" metric = tf.keras.metrics.SparseCategoricalAccuracy(\"accuracy\")\n",
" model.compile(optimizer=opt, loss=loss, metrics=[metric])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"train_data_size = input_meta_data['train_data_size']\n",
"steps_per_epoch = int(train_data_size / BATCH_SIZE)\n",
"eval_steps = int(math.ceil(input_meta_data['eval_data_size'] / EVAL_BATCH_SIZE))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train for 28 steps, validate for 1 steps\n",
"Epoch 1/3\n",
"28/28 [==============================] - 80s 3s/step - loss: 0.6127 - accuracy: 0.6733 - val_loss: 0.5566 - val_accuracy: 0.7157\n",
"Epoch 2/3\n",
"28/28 [==============================] - 3s 110ms/step - loss: 0.5024 - accuracy: 0.7656 - val_loss: 0.4951 - val_accuracy: 0.7328\n",
"Epoch 3/3\n",
"28/28 [==============================] - 3s 103ms/step - loss: 0.3980 - accuracy: 0.8245 - val_loss: 0.5141 - val_accuracy: 0.7574\n"
]
}
],
"source": [
"# Train and evaluate using tf.keras.Model.fit()\n",
"# train_steps = train_examples // BATCH_SIZE\n",
"# valid_steps = valid_examples // EVAL_BATCH_SIZE\n",
"history = model.fit(\n",
" training_dataset,\n",
" epochs=EPOCHS,\n",
" steps_per_epoch=steps_per_epoch,\n",
" validation_data=evaluation_dataset,\n",
" validation_steps=eval_steps,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Save TF2 model\n",
"os.makedirs(\"./save/\", exist_ok=True)\n",
"model.save_pretrained(\"./save/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment