suxue/trt-bert.ipynb

## trt-bert.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/env python3\n",
    "# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "\n",
    "import time\n",
    "import ctypes\n",
    "import argparse\n",
    "import numpy as np\n",
    "import tensorrt as trt\n",
    "import pycuda.driver as cuda\n",
    "import pycuda.autoinit\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "SIZE = 32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<CDLL 'libbert_plugins.so', handle 37fc880 at 0x7f392d247d10>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_query_length = SIZE\n",
    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
    "doc_stride = SIZE\n",
    "# The maximum total input sequence length after WordPiece tokenization.\n",
    "# Sequences longer than this will be truncated, and sequences shorter\n",
    "max_seq_length = SIZE\n",
    "\n",
    "# Import necessary plugins for BERT TensorRT\n",
    "ctypes.CDLL(\"libnvinfer_plugin.so.6\", mode=ctypes.RTLD_GLOBAL)\n",
    "ctypes.CDLL(\"libcommon.so\", mode=ctypes.RTLD_GLOBAL)\n",
    "ctypes.CDLL(\"libbert_plugins.so\", mode=ctypes.RTLD_GLOBAL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "serialized_bert = open('./trt_bert/p40/{}.trt'.format(SIZE), 'rb').read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "runtime = trt.Runtime(TRT_LOGGER)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "engine = runtime.deserialize_cuda_engine(serialized_bert)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "context = engine.create_execution_context()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "BATCH_SIZE = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_shape=(BATCH_SIZE, max_seq_length)\n",
    "input_nbytes = trt.volume(input_shape) * trt.int32.itemsize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
    "# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
    "for binding in range(3):\n",
    "    context.set_binding_shape(binding, input_shape)\n",
    "assert context.all_binding_shapes_specified"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
    "d_output = cuda.mem_alloc(h_output.nbytes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
     ]
    }
   ],
   "source": [
    "import data_processing as dp\n",
    "import tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /apdcephfs/private_andyfei/trt/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer = tokenization.FullTokenizer(vocab_file='./trt_bert/model/vocab.txt',\n",
    "                                       do_lower_case=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_tokens = dp.convert_doc_tokens('hello world, nice to meet you' * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def question_features(question):\n",
    "    return dp.convert_examples_to_features(doc_tokens, question, tokenizer,\n",
    "                                           max_seq_length, doc_stride, max_query_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = question_features('hello world')\n",
    "\n",
    "features['input_ids'] = np.reshape((np.stack([features['input_ids']] * BATCH_SIZE)), [-1])\n",
    "features['input_mask'] = np.reshape((np.stack([features['input_mask']] * BATCH_SIZE)), [-1])\n",
    "features['segment_ids'] = np.reshape((np.stack([features['segment_ids']] * BATCH_SIZE)), [-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "stream = cuda.Stream()  # Create a stream in which to copy inputs/outputs and run inference."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def inference(features):\n",
    "    #print(\"\\nRunning Inference...\")\n",
    "    \n",
    "    eval_start_time = time.time()\n",
    "\n",
    "    # Copy inputs\n",
    "    cuda.memcpy_htod_async(d_inputs[0], features[\"input_ids\"], stream)\n",
    "    cuda.memcpy_htod_async(d_inputs[1], features[\"segment_ids\"], stream)\n",
    "    cuda.memcpy_htod_async(d_inputs[2], features[\"input_mask\"], stream)\n",
    "    # Run inference\n",
    "    context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
    "    # Transfer predictions back from GPU\n",
    "    cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
    "    # Synchronize the stream\n",
    "    stream.synchronize()\n",
    "\n",
    "    eval_time_elapsed = time.time() - eval_start_time\n",
    "\n",
    "    #print(\"------------------------\")\n",
    "    #print(\"Running inference in {:.3f} \".format(eval_time_elapsed))\n",
    "    #print(\"------------------------\")\n",
    "    \n",
    "    return h_output\n",
    "    \n",
    "    for index, batch in enumerate(h_output):\n",
    "        # Data Post-processing\n",
    "        start_logits = batch[:, 0]\n",
    "        end_logits = batch[:, 1]\n",
    "\n",
    "        # Total number of n-best predictions to generate in the nbest_predictions.json output file\n",
    "        n_best_size = 20\n",
    "\n",
    "        # The maximum length of an answer that can be generated. This is needed\n",
    "        # because the start and end predictions are not conditioned on one another\n",
    "        max_answer_length = 30\n",
    "\n",
    "        prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,\n",
    "                start_logits, end_logits, n_best_size, max_answer_length)\n",
    "\n",
    "        print(\"Processing output {:} in batch\".format(index))\n",
    "        print(\"Answer: '{}'\".format(prediction))\n",
    "        print(\"With probability: {:.3f}\".format(nbest_json[0]['probability'] * 100.0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8\n",
      "32\n"
     ]
    }
   ],
   "source": [
    "print(BATCH_SIZE)\n",
    "print(SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.07 ms, sys: 252 µs, total: 9.32 ms\n",
      "Wall time: 8.73 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(8, 32, 2, 1, 1)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "out = inference(features)\n",
    "out.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.34 ms ± 3.83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit inference(features)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"#!/usr/bin/env python3\n",
	"# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.\n",
	"#\n",
	"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
	"# you may not use this file except in compliance with the License.\n",
	"# You may obtain a copy of the License at\n",
	"#\n",
	"# http://www.apache.org/licenses/LICENSE-2.0\n",
	"#\n",
	"# Unless required by applicable law or agreed to in writing, software\n",
	"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
	"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
	"# See the License for the specific language governing permissions and\n",
	"# limitations under the License.\n",
	"\n",
	"import time\n",
	"import ctypes\n",
	"import argparse\n",
	"import numpy as np\n",
	"import tensorrt as trt\n",
	"import pycuda.driver as cuda\n",
	"import pycuda.autoinit\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"SIZE = 32"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<CDLL 'libbert_plugins.so', handle 37fc880 at 0x7f392d247d10>"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"max_query_length = SIZE\n",
	"# When splitting up a long document into chunks, how much stride to take between chunks.\n",
	"doc_stride = SIZE\n",
	"# The maximum total input sequence length after WordPiece tokenization.\n",
	"# Sequences longer than this will be truncated, and sequences shorter\n",
	"max_seq_length = SIZE\n",
	"\n",
	"# Import necessary plugins for BERT TensorRT\n",
	"ctypes.CDLL(\"libnvinfer_plugin.so.6\", mode=ctypes.RTLD_GLOBAL)\n",
	"ctypes.CDLL(\"libcommon.so\", mode=ctypes.RTLD_GLOBAL)\n",
	"ctypes.CDLL(\"libbert_plugins.so\", mode=ctypes.RTLD_GLOBAL)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"serialized_bert = open('./trt_bert/p40/{}.trt'.format(SIZE), 'rb').read()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"runtime = trt.Runtime(TRT_LOGGER)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"engine = runtime.deserialize_cuda_engine(serialized_bert)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"context = engine.create_execution_context()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"BATCH_SIZE = 8"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"input_shape=(BATCH_SIZE, max_seq_length)\n",
	"input_nbytes = trt.volume(input_shape) * trt.int32.itemsize"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
	"# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
	"for binding in range(3):\n",
	" context.set_binding_shape(binding, input_shape)\n",
	"assert context.all_binding_shapes_specified"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
	"d_output = cuda.mem_alloc(h_output.nbytes)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
	"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
	" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
	]
	}
	],
	"source": [
	"import data_processing as dp\n",
	"import tokenization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"WARNING:tensorflow:From /apdcephfs/private_andyfei/trt/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
	"\n"
	]
	}
	],
	"source": [
	"tokenizer = tokenization.FullTokenizer(vocab_file='./trt_bert/model/vocab.txt',\n",
	" do_lower_case=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"doc_tokens = dp.convert_doc_tokens('hello world, nice to meet you' * 100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"def question_features(question):\n",
	" return dp.convert_examples_to_features(doc_tokens, question, tokenizer,\n",
	" max_seq_length, doc_stride, max_query_length)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"features = question_features('hello world')\n",
	"\n",
	"features['input_ids'] = np.reshape((np.stack([features['input_ids']] * BATCH_SIZE)), [-1])\n",
	"features['input_mask'] = np.reshape((np.stack([features['input_mask']] * BATCH_SIZE)), [-1])\n",
	"features['segment_ids'] = np.reshape((np.stack([features['segment_ids']] * BATCH_SIZE)), [-1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"stream = cuda.Stream() # Create a stream in which to copy inputs/outputs and run inference."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"def inference(features):\n",
	" #print(\"\\nRunning Inference...\")\n",
	" \n",
	" eval_start_time = time.time()\n",
	"\n",
	" # Copy inputs\n",
	" cuda.memcpy_htod_async(d_inputs[0], features[\"input_ids\"], stream)\n",
	" cuda.memcpy_htod_async(d_inputs[1], features[\"segment_ids\"], stream)\n",
	" cuda.memcpy_htod_async(d_inputs[2], features[\"input_mask\"], stream)\n",
	" # Run inference\n",
	" context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
	" # Transfer predictions back from GPU\n",
	" cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
	" # Synchronize the stream\n",
	" stream.synchronize()\n",
	"\n",
	" eval_time_elapsed = time.time() - eval_start_time\n",
	"\n",
	" #print(\"------------------------\")\n",
	" #print(\"Running inference in {:.3f} \".format(eval_time_elapsed))\n",
	" #print(\"------------------------\")\n",
	" \n",
	" return h_output\n",
	" \n",
	" for index, batch in enumerate(h_output):\n",
	" # Data Post-processing\n",
	" start_logits = batch[:, 0]\n",
	" end_logits = batch[:, 1]\n",
	"\n",
	" # Total number of n-best predictions to generate in the nbest_predictions.json output file\n",
	" n_best_size = 20\n",
	"\n",
	" # The maximum length of an answer that can be generated. This is needed\n",
	" # because the start and end predictions are not conditioned on one another\n",
	" max_answer_length = 30\n",
	"\n",
	" prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,\n",
	" start_logits, end_logits, n_best_size, max_answer_length)\n",
	"\n",
	" print(\"Processing output {:} in batch\".format(index))\n",
	" print(\"Answer: '{}'\".format(prediction))\n",
	" print(\"With probability: {:.3f}\".format(nbest_json[0]['probability'] * 100.0))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"8\n",
	"32\n"
	]
	}
	],
	"source": [
	"print(BATCH_SIZE)\n",
	"print(SIZE)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 9.07 ms, sys: 252 µs, total: 9.32 ms\n",
	"Wall time: 8.73 ms\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"(8, 32, 2, 1, 1)"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"out = inference(features)\n",
	"out.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"8.34 ms ± 3.83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
	]
	}
	],
	"source": [
	"%timeit inference(features)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}