Skip to content

Instantly share code, notes, and snippets.

@suxue
Created March 11, 2020 12:06
Show Gist options
  • Save suxue/92561040d036fe6fa0db496e255b68af to your computer and use it in GitHub Desktop.
Save suxue/92561040d036fe6fa0db496e255b68af to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#!/usr/bin/env python3\n",
"# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"\n",
"import time\n",
"import ctypes\n",
"import argparse\n",
"import numpy as np\n",
"import tensorrt as trt\n",
"import pycuda.driver as cuda\n",
"import pycuda.autoinit\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"SIZE = 32"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<CDLL 'libbert_plugins.so', handle 37fc880 at 0x7f392d247d10>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max_query_length = SIZE\n",
"# When splitting up a long document into chunks, how much stride to take between chunks.\n",
"doc_stride = SIZE\n",
"# The maximum total input sequence length after WordPiece tokenization.\n",
"# Sequences longer than this will be truncated, and sequences shorter\n",
"max_seq_length = SIZE\n",
"\n",
"# Import necessary plugins for BERT TensorRT\n",
"ctypes.CDLL(\"libnvinfer_plugin.so.6\", mode=ctypes.RTLD_GLOBAL)\n",
"ctypes.CDLL(\"libcommon.so\", mode=ctypes.RTLD_GLOBAL)\n",
"ctypes.CDLL(\"libbert_plugins.so\", mode=ctypes.RTLD_GLOBAL)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"serialized_bert = open('./trt_bert/p40/{}.trt'.format(SIZE), 'rb').read()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"runtime = trt.Runtime(TRT_LOGGER)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"engine = runtime.deserialize_cuda_engine(serialized_bert)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"context = engine.create_execution_context()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"BATCH_SIZE = 8"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"input_shape=(BATCH_SIZE, max_seq_length)\n",
"input_nbytes = trt.volume(input_shape) * trt.int32.itemsize"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
"# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
"for binding in range(3):\n",
" context.set_binding_shape(binding, input_shape)\n",
"assert context.all_binding_shapes_specified"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
"d_output = cuda.mem_alloc(h_output.nbytes)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
"/usr/local/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
" np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
]
}
],
"source": [
"import data_processing as dp\n",
"import tokenization"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From /apdcephfs/private_andyfei/trt/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n",
"\n"
]
}
],
"source": [
"tokenizer = tokenization.FullTokenizer(vocab_file='./trt_bert/model/vocab.txt',\n",
" do_lower_case=True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"doc_tokens = dp.convert_doc_tokens('hello world, nice to meet you' * 100)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def question_features(question):\n",
" return dp.convert_examples_to_features(doc_tokens, question, tokenizer,\n",
" max_seq_length, doc_stride, max_query_length)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"features = question_features('hello world')\n",
"\n",
"features['input_ids'] = np.reshape((np.stack([features['input_ids']] * BATCH_SIZE)), [-1])\n",
"features['input_mask'] = np.reshape((np.stack([features['input_mask']] * BATCH_SIZE)), [-1])\n",
"features['segment_ids'] = np.reshape((np.stack([features['segment_ids']] * BATCH_SIZE)), [-1])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"stream = cuda.Stream() # Create a stream in which to copy inputs/outputs and run inference."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def inference(features):\n",
" #print(\"\\nRunning Inference...\")\n",
" \n",
" eval_start_time = time.time()\n",
"\n",
" # Copy inputs\n",
" cuda.memcpy_htod_async(d_inputs[0], features[\"input_ids\"], stream)\n",
" cuda.memcpy_htod_async(d_inputs[1], features[\"segment_ids\"], stream)\n",
" cuda.memcpy_htod_async(d_inputs[2], features[\"input_mask\"], stream)\n",
" # Run inference\n",
" context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
" # Transfer predictions back from GPU\n",
" cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
" # Synchronize the stream\n",
" stream.synchronize()\n",
"\n",
" eval_time_elapsed = time.time() - eval_start_time\n",
"\n",
" #print(\"------------------------\")\n",
" #print(\"Running inference in {:.3f} \".format(eval_time_elapsed))\n",
" #print(\"------------------------\")\n",
" \n",
" return h_output\n",
" \n",
" for index, batch in enumerate(h_output):\n",
" # Data Post-processing\n",
" start_logits = batch[:, 0]\n",
" end_logits = batch[:, 1]\n",
"\n",
" # Total number of n-best predictions to generate in the nbest_predictions.json output file\n",
" n_best_size = 20\n",
"\n",
" # The maximum length of an answer that can be generated. This is needed\n",
" # because the start and end predictions are not conditioned on one another\n",
" max_answer_length = 30\n",
"\n",
" prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,\n",
" start_logits, end_logits, n_best_size, max_answer_length)\n",
"\n",
" print(\"Processing output {:} in batch\".format(index))\n",
" print(\"Answer: '{}'\".format(prediction))\n",
" print(\"With probability: {:.3f}\".format(nbest_json[0]['probability'] * 100.0))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8\n",
"32\n"
]
}
],
"source": [
"print(BATCH_SIZE)\n",
"print(SIZE)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 9.07 ms, sys: 252 µs, total: 9.32 ms\n",
"Wall time: 8.73 ms\n"
]
},
{
"data": {
"text/plain": [
"(8, 32, 2, 1, 1)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"out = inference(features)\n",
"out.shape"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8.34 ms ± 3.83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit inference(features)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment