Created
May 5, 2019 06:07
-
-
Save NHZlX/cfcdf72d3df46985b8da49b417798e1e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <vector> | |
#include <cmath> | |
#include <gtest/gtest.h> | |
#include "paddle/fluid/framework/lod_tensor.h" | |
#include "paddle/fluid/framework/op_desc.h" | |
#include "paddle/fluid/framework/tensor_util.h" | |
#include "paddle/fluid/inference/analysis/helper.h" | |
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" | |
#include "paddle/fluid/inference/tensorrt/engine.h" | |
#include "paddle/fluid/inference/utils/singleton.h" | |
#include "paddle/fluid/framework/executor.h" | |
#define Random(x) (rand()%x) | |
namespace paddle { | |
namespace inference { | |
namespace tensorrt { | |
void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, | |
const platform::DeviceContext& ctx) { | |
auto dims = tensor->dims(); | |
size_t num_elements = analysis::AccuDims(dims, dims.size()); | |
PADDLE_ENFORCE_GT(num_elements, 0); | |
platform::CPUPlace cpu_place; | |
framework::LoDTensor temp_tensor; | |
temp_tensor.Resize(dims); | |
auto* temp_data = temp_tensor.mutable_data<float>(cpu_place); | |
for (size_t i = 0; i < num_elements; i++) { | |
*(temp_data + i) = Random(100) / 1000.0; | |
} | |
TensorCopySync(temp_tensor, place, tensor); | |
} | |
void PrepareInt8Input(framework::LoDTensor *tensor, float * scale) { | |
float max_r = 0; | |
std::cout << "Prepare INPUT: " << std::endl; | |
platform::CPUPlace cpu_place; | |
auto dims = tensor->dims(); | |
size_t num_elements = analysis::AccuDims(dims, dims.size()); | |
PADDLE_ENFORCE_GT(num_elements, 0); | |
auto* temp_data = tensor->mutable_data<float>(cpu_place); | |
for (size_t i = 0; i < num_elements; i++) { | |
float temp_d = (Random(2000)/10. - 100); | |
if (std::abs(temp_d) > max_r) {max_r = std::abs(temp_d);} | |
*(temp_data + i) = temp_d; | |
std::cout << temp_d << std::endl; | |
} | |
*scale = max_r; | |
} | |
void PrepareParam(framework::LoDTensor *tensor) { | |
std::cout << "Prepare FC param: " << std::endl; | |
platform::CPUPlace cpu_place; | |
auto dims = tensor->dims(); | |
size_t num_elements = analysis::AccuDims(dims, dims.size()); | |
PADDLE_ENFORCE_GT(num_elements, 0); | |
int shape_s = num_elements; | |
auto* temp_data = tensor->mutable_data<float>(cpu_place); | |
for (size_t i = 0; i < shape_s; i++) { | |
float temp_d = (Random(400)/100. - 1.); | |
*(temp_data + i) = temp_d; | |
std::cout << temp_d << std::endl; | |
} | |
} | |
void PrepareFcParam(float *src, float *dst, int c, int k) { | |
std::cout << "Preprea Fc Param: " << std::endl; | |
for (int h = 0; h < k; h++) { | |
for (int w = 0; w < c; w++) { | |
dst[h * c + w * 1] = src[h * 1 + w * k]; | |
std::cout << dst[h * c + w * 1] << std::endl; | |
} | |
} | |
} | |
void PrepareFcInt8Param(framework::LoDTensor *tensor, framework::LoDTensor *tensor_int8, float *scale) { | |
float max_r = 0; | |
platform::CPUPlace cpu_place; | |
auto dims = tensor->dims(); | |
size_t num_elements = analysis::AccuDims(dims, dims.size()); | |
PADDLE_ENFORCE_GT(num_elements, 0); | |
auto* temp_data = tensor->mutable_data<float>(cpu_place); | |
auto* temp_data_int8 = tensor_int8->mutable_data<float>(cpu_place); | |
for (size_t i = 0; i < num_elements; i++) { | |
float temp_d = (Random(400)/100. - 1.); | |
if (std::abs(temp_d) > max_r) {max_r = std::abs(temp_d);} | |
*(temp_data + i) = temp_d; | |
std::cout << temp_d << std::endl; | |
} | |
for (size_t i = 0; i < num_elements; i++) { | |
float temp_d = *(temp_data + i); | |
*(temp_data_int8 + i) = std::round(temp_d / max_r * 127); | |
std::cout << *(temp_data_int8 + i) << std::endl; | |
} | |
*scale = max_r; | |
} | |
void PrepareInt8Param(framework::LoDTensor *tensor, framework::LoDTensor *tensor_int8, float *scale) { | |
std::cout << "Prepare PARAM: " << std::endl; | |
float max_r = 0; | |
platform::CPUPlace cpu_place; | |
auto dims = tensor->dims(); | |
size_t num_elements = analysis::AccuDims(dims, dims.size()); | |
PADDLE_ENFORCE_GT(num_elements, 0); | |
int cout = dims[0]; | |
int shape_s = num_elements / cout; | |
auto* temp_data = tensor->mutable_data<float>(cpu_place); | |
auto* temp_data_int8 = tensor_int8->mutable_data<float>(cpu_place); | |
for (size_t i = 0; i < shape_s; i++) { | |
float temp_d = (Random(400)/100. - 1.); | |
if (std::abs(temp_d) > max_r) {max_r = std::abs(temp_d);} | |
*(temp_data + i) = temp_d; | |
std::cout << temp_d << std::endl; | |
} | |
for (size_t i = 0; i < shape_s; i++) { | |
float temp_d = *(temp_data + i); | |
*(temp_data_int8 + i) = std::round(temp_d / max_r * 127); | |
std::cout << *(temp_data_int8 + i) << std::endl; | |
} | |
for (size_t i = shape_s; i < num_elements; i++) { | |
*(temp_data + i) = *(temp_data + (i % shape_s)); | |
*(temp_data_int8 + i) = *(temp_data_int8 + (i % shape_s)); | |
} | |
*scale = max_r; | |
} | |
void DeclVar(const std::string& name, const std::vector<int> dim_vec, framework::Scope &scope) { | |
platform::CUDAPlace place; | |
platform::CUDADeviceContext ctx(place); | |
auto* x = scope.Var(name); | |
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); | |
x_tensor->Resize(framework::make_ddim(dim_vec)); | |
RandomizeTensor(x_tensor, place, ctx); | |
} | |
void DeclVar(const std::string& name, const framework::LoDTensor & tensor, framework::Scope &scope) { | |
platform::CUDAPlace place; | |
auto* x = scope.Var(name); | |
auto dims = tensor.dims(); | |
auto* x_tensor = x->GetMutable<framework::LoDTensor>(); | |
x_tensor->Resize(dims); | |
TensorCopySync(tensor, place, x_tensor); | |
} | |
void PrepareTRT() { | |
} | |
void PrepareFluid() { | |
// Prepare fluid int8 program | |
platform::CUDAPlace cuda_place(0); | |
platform::CUDADeviceContext cuda_ctx(cuda_place); | |
framework::Executor executor(cuda_place); | |
framework::Scope scope; | |
framework::ProgramDesc program_desc; | |
framework::BlockDesc *main_block = program_desc.MutableBlock(framework::kRootBlockIndex); | |
float quant_scale; | |
platform::CPUPlace cpu_place; | |
framework::LoDTensor temp_tensor; | |
temp_tensor.Resize(framework::make_ddim({1, 1, 1, 2})); | |
temp_tensor.mutable_data<float>(cpu_place); | |
PrepareInt8Input(&temp_tensor, &quant_scale); | |
framework::LoDTensor in_scale_tensor; | |
in_scale_tensor.Resize(framework::make_ddim({1})); | |
auto *temp_data = in_scale_tensor.mutable_data<float>(cpu_place); | |
temp_data[0] = quant_scale; | |
std::cout << "quant_scale: " << quant_scale << std::endl; | |
// first fc1 | |
float weight_scale; | |
framework::LoDTensor filter_tensor, filter_tensor_int8; | |
filter_tensor.Resize(framework::make_ddim({2, 2})); | |
filter_tensor.mutable_data<float>(cpu_place); | |
filter_tensor_int8.Resize(framework::make_ddim({2, 2})); | |
filter_tensor_int8.mutable_data<float>(cpu_place); | |
PrepareFcInt8Param(&filter_tensor, &filter_tensor_int8, &weight_scale); | |
// fc weight | |
framework::LoDTensor fc_weight_tensor; | |
fc_weight_tensor.Resize(framework::make_ddim({2, 2})); | |
fc_weight_tensor.mutable_data<float>(cpu_place); | |
PrepareParam(&fc_weight_tensor); | |
DeclVar("quant_x", temp_tensor, scope); | |
DeclVar("quant_x_scale", in_scale_tensor, scope); | |
DeclVar("quant_out", {1, 1, 1, 2}, scope); | |
DeclVar("quant_out_scale", in_scale_tensor, scope); | |
DeclVar("conv_filter", filter_tensor_int8, scope); | |
DeclVar("conv_output", {1, 2}, scope); | |
// DeclVar("dequant_x", {1, 2, 1, 1}, scope); | |
DeclVar("dequant_out", {1, 2}, scope); | |
DeclVar("fc_weight", fc_weight_tensor, scope); | |
DeclVar("fc_out", {1, 2}, scope); | |
auto* fake_q_desc = main_block->AppendOp(); | |
// fake_quant | |
fake_q_desc->SetType("fake_quantize_range_abs_max"); | |
fake_q_desc->SetInput("X", {"quant_x"}); | |
fake_q_desc->SetInput("InScale", {"quant_x_scale"}); | |
fake_q_desc->SetOutput("Out", {"quant_out"}); | |
fake_q_desc->SetOutput("OutScale", {"quant_out_scale"}); | |
int window_size = 10000; | |
int bit_length = 8; | |
bool is_test = true; | |
fake_q_desc->SetAttr("window_size", window_size); | |
fake_q_desc->SetAttr("bit_length", bit_length); | |
fake_q_desc->SetAttr("is_test", is_test); | |
auto* desc = main_block->AppendOp(); | |
desc->SetType("mul"); | |
desc->SetInput("X", {"quant_out"}); | |
desc->SetInput("Y", {"conv_filter"}); | |
desc->SetOutput("Out", {"conv_output"}); | |
auto* fake_dq_desc = main_block->AppendOp(); | |
// fake_dquant | |
fake_dq_desc->SetType("fake_dequantize_max_abs"); | |
fake_dq_desc->SetInput("X", {"conv_output"}); | |
fake_dq_desc->SetInput("Scale", {"quant_x_scale"}); | |
fake_dq_desc->SetOutput("Out", {"dequant_out"}); | |
// need change here. | |
float max_range = float(127 * 127) / weight_scale; | |
std::cout << "max_range: " << max_range << std::endl; | |
fake_dq_desc->SetAttr("max_range", max_range); | |
// fc op | |
auto* fc_desc = main_block->AppendOp(); | |
fc_desc->SetType("mul"); | |
fc_desc->SetInput("X", {"dequant_out"}); | |
fc_desc->SetInput("Y", {"fc_weight"}); | |
fc_desc->SetOutput("Out", {"fc_out"}); | |
auto ctx = executor.Prepare(program_desc, 0); | |
executor.RunPreparedContext(ctx.get(), &scope, false, true, true); | |
// auto *var = scope.FindVar("quant_out"); | |
// auto *var = scope.FindVar("conv_filter"); | |
auto *var = scope.FindVar("fc_out"); | |
// auto *var = scope.FindVar("conv_output"); | |
// auto *var = scope.FindVar("dequant_out"); | |
auto out_tensor = var->GetMutable<framework::LoDTensor>(); | |
std::vector<float> fluid_out; | |
framework::TensorToVector(*out_tensor, cuda_ctx, &fluid_out); | |
std::cout << "fluid output: " << std::endl; | |
size_t fluid_out_size = framework::product(out_tensor->dims()); | |
for (int i = 0; i < fluid_out_size; i++) { | |
std::cout << fluid_out[i] << std::endl; | |
} | |
// FOR TRT | |
cudaStream_t stream; | |
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream), 0); | |
TensorRTEngine engine(1, 1 << 20, true, nullptr, 0); | |
engine.InitNetwork(); | |
engine.DeclareInput("conv_input", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW(1, 1, 2)); | |
framework::LoDTensor fc1_weight_temp; | |
fc1_weight_temp.Resize(filter_tensor.dims()); | |
fc1_weight_temp.mutable_data<float>(cpu_place); | |
auto *filter_data = filter_tensor.mutable_data<float>(cpu_place); | |
auto *fc1_weight_temp_data = fc1_weight_temp.mutable_data<float>(cpu_place); | |
PrepareFcParam(filter_data, fc1_weight_temp_data, 2, 2); | |
std::cout << "filter tensor num: " << filter_tensor.numel() << std::endl; | |
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, | |
static_cast<void*>(fc1_weight_temp_data), | |
static_cast<size_t>(filter_tensor.numel())}; | |
TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; | |
auto* X = engine.GetITensor("conv_input"); | |
engine.SetTensorDynamicRange(X, 38.3); | |
auto *layer = engine.network()->addFullyConnected(*X, 2, weight.get(), bias.get()); | |
engine.SetITensor("conv_output", layer->getOutput(0)); | |
auto *iconv_output = layer->getOutput(0); | |
iconv_output->setName("conv_output"); | |
engine.SetTensorDynamicRange(iconv_output, 95.34311147622294); | |
// For Fc | |
framework::LoDTensor fc_weight_temp; | |
fc_weight_temp.Resize(fc_weight_tensor.dims()); | |
fc_weight_temp.mutable_data<float>(cpu_place); | |
auto *fc_weight_data = fc_weight_tensor.mutable_data<float>(cpu_place); | |
auto *fc_weight_temp_data = fc_weight_temp.mutable_data<float>(cpu_place); | |
PrepareFcParam(fc_weight_data, fc_weight_temp_data, 2, 2); | |
TensorRTEngine::Weight fc_weight{nvinfer1::DataType::kFLOAT, | |
static_cast<void*>(fc_weight_temp_data), | |
static_cast<size_t>(fc_weight_temp.numel())}; | |
TensorRTEngine::Weight fc_bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; | |
auto *fc_layer = engine.network()->addFullyConnected(*iconv_output, 2, fc_weight.get(), fc_bias.get()); | |
engine.SetITensor("fc_output", fc_layer->getOutput(0)); | |
engine.SetTensorDynamicRange(fc_layer->getOutput(0), 10); | |
engine.DeclareOutput("fc_output"); | |
engine.FreezeNetwork(); | |
std::vector<void*> buffers(2); | |
const int input_index = engine.engine()->getBindingIndex("conv_input"); | |
const int output_index = engine.engine()->getBindingIndex("fc_output"); | |
auto *input_var = scope.FindVar("quant_x"); | |
auto *input_tensor = input_var->GetMutable<framework::LoDTensor>(); | |
buffers[input_index] = static_cast<void*>(input_tensor->data<void>()); | |
buffers[output_index] = static_cast<void*>(out_tensor->mutable_data<float>(cuda_place)); | |
cudaStreamSynchronize(stream); | |
std::cout << "start Executing..." << std::endl; | |
engine.Execute(1, &buffers, stream); | |
cudaStreamSynchronize(stream); | |
std::vector<float> trt_out; | |
framework::TensorToVector(*out_tensor, cuda_ctx, &trt_out); | |
cudaStreamSynchronize(stream); | |
for (int i = 0; i < fluid_out_size; i++) { | |
std::cout << trt_out[i] << std::endl; | |
} | |
} | |
TEST(fake_int8, normal) { | |
PrepareFluid(); | |
} | |
} // namespace tensorrt | |
} // namespace inference | |
} // namespace paddle | |
USE_OP(mul); | |
USE_OP(conv2d); | |
USE_OP(fake_quantize_range_abs_max); | |
USE_OP(fake_dequantize_max_abs); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment