Created
April 18, 2020 00:45
-
-
Save emfomenk/1c3efe3c0d5102d8fc0987ef8633d931 to your computer and use it in GitHub Desktop.
ref gpu sum patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
commit 7cb078d0f5b27957074695a7fe92f0103fb713b7 | |
Author: Fomenko, Evarist M <evarist.m.fomenko@intel.com> | |
Date: Sat Apr 18 00:42:00 2020 +0000 | |
gpu: ocl: sum: fix accuracy loss for non-f32 sum | |
diff --git a/src/gpu/ocl/ref_sum.hpp b/src/gpu/ocl/ref_sum.hpp | |
index ed927d338..451f58348 100644 | |
--- a/src/gpu/ocl/ref_sum.hpp | |
+++ b/src/gpu/ocl/ref_sum.hpp | |
@@ -51,7 +51,7 @@ struct ref_sum_t : public gpu_primitive_t { | |
for (int i = 0; i < n_; ++i) { | |
auto r_impls = engine->get_reorder_implementation_list( | |
- src_md(i), dst_md()); | |
+ src_md(i), dst_acc_md()); | |
for (auto r = r_impls; *r; ++r) { | |
primitive_attr_t r_attr; | |
r_attr.set_scratchpad_mode(scratchpad_mode::user); | |
@@ -60,15 +60,35 @@ struct ref_sum_t : public gpu_primitive_t { | |
reorder_pd_t *r_pd; | |
if ((*r)(&r_pd, engine, &r_attr, engine, src_md(i), engine, | |
- dst_md()) | |
+ dst_acc_md()) | |
== status::success) { | |
reorder_pds_.emplace_back(r_pd); | |
break; | |
} | |
} | |
} | |
- ok = utils::everyone_is(reorder_pds_.size(), scales_.size()); | |
- return ok ? status::success : status::unimplemented; | |
+ | |
+ if (need_output_reorder()) { | |
+ auto r_impls = engine->get_reorder_implementation_list( | |
+ dst_acc_md(), dst_md()); | |
+ for (auto r = r_impls; *r; ++r) { | |
+ primitive_attr_t r_attr; | |
+ r_attr.set_scratchpad_mode(scratchpad_mode::user); | |
+ reorder_pd_t *r_pd = nullptr; | |
+ if ((*r)(&r_pd, engine, &r_attr, engine, dst_acc_md(), | |
+ engine, dst_md()) | |
+ == status::success) { | |
+ reorder_pds_.emplace_back(r_pd); | |
+ break; | |
+ } | |
+ } | |
+ } | |
+ | |
+ ok = reorder_pds_.size() == (size_t)n_ + need_output_reorder(); | |
+ if (!ok) return status::unimplemented; | |
+ | |
+ init_scratchpad(); | |
+ return status::success; | |
} | |
void clone_reorder_pds(const pd_t &rhs) { | |
@@ -81,10 +101,15 @@ struct ref_sum_t : public gpu_primitive_t { | |
private: | |
void init_scratchpad() { | |
+ using namespace memory_tracking::names; | |
auto scratchpad = scratchpad_registry().registrar(); | |
+ if (need_output_reorder()) { | |
+ const memory_desc_wrapper dst_acc_d(dst_acc_md()); | |
+ scratchpad.book(key_sum_reduction, dst_acc_d.size()); | |
+ } | |
+ | |
for (size_t i = 0; i < reorder_pds_.size(); i++) { | |
- scratchpad.book( | |
- memory_tracking::names::key_nested_multiple + (int)i, | |
+ scratchpad.book(key_nested_multiple + (int)i, | |
reorder_pds_[i]->scratchpad_registry().size()); | |
} | |
} | |
@@ -93,7 +118,7 @@ struct ref_sum_t : public gpu_primitive_t { | |
ref_sum_t(const pd_t *apd) : gpu_primitive_t(apd) {} | |
status_t init(engine_t *engine) override { | |
- const int n = pd()->n_inputs(); | |
+ const int n = pd()->n_inputs() + pd()->need_output_reorder(); | |
reorders_.resize(n); | |
for (int i = 0; i < n; ++i) { | |
pd()->reorder_pds_[i]->create_primitive(reorders_[i], engine); | |
@@ -105,10 +130,23 @@ struct ref_sum_t : public gpu_primitive_t { | |
virtual status_t execute(const exec_ctx_t &ctx) const override { | |
using namespace memory_tracking::names; | |
const auto n = pd()->n_inputs(); | |
+ exec_args_t r_args; | |
+ | |
+ std::unique_ptr<memory_t> p_temp_dst_acc; | |
+ if (pd()->need_output_reorder()) { | |
+ auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage( | |
+ key_sum_reduction); | |
+ p_temp_dst_acc.reset(new memory_t(ctx.stream()->engine(), | |
+ pd()->dst_acc_md(), memory_flags_t::use_runtime_ptr, | |
+ scratchpad->data_handle())); | |
+ } | |
+ | |
+ auto dst = ctx.args().at(DNNL_ARG_DST); | |
+ memory_arg_t dst_acc = {p_temp_dst_acc.get(), false}; | |
+ | |
for (int i = 0; i < n; ++i) { | |
- exec_args_t r_args; | |
r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i); | |
- r_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DST); | |
+ r_args[DNNL_ARG_DST] = pd()->need_output_reorder() ? dst_acc : dst; | |
exec_ctx_t r_ctx(ctx, std::move(r_args)); | |
nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]); | |
@@ -116,6 +154,18 @@ struct ref_sum_t : public gpu_primitive_t { | |
reorders_[i]->execute(r_ctx); | |
ctx.stream()->wait(); | |
} | |
+ | |
+ if (pd()->need_output_reorder()) { | |
+ dst_acc = {p_temp_dst_acc.get(), true}; | |
+ r_args[DNNL_ARG_SRC] = dst_acc; | |
+ r_args[DNNL_ARG_DST] = dst; | |
+ exec_ctx_t r_ctx(ctx, std::move(r_args)); | |
+ | |
+ nested_scratchpad_t ns(ctx, key_nested_multiple + n, reorders_[n]); | |
+ r_ctx.set_scratchpad_grantor(ns.grantor()); | |
+ reorders_[n]->execute(r_ctx); | |
+ } | |
+ | |
return status::success; | |
} | |
diff --git a/tests/benchdnn/inputs/sum/test_sum_gpu b/tests/benchdnn/inputs/sum/test_sum_gpu | |
index dc2b09047..862ca07db 100644 | |
--- a/tests/benchdnn/inputs/sum/test_sum_gpu | |
+++ b/tests/benchdnn/inputs/sum/test_sum_gpu | |
@@ -1,9 +1,9 @@ | |
# f32 | |
--reset | |
---ddt=f32,bf16 | |
+--ddt=f32,bf16,s8 | |
--dtag=undef,abx | |
---sdt=f32:f32,bf16:bf16 | |
+--sdt=f32:f32,bf16:bf16,s8:s8 | |
--stag=abx:abx 3x3x16x4 4x4x2x16 5x5x1x15 2x8x3x10 1x9x4x7 1x16x5x11 2x15x6x3 | |
--sdt=f32:f32:f32,bf16:bf16:bf16 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment