Skip to content

Instantly share code, notes, and snippets.

@emfomenk
Created April 18, 2020 00:45
Show Gist options
  • Save emfomenk/1c3efe3c0d5102d8fc0987ef8633d931 to your computer and use it in GitHub Desktop.
Save emfomenk/1c3efe3c0d5102d8fc0987ef8633d931 to your computer and use it in GitHub Desktop.
ref gpu sum patch
commit 7cb078d0f5b27957074695a7fe92f0103fb713b7
Author: Fomenko, Evarist M <evarist.m.fomenko@intel.com>
Date: Sat Apr 18 00:42:00 2020 +0000
gpu: ocl: sum: fix accuracy loss for non-f32 sum
diff --git a/src/gpu/ocl/ref_sum.hpp b/src/gpu/ocl/ref_sum.hpp
index ed927d338..451f58348 100644
--- a/src/gpu/ocl/ref_sum.hpp
+++ b/src/gpu/ocl/ref_sum.hpp
@@ -51,7 +51,7 @@ struct ref_sum_t : public gpu_primitive_t {
for (int i = 0; i < n_; ++i) {
auto r_impls = engine->get_reorder_implementation_list(
- src_md(i), dst_md());
+ src_md(i), dst_acc_md());
for (auto r = r_impls; *r; ++r) {
primitive_attr_t r_attr;
r_attr.set_scratchpad_mode(scratchpad_mode::user);
@@ -60,15 +60,35 @@ struct ref_sum_t : public gpu_primitive_t {
reorder_pd_t *r_pd;
if ((*r)(&r_pd, engine, &r_attr, engine, src_md(i), engine,
- dst_md())
+ dst_acc_md())
== status::success) {
reorder_pds_.emplace_back(r_pd);
break;
}
}
}
- ok = utils::everyone_is(reorder_pds_.size(), scales_.size());
- return ok ? status::success : status::unimplemented;
+
+ if (need_output_reorder()) {
+ auto r_impls = engine->get_reorder_implementation_list(
+ dst_acc_md(), dst_md());
+ for (auto r = r_impls; *r; ++r) {
+ primitive_attr_t r_attr;
+ r_attr.set_scratchpad_mode(scratchpad_mode::user);
+ reorder_pd_t *r_pd = nullptr;
+ if ((*r)(&r_pd, engine, &r_attr, engine, dst_acc_md(),
+ engine, dst_md())
+ == status::success) {
+ reorder_pds_.emplace_back(r_pd);
+ break;
+ }
+ }
+ }
+
+ ok = reorder_pds_.size() == (size_t)n_ + need_output_reorder();
+ if (!ok) return status::unimplemented;
+
+ init_scratchpad();
+ return status::success;
}
void clone_reorder_pds(const pd_t &rhs) {
@@ -81,10 +101,15 @@ struct ref_sum_t : public gpu_primitive_t {
private:
void init_scratchpad() {
+ using namespace memory_tracking::names;
auto scratchpad = scratchpad_registry().registrar();
+ if (need_output_reorder()) {
+ const memory_desc_wrapper dst_acc_d(dst_acc_md());
+ scratchpad.book(key_sum_reduction, dst_acc_d.size());
+ }
+
for (size_t i = 0; i < reorder_pds_.size(); i++) {
- scratchpad.book(
- memory_tracking::names::key_nested_multiple + (int)i,
+ scratchpad.book(key_nested_multiple + (int)i,
reorder_pds_[i]->scratchpad_registry().size());
}
}
@@ -93,7 +118,7 @@ struct ref_sum_t : public gpu_primitive_t {
ref_sum_t(const pd_t *apd) : gpu_primitive_t(apd) {}
status_t init(engine_t *engine) override {
- const int n = pd()->n_inputs();
+ const int n = pd()->n_inputs() + pd()->need_output_reorder();
reorders_.resize(n);
for (int i = 0; i < n; ++i) {
pd()->reorder_pds_[i]->create_primitive(reorders_[i], engine);
@@ -105,10 +130,23 @@ struct ref_sum_t : public gpu_primitive_t {
virtual status_t execute(const exec_ctx_t &ctx) const override {
using namespace memory_tracking::names;
const auto n = pd()->n_inputs();
+ exec_args_t r_args;
+
+ std::unique_ptr<memory_t> p_temp_dst_acc;
+ if (pd()->need_output_reorder()) {
+ auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage(
+ key_sum_reduction);
+ p_temp_dst_acc.reset(new memory_t(ctx.stream()->engine(),
+ pd()->dst_acc_md(), memory_flags_t::use_runtime_ptr,
+ scratchpad->data_handle()));
+ }
+
+ auto dst = ctx.args().at(DNNL_ARG_DST);
+ memory_arg_t dst_acc = {p_temp_dst_acc.get(), false};
+
for (int i = 0; i < n; ++i) {
- exec_args_t r_args;
r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i);
- r_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DST);
+ r_args[DNNL_ARG_DST] = pd()->need_output_reorder() ? dst_acc : dst;
exec_ctx_t r_ctx(ctx, std::move(r_args));
nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]);
@@ -116,6 +154,18 @@ struct ref_sum_t : public gpu_primitive_t {
reorders_[i]->execute(r_ctx);
ctx.stream()->wait();
}
+
+ if (pd()->need_output_reorder()) {
+ dst_acc = {p_temp_dst_acc.get(), true};
+ r_args[DNNL_ARG_SRC] = dst_acc;
+ r_args[DNNL_ARG_DST] = dst;
+ exec_ctx_t r_ctx(ctx, std::move(r_args));
+
+ nested_scratchpad_t ns(ctx, key_nested_multiple + n, reorders_[n]);
+ r_ctx.set_scratchpad_grantor(ns.grantor());
+ reorders_[n]->execute(r_ctx);
+ }
+
return status::success;
}
diff --git a/tests/benchdnn/inputs/sum/test_sum_gpu b/tests/benchdnn/inputs/sum/test_sum_gpu
index dc2b09047..862ca07db 100644
--- a/tests/benchdnn/inputs/sum/test_sum_gpu
+++ b/tests/benchdnn/inputs/sum/test_sum_gpu
@@ -1,9 +1,9 @@
# f32
--reset
---ddt=f32,bf16
+--ddt=f32,bf16,s8
--dtag=undef,abx
---sdt=f32:f32,bf16:bf16
+--sdt=f32:f32,bf16:bf16,s8:s8
--stag=abx:abx 3x3x16x4 4x4x2x16 5x5x1x15 2x8x3x10 1x9x4x7 1x16x5x11 2x15x6x3
--sdt=f32:f32:f32,bf16:bf16:bf16
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment