killeent/depthwise_gradWeight.c

## depthwise_gradWeight.c
// Assumes:
// - input is (N, C, H, W)
// - gradOutput is (N, C, goH, goW)
// - gradWeight is (C, 1, kH, kW) --> (C, kH, kW)

// Naive Loop: No striding, padding, dilation handled

// These three loops would be parallelized, such that each is computed by a single block
for (int ch = 0; ch < C; ++ch) {
  for (gw_h_offset = 0; gw_h_offset < kH; ++gw_h_offset) {
    for (gw_w_offset = 0; gw_w_offset < kH; ++gw_w_offset) {

      // Here is where the individual threads would compute their values, will need to be smart
      // about the memory layout and ordering of these loops but I did not think about
      // that here
      float grad = 0.0;
      for (bsz = 0; bsz < N; ++bsz) {
         for (go_h_offset = 0; go_h_offset < goH; ++go_h_offset) {
           for (go_w_offset = 0; go_w_offset < goW; ++go_w_offset) {
             grad += input[bsz][ch][go_h_offset + gw_h_offset][go_w_offset + gw_w_offset] *
               gradOutput[bsz][ch][go_h_offset][gw_w_offset];
           }
         }
      }

      // At this point, we would do a blockwise reduction, but in the naive loop, we don't
      // have to do this
      gradWeight[ch][gw_h_offset][gw_w_offset] = grad;
    }
  }
}
	// Assumes:
	// - input is (N, C, H, W)
	// - gradOutput is (N, C, goH, goW)
	// - gradWeight is (C, 1, kH, kW) --> (C, kH, kW)

	// Naive Loop: No striding, padding, dilation handled

	// These three loops would be parallelized, such that each is computed by a single block
	for (int ch = 0; ch < C; ++ch) {
	for (gw_h_offset = 0; gw_h_offset < kH; ++gw_h_offset) {
	for (gw_w_offset = 0; gw_w_offset < kH; ++gw_w_offset) {

	// Here is where the individual threads would compute their values, will need to be smart
	// about the memory layout and ordering of these loops but I did not think about
	// that here
	float grad = 0.0;
	for (bsz = 0; bsz < N; ++bsz) {
	for (go_h_offset = 0; go_h_offset < goH; ++go_h_offset) {
	for (go_w_offset = 0; go_w_offset < goW; ++go_w_offset) {
	grad += input[bsz][ch][go_h_offset + gw_h_offset][go_w_offset + gw_w_offset] *
	gradOutput[bsz][ch][go_h_offset][gw_w_offset];
	}
	}
	}

	// At this point, we would do a blockwise reduction, but in the naive loop, we don't
	// have to do this
	gradWeight[ch][gw_h_offset][gw_w_offset] = grad;
	}
	}
	}