Column01/make.log Secret

## make.log
  >> Using a default computing expression for testing.
  >> Using c-rocm_win64 default device properties.

[LL-IR]
input0 = input("input0", [1024, 512], dtype="float32"); input1 = input("input1", [1024, 512], dtype="float32");
output0 = output(shape=[1024, 512], func=lambda N, M: (input0[N, M] + input1[N, M]), dtype="float32", tag="", name="output0", final_output=True);

  >> Backend = c-rocm_win64, Python PID = 66, Task = lang.generic;

// ---------------------------------------------------------------------------
// GLOBALS: input0:float32[1024, 512], input1:float32[1024, 512] -> output0:float32[1024, 512]
// BACKEND: c-rocm_win64 (default)
// CONFIG: null
// COMPUTE_V1: - einstein_v2("output0[N, M] = input0[N, M] + input1[N, M]", input_dict={"input0": {"dtype": "float32", "shape": [1024, 512]}, "input1": {"dtype": "float32", "shape": [1024, 512]}})


// ---------------------------------------------------------------------------
// LOCAL: template_op_kernel0 -- input0:float32[1024, 512], input1:float32[1024, 512] -> output0:float32[1024, 512]

#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>

#ifndef __ROCM_COMMON_MACRO__
#define __ROCM_COMMON_MACRO__

#define __ITEM_0_OF__(v) (v).x
#define __ITEM_1_OF__(v) (v).y
#define __ITEM_2_OF__(v) (v).z
#define __ITEM_3_OF__(v) (v).w

#define __STORE_ITEM_0__(t, out, ido, in, idi) *(t*)(out + ido) = *(t*)(in + idi)
#define __STORE_ITEM_1__(t, out, ido, in, idi)
#define __STORE_ITEM_2__(t, out, ido, in, idi)
#define __STORE_ITEM_3__(t, out, ido, in, idi)

#endif


extern "C" __global__ __launch_bounds__(1) void template_op_kernel0(float* __restrict__ input0, float* __restrict__ input1, float* __restrict__ output0) {
  // [thread_extent] blockIdx.x = 1024
  // [thread_extent] threadIdx.x = 1
  // [thread_extent] blockIdx.y = 512
  // [thread_extent] threadIdx.y = 1
  output0[(((((int)blockIdx.x) * 512) + ((int)blockIdx.y)))] = (input0[(((((int)blockIdx.x) * 512) + ((int)blockIdx.y)))] + input1[(((((int)blockIdx.x) * 512) + ((int)blockIdx.y)))]);
}

// ---------------------------------------------------------------------------

[EvalAgent] Evaluating Modules..


[EvalAgent] Results = {}

[Antares] Incorrect compute kernel from evaluator.
	>> Using a default computing expression for testing.
	>> Using c-rocm_win64 default device properties.

	[LL-IR]
	input0 = input("input0", [1024, 512], dtype="float32"); input1 = input("input1", [1024, 512], dtype="float32");
	output0 = output(shape=[1024, 512], func=lambda N, M: (input0[N, M] + input1[N, M]), dtype="float32", tag="", name="output0", final_output=True);

	>> Backend = c-rocm_win64, Python PID = 66, Task = lang.generic;

	// ---------------------------------------------------------------------------
	// GLOBALS: input0:float32[1024, 512], input1:float32[1024, 512] -> output0:float32[1024, 512]
	// BACKEND: c-rocm_win64 (default)
	// CONFIG: null
	// COMPUTE_V1: - einstein_v2("output0[N, M] = input0[N, M] + input1[N, M]", input_dict={"input0": {"dtype": "float32", "shape": [1024, 512]}, "input1": {"dtype": "float32", "shape": [1024, 512]}})


	// ---------------------------------------------------------------------------
	// LOCAL: template_op_kernel0 -- input0:float32[1024, 512], input1:float32[1024, 512] -> output0:float32[1024, 512]

	#include <hip/hip_runtime.h>
	#include <hip/hip_fp16.h>

	#ifndef __ROCM_COMMON_MACRO__
	#define __ROCM_COMMON_MACRO__

	#define __ITEM_0_OF__(v) (v).x
	#define __ITEM_1_OF__(v) (v).y
	#define __ITEM_2_OF__(v) (v).z
	#define __ITEM_3_OF__(v) (v).w

	#define __STORE_ITEM_0__(t, out, ido, in, idi) (t)(out + ido) = (t)(in + idi)
	#define __STORE_ITEM_1__(t, out, ido, in, idi)
	#define __STORE_ITEM_2__(t, out, ido, in, idi)
	#define __STORE_ITEM_3__(t, out, ido, in, idi)

	#endif


	extern "C" __global__ __launch_bounds__(1) void template_op_kernel0(float* __restrict__ input0, float* __restrict__ input1, float* __restrict__ output0) {
	// [thread_extent] blockIdx.x = 1024
	// [thread_extent] threadIdx.x = 1
	// [thread_extent] blockIdx.y = 512
	// [thread_extent] threadIdx.y = 1
	output0[(((((int)blockIdx.x) * 512) + ((int)blockIdx.y)))] = (input0[(((((int)blockIdx.x) * 512) + ((int)blockIdx.y)))] + input1[(((((int)blockIdx.x) * 512) + ((int)blockIdx.y)))]);
	}

	// ---------------------------------------------------------------------------

	[EvalAgent] Evaluating Modules..


	[EvalAgent] Results = {}

	[Antares] Incorrect compute kernel from evaluator.