Skip to content

Instantly share code, notes, and snippets.

@masahi
Created December 7, 2020 07:59
Show Gist options
  • Save masahi/005cc165f1c3bd5179baee6bdb49e674 to your computer and use it in GitHub Desktop.
Save masahi/005cc165f1c3bd5179baee6bdb49e674 to your computer and use it in GitHub Desktop.
Computational DAG:
data = PLACEHOLDER [1, 2048]
weight = PLACEHOLDER [1000, 2048]
T_dense(i, j) += (data[i, k]*weight[j, k])
bias = PLACEHOLDER [1000]
T_add(i, j) = (T_dense[i, j] + bias[j])
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Generate Sketches #s: 2
Sample Initial Population #s: 65 fail_ct: 6079 Time elapsed: 1.12
GA Iter: 0 Max score: 0.9951 Min score: 0.0049 #Pop: 65 #M+: 0 #M-: 0
GA Iter: 4 Max score: 1.0000 Min score: 0.9801 #Pop: 128 #M+: 1393 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.19
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
Get devices for measurement successfully!
........................************************==================================================
No: 1 GFLOPS: 0.04 / 0.04 results: MeasureResult(cost:[0.1110], error_no:0, all_cost:3.68, Tstamp:1607326965.16)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,10)
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,100)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for j.3 (0,100)
T_dense = ...
for j.3 (0,100)
T_add = ...
==================================================
No: 2 GFLOPS: 0.80 / 0.80 results: MeasureResult(cost:[0.0052], error_no:0, all_cost:2.01, Tstamp:1607326965.86)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,1000)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,1000)
data.shared = ...
for k.1 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 3 GFLOPS: 2.30 / 2.30 results: MeasureResult(cost:[0.0018], error_no:0, all_cost:2.50, Tstamp:1607326967.04)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 4 GFLOPS: 0.78 / 2.30 results: MeasureResult(cost:[0.0052], error_no:0, all_cost:2.03, Tstamp:1607326967.77)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,1000)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,1000)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,1000)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 5 GFLOPS: 3.96 / 3.96 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:2.44, Tstamp:1607326968.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.2 (0,32)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 6 GFLOPS: 1.07 / 3.96 results: MeasureResult(cost:[0.0038], error_no:0, all_cost:5.64, Tstamp:1607326970.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,50)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,256)
threadIdx.x ax0@ax1@.0.1 (0,50)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,50)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,32)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 7 GFLOPS: 3.35 / 3.96 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:2.46, Tstamp:1607326971.31)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 8 GFLOPS: 0.62 / 3.96 results: MeasureResult(cost:[0.0066], error_no:0, all_cost:2.00, Tstamp:1607326972.02)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for j.3 (0,2)
for k.2 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 9 GFLOPS: 1.24 / 3.96 results: MeasureResult(cost:[0.0033], error_no:0, all_cost:2.00, Tstamp:1607326972.73)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,25)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,5)
T_dense auto_unroll: 16
for k.0 (0,8)
for ax0@ax1@.0.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,5)
weight.shared = ...
for ax0@ax1@.0.0 (0,52)
threadIdx.x ax0@ax1@.0.1 (0,5)
data.shared = ...
for k.1 (0,32)
for j.3 (0,2)
for k.2 (0,8)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 10 GFLOPS: 0.62 / 3.96 results: MeasureResult(cost:[0.0066], error_no:0, all_cost:1.99, Tstamp:1607326973.43)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,500)
for k.0 (0,1024)
threadIdx.x ax0@ax1@.0.1 (0,500)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 11 GFLOPS: 0.96 / 3.96 results: MeasureResult(cost:[0.0043], error_no:0, all_cost:2.52, Tstamp:1607326974.58)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,40)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
for j.4 (0,10)
T_dense = ...
for j.3 (0,10)
T_add = ...
==================================================
No: 12 GFLOPS: 4.63 / 4.63 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:2.48, Tstamp:1607326975.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,500)
T_dense auto_unroll: 64
for j ((blockIdx.x*2),2)
for k.0 (0,1024)
threadIdx.x k.1 (0,2)
T_dense = ...
threadIdx.x j.1 (0,2)
T_add = ...
==================================================
No: 13 GFLOPS: 4.31 / 4.63 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.31, Tstamp:1607326976.91)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for j.3 (0,2)
for k.2 (0,8)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 14 GFLOPS: 0.42 / 4.63 results: MeasureResult(cost:[0.0099], error_no:0, all_cost:0.90, Tstamp:1607326977.66)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for j.3 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 15 GFLOPS: 2.60 / 4.63 results: MeasureResult(cost:[0.0016], error_no:0, all_cost:1.65, Tstamp:1607326978.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.2 (0,32)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 16 GFLOPS: 0.43 / 4.63 results: MeasureResult(cost:[0.0096], error_no:0, all_cost:0.98, Tstamp:1607326979.67)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,25)
T_dense auto_unroll: 1024
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,20)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,25)
data.shared = ...
for j.3 (0,5)
for j.4 (0,2)
T_dense = ...
for j.3 (0,10)
T_add = ...
==================================================
No: 17 GFLOPS: 3.42 / 4.63 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.32, Tstamp:1607326980.79)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,50)
threadIdx.x i.2@j.2@ (0,10)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for k.1 (0,8)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 18 GFLOPS: 0.91 / 4.63 results: MeasureResult(cost:[0.0045], error_no:0, all_cost:1.39, Tstamp:1607326982.01)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 19 GFLOPS: 3.42 / 4.63 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.60, Tstamp:1607326983.17)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,63)
T_dense auto_unroll: 16
for j ((blockIdx.x*16),16)
for k.0 (0,128)
threadIdx.x k.1 (0,16)
T_dense = ...
threadIdx.x j.1 (0,16)
T_add = ...
==================================================
No: 20 GFLOPS: 1.18 / 4.63 results: MeasureResult(cost:[0.0035], error_no:0, all_cost:1.37, Tstamp:1607326984.35)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
====================
........................************************==============================
No: 21 GFLOPS: 0.99 / 4.63 results: MeasureResult(cost:[0.0041], error_no:0, all_cost:1.49, Tstamp:1607326985.57)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,16)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 22 GFLOPS: 5.90 / 5.90 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.31, Tstamp:1607326986.72)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 23 GFLOPS: 0.44 / 5.90 results: MeasureResult(cost:[0.0093], error_no:0, all_cost:1.17, Tstamp:1607326987.44)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,80)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for j.3 (0,2)
for k.2 (0,8)
for j.4 (0,5)
T_dense = ...
for j.3 (0,10)
T_add = ...
==================================================
No: 24 GFLOPS: 0.62 / 5.90 results: MeasureResult(cost:[0.0066], error_no:0, all_cost:0.89, Tstamp:1607326988.18)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 25 GFLOPS: 1.26 / 5.90 results: MeasureResult(cost:[0.0033], error_no:0, all_cost:1.38, Tstamp:1607326990.73)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,500)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,500)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 26 GFLOPS: 1.40 / 5.90 results: MeasureResult(cost:[0.0029], error_no:0, all_cost:0.82, Tstamp:1607326991.40)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,1000)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,1000)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 27 GFLOPS: 3.78 / 5.90 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.76, Tstamp:1607326992.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,16)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 28 GFLOPS: 4.19 / 5.90 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.39, Tstamp:1607326993.75)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,32)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,64)
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 29 GFLOPS: 0.77 / 5.90 results: MeasureResult(cost:[0.0053], error_no:0, all_cost:0.87, Tstamp:1607326994.45)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,200)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,5)
threadIdx.x ax0@ax1@.0.1 (0,200)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,2)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 30 GFLOPS: 5.57 / 5.90 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.32, Tstamp:1607326995.61)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 31 GFLOPS: 0.79 / 5.90 results: MeasureResult(cost:[0.0052], error_no:0, all_cost:1.33, Tstamp:1607326996.78)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,500)
T_dense auto_unroll: 512
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,500)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 32 GFLOPS: 3.20 / 5.90 results: MeasureResult(cost:[0.0013], error_no:0, all_cost:1.29, Tstamp:1607326997.92)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,512)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 33 GFLOPS: 0.93 / 5.90 results: MeasureResult(cost:[0.0044], error_no:0, all_cost:1.43, Tstamp:1607326999.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 34 GFLOPS: 2.94 / 5.90 results: MeasureResult(cost:[0.0014], error_no:0, all_cost:1.32, Tstamp:1607327000.34)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 35 GFLOPS: 5.86 / 5.90 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.30, Tstamp:1607327001.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 36 GFLOPS: 0.77 / 5.90 results: MeasureResult(cost:[0.0053], error_no:0, all_cost:0.96, Tstamp:1607327002.27)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,500)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,500)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
data.shared = ...
for k.2 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 37 GFLOPS: 3.80 / 5.90 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.40, Tstamp:1607327003.39)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,125)
T_dense auto_unroll: 16
for j ((blockIdx.x*8),8)
for k.0 (0,256)
threadIdx.x k.1 (0,8)
T_dense = ...
threadIdx.x j.1 (0,8)
T_add = ...
==================================================
No: 38 GFLOPS: 0.08 / 5.90 results: MeasureResult(cost:[0.0494], error_no:0, all_cost:1.31, Tstamp:1607327004.31)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,10)
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,50)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for j.3 (0,25)
T_dense = ...
for j.3 (0,25)
T_add = ...
==================================================
No: 39 GFLOPS: 2.83 / 5.90 results: MeasureResult(cost:[0.0014], error_no:0, all_cost:1.42, Tstamp:1607327005.53)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 40 GFLOPS: 5.16 / 5.90 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.29, Tstamp:1607327006.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
threadIdx.x i.2@j.2@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,5)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,5)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 41 GFLOPS: 2.07 / 5.90 results: MeasureResult(cost:[0.0020], error_no:0, all_cost:1.41, Tstamp:1607327007.83)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,8)
for j.3 (0,2)
for k.2 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 42 GFLOPS: 5.68 / 5.90 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.39, Tstamp:1607327009.04)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,4)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 43 GFLOPS: 0.42 / 5.90 results: MeasureResult(cost:[0.0097], error_no:0, all_cost:0.91, Tstamp:1607327009.79)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,500)
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,500)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 44 GFLOPS: 2.05 / 5.90 results: MeasureResult(cost:[0.0020], error_no:0, all_cost:1.31, Tstamp:1607327010.92)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 45 GFLOPS: 0.79 / 5.90 results: MeasureResult(cost:[0.0052], error_no:0, all_cost:0.98, Tstamp:1607327011.73)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 46 GFLOPS: 0.07 / 5.90 results: MeasureResult(cost:[0.0555], error_no:0, all_cost:1.29, Tstamp:1607327012.65)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,10)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,200)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for j.3 (0,100)
for k.2 (0,2)
T_dense = ...
for j.3 (0,100)
T_add = ...
==================================================
No: 47 GFLOPS: 0.37 / 5.90 results: MeasureResult(cost:[0.0111], error_no:0, all_cost:2.16, Tstamp:1607327013.81)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,25)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,640)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,25)
data.shared = ...
for j.3 (0,10)
for k.2 (0,32)
for j.4 (0,2)
T_dense = ...
for j.3 (0,20)
T_add = ...
=============================================
.............****************=====
No: 48 GFLOPS: 0.53 / 5.90 results: MeasureResult(cost:[0.0077], error_no:0, all_cost:1.02, Tstamp:1607327014.61)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for j.3 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 49 GFLOPS: 4.20 / 5.90 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.41, Tstamp:1607327017.52)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,32)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,16)
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 50 GFLOPS: 0.54 / 5.90 results: MeasureResult(cost:[0.0076], error_no:0, all_cost:1.28, Tstamp:1607327018.24)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,80)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.2 (0,8)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 51 GFLOPS: 3.72 / 5.90 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.45, Tstamp:1607327019.40)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 64
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,4)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 52 GFLOPS: 0.90 / 5.90 results: MeasureResult(cost:[0.0046], error_no:0, all_cost:1.01, Tstamp:1607327020.17)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,80)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 53 GFLOPS: 0.06 / 5.90 results: MeasureResult(cost:[0.0706], error_no:0, all_cost:1.51, Tstamp:1607327021.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,64)
for ax0@ax1@.0.0 (0,4000)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,4)
for j.3 (0,125)
for k.2 (0,8)
T_dense = ...
for j.3 (0,125)
T_add = ...
==================================================
No: 54 GFLOPS: 0.60 / 5.90 results: MeasureResult(cost:[0.0069], error_no:0, all_cost:1.03, Tstamp:1607327021.95)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for j.4 (0,4)
T_dense = ...
for j.3 (0,8)
T_add = ...
==================================================
No: 55 GFLOPS: 0.21 / 5.90 results: MeasureResult(cost:[0.0197], error_no:0, all_cost:1.53, Tstamp:1607327022.70)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,25)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,320)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,25)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
for j.4 (0,20)
T_dense = ...
for j.3 (0,20)
T_add = ...
==================================================
No: 56 GFLOPS: 0.21 / 5.90 results: MeasureResult(cost:[0.0195], error_no:0, all_cost:1.54, Tstamp:1607327023.45)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,25)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,320)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,25)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
for j.4 (0,20)
T_dense = ...
for j.3 (0,20)
T_add = ...
==================================================
No: 57 GFLOPS: 0.77 / 5.90 results: MeasureResult(cost:[0.0053], error_no:0, all_cost:2.03, Tstamp:1607327024.69)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,10)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,40)
threadIdx.x ax0@ax1@.0.1 (0,10)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for k.1 (0,2)
for j.3 (0,10)
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,20)
T_add = ...
==================================================
No: 58 GFLOPS: 5.38 / 5.90 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.99, Tstamp:1607327025.81)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,100)
T_dense auto_unroll: 1024
for j ((blockIdx.x*10),10)
for k.0 (0,205)
threadIdx.x k.1 (0,10)
T_dense = ...
threadIdx.x j.1 (0,10)
T_add = ...
==================================================
No: 59 GFLOPS: 4.16 / 5.90 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.42, Tstamp:1607327026.94)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,32)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,16)
for j.3 (0,2)
for k.2 (0,4)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 60 GFLOPS: 3.17 / 5.90 results: MeasureResult(cost:[0.0013], error_no:0, all_cost:1.49, Tstamp:1607327028.20)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for j.3 (0,2)
for k.2 (0,8)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 61 GFLOPS: 0.65 / 5.90 results: MeasureResult(cost:[0.0063], error_no:0, all_cost:2.26, Tstamp:1607327028.94)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,160)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for j.3 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 62 GFLOPS: 0.41 / 5.90 results: MeasureResult(cost:[0.0099], error_no:0, all_cost:1.05, Tstamp:1607327029.65)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,160)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,4)
for j.3 (0,2)
for k.2 (0,4)
for j.4 (0,5)
T_dense = ...
for j.3 (0,10)
T_add = ...
==================================================
No: 63 GFLOPS: 4.01 / 5.90 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.35, Tstamp:1607327030.86)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 64 GFLOPS: 0.46 / 5.90 results: MeasureResult(cost:[0.0089], error_no:0, all_cost:1.08, Tstamp:1607327031.58)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
for j.4 (0,4)
T_dense = ...
for j.3 (0,4)
T_add = ...
Time elapsed for measurement: 73.24 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.36 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 58 fail_ct: 6086 Time elapsed: 1.04
GA Iter: 0 Max score: 0.9718 Min score: 0.0057 #Pop: 57 #M+: 0 #M-: 0
GA Iter: 4 Max score: 1.0000 Min score: 0.9795 #Pop: 128 #M+: 1409 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.11
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
......................************************==================================================
No: 65 GFLOPS: 3.41 / 5.90 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.38, Tstamp:1607327039.85)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 64
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 66 GFLOPS: 0.27 / 5.90 results: MeasureResult(cost:[0.0152], error_no:0, all_cost:2.47, Tstamp:1607327040.70)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,160)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
for j.4 (0,10)
T_dense = ...
for j.3 (0,10)
T_add = ...
==================================================
No: 67 GFLOPS: 2.58 / 5.90 results: MeasureResult(cost:[0.0016], error_no:0, all_cost:1.44, Tstamp:1607327041.84)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,8)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 68 GFLOPS: 0.39 / 5.90 results: MeasureResult(cost:[0.0105], error_no:0, all_cost:1.87, Tstamp:1607327042.55)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,8)
for j.3 (0,2)
for k.2 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 69 GFLOPS: 6.44 / 6.44 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.38, Tstamp:1607327043.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,10)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 70 GFLOPS: 1.05 / 6.44 results: MeasureResult(cost:[0.0039], error_no:0, all_cost:1.52, Tstamp:1607327044.97)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,8)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 71 GFLOPS: 0.63 / 6.44 results: MeasureResult(cost:[0.0065], error_no:0, all_cost:0.90, Tstamp:1607327045.69)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 72 GFLOPS: 1.14 / 6.44 results: MeasureResult(cost:[0.0036], error_no:0, all_cost:1.36, Tstamp:1607327046.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 73 GFLOPS: 2.98 / 6.44 results: MeasureResult(cost:[0.0014], error_no:0, all_cost:0.84, Tstamp:1607327047.57)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 74 GFLOPS: 0.48 / 6.44 results: MeasureResult(cost:[0.0085], error_no:0, all_cost:1.19, Tstamp:1607327048.27)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,8)
for j.3 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 75 GFLOPS: 2.24 / 6.44 results: MeasureResult(cost:[0.0018], error_no:0, all_cost:2.51, Tstamp:1607327049.52)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,50)
threadIdx.x i.2@j.2@ (0,10)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
for ax0@ax1@.0.0 (0,7)
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for k.1 (0,64)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 76 GFLOPS: 0.05 / 6.44 results: MeasureResult(cost:[0.0863], error_no:0, all_cost:1.86, Tstamp:1607327050.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,5)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,800)
threadIdx.x ax0@ax1@.0.1 (0,5)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,5)
data.shared = ...
for k.1 (0,8)
for j.3 (0,2)
for j.4 (0,25)
T_dense = ...
for j.3 (0,50)
T_add = ...
==================================================
No: 77 GFLOPS: 0.29 / 6.44 results: MeasureResult(cost:[0.0141], error_no:0, all_cost:1.10, Tstamp:1607327051.36)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,50)
T_dense auto_unroll: 64
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,40)
threadIdx.x ax0@ax1@.0.1 (0,50)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,50)
data.shared = ...
for k.1 (0,2)
for j.4 (0,20)
T_dense = ...
for j.3 (0,20)
T_add = ...
==================================================
No: 78 GFLOPS: 3.15 / 6.44 results: MeasureResult(cost:[0.0013], error_no:0, all_cost:1.51, Tstamp:1607327052.58)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 79 GFLOPS: 2.53 / 6.44 results: MeasureResult(cost:[0.0016], error_no:0, all_cost:1.33, Tstamp:1607327053.75)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,512)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 80 GFLOPS: 3.59 / 6.44 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.77, Tstamp:1607327054.93)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,32)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,64)
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 81 GFLOPS: 0.87 / 6.44 results: MeasureResult(cost:[0.0047], error_no:0, all_cost:1.04, Tstamp:1607327055.67)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,250)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,2)
for j.3 (0,4)
for k.2 (0,4)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 82 GFLOPS: 2.61 / 6.44 results: MeasureResult(cost:[0.0016], error_no:0, all_cost:1.63, Tstamp:1607327056.82)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,16)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 83 GFLOPS: 0.74 / 6.44 results: MeasureResult(cost:[0.0055], error_no:0, all_cost:1.34, Tstamp:1607327058.01)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 84 GFLOPS: 2.34 / 6.44 results: MeasureResult(cost:[0.0018], error_no:0, all_
....................************************cost:1.00, Tstamp:1607327058.82)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 85 GFLOPS: 0.77 / 6.44 results: MeasureResult(cost:[0.0053], error_no:0, all_cost:1.33, Tstamp:1607327060.00)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 86 GFLOPS: 4.91 / 6.44 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.35, Tstamp:1607327061.17)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,250)
for j ((blockIdx.x*4),4)
for k.0 (0,512)
threadIdx.x k.1 (0,4)
T_dense = ...
threadIdx.x j.1 (0,4)
T_add = ...
==================================================
No: 87 GFLOPS: 5.86 / 6.44 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.33, Tstamp:1607327062.35)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 88 GFLOPS: 0.80 / 6.44 results: MeasureResult(cost:[0.0051], error_no:0, all_cost:0.87, Tstamp:1607327063.08)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,500)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,500)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
data.shared = ...
for k.2 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 89 GFLOPS: 1.77 / 6.44 results: MeasureResult(cost:[0.0023], error_no:0, all_cost:1.29, Tstamp:1607327065.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 90 GFLOPS: 0.53 / 6.44 results: MeasureResult(cost:[0.0077], error_no:0, all_cost:1.10, Tstamp:1607327066.69)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,512)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for j.3 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 91 GFLOPS: 0.59 / 6.44 results: MeasureResult(cost:[0.0069], error_no:0, all_cost:0.89, Tstamp:1607327067.37)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,1024)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for j.4 (0,4)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 92 GFLOPS: 0.11 / 6.44 results: MeasureResult(cost:[0.0363], error_no:0, all_cost:1.21, Tstamp:1607327068.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,10)
T_dense auto_unroll: 64
for k.0 (0,64)
for ax0@ax1@.0.0 (0,1600)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,32)
for j.3 (0,25)
T_dense = ...
for j.3 (0,25)
T_add = ...
==================================================
No: 93 GFLOPS: 2.80 / 6.44 results: MeasureResult(cost:[0.0015], error_no:0, all_cost:1.39, Tstamp:1607327069.40)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 94 GFLOPS: 0.09 / 6.44 results: MeasureResult(cost:[0.0477], error_no:0, all_cost:1.38, Tstamp:1607327070.30)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,25)
T_dense auto_unroll: 16
for k.0 (0,512)
for ax0@ax1@.0.0 (0,80)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,25)
data.shared = ...
for j.3 (0,10)
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,20)
T_add = ...
==================================================
No: 95 GFLOPS: 0.76 / 6.44 results: MeasureResult(cost:[0.0054], error_no:0, all_cost:0.84, Tstamp:1607327070.97)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,5)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for j.3 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 96 GFLOPS: 4.64 / 6.44 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.36, Tstamp:1607327072.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,500)
T_dense auto_unroll: 512
for j ((blockIdx.x*2),2)
for k.0 (0,1024)
threadIdx.x k.1 (0,2)
T_dense = ...
threadIdx.x j.1 (0,2)
T_add = ...
==================================================
No: 97 GFLOPS: 3.21 / 6.44 results: MeasureResult(cost:[0.0013], error_no:0, all_cost:1.91, Tstamp:1607327073.33)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,50)
T_dense auto_unroll: 16
for j ((blockIdx.x*20),20)
for k.0 (0,103)
threadIdx.x k.1 (0,20)
T_dense = ...
threadIdx.x j.1 (0,20)
T_add = ...
==================================================
No: 98 GFLOPS: 1.08 / 6.44 results: MeasureResult(cost:[0.0038], error_no:0, all_cost:0.98, Tstamp:1607327074.01)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,500)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,500)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 99 GFLOPS: 1.37 / 6.44 results: MeasureResult(cost:[0.0030], error_no:0, all_cost:1.42, Tstamp:1607327075.22)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,10)
threadIdx.x i.2@j.2@ (0,20)
T_dense auto_unroll: 16
for k.0 (0,512)
for ax0@ax1@.0.0 (0,20)
threadIdx.x ax0@ax1@.0.1 (0,20)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,20)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 100 GFLOPS: 0.54 / 6.44 results: MeasureResult(cost:[0.0077], error_no:0, all_cost:2.74, Tstamp:1607327076.45)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,160)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for j.3 (0,5)
for k.2 (0,16)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 101 GFLOPS: 3.80 / 6.44 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.47, Tstamp:1607327077.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,125)
for j ((blockIdx.x*8),8)
for k.0 (0,256)
threadIdx.x k.1 (0,8)
T_dense = ...
threadIdx.x j.1 (0,8)
T_add = ...
==================================================
No: 102 GFLOPS: 4.33 / 6.44 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.53, Tstamp:1607327078.86)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,8)
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 103 GFLOPS: 0.41 / 6.44 results: MeasureResult(cost:[0.0099], error_no:0, all_cost:0.92, Tstamp:1607327079.59)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,10)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for j.3 (0,5)
for j.4 (0,2)
T_dense = ...
for j.3 (0,10)
T_add = ...
==================================================
No: 104 GFLOPS: 1.29 / 6.44 results: MeasureResult(cost:[0.0032], error_no:0, all_cost:0.89, Tstamp:1607327080.26)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,512)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 105 GFLOPS: 7.03 / 7.03 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.44, Tstamp:1607327081.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
threadIdx.x i.2@j.2@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,5)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,5)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 106 GFLOPS: 3.81 / 7.03 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:2.51, Tstamp:1607327082.61)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,8)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 107 GFLOPS: 0.48 / 7.03 results: MeasureResult(cost:[0.0085], error_no:0, all_cost:2.09, Tstamp:1607327083.82)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for j.3 (0,2)
for k.2 (0,16)
for j.4 (0,2)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 108 GFLOPS: 4.14 / 7.03 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.40, Tstamp:1607327085.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,8)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 109 GFLOPS: 0.06 / 7.03 results: MeasureResult(cost:[0.0697], error_no:0, all_cost:1.38, Tstamp:1607327086.10)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,8000)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
data.shared = ...
for k.1 (0,4)
for j.3 (0,125)
for k.2 (0,8)
for j.4 (0,2)
T_dense = ...
for j.3 (0,250)
T_add = ...
==================================================
No: 110 GFLOPS: 5.71 / 7.03 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.32, Tstamp:1607327087.26)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,40)
threadIdx.x i.2@j.2@ (0,25)
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,25)
data.shared = ...
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 111 GFLOPS: 1.57 / 7.03 results: MeasureResult(cost:[0.0026], error_no:0, all_cost:1.37
................****************, Tstamp:1607327088.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,512)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,4)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 112 GFLOPS: 5.66 / 7.03 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.30, Tstamp:1607327089.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,4)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 113 GFLOPS: 1.08 / 7.03 results: MeasureResult(cost:[0.0038], error_no:0, all_cost:0.84, Tstamp:1607327096.85)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,200)
for k.0 (0,512)
for ax0@ax1@.0.0 (0,20)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.1 (0,4)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 114 GFLOPS: 0.99 / 7.03 results: MeasureResult(cost:[0.0041], error_no:0, all_cost:1.84, Tstamp:1607327098.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,80)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,16)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 115 GFLOPS: 0.10 / 7.03 results: MeasureResult(cost:[0.0429], error_no:0, all_cost:7.41, Tstamp:1607327098.90)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,25)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,320)
threadIdx.x ax0@ax1@.0.1 (0,25)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,25)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for j.3 (0,4)
for k.2 (0,2)
for j.4 (0,5)
T_dense = ...
for j.3 (0,20)
T_add = ...
==================================================
No: 116 GFLOPS: 3.36 / 7.03 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:0.84, Tstamp:1607327099.58)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 117 GFLOPS: 1.07 / 7.03 results: MeasureResult(cost:[0.0038], error_no:0, all_cost:1.37, Tstamp:1607327100.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,10)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,10)
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,10)
threadIdx.x ax0@ax1@.0.1 (0,10)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for j.3 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 118 GFLOPS: 3.69 / 7.03 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.32, Tstamp:1607327101.91)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,512)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,100)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 119 GFLOPS: 8.45 / 8.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.92, Tstamp:1607327103.01)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 120 GFLOPS: 0.54 / 8.45 results: MeasureResult(cost:[0.0076], error_no:0, all_cost:1.01, Tstamp:1607327103.82)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,20)
threadIdx.x i.2@j.2@ (0,10)
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,10)
vectorize ax0@ax1@.1 (0,16)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,10)
data.shared = ...
for j.3 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 121 GFLOPS: 0.75 / 8.45 results: MeasureResult(cost:[0.0055], error_no:0, all_cost:2.23, Tstamp:1607327104.50)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,10)
threadIdx.x i.2@j.2@ (0,20)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,160)
threadIdx.x ax0@ax1@.0.1 (0,20)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,20)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for j.3 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 122 GFLOPS: 5.88 / 8.45 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.43, Tstamp:1607327105.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,200)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,200)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,200)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 123 GFLOPS: 2.73 / 8.45 results: MeasureResult(cost:[0.0015], error_no:0, all_cost:1.65, Tstamp:1607327107.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,2)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,125)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,8)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 124 GFLOPS: 1.27 / 8.45 results: MeasureResult(cost:[0.0032], error_no:0, all_cost:1.64, Tstamp:1607327108.50)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,500)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,500)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,500)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 125 GFLOPS: 0.07 / 8.45 results: MeasureResult(cost:[0.0555], error_no:0, all_cost:1.48, Tstamp:1607327109.59)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
T_dense auto_unroll: 16
for k.0 (0,32)
for ax0@ax1@.0.0 (0,12800)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
data.shared = ...
for k.1 (0,2)
for j.3 (0,100)
for k.2 (0,32)
for j.4 (0,2)
T_dense = ...
for j.3 (0,200)
T_add = ...
==================================================
No: 126 GFLOPS: 3.11 / 8.45 results: MeasureResult(cost:[0.0013], error_no:0, all_cost:1.62, Tstamp:1607327110.97)
==================================================
Placeholder: data, weight, bias
blockIdx.x i@j.0@ (0,16)
T_dense auto_unroll: 1024
for j ((blockIdx.x*64),64)
for k.0 (0,32)
threadIdx.x k.1 (0,64)
T_dense = ...
threadIdx.x j.1 (0,64)
T_add = ...
==================================================
No: 127 GFLOPS: 0.54 / 8.45 results: MeasureResult(cost:[0.0075], error_no:0, all_cost:1.07, Tstamp:1607327111.85)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,4)
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 128 GFLOPS: 0.20 / 8.45 results: MeasureResult(cost:[0.0205], error_no:0, all_cost:2.23, Tstamp:1607327112.78)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,160)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for j.3 (0,5)
for k.2 (0,16)
T_dense = ...
for j.3 (0,5)
T_add = ...
Time elapsed for measurement: 75.66 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.14 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 56 fail_ct: 8136 Time elapsed: 1.42
GA Iter: 0 Max score: 0.7423 Min score: 0.0469 #Pop: 54 #M+: 0 #M-: 0
GA Iter: 4 Max score: 1.1749 Min score: 0.9942 #Pop: 128 #M+: 1391 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 3.69
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
......................************************==================================================
No: 129 GFLOPS: 4.65 / 8.45 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.25, Tstamp:1607327122.47)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for ax0@ax1@.0.0 (0,2048)
weight.shared = ...
for ax0@ax1@.0.0 (0,2048)
data.shared = ...
for k.1 (0,2048)
T_dense = ...
T_add = ...
==================================================
No: 130 GFLOPS: 4.65 / 8.45 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.49, Tstamp:1607327123.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for ax0@ax1@.0.0 (0,2048)
weight.shared = ...
for ax0@ax1@.0.0 (0,2048)
data.shared = ...
for k.1 (0,512)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 131 GFLOPS: 8.07 / 8.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.53, Tstamp:1607327125.09)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 132 GFLOPS: 8.20 / 8.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.05, Tstamp:1607327126.36)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 133 GFLOPS: 8.07 / 8.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.52, Tstamp:1607327127.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 134 GFLOPS: 7.60 / 8.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.75, Tstamp:1607327129.05)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,64)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,16)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 135 GFLOPS: 8.21 / 8.45 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.05, Tstamp:1607327130.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,64)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 136 GFLOPS: 14.55 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327131.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,2048)
weight.shared = ...
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 137 GFLOPS: 14.54 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.48, Tstamp:1607327132.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
for k.0 (0,2048)
weight.shared = ...
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 138 GFLOPS: 14.54 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327134.18)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,2048)
weight.shared = ...
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 139 GFLOPS: 7.80 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.76, Tstamp:1607327135.46)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,64)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 140 GFLOPS: 6.76 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.97, Tstamp:1607327136.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 64
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,64)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 141 GFLOPS: 5.04 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.87, Tstamp:1607327138.13)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,80)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 142 GFLOPS: 7.60 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.57, Tstamp:1607327139.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,16)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 143 GFLOPS: 14.54 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.49, Tstamp:1607327140.75)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,2048)
weight.shared = ...
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 144 GFLOPS: 2.11 / 14.55 results: MeasureResult(cost:[0.0019], error_no:0, all_cost:4.32, Tstamp:1607327142.08)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,2)
for ax0@ax1@.0.0 (0,2560)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,512)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,1024)
T_dense = ...
T_add = ...
==================================================
No: 145 GFLOPS: 5.86 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.45, Tstamp:1607327143.37)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,10)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 146 GFLOPS: 9.78 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327144.72)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 147 GFLOPS: 9.53 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327146.00)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,16)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 148 GFLOPS: 8.20 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.02, Tstamp:1607327147.28)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,128)
T_dense = ...
T_add = ...
==================================================
No: 149 GFLOPS: 6.51 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.53, Tstamp:1607327148.61)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,20)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 150 GFLOPS: 1.47 / 14.55 results: MeasureResult(cost:[0.0028], error_no:0, all_cost:1.58, Tstamp:1607327149.98)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
T_dense auto_unroll: 1024
for ax0@ax1@.0.0 (0,10240)
weight.shared = ...
for ax0@ax1@.0.0 (0,2048)
data.shared = ...
for k.1 (0,2048)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 151 GFLOPS: 8.07 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.48, Tstamp:1607327151.26)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 152 GFLOPS: 8.07 / 14.55 results: MeasureResult(cost:[0.0005], erro
.................T***********************r_no:0, all_cost:1.53, Tstamp:1607327152.57)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 153 GFLOPS: 8.07 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.36, Tstamp:1607327168.94)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 154 GFLOPS: 10.50 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.58, Tstamp:1607327170.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 155 GFLOPS: 8.37 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.03, Tstamp:1607327171.58)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,8)
for ax0@ax1@.0.0 (0,256)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 156 GFLOPS: 8.20 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.25, Tstamp:1607327172.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 157 GFLOPS: 8.20 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.27, Tstamp:1607327174.23)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 158 GFLOPS: 8.19 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.29, Tstamp:1607327175.55)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 159 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327176.82)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 160 GFLOPS: 10.50 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.56, Tstamp:1607327178.18)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,16)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 161 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327179.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 162 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327180.69)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 163 GFLOPS: 8.20 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.40, Tstamp:1607327182.08)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 164 GFLOPS: 0.42 / 14.55 results: MeasureResult(cost:[0.0098], error_no:0, all_cost:9.79, Tstamp:1607327182.96)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,640)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,128)
T_dense = ...
T_add = ...
==================================================
No: 165 GFLOPS: 1.42 / 14.55 results: MeasureResult(cost:[0.0029], error_no:0, all_cost:2.32, Tstamp:1607327184.29)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
T_dense auto_unroll: 1024
for ax0@ax1@.0.0 (0,10240)
weight.shared = ...
for ax0@ax1@.0.0 (0,2048)
data.shared = ...
for k.1 (0,32)
for k.2 (0,64)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 166 GFLOPS: 8.50 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.83, Tstamp:1607327185.67)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,64)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 167 GFLOPS: 8.19 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.17, Tstamp:1607327186.94)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 168 GFLOPS: 0.00 / 14.55 results: MeasureResult(error_type:BuildTimeoutError, error_msg:, all_cost:15.00, Tstamp:1607327186.95)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,2)
for ax0@ax1@.0.0 (0,1024)
weight.shared = ...
for ax0@ax1@.0.0 (0,512)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 169 GFLOPS: 8.19 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.16, Tstamp:1607327188.22)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 170 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327189.53)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 171 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.43, Tstamp:1607327190.81)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 172 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.42, Tstamp:1607327192.08)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 173 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.54, Tstamp:1607327193.47)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 174 GFLOPS: 8.07 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.54, Tstamp:1607327194.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 175 GFLOPS: 5.04 / 14.55 resu
...............****************lts: MeasureResult(cost:[0.0008], error_no:0, all_cost:4.70, Tstamp:1607327196.11)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
for k.0 (0,4)
for ax0@ax1@.0.0 (0,1024)
weight.shared = ...
for ax0@ax1@.0.0 (0,256)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 176 GFLOPS: 2.97 / 14.55 results: MeasureResult(cost:[0.0014], error_no:0, all_cost:1.67, Tstamp:1607327197.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
for k.0 (0,64)
for ax0@ax1@.0.0 (0,64)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,16)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 177 GFLOPS: 5.04 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.85, Tstamp:1607327201.86)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,160)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 178 GFLOPS: 5.04 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:2.01, Tstamp:1607327203.21)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,160)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 179 GFLOPS: 2.89 / 14.55 results: MeasureResult(cost:[0.0014], error_no:0, all_cost:1.59, Tstamp:1607327204.54)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,160)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 180 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327205.81)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 181 GFLOPS: 6.27 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.55, Tstamp:1607327207.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,20)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 182 GFLOPS: 6.51 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.58, Tstamp:1607327208.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,40)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 183 GFLOPS: 8.20 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.23, Tstamp:1607327209.73)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,128)
weight.shared = ...
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,64)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 184 GFLOPS: 1.55 / 14.55 results: MeasureResult(cost:[0.0026], error_no:0, all_cost:4.63, Tstamp:1607327211.14)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
vthread i.1@j.1@ (0,5)
T_dense auto_unroll: 1024
for k.0 (0,2)
for ax0@ax1@.0.0 (0,5120)
weight.shared = ...
for ax0@ax1@.0.0 (0,512)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,1024)
T_dense = ...
T_add = ...
==================================================
No: 185 GFLOPS: 8.07 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.59, Tstamp:1607327212.46)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 186 GFLOPS: 8.07 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.54, Tstamp:1607327213.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 187 GFLOPS: 3.84 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:2.01, Tstamp:1607327215.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,200)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,80)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,16)
for j.4 (0,5)
T_dense = ...
for j.3 (0,5)
T_add = ...
==================================================
No: 188 GFLOPS: 8.19 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.26, Tstamp:1607327216.45)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 189 GFLOPS: 8.19 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:2.09, Tstamp:1607327217.74)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,8)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 190 GFLOPS: 8.20 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.99, Tstamp:1607327219.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,8)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 191 GFLOPS: 8.19 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.95, Tstamp:1607327220.34)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,128)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 192 GFLOPS: 10.69 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.43, Tstamp:1607327221.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
Time elapsed for measurement: 103.55 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.18 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 54 fail_ct: 6090 Time elapsed: 1.07
GA Iter: 0 Max score: 0.3385 Min score: 0.0067 #Pop: 53 #M+: 0 #M-: 0
GA Iter: 4 Max score: 1.4983 Min score: 1.3601 #Pop: 128 #M+: 1393 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.05
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
........................************************==================================================
No: 193 GFLOPS: 3.64 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.26, Tstamp:1607327228.73)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 194 GFLOPS: 3.64 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.57, Tstamp:1607327230.12)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 195 GFLOPS: 4.23 / 14.55 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.49, Tstamp:1607327231.43)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 196 GFLOPS: 4.23 / 14.55 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.47, Tstamp:1607327232.74)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 197 GFLOPS: 4.23 / 14.55 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.66, Tstamp:1607327234.13)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 198 GFLOPS: 3.64 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.56, Tstamp:1607327235.43)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 199 GFLOPS: 3.89 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.59, Tstamp:1607327236.73)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 200 GFLOPS: 3.35 / 14.55 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.68, Tstamp:1607327238.14)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 201 GFLOPS: 3.31 / 14.55 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.58, Tstamp:1607327239.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 202 GFLOPS: 3.31 / 14.55 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.58, Tstamp:1607327240.75)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 203 GFLOPS: 3.35 / 14.55 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.60, Tstamp:1607327242.11)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 204 GFLOPS: 8.68 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.58, Tstamp:1607327243.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 205 GFLOPS: 8.66 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.48, Tstamp:1607327244.70)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 206 GFLOPS: 5.55 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.61, Tstamp:1607327246.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 207 GFLOPS: 4.23 / 14.55 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.48, Tstamp:1607327247.37)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 16
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 208 GFLOPS: 3.64 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.44, Tstamp:1607327248.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 16
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 209 GFLOPS: 8.27 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.56, Tstamp:1607327250.05)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 210 GFLOPS: 8.27 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.50, Tstamp:1607327251.36)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 211 GFLOPS: 8.27 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.45, Tstamp:1607327252.64)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 212 GFLOPS: 3.89 / 14.55 results: MeasureResult(cost:[0.0011], error_no:0, all_cost:1.52, Tstamp:1607327254.00)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 213 GFLOPS: 14.07 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.39, Tstamp:1607327255.24)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
.......................************************ data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 214 GFLOPS: 3.31 / 14.55 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.47, Tstamp:1607327256.55)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,2048)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 215 GFLOPS: 5.36 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.58, Tstamp:1607327257.91)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 216 GFLOPS: 3.90 / 14.55 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.53, Tstamp:1607327259.23)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,64)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,10)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 217 GFLOPS: 10.55 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.28, Tstamp:1607327261.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 218 GFLOPS: 10.39 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.63, Tstamp:1607327262.64)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 219 GFLOPS: 5.85 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.49, Tstamp:1607327263.93)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,1024)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 220 GFLOPS: 13.12 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327265.20)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 221 GFLOPS: 7.42 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.72, Tstamp:1607327266.55)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,64)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 222 GFLOPS: 7.77 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.54, Tstamp:1607327267.86)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,64)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 223 GFLOPS: 5.55 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.60, Tstamp:1607327269.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 224 GFLOPS: 5.55 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.69, Tstamp:1607327270.52)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,64)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 225 GFLOPS: 5.47 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.50, Tstamp:1607327271.84)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 226 GFLOPS: 5.24 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.50, Tstamp:1607327273.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 227 GFLOPS: 7.61 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.56, Tstamp:1607327274.53)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,512)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 228 GFLOPS: 5.23 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.48, Tstamp:1607327275.85)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 229 GFLOPS: 12.86 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.61, Tstamp:1607327277.11)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 230 GFLOPS: 7.61 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.60, Tstamp:1607327278.47)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 231 GFLOPS: 5.47 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.44, Tstamp:1607327279.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 232 GFLOPS: 7.31 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:2.00, Tstamp:1607327281.08)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,64)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 233 GFLOPS: 7.37 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.71, Tstamp:1607327282.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 234 GFLOPS: 7.29 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.95, Tstamp:1607327283.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,32)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 235 GFLOPS: 7.37 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.48, Tstamp:1607327285.08)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,512)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,16)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 236 GFLOPS: 11.83 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.55, Tstamp:1607327286.46)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 237 GFLOPS: 11.84 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.41, Tstamp:1607327287.72)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 238 GFLOPS: 11.84 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327288.99)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
................****************data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 239 GFLOPS: 7.06 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.51, Tstamp:1607327290.35)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,512)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,8)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 240 GFLOPS: 7.22 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.47, Tstamp:1607327291.67)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,512)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,8)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 241 GFLOPS: 4.78 / 14.55 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.27, Tstamp:1607327293.36)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,1024)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 242 GFLOPS: 4.79 / 14.55 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.52, Tstamp:1607327294.72)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,1024)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 243 GFLOPS: 7.61 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.54, Tstamp:1607327296.00)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 244 GFLOPS: 10.54 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.84, Tstamp:1607327297.27)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 245 GFLOPS: 7.37 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.75, Tstamp:1607327298.59)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 246 GFLOPS: 7.40 / 14.55 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.71, Tstamp:1607327299.87)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 247 GFLOPS: 5.76 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.45, Tstamp:1607327301.17)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 248 GFLOPS: 5.35 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.60, Tstamp:1607327302.56)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 249 GFLOPS: 5.35 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.48, Tstamp:1607327303.85)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 250 GFLOPS: 5.35 / 14.55 results: MeasureResult(cost:[0.0008], error_no:0, all_cost:1.54, Tstamp:1607327305.20)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 251 GFLOPS: 5.51 / 14.55 results: MeasureResult(cost:[0.0007], error_no:0, all_cost:1.58, Tstamp:1607327306.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 252 GFLOPS: 9.28 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327307.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 253 GFLOPS: 8.41 / 14.55 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.47, Tstamp:1607327309.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 254 GFLOPS: 4.33 / 14.55 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.50, Tstamp:1607327310.53)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,8)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,512)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 255 GFLOPS: 0.42 / 14.55 results: MeasureResult(cost:[0.0099], error_no:0, all_cost:1.05, Tstamp:1607327311.43)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 256 GFLOPS: 9.28 / 14.55 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.37, Tstamp:1607327312.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
Time elapsed for measurement: 85.74 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.24 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 62 fail_ct: 8130 Time elapsed: 1.43
GA Iter: 0 Max score: 0.4568 Min score: 0.0184 #Pop: 60 #M+: 0 #M-: 0
GA Iter: 4 Max score: 0.9958 Min score: 0.7919 #Pop: 128 #M+: 1401 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.18
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
........................************************==================================================
No: 257 GFLOPS: 14.53 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.19, Tstamp:1607327320.34)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,2048)
weight.shared = ...
data.shared = ...
T_dense = ...
T_add = ...
==================================================
No: 258 GFLOPS: 14.07 / 14.55 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327321.63)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 259 GFLOPS: 15.57 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.39, Tstamp:1607327322.86)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 260 GFLOPS: 15.57 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327324.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 261 GFLOPS: 15.57 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327325.46)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 262 GFLOPS: 15.57 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327326.70)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 263 GFLOPS: 14.07 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327328.01)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 264 GFLOPS: 15.57 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.42, Tstamp:1607327329.27)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 265 GFLOPS: 14.07 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327330.56)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 266 GFLOPS: 15.57 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.51, Tstamp:1607327331.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 267 GFLOPS: 9.98 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.64, Tstamp:1607327333.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 268 GFLOPS: 10.43 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327334.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 269 GFLOPS: 10.38 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.69, Tstamp:1607327335.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 270 GFLOPS: 10.43 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327337.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 271 GFLOPS: 7.80 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.44, Tstamp:1607327338.35)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 272 GFLOPS: 10.44 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327339.70)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 273 GFLOPS: 12.69 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327340.97)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 274 GFLOPS: 10.76 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.43, Tstamp:1607327342.25)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 275 GFLOPS: 10.75 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.50, Tstamp:1607327343.61)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 276 GFLOPS: 10.75 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327344.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
=========
........................************************=========================================
No: 277 GFLOPS: 10.76 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327346.17)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 278 GFLOPS: 9.96 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.65, Tstamp:1607327347.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for j.3 (0,2)
for k.2 (0,32)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 279 GFLOPS: 10.75 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.42, Tstamp:1607327348.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 280 GFLOPS: 10.75 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.37, Tstamp:1607327350.02)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 281 GFLOPS: 10.76 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.28, Tstamp:1607327351.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 282 GFLOPS: 10.75 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.44, Tstamp:1607327353.03)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 283 GFLOPS: 10.75 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327354.33)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 284 GFLOPS: 11.83 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.52, Tstamp:1607327355.64)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 285 GFLOPS: 10.21 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327356.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,512)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 286 GFLOPS: 9.87 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327358.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 287 GFLOPS: 13.13 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.49, Tstamp:1607327359.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 288 GFLOPS: 13.13 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.53, Tstamp:1607327360.79)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 289 GFLOPS: 13.12 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327362.09)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 290 GFLOPS: 14.07 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327363.38)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 291 GFLOPS: 8.44 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.56, Tstamp:1607327364.66)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 292 GFLOPS: 13.13 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327365.92)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 293 GFLOPS: 10.44 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327367.26)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 294 GFLOPS: 10.11 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.43, Tstamp:1607327368.57)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 295 GFLOPS: 10.65 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.40, Tstamp:1607327369.83)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 296 GFLOPS: 10.66 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327371.20)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 297 GFLOPS: 10.64 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.40, Tstamp:1607327372.47)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 298 GFLOPS: 12.86 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.58, Tstamp:1607327373.75)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 299 GFLOPS: 8.81 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.61, Tstamp:1607327375.12)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 300 GFLOPS: 8.81 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.57, Tstamp:1607327376.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 301 GFLOPS: 8.80 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.46, Tstamp:1607327377.70)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,1000)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 302 GFLOPS: 12.72 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327379.04)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 303 GFLOPS: 11.71 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327380.31)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
................**************** vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 304 GFLOPS: 10.43 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327381.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
for j.4 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 305 GFLOPS: 6.64 / 15.57 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.86, Tstamp:1607327383.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,64)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,16)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for j.3 (0,4)
for k.2 (0,32)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 306 GFLOPS: 7.61 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.60, Tstamp:1607327384.81)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 307 GFLOPS: 10.65 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327386.10)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 308 GFLOPS: 10.65 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327387.39)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 309 GFLOPS: 13.13 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327388.64)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 310 GFLOPS: 7.76 / 15.57 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.50, Tstamp:1607327389.95)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 311 GFLOPS: 12.74 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.55, Tstamp:1607327391.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 312 GFLOPS: 13.13 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327392.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 313 GFLOPS: 13.11 / 15.57 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327393.87)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 314 GFLOPS: 10.65 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327395.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 315 GFLOPS: 10.65 / 15.57 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327396.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
vthread i.1@j.1@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 316 GFLOPS: 4.69 / 15.57 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.49, Tstamp:1607327397.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
T_dense auto_unroll: 512
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
data.shared = ...
for j.4 (0,4)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 317 GFLOPS: 4.00 / 15.57 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.51, Tstamp:1607327399.18)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
T_dense auto_unroll: 512
for k.0 (0,2048)
for ax0@ax1@.0.0 (0,4)
weight.shared = ...
data.shared = ...
for j.4 (0,4)
T_dense = ...
for j.3 (0,4)
T_add = ...
==================================================
No: 318 GFLOPS: 0.64 / 15.57 results: MeasureResult(cost:[0.0064], error_no:0, all_cost:1.17, Tstamp:1607327400.06)
==================================================
Placeholder: data, weight, bias
vthread i.1@j.1@ (0,4)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,8)
for j.3 (0,2)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 319 GFLOPS: 3.29 / 15.57 results: MeasureResult(cost:[0.0012], error_no:0, all_cost:1.01, Tstamp:1607327400.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,5)
threadIdx.x i.2@j.2@ (0,100)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for j.3 (0,2)
for k.2 (0,16)
T_dense = ...
for j.3 (0,2)
T_add = ...
==================================================
No: 320 GFLOPS: 0.98 / 15.57 results: MeasureResult(cost:[0.0042], error_no:0, all_cost:1.55, Tstamp:1607327402.30)
==================================================
Placeholder: data, weight, bias
threadIdx.x i.2@j.2@ (0,125)
for k.0 (0,128)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,16)
for j.3 (0,2)
for j.4 (0,4)
T_dense = ...
for j.3 (0,8)
T_add = ...
Time elapsed for measurement: 83.73 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.23 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 50 fail_ct: 6094 Time elapsed: 1.08
GA Iter: 0 Max score: 0.2953 Min score: -0.0289 #Pop: 49 #M+: 0 #M-: 0
GA Iter: 4 Max score: 0.9867 Min score: 0.7344 #Pop: 128 #M+: 1396 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.13
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
........................************************==================================================
No: 321 GFLOPS: 15.58 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.15, Tstamp:1607327409.38)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 322 GFLOPS: 15.57 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327410.63)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 323 GFLOPS: 15.58 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327411.87)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 324 GFLOPS: 15.58 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.52, Tstamp:1607327413.21)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 325 GFLOPS: 15.58 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327414.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 326 GFLOPS: 15.57 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327415.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 327 GFLOPS: 15.57 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327417.07)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 328 GFLOPS: 15.57 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327418.34)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 329 GFLOPS: 15.57 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327419.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 330 GFLOPS: 15.56 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.50, Tstamp:1607327420.94)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 331 GFLOPS: 14.06 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.41, Tstamp:1607327422.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 332 GFLOPS: 14.06 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327423.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 333 GFLOPS: 14.06 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.42, Tstamp:1607327424.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 334 GFLOPS: 14.07 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327426.05)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 335 GFLOPS: 14.07 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.38, Tstamp:1607327427.29)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 336 GFLOPS: 14.06 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.48, Tstamp:1607327428.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 337 GFLOPS: 14.05 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.39, Tstamp:1607327429.88)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 338 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327431.14)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 339 GFLOPS: 14.07 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327432.46)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256
........................************************)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 340 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327433.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 341 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327435.10)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 342 GFLOPS: 14.08 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327436.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 343 GFLOPS: 14.06 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327437.74)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 344 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.40, Tstamp:1607327439.02)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 345 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.28, Tstamp:1607327440.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 346 GFLOPS: 10.44 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327442.11)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 347 GFLOPS: 14.07 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327443.39)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 348 GFLOPS: 13.11 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.50, Tstamp:1607327444.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 349 GFLOPS: 13.11 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327445.93)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 350 GFLOPS: 13.11 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327447.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 351 GFLOPS: 13.11 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327448.45)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 352 GFLOPS: 10.42 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327449.72)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 353 GFLOPS: 13.11 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327451.00)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 354 GFLOPS: 13.11 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.54, Tstamp:1607327452.34)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 355 GFLOPS: 10.42 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327453.61)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 356 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327454.88)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 357 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327456.21)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 358 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327457.51)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 359 GFLOPS: 13.12 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.39, Tstamp:1607327458.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 360 GFLOPS: 10.43 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327460.09)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 361 GFLOPS: 10.44 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327461.41)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 362 GFLOPS: 13.12 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327462.67)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 363 GFLOPS: 10.45 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327463.98)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 364 GFLOPS: 12.71 /
................****************15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327465.25)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 365 GFLOPS: 12.71 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.42, Tstamp:1607327466.52)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 366 GFLOPS: 9.73 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327467.84)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 367 GFLOPS: 6.95 / 15.58 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.55, Tstamp:1607327469.13)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 368 GFLOPS: 9.73 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.44, Tstamp:1607327470.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 369 GFLOPS: 9.72 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.28, Tstamp:1607327471.96)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 370 GFLOPS: 9.74 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.52, Tstamp:1607327473.28)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 371 GFLOPS: 11.68 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.43, Tstamp:1607327474.52)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 372 GFLOPS: 11.71 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.58, Tstamp:1607327475.87)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 373 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.60, Tstamp:1607327477.11)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 374 GFLOPS: 10.86 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.54, Tstamp:1607327478.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 375 GFLOPS: 10.85 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.59, Tstamp:1607327479.83)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 376 GFLOPS: 11.67 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.52, Tstamp:1607327481.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 377 GFLOPS: 11.70 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327482.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 378 GFLOPS: 10.85 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.56, Tstamp:1607327483.81)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 379 GFLOPS: 11.71 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.50, Tstamp:1607327485.10)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 380 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327486.40)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 381 GFLOPS: 4.77 / 15.58 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.51, Tstamp:1607327487.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 382 GFLOPS: 2.71 / 15.58 results: MeasureResult(cost:[0.0015], error_no:0, all_cost:1.06, Tstamp:1607327488.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,125)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,125)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,125)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 383 GFLOPS: 4.76 / 15.58 results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.39, Tstamp:1607327489.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,2)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 384 GFLOPS: 11.67 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327491.24)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
Time elapsed for measurement: 83.47 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.31 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 60 fail_ct: 6084 Time elapsed: 1.06
GA Iter: 0 Max score: 0.3132 Min score: 0.0005 #Pop: 59 #M+: 0 #M-: 0
GA Iter: 4 Max score: 0.8197 Min score: 0.6926 #Pop: 128 #M+: 1389 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.29
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
...............*****************E*******==================================================
No: 385 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.22, Tstamp:1607327498.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 386 GFLOPS: 12.84 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.62, Tstamp:1607327500.04)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 387 GFLOPS: 12.83 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.70, Tstamp:1607327501.39)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 388 GFLOPS: 12.96 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.41, Tstamp:1607327502.64)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 389 GFLOPS: 12.95 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327503.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 390 GFLOPS: 12.95 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.51, Tstamp:1607327505.24)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 391 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327506.52)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 392 GFLOPS: 12.71 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327507.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 393 GFLOPS: 10.10 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.93, Tstamp:1607327509.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,16)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 394 GFLOPS: 6.89 / 15.58 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.52, Tstamp:1607327510.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,4)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,256)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 395 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.49, Tstamp:1607327511.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 396 GFLOPS: 12.84 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.66, Tstamp:1607327513.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 397 GFLOPS: 10.11 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.82, Tstamp:1607327514.33)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,16)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 398 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.65, Tstamp:1607327515.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 399 GFLOPS: 12.84 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.67, Tstamp:1607327516.93)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 400 GFLOPS: 11.66 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327518.22)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 401 GFLOPS: 0.00 / 15.58 results: MeasureResult(error_type:RuntimeDeviceError, error_msg:Traceback (most recent call last):
File "/home/masa/projects/dev/tvm/python/tvm/auto_scheduler/measure.py", line 905, in _timed_rpc_run
costs = time_f(*args).results
File "/home/masa/projects/dev/tvm/python/tvm/runtime/module.py", line 226, in eval
...
b8) [0x7f7ca1943068]
[bt] (0) /home/masa/projects/dev/tvm/build/libtvm.so(+0x14498d8) [0x7f7ca193e8d8]
File "/home/masa/projects/dev/tvm/src/runtime/rpc/../../support/socket.h", line 360
TVMError: Socket SockChannel::Recv Error:Interrupted system call
, all_cost:1.20, Tstamp:1607327519.26)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 402 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.51, Tstamp:1607327520.83)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 403 GFLOPS: 12.72 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.49, Tstamp:1607327522.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 404 GFLOPS: 8.19 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.44, Tstamp:1607327523.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 405 GFLOPS: 8.20 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.49, Tstamp:1607327524.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 406 GFLOPS: 11.69 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327526.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 407 GFLOPS: 11.68 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327527.33)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 408 GFLOPS: 11.66 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327528.67)
==========================================
........................************************========
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 409 GFLOPS: 12.83 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.34, Tstamp:1607327530.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 410 GFLOPS: 11.66 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.44, Tstamp:1607327531.84)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 411 GFLOPS: 10.38 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.60, Tstamp:1607327533.17)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 412 GFLOPS: 12.96 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.51, Tstamp:1607327534.47)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 413 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327535.70)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 414 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.51, Tstamp:1607327537.04)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 415 GFLOPS: 11.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327538.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 416 GFLOPS: 11.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.48, Tstamp:1607327539.59)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 417 GFLOPS: 11.66 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.58, Tstamp:1607327540.94)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 418 GFLOPS: 11.67 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327542.23)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 419 GFLOPS: 7.86 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.46, Tstamp:1607327543.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 420 GFLOPS: 7.85 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.51, Tstamp:1607327544.82)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 421 GFLOPS: 11.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.41, Tstamp:1607327546.11)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 422 GFLOPS: 7.35 / 15.58 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.90, Tstamp:1607327547.38)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,16)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 423 GFLOPS: 11.65 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.55, Tstamp:1607327548.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 424 GFLOPS: 8.20 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.45, Tstamp:1607327550.09)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 425 GFLOPS: 8.21 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.42, Tstamp:1607327551.37)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 426 GFLOPS: 12.98 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.51, Tstamp:1607327552.72)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 427 GFLOPS: 6.92 / 15.58 results: MeasureResult(cost:[0.0006], error_no:0, all_cost:1.53, Tstamp:1607327554.04)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,4)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,128)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 428 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327555.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 429 GFLOPS: 3.03 / 15.58 results: MeasureResult(cost:[0.0014], error_no:0, all_cost:1.33, Tstamp:1607327556.24)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,64)
threadIdx.x ax0@ax1@.0.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 430 GFLOPS: 10.07 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.44, Tstamp:1607327557.51)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 431 GFLOPS: 10.06 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.42, Tstamp:1607327558.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 432 GFLOPS: 11.66 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327560.16)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@
................****************ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 433 GFLOPS: 0.48 / 15.58 results: MeasureResult(cost:[0.0085], error_no:0, all_cost:14.29, Tstamp:1607327574.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,4)
for ax0@ax1@.0.0 (0,128)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 434 GFLOPS: 10.09 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.52, Tstamp:1607327575.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 435 GFLOPS: 10.08 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.52, Tstamp:1607327577.09)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 436 GFLOPS: 10.10 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.50, Tstamp:1607327578.37)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 437 GFLOPS: 10.12 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.05, Tstamp:1607327579.22)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 438 GFLOPS: 10.11 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327580.53)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 439 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327581.77)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 440 GFLOPS: 12.72 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327583.05)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 441 GFLOPS: 7.86 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.50, Tstamp:1607327584.38)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 442 GFLOPS: 7.86 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.50, Tstamp:1607327585.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 443 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.60, Tstamp:1607327586.97)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 444 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.68, Tstamp:1607327588.31)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 445 GFLOPS: 11.80 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.39, Tstamp:1607327589.56)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 446 GFLOPS: 3.94 / 15.58 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.00, Tstamp:1607327590.44)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,4)
threadIdx.x i.2@j.2@ (0,250)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,250)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,250)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 447 GFLOPS: 4.12 / 15.58 results: MeasureResult(cost:[0.0010], error_no:0, all_cost:1.55, Tstamp:1607327591.88)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,10)
threadIdx.x i.2@j.2@ (0,100)
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
threadIdx.x ax0@ax1@.0.1 (0,100)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,100)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 448 GFLOPS: 12.99 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.45, Tstamp:1607327593.20)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
Time elapsed for measurement: 96.27 s
----------------------------------------------------------------------
------------------------------ [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.31 s
----------------------------------------------------------------------
------------------------------ [ Search ]
----------------------------------------------------------------------
Sample Initial Population #s: 55 fail_ct: 8137 Time elapsed: 1.40
GA Iter: 0 Max score: 0.3300 Min score: -0.0614 #Pop: 54 #M+: 0 #M-: 0
GA Iter: 4 Max score: 0.8394 Min score: 0.6883 #Pop: 128 #M+: 1397 #M-: 0
EvolutionarySearch #s: 128 Time elapsed: 4.24
----------------------------------------------------------------------
------------------------------ [ Measure ]
----------------------------------------------------------------------
Get 52 programs to measure:
........................************************==================================================
No: 449 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.22, Tstamp:1607327600.95)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 450 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.62, Tstamp:1607327602.27)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 451 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327603.53)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 452 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327604.78)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 453 GFLOPS: 12.98 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.54, Tstamp:1607327606.14)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 454 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.44, Tstamp:1607327607.42)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 455 GFLOPS: 12.84 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.57, Tstamp:1607327608.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 456 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.54, Tstamp:1607327610.05)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 457 GFLOPS: 12.75 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.43, Tstamp:1607327611.32)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 458 GFLOPS: 12.98 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.42, Tstamp:1607327612.59)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 459 GFLOPS: 12.97 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327613.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 460 GFLOPS: 12.72 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.47, Tstamp:1607327615.18)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 461 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.54, Tstamp:1607327616.41)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,2)
for k.2 (0,32)
T_dense = ...
T_add = ...
==================================================
No: 462 GFLOPS: 12.73 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.48, Tstamp:1607327617.74)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 463 GFLOPS: 10.05 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.43, Tstamp:1607327619.01)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 464 GFLOPS: 10.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.44, Tstamp:1607327620.28)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 465 GFLOPS: 10.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.52, Tstamp:1607327621.62)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 466 GFLOPS: 10.11 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.44, Tstamp:1607327622.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,2)
........................************************for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 467 GFLOPS: 10.04 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.04, Tstamp:1607327623.76)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,2)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 468 GFLOPS: 10.06 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327625.07)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 469 GFLOPS: 12.85 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.59, Tstamp:1607327626.36)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,32)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 470 GFLOPS: 10.04 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327627.71)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 471 GFLOPS: 10.12 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327629.03)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 472 GFLOPS: 10.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.47, Tstamp:1607327630.33)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 473 GFLOPS: 7.81 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.24, Tstamp:1607327632.06)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 474 GFLOPS: 9.64 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.79, Tstamp:1607327633.39)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,32)
data.shared = ...
for k.1 (0,8)
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 475 GFLOPS: 11.79 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.40, Tstamp:1607327634.63)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 476 GFLOPS: 7.80 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.48, Tstamp:1607327635.91)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 477 GFLOPS: 10.44 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.62, Tstamp:1607327637.27)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 478 GFLOPS: 11.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.46, Tstamp:1607327638.55)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 479 GFLOPS: 11.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.45, Tstamp:1607327639.82)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 480 GFLOPS: 11.82 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.57, Tstamp:1607327641.18)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 481 GFLOPS: 11.80 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.49, Tstamp:1607327642.47)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 482 GFLOPS: 10.39 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.59, Tstamp:1607327643.80)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 483 GFLOPS: 11.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.54, Tstamp:1607327645.15)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 484 GFLOPS: 10.41 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.56, Tstamp:1607327646.41)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,4)
for k.2 (0,16)
T_dense = ...
T_add = ...
==================================================
No: 485 GFLOPS: 7.80 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.42, Tstamp:1607327647.68)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,8)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 486 GFLOPS: 9.63 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.70, Tstamp:1607327649.01)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,32)
data.shared = ...
for k.2 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 487 GFLOPS: 10.05 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.51, Tstamp:1607327650.35)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,8)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 488 GFLOPS: 10.38 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.49, Tstamp:1607327651.60)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 489 GFLOPS: 7.79 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.56, Tstamp:1607327652.98)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 490 GFLOPS: 7.81 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.46, Tstamp:1607327654.28)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 491 GFLOPS: 7.80 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.46, Tstamp:1607327655.56)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
we
....****ight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 492 GFLOPS: 7.80 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.52, Tstamp:1607327656.90)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,2)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 493 GFLOPS: 7.81 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.46, Tstamp:1607327658.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 494 GFLOPS: 7.80 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.48, Tstamp:1607327659.49)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 1024
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
vectorize ax0@ax1@.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 495 GFLOPS: 10.00 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.56, Tstamp:1607327660.89)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 16
for k.0 (0,128)
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,8)
data.shared = ...
for k.1 (0,4)
for k.2 (0,4)
T_dense = ...
T_add = ...
==================================================
No: 496 GFLOPS: 11.80 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.46, Tstamp:1607327662.19)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 497 GFLOPS: 11.83 / 15.58 results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.17, Tstamp:1607327663.56)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,500)
threadIdx.x i.2@j.2@ (0,2)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,2)
threadIdx.x ax0@ax1@.0.1 (0,2)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,2)
data.shared = ...
for k.1 (0,4)
for k.2 (0,2)
T_dense = ...
T_add = ...
==================================================
No: 498 GFLOPS: 7.80 / 15.58 results: MeasureResult(cost:[0.0005], error_no:0, all_cost:1.51, Tstamp:1607327664.91)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
vthread i.1@j.1@ (0,4)
T_dense auto_unroll: 512
for k.0 (0,256)
for ax0@ax1@.0.0 (0,8)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,2)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.2 (0,8)
T_dense = ...
T_add = ...
==================================================
No: 499 GFLOPS: 9.65 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.59, Tstamp:1607327666.18)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,32)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,2)
weight.shared = ...
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,32)
data.shared = ...
for k.1 (0,64)
T_dense = ...
T_add = ...
==================================================
No: 500 GFLOPS: 10.40 / 15.58 results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.50, Tstamp:1607327667.48)
==================================================
Placeholder: data, weight, bias
blockIdx.x i.0@j.0@ (0,250)
threadIdx.x i.2@j.2@ (0,4)
T_dense auto_unroll: 64
for k.0 (0,32)
for ax0@ax1@.0.0 (0,16)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
weight.shared = ...
for ax0@ax1@.0.0 (0,4)
threadIdx.x ax0@ax1@.0.1 (0,4)
vectorize ax0@ax1@.1 (0,4)
data.shared = ...
for k.1 (0,16)
for k.2 (0,4)
T_dense = ...
T_add = ...
Time elapsed for measurement: 68.28 s
----------------------------------------------------------------------
------------------------------ [ Done ]
----------------------------------------------------------------------
Lowered TIR:
primfn(data_1: handle, weight_1: handle, bias_1: handle, T_add_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {T_add: Buffer(T_add_2: Pointer(float32), float32, [1, 1000], []),
bias: Buffer(bias_2: Pointer(float32), float32, [1000], []),
weight: Buffer(weight_2: Pointer(float32), float32, [1000, 2048], []),
data: Buffer(data_2: Pointer(float32), float32, [1, 2048], [])}
buffer_map = {data_1: data, weight_1: weight, bias_1: bias, T_add_1: T_add} {
attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 250;
attr [T_dense: Pointer(float32)] "storage_scope" = "local";
allocate(T_dense, float32, [1]);
attr [data.shared: Pointer(float32)] "storage_scope" = "shared";
allocate(data.shared, float32, [8]);
attr [weight.shared: Pointer(float32)] "storage_scope" = "shared";
allocate(weight.shared, float32, [32]);
attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 4 {
T_dense[0] = 0f32
for (k.outer.outer: int32, 0, 256) {
attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 4;
data.shared[ramp((threadIdx.x_1*2), 1, 2)] = (float32x2*)data_2[ramp(((k.outer.outer*8) + (threadIdx.x_1*2)), 1, 2)]
attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 4;
weight.shared[ramp((threadIdx.x_2*4), 1, 4)] = (float32x4*)weight_2[ramp(((((blockIdx.x*8192) + (floordiv(threadIdx.x_2, 2)*2048)) + (k.outer.outer*8)) + (floormod(threadIdx.x_2, 2)*4)), 1, 4)]
attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 4;
weight.shared[ramp(((threadIdx.x_2*4) + 16), 1, 4)] = (float32x4*)weight_2[ramp((((((blockIdx.x*8192) + (floordiv(threadIdx.x_2, 2)*2048)) + (k.outer.outer*8)) + (floormod(threadIdx.x_2, 2)*4)) + 4096), 1, 4)]
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[0]*(float32*)weight.shared[(threadIdx.x*8)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[1]*(float32*)weight.shared[((threadIdx.x*8) + 1)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[2]*(float32*)weight.shared[((threadIdx.x*8) + 2)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[3]*(float32*)weight.shared[((threadIdx.x*8) + 3)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[4]*(float32*)weight.shared[((threadIdx.x*8) + 4)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[5]*(float32*)weight.shared[((threadIdx.x*8) + 5)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[6]*(float32*)weight.shared[((threadIdx.x*8) + 6)]))
T_dense[0] = ((float32*)T_dense[0] + ((float32*)data.shared[7]*(float32*)weight.shared[((threadIdx.x*8) + 7)]))
}
T_add_2[((blockIdx.x*4) + threadIdx.x)] = ((float32*)T_dense[0] + (float32*)bias_2[((blockIdx.x*4) + threadIdx.x)])
}
}
Latency: 0.263 ms, GFLOPS: 15.57
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment