Branch: cvecextensions_target
import loopy as lp
knl = lp.make_kernel(
"{[i, j1, j2, j3, j4]: 0<=i<10 and 0<=j1,j2,j3,j4<4}",
"""
<> temp1[j1] = x[i, j1]
<> temp2[j2] = temp1[j2] + 1 {inames=i:j2}
450 elements | |
--------------------------------------------------------------------------- | |
KERNEL: _pt_kernel | |
--------------------------------------------------------------------------- | |
ARGUMENTS: | |
_msh_inp_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global | |
_msh_inp_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global | |
_msh_inp_2_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global | |
_msh_out_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global | |
_msh_out_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global |
Metric Description | SCPT | Auto-tile | |
---|---|---|---|
Instructions per warp | 9903 | 5679.279265 | |
Warp Non-Predicated Execution Efficiency | 99.206680% | 89.364627% | |
Instruction Replay Overhead | 2.9E-05 | 4.5E-05 | |
Shared Memory Load Transactions Per Request | 0 | 2.006311 | |
Shared Memory Store Transactions Per Request | 0 | 2.961292 | |
Local Memory Load Transactions Per Request | 4 | 0 | |
Local Memory Store Transactions Per Request | 4 | 0 | |
Global Load Transactions Per Request | 1.605785 | 7.423965 | |
Global Store Transactions Per Request | 0 | 0 |
test | atts | units | median | |
---|---|---|---|---|
readConstantMemoryCoalesced | GrpSize:032 | GB/s | 664.378 | |
readConstantMemoryCoalesced | GrpSize:064 | GB/s | 648.894 | |
readConstantMemoryCoalesced | GrpSize:128 | GB/s | 652.453 | |
readConstantMemoryCoalesced | GrpSize:256 | GB/s | 658.546 | |
readConstantMemoryCoalesced | GrpSize:512 | GB/s | 639.496 | |
readGlobalMemoryCoalesced | GrpSize:032 | GB/s | 664.912 | |
readGlobalMemoryCoalesced | GrpSize:064 | GB/s | 652.379 | |
readGlobalMemoryCoalesced | GrpSize:128 | GB/s | 654.09 | |
readGlobalMemoryCoalesced | GrpSize:256 | GB/s | 657.666 |
import loopy as lp | |
import numpy as np | |
from time import time | |
knl = lp.make_kernel( | |
[ | |
"[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0 |
#include <isl/ctx.h> | |
#include <isl/aff.h> | |
#include <isl/set.h> | |
#include <sys/time.h> | |
#include <stdio.h> | |
#define TIME_DIFF(t2, t1) ((t2).tv_sec - (t1).tv_sec + ((t2).tv_usec - (t1).tv_usec)*1e-6) | |
int main() { | |
struct timeval start, end; |
import loopy as lp | |
import numpy as np | |
knl = lp.make_kernel( | |
[ | |
"[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0_inner <= 22 and icol1 |
Branch: cvecextensions_target
import loopy as lp
knl = lp.make_kernel(
"{[i, j1, j2, j3, j4]: 0<=i<10 and 0<=j1,j2,j3,j4<4}",
"""
<> temp1[j1] = x[i, j1]
<> temp2[j2] = temp1[j2] + 1 {inames=i:j2}
// Generated with SHOC <https://github.com/vetter/shoc> | |
// Commit: 0aea03beba2f09fcb5935cc11737372fe4de9ec0, date: Apr 17, 2020 | |
__kernel void readGlobalMemoryCoalesced(__global float *data, __global float *output, int size) | |
{ | |
int gid = get_global_id(0), num_thr = get_global_size(0), grpid=get_group_id(0), j = 0; | |
float sum = 0; | |
int s = gid; | |
for (j=0 ; j<1024 ; ++j) { | |
float a0 = data[(s+0)&(size-1)]; | |
float a1 = data[(s+32)&(size-1)]; |
test, atts, units, median, mean, stddev, min, max, trial0, trial1, trial2, trial3, trial4, trial5, trial6, trial7, trial8, trial9 | |
readConstantMemoryCoalesced, GrpSize:032, GB/s, 188.388, 188.682, 2.72905, 183.337, 193.514, 183.337, 193.514, 186.201, 190.98, 189.87, 187.736, 188.921, 187.855, 187.261, 191.144 | |
readConstantMemoryCoalesced, GrpSize:064, GB/s, 198.043, 198.141, 0.704771, 197.337, 199.217, 197.99, 198.95, 198.095, 199.153, 199.217, 197.356, 197.353, 197.337, 198.275, 197.688 | |
readConstantMemoryCoalesced, GrpSize:128, GB/s, 194.171, 193.219, 1.80562, 190.408, 194.921, 194.921, 194.135, 190.408, 194.023, 190.526, 194.392, 194.677, 194.375, 194.206, 190.525 | |
readConstantMemoryCoalesced, GrpSize:256, GB/s, 173.547, 174.337, 2.3963, 173.305, 181.513, 173.51, 181.513, 173.534, 173.559, 173.525, 173.305, 173.696, 173.642, 173.306, 173.777 | |
readConstantMemoryCoalesced, GrpSize:512, GB/s, 164.658, 165.009, 0.533506, 164.479, 165.718, 164.524, 164.479, 164.523, 165.68, 165.566, 164.681, 165.665, 164.634, 165.71 |
#define lid(N) ((int) get_local_id(N)) | |
#define gid(N) ((int) get_group_id(N)) | |
__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) rank_one(__global float const *__restrict__ a, __global float const *__restrict__ b, __global float *__restrict__ c, int const n) | |
{ | |
__local float a_fetch[256]; | |
__local float b_fetch[256]; | |
/* bulk slab for 'j_outer' */ | |
/* bulk slab for 'i_outer' */ |