Kaushik Kulkarni kaushikcfd

## wave_operator.lpknl
450 elements
---------------------------------------------------------------------------
KERNEL: _pt_kernel
---------------------------------------------------------------------------
ARGUMENTS:
_msh_inp_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_inp_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_inp_2_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_out_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_out_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global

## helmholtz5_scpt_autotile.csv

          
            Metric Description
            SCPT
            Auto-tile

            
              Instructions per warp
              9903
              5679.279265

            
              Warp Non-Predicated Execution Efficiency
              99.206680%
              89.364627%

            
              Instruction Replay Overhead
              2.9E-05
              4.5E-05

            
              Shared Memory Load Transactions Per Request
              0
              2.006311

            
              Shared Memory Store Transactions Per Request
              0
              2.961292

            
              Local Memory Load Transactions Per Request
              4
              0

            
              Local Memory Store Transactions Per Request
              4
              0

            
              Global Load Transactions Per Request
              1.605785
              7.423965

            
              Global Store Transactions Per Request
              0
              0

## titanV_shoc.csv

          
            test
             atts
             units
             median

            
              readConstantMemoryCoalesced
               GrpSize:032
               GB/s
               664.378

            
              readConstantMemoryCoalesced
               GrpSize:064
               GB/s
               648.894

            
              readConstantMemoryCoalesced
               GrpSize:128
               GB/s
               652.453

            
              readConstantMemoryCoalesced
               GrpSize:256
               GB/s
               658.546

            
              readConstantMemoryCoalesced
               GrpSize:512
               GB/s
               639.496

            
              readGlobalMemoryCoalesced
               GrpSize:032
               GB/s
               664.912

            
              readGlobalMemoryCoalesced
               GrpSize:064
               GB/s
               652.379

            
              readGlobalMemoryCoalesced
               GrpSize:128
               GB/s
               654.09

            
              readGlobalMemoryCoalesced
               GrpSize:256
               GB/s
               657.666

## compare_dim_max_perf.py
import loopy as lp
import numpy as np
from time import time

knl = lp.make_kernel(
    [
    "[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0

## isl_reproducer.c
#include <isl/ctx.h>
#include <isl/aff.h>
#include <isl/set.h>
#include <sys/time.h>
#include <stdio.h>

#define TIME_DIFF(t2, t1) ((t2).tv_sec - (t1).tv_sec + ((t2).tv_usec - (t1).tv_usec)*1e-6)

int main() {
  struct timeval start, end;

## slow_due_to_islpy.py
import loopy as lp
import numpy as np

knl = lp.make_kernel(
    [
    "[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0_inner <= 22 and icol1

## cvec_branch_unvectorize.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kaushikcfd
                / cvec_branch_unvectorize.md
            
            
              Created
              September 29, 2020 14:16
            
          
    Branch: cvecextensions_target
import loopy as lp

knl = lp.make_kernel(
        "{[i, j1, j2, j3, j4]: 0<=i<10 and 0<=j1,j2,j3,j4<4}",
        """
        <> temp1[j1] = x[i, j1]
        <> temp2[j2] = temp1[j2] + 1 {inames=i:j2}

  
## shoc_kernels.cl
// Generated with SHOC <https://github.com/vetter/shoc>
// Commit: 0aea03beba2f09fcb5935cc11737372fe4de9ec0, date: Apr 17, 2020
__kernel void readGlobalMemoryCoalesced(__global float *data, __global float *output, int size)
{
    int gid = get_global_id(0), num_thr = get_global_size(0), grpid=get_group_id(0), j = 0;
    float sum = 0;
    int s = gid;
    for (j=0 ; j<1024 ; ++j) {
       float a0 = data[(s+0)&(size-1)];
       float a1 = data[(s+32)&(size-1)];

## k40_shoc.csv
test,	atts,	units,	median,	mean,	stddev,	min,	max,	trial0,	trial1,	trial2,	trial3,	trial4,	trial5,	trial6,	trial7,	trial8,	trial9
readConstantMemoryCoalesced,	GrpSize:032,	GB/s,	188.388,	188.682,	2.72905,	183.337,	193.514,	183.337,	193.514,	186.201,	190.98,	189.87,	187.736,	188.921,	187.855,	187.261,	191.144
readConstantMemoryCoalesced,	GrpSize:064,	GB/s,	198.043,	198.141,	0.704771,	197.337,	199.217,	197.99,	198.95,	198.095,	199.153,	199.217,	197.356,	197.353,	197.337,	198.275,	197.688
readConstantMemoryCoalesced,	GrpSize:128,	GB/s,	194.171,	193.219,	1.80562,	190.408,	194.921,	194.921,	194.135,	190.408,	194.023,	190.526,	194.392,	194.677,	194.375,	194.206,	190.525
readConstantMemoryCoalesced,	GrpSize:256,	GB/s,	173.547,	174.337,	2.3963,	173.305,	181.513,	173.51,	181.513,	173.534,	173.559,	173.525,	173.305,	173.696,	173.642,	173.306,	173.777
readConstantMemoryCoalesced,	GrpSize:512,	GB/s,	164.658,	165.009,	0.533506,	164.479,	165.718,	164.524,	164.479,	164.523,	165.68,	165.566,	164.681,	165.665,	164.634,	165.71

## turtle_kernel.cl
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))

__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) rank_one(__global float const *__restrict__ a, __global float const *__restrict__ b, __global float *__restrict__ c, int const n)
{
  __local float a_fetch[256];
  __local float b_fetch[256];

  /* bulk slab for 'j_outer' */
  /* bulk slab for 'i_outer' */
	450 elements
	---------------------------------------------------------------------------
	KERNEL: _pt_kernel
	---------------------------------------------------------------------------
	ARGUMENTS:
	_msh_inp_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
	_msh_inp_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
	_msh_inp_2_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
	_msh_out_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
	_msh_out_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
Metric Description	SCPT	Auto-tile
Instructions per warp	9903	5679.279265
Warp Non-Predicated Execution Efficiency	99.206680%	89.364627%
Instruction Replay Overhead	2.9E-05	4.5E-05
Shared Memory Load Transactions Per Request	0	2.006311
Shared Memory Store Transactions Per Request	0	2.961292
Local Memory Load Transactions Per Request	4	0
Local Memory Store Transactions Per Request	4	0
Global Load Transactions Per Request	1.605785	7.423965
Global Store Transactions Per Request	0	0
test	atts	units	median
readConstantMemoryCoalesced	GrpSize:032	GB/s	664.378
readConstantMemoryCoalesced	GrpSize:064	GB/s	648.894
readConstantMemoryCoalesced	GrpSize:128	GB/s	652.453
readConstantMemoryCoalesced	GrpSize:256	GB/s	658.546
readConstantMemoryCoalesced	GrpSize:512	GB/s	639.496
readGlobalMemoryCoalesced	GrpSize:032	GB/s	664.912
readGlobalMemoryCoalesced	GrpSize:064	GB/s	652.379
readGlobalMemoryCoalesced	GrpSize:128	GB/s	654.09
readGlobalMemoryCoalesced	GrpSize:256	GB/s	657.666
	import loopy as lp
	import numpy as np
	from time import time

	knl = lp.make_kernel(
	[
	"[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0
	#include <isl/ctx.h>
	#include <isl/aff.h>
	#include <isl/set.h>
	#include <sys/time.h>
	#include <stdio.h>

	#define TIME_DIFF(t2, t1) ((t2).tv_sec - (t1).tv_sec + ((t2).tv_usec - (t1).tv_usec)*1e-6)

	int main() {
	struct timeval start, end;
	// Generated with SHOC <https://github.com/vetter/shoc>
	// Commit: 0aea03beba2f09fcb5935cc11737372fe4de9ec0, date: Apr 17, 2020
	__kernel void readGlobalMemoryCoalesced(__global float data, __global float output, int size)
	{
	int gid = get_global_id(0), num_thr = get_global_size(0), grpid=get_group_id(0), j = 0;
	float sum = 0;
	int s = gid;
	for (j=0 ; j<1024 ; ++j) {
	float a0 = data[(s+0)&(size-1)];
	float a1 = data[(s+32)&(size-1)];
	test, atts, units, median, mean, stddev, min, max, trial0, trial1, trial2, trial3, trial4, trial5, trial6, trial7, trial8, trial9
	readConstantMemoryCoalesced, GrpSize:032, GB/s, 188.388, 188.682, 2.72905, 183.337, 193.514, 183.337, 193.514, 186.201, 190.98, 189.87, 187.736, 188.921, 187.855, 187.261, 191.144
	readConstantMemoryCoalesced, GrpSize:064, GB/s, 198.043, 198.141, 0.704771, 197.337, 199.217, 197.99, 198.95, 198.095, 199.153, 199.217, 197.356, 197.353, 197.337, 198.275, 197.688
	readConstantMemoryCoalesced, GrpSize:128, GB/s, 194.171, 193.219, 1.80562, 190.408, 194.921, 194.921, 194.135, 190.408, 194.023, 190.526, 194.392, 194.677, 194.375, 194.206, 190.525
	readConstantMemoryCoalesced, GrpSize:256, GB/s, 173.547, 174.337, 2.3963, 173.305, 181.513, 173.51, 181.513, 173.534, 173.559, 173.525, 173.305, 173.696, 173.642, 173.306, 173.777
	readConstantMemoryCoalesced, GrpSize:512, GB/s, 164.658, 165.009, 0.533506, 164.479, 165.718, 164.524, 164.479, 164.523, 165.68, 165.566, 164.681, 165.665, 164.634, 165.71
	#define lid(N) ((int) get_local_id(N))
	#define gid(N) ((int) get_group_id(N))

	__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) rank_one(__global float const __restrict__ a, __global float const __restrict__ b, __global float *__restrict__ c, int const n)
	{
	__local float a_fetch[256];
	__local float b_fetch[256];

	/* bulk slab for 'j_outer' */
	/* bulk slab for 'i_outer' */