Skip to content

Instantly share code, notes, and snippets.

View kaushikcfd's full-sized avatar

Kaushik Kulkarni kaushikcfd

View GitHub Profile
450 elements
---------------------------------------------------------------------------
KERNEL: _pt_kernel
---------------------------------------------------------------------------
ARGUMENTS:
_msh_inp_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_inp_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_inp_2_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_out_0_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
_msh_out_1_0: type: np:dtype('float64'), shape: (450, 10), dim_tags: (N1:stride:10, N0:stride:1) aspace: global
Metric Description SCPT Auto-tile
Instructions per warp 9903 5679.279265
Warp Non-Predicated Execution Efficiency 99.206680% 89.364627%
Instruction Replay Overhead 2.9E-05 4.5E-05
Shared Memory Load Transactions Per Request 0 2.006311
Shared Memory Store Transactions Per Request 0 2.961292
Local Memory Load Transactions Per Request 4 0
Local Memory Store Transactions Per Request 4 0
Global Load Transactions Per Request 1.605785 7.423965
Global Store Transactions Per Request 0 0
test atts units median
readConstantMemoryCoalesced GrpSize:032 GB/s 664.378
readConstantMemoryCoalesced GrpSize:064 GB/s 648.894
readConstantMemoryCoalesced GrpSize:128 GB/s 652.453
readConstantMemoryCoalesced GrpSize:256 GB/s 658.546
readConstantMemoryCoalesced GrpSize:512 GB/s 639.496
readGlobalMemoryCoalesced GrpSize:032 GB/s 664.912
readGlobalMemoryCoalesced GrpSize:064 GB/s 652.379
readGlobalMemoryCoalesced GrpSize:128 GB/s 654.09
readGlobalMemoryCoalesced GrpSize:256 GB/s 657.666
import loopy as lp
import numpy as np
from time import time
knl = lp.make_kernel(
[
"[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0
#include <isl/ctx.h>
#include <isl/aff.h>
#include <isl/set.h>
#include <sys/time.h>
#include <stdio.h>
#define TIME_DIFF(t2, t1) ((t2).tv_sec - (t1).tv_sec + ((t2).tv_usec - (t1).tv_usec)*1e-6)
int main() {
struct timeval start, end;
import loopy as lp
import numpy as np
knl = lp.make_kernel(
[
"[end, start] -> { [i2, i1, iquad_tile, iblock, icell, irowtile_eval, icoltile0, icol0_inner, irowtile_quadr, icoltile1, icol1_inner, i_matvec0_prftch_outer, i_matvec0_prftch_inner_outer, i_matvec0_prftch_inner_inner, i_matvec1_prftch_outer, i_matvec1_prftch_inner_outer, i_matvec1_prftch_inner_inner, iprtftch_outer, iprtftch_inner_outer, iprtftch_inner_inner, form_t21_dim_0, form_t23_dim_0, form_t22_dim_0, irow_quadr_inner_inner, irow0_inner_inner, irow0_inner_outer, irow0_inner_outer_init, irow_eval_wrap_up_inner_inner, irow_eval_wrap_up_inner_outer, irow1_inner_inner, irow1_inner_outer, irow_quadr_wrap_up_inner_inner, irow_quadr_wrap_up_inner_outer, irow1_inner_outer_init] : exists (e3, e4: iquad_tile = 0 and 0 <= i2 <= 1 and 0 <= i1 <= 2 and icell >= start - 8iblock and 0 <= icell <= 7 and icell < end - 8iblock and irowtile_eval >= 0 and icol0_inner >= 0 and -23icoltile0 <= icol0_inner <= 44 - 23icoltile0 and icol0_inner <= 22 and icol1

Branch: cvecextensions_target

import loopy as lp

knl = lp.make_kernel(
        "{[i, j1, j2, j3, j4]: 0<=i<10 and 0<=j1,j2,j3,j4<4}",
        """
        <> temp1[j1] = x[i, j1]
        <> temp2[j2] = temp1[j2] + 1 {inames=i:j2}
// Generated with SHOC <https://github.com/vetter/shoc>
// Commit: 0aea03beba2f09fcb5935cc11737372fe4de9ec0, date: Apr 17, 2020
__kernel void readGlobalMemoryCoalesced(__global float *data, __global float *output, int size)
{
int gid = get_global_id(0), num_thr = get_global_size(0), grpid=get_group_id(0), j = 0;
float sum = 0;
int s = gid;
for (j=0 ; j<1024 ; ++j) {
float a0 = data[(s+0)&(size-1)];
float a1 = data[(s+32)&(size-1)];
We can make this file beautiful and searchable if this error is corrected: It looks like row 6 should actually have 18 columns, instead of 17 in line 5.
test, atts, units, median, mean, stddev, min, max, trial0, trial1, trial2, trial3, trial4, trial5, trial6, trial7, trial8, trial9
readConstantMemoryCoalesced, GrpSize:032, GB/s, 188.388, 188.682, 2.72905, 183.337, 193.514, 183.337, 193.514, 186.201, 190.98, 189.87, 187.736, 188.921, 187.855, 187.261, 191.144
readConstantMemoryCoalesced, GrpSize:064, GB/s, 198.043, 198.141, 0.704771, 197.337, 199.217, 197.99, 198.95, 198.095, 199.153, 199.217, 197.356, 197.353, 197.337, 198.275, 197.688
readConstantMemoryCoalesced, GrpSize:128, GB/s, 194.171, 193.219, 1.80562, 190.408, 194.921, 194.921, 194.135, 190.408, 194.023, 190.526, 194.392, 194.677, 194.375, 194.206, 190.525
readConstantMemoryCoalesced, GrpSize:256, GB/s, 173.547, 174.337, 2.3963, 173.305, 181.513, 173.51, 181.513, 173.534, 173.559, 173.525, 173.305, 173.696, 173.642, 173.306, 173.777
readConstantMemoryCoalesced, GrpSize:512, GB/s, 164.658, 165.009, 0.533506, 164.479, 165.718, 164.524, 164.479, 164.523, 165.68, 165.566, 164.681, 165.665, 164.634, 165.71
#define lid(N) ((int) get_local_id(N))
#define gid(N) ((int) get_group_id(N))
__kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) rank_one(__global float const *__restrict__ a, __global float const *__restrict__ b, __global float *__restrict__ c, int const n)
{
__local float a_fetch[256];
__local float b_fetch[256];
/* bulk slab for 'j_outer' */
/* bulk slab for 'i_outer' */