Skip to content

Instantly share code, notes, and snippets.

import random
import math
def sort_by_z_order(a,b):
return a.z_value - b.z_value
def push_zeros(a, n):
if len(a) < n:
for i in xrange(n-len(a)):
a = '0' + a
import random
import math
def sort_by_z_order(a,b):
return a.z_value - b.z_value
def push_zeros(a, n):
if len(a) < n:
for i in xrange(n-len(a)):
a = '0' + a
[23:36:46] src/codegen/llvm/codegen_amdgpu.cc:177: ; ModuleID = 'myadd__kernel0'
source_filename = "myadd__kernel0"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
target triple = "amdgcn-amd-amdhsa-hcc"
; Function Attrs: nounwind
define amdgpu_kernel void @myadd__kernel0(float addrspace(1)* noalias nocapture, float addrspace(1)* noalias nocapture readonly, float addrspace(1)* noalias nocapture readonly, i32) local_unnamed_addr #0 {
entry:
%4 = tail call i32 @llvm.amdgcn.workitem.id.x()
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x()
@masahi
masahi / rocm_kernel.s
Created October 10, 2017 15:17
The output of $/opt/rocm/hcc/compiler/bin/llvm-objdump -disassemble -mcpu=gfx803 rocm_kernel.co
rocm_kernel.co: file format ELF64-amdgpu-hsacobj
Disassembly of section .text:
myadd__kernel0:
s_load_dword s0, s[4:5], 0x18 // 000000001100: C0020002 00000018
v_lshlrev_b32_e32 v0, 7, v0 // 000000001108: 24000087
s_waitcnt lgkmcnt(0) // 00000000110C: BF8C007F
v_sub_i32_e32 v1, vcc, s0, v0 // 000000001110: 34020000
@masahi
masahi / myadd_kernel.s
Created October 10, 2017 15:22
The output of $ llc-5.0 -march=amdgcn -mcpu=gfx803 myadd_kernel.ll
.text
.hsa_code_object_version 2,1
.hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
.globl myadd__kernel0 ; -- Begin function myadd__kernel0
.p2align 8
.type myadd__kernel0,@function
.amdgpu_hsa_kernel myadd__kernel0
myadd__kernel0: ; @myadd__kernel0
.amd_kernel_code_t
amd_code_version_major = 1
@masahi
masahi / conv2d_nchw_amd.py
Created October 11, 2017 00:45
Modified schedule for amd gpu
# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches
"""Schedule for conv2d_nchw with auto fusion"""
import tvm
from .. import util
from .. import tag
def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L):
"""Schedule conv2d for specific feature_in_out_filter pattern"""
# scheduler params
""" Benchmark script for performance on GPUs. For example, run the file with: `python cuda_imagenet_bench.py --model='mobilenet'`. For more details about how to set up the inference environment on GPUs, please refer to NNVM Tutorial: ImageNet Inference on the GPU """
import time
import argparse
import numpy as np
import tvm
import nnvm.compiler
import nnvm.testing
from tvm.contrib import util, nvcc
from tvm.contrib import graph_runtime as runtime
# pylint: disable=invalid-name, unused-argument
"""Definition of nn ops"""
from __future__ import absolute_import
import tvm
import topi
from topi.util import get_const_int
from .tensor import _fschedule_broadcast
from . import registry as reg
from .registry import OpPattern
# pylint: disable=invalid-name, unused-argument
"""Reduction ops"""
from __future__ import absolute_import
import tvm
import topi
import topi.cuda
from . import registry as reg
from .registry import OpPattern
# pylint: disable=invalid-name, unused-argument
"""Tensor ops"""
from __future__ import absolute_import
import tvm
import topi
import topi.cuda
from . import registry as reg
from .registry import OpPattern