Skip to content

Instantly share code, notes, and snippets.

# coding=utf-8
# Copyright (c) 2019-2020 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
PrimFunc([argsort_gpu.v0, argsort_gpu.v2, argsort_gpu.v3, argsort_gpu.v1, i_0, any_dim]) attrs={"target": vulkan -keys=vulkan,gpu -max_num_threads=256, "tir.noalias": 1, "global_symbol": "fused_argsort_kernel2", "tir.device_thread_axis": [iter_var(threadIdx.x, , threadIdx.x), iter_var(blockIdx.x, , blockIdx.x), iter_var(blockIdx.y, , blockIdx.y), iter_var(blockIdx.z, , blockIdx.z)], "calling_conv": 2} {
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 256
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
type Option[A] {
Some(A),
None,
}
type static_tensor_float32_2_4_t {
tensor_nil_float32_2_4,
tensor_constructor_float32_2_4(Tensor[(2, 4), float32]),
}
@masahi
masahi / gist:d3db22563b3864baf7e7813ed7c6e0e1
Created February 22, 2021 11:04
fused_expand_dims_concatenate_1_kernel0.cu
extern "C" __global__ void fused_expand_dims_concatenate_1_kernel0(float* __restrict__ T_concat, float* __restrict__ placeholder, float* __restrict__ placeholder1, int any_dim, int stride, int stride1, int stride2, int stride3, int stride4) {
if (((int)blockIdx.x) < (((any_dim * 90000) + 90000) >> 9)) {
if ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / 90000) < (any_dim + 1)) {
if ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / 300) < ((any_dim * 300) + 300)) {
T_concat[(((((int)blockIdx.x) * 512) + ((int)threadIdx.x)))] = ((0 <= ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / 90000) - any_dim)) ? placeholder[(((((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % 90000) / 300) * stride) + ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % 300) * stride1)))] : placeholder1[(((((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / 90000) * stride2) + (((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % 90000) / 300) * stride3)) + ((((((int)blockIdx.x) * 512) + ((int)threadI
type Storage {
}
def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
let %storage_0: Storage[] = memory.alloc_storage(602112 /* ty=int64 */, 64 /* ty=int64 */, meta[relay.attrs.AllocStorageAttrs][0]) /* ty=Storage[] */;
let %tensor_0: Tensor[(1, 224, 224, 3), float32] = memory.alloc_tensor(%storage_0, 0 /* ty=int64 */, meta[relay.Constant][0] /* ty=Tensor[(4), int64] */, meta[relay.attrs.AllocTensorAttrs][0]) /* ty=Tensor[(1, 224, 224, 3), float32] */;
%2 = fn (%p0: Tensor[(1, 3, 224, 224), float32], %p1: Tensor[(3, 1, 1), float32], %p2: Tensor[(3, 1, 1), float32], Primitive=1) -> Tensor[(1, 224, 224, 3), float32] {
%0 = multiply(%p0, %p1) /* ty=Tensor[(1, 3, 224, 224), float32] */;
%1 = add(%0, %p2) /* ty=Tensor[(1, 3, 224, 224), float32] */;
def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
%2 = fn (%p0: Tensor[(1, 3, 224, 224), float32], %p1: Tensor[(3, 1, 1), float32], %p2: Tensor[(3, 1, 1), float32], Primitive=1) -> Tensor[(1, 224, 224, 3), float32] {
%0 = multiply(%p0, %p1) /* ty=Tensor[(1, 3, 224, 224), float32] */;
%1 = add(%0, %p2) /* ty=Tensor[(1, 3, 224, 224), float32] */;
layout_transform(%1, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(1, 224, 224, 3), float32] */
};
%3 = %2(%data, meta[relay.Constant][0] /* ty=Tensor[(3, 1, 1), float32] */, meta[relay.Constant][1] /* ty=Tensor[(3, 1, 1), float32] */) /* ty=Tensor[(1, 224, 224, 3), float32] */;
%6 = fn (%p01: Tensor[(1, 224, 224, 3), float32], %p11: Tensor[(7, 7, 3, 64), float32], %p21: Tensor[(1, 1, 1, 64), float32], Primitive=1) -> Tensor[(1, 112, 112, 64), float32] {
%4 = nn.conv2d(%p01, %p11, strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7], data_layout="NHWC", kernel_layout="HWIO") /* ty=Tensor[(
type Storage {
}
def @main(%data: Tensor[(1, 3, 224, 224), float32], %bn_data_gamma: Tensor[(3), float32], %bn_data_beta: Tensor[(3), float32], %bn_data_moving_mean: Tensor[(3), float32], %bn_data_moving_var: Tensor[(3), float32], %conv0_weight: Tensor[(64, 3, 7, 7), float32], %bn0_gamma: Tensor[(64), float32], %bn0_beta: Tensor[(64), float32], %bn0_moving_mean: Tensor[(64), float32], %bn0_moving_var: Tensor[(64), float32], %stage1_unit1_bn1_gamma: Tensor[(64), float32], %stage1_unit1_bn1_beta: Tensor[(64), float32], %stage1_unit1_bn1_moving_mean: Tensor[(64), float32], %stage1_unit1_bn1_moving_var: Tensor[(64), float32], %stage1_unit1_conv1_weight: Tensor[(64, 64, 1, 1), float32], %stage1_unit1_bn2_gamma: Tensor[(64), float32], %stage1_unit1_bn2_beta: Tensor[(64), float32], %stage1_unit1_bn2_moving_mean: Tensor[(64), float32], %stage1_unit1_bn2_moving_var: Tensor[(64), float32], %stage1_unit1_conv2_weight: Tensor[(64, 64, 3, 3), float32], %stage1_unit1_bn3_gamma: Tensor[(64), float32], %stage1_unit1_bn3_
extern "C" __global__ void fused_dyn_full_kernel0(float* __restrict__ T_full, int* __restrict__ placeholder, int any_dim, int any_dim1, int any_dim2, int any_dim3, int stride, int stride1, int stride2, int stride3) {
if (((int)blockIdx.x) < ((((any_dim * any_dim1) * any_dim2) * any_dim3) >> 9)) {
if (((((any_dim1 >= 0) && ((((((any_dim2 >= 0) && ((((((any_dim3 >= 0) && ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % any_dim3) >= 0)) || ((any_dim3 < 0) && ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % any_dim3) <= 0))) ? (((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / any_dim3) : ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / any_dim3) - 1)) % any_dim2) >= 0)) || ((any_dim2 < 0) && ((((((any_dim3 >= 0) && ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % any_dim3) >= 0)) || ((any_dim3 < 0) && ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) % any_dim3) <= 0))) ? (((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / any_dim3) : ((((((int)blockIdx.x) * 512) + ((int)threadIdx.x)) / a
import numpy as np
import cv2
import torch
import torchvision
in_size = 300
input_shape = (1, 3, in_size, in_size)
; ModuleID = 'TVMMod'
source_filename = "TVMMod"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
%0 = type { double }
%1 = type { i8*, %2, i32, %3, i64*, i64*, i64 }
%2 = type { i32, i32 }
%3 = type { i8, i8, i16 }