Install
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -y
Expected result (-0.1250)
import torch | |
import time | |
import sys | |
def run(nelems, iters): | |
# Check if CUDA is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
tensor_a = torch.randn(nelems, dtype=torch.float32, device=device) |
""" | |
Matrix Multiplication | |
===================== | |
In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves | |
performance on par with cuBLAS or rocBLAS. | |
You will specifically learn about: | |
* Block-level matrix multiplications. |
import torch | |
import sys | |
device = torch.device('cpu') | |
left = torch.zeros(100, device=device, requires_grad=True) | |
right = torch.zeros(100, device=device, requires_grad=True) | |
grad = torch.zeros(100, device=device) | |
for _ in range(10): | |
output = torch.add(left, right) |
import triton | |
import pytest | |
import torch | |
import triton.language as tl | |
import numpy as np | |
from numpy.random import RandomState | |
@pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis", | |
[(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis) |
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 8.0 | |
.target sm_80 | |
.address_size 64 | |
// .globl triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d | |
.extern .shared .align 1 .b8 global_smem[]; |
Install
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -y
Expected result (-0.1250)
// | |
// Generated by LLVM NVPTX Back-End | |
// | |
.version 8.0 | |
.target sm_80 | |
.address_size 64 | |
// .globl triton__0d1d2d3d | |
.visible .entry triton__0d1d2d3d( | |
.param .u64 triton__0d1d2d3d_param_0, | |
.param .u64 triton__0d1d2d3d_param_1, |
#include <dlfcn.h> | |
#include "tool.h" | |
int main() { | |
//void *handle = dlopen("./tool.so", RTLD_NOW); | |
print_t func = (print_t)dlsym(RTLD_NEXT, "print"); | |
func(); | |
return 0; | |
} |
waka |