Keren Zhou Jokeren

## overhead.py
import torch
import time
import sys


def run(nelems, iters):
    # Check if CUDA is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)

## gist:17554b58eed1157a43aaa85a5f1cfc7c
"""
Matrix Multiplication
=====================
In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves
performance on par with cuBLAS or rocBLAS.

You will specifically learn about:

* Block-level matrix multiplications.

## add.py
import torch
import sys

device = torch.device('cpu')
left = torch.zeros(100, device=device, requires_grad=True)
right = torch.zeros(100, device=device, requires_grad=True)
grad = torch.zeros(100, device=device)

for _ in range(10):
    output = torch.add(left, right)

## test.py
import triton
import pytest
import torch
import triton.language as tl
import numpy as np
from numpy.random import RandomState


@pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis",
                         [(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis)

## main.ptx
//
// Generated by LLVM NVPTX Back-End
//

.version 8.0
.target sm_80
.address_size 64

	// .globl	triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d
.extern .shared .align 1 .b8 global_smem[];

## Instruction.md

      
              4 files
            
          
              0 forks
            
          
              1 comment
            
          
              0 stars
            
          
                Jokeren
                / Instruction.md
            
            
              Created
              March 1, 2023 06:13
            
              
                fp16 mov reproducer
              
          
    Install
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install cmake; # build time dependency
pip install -e .
pip uninstall pytorch-triton -y
Expected result (-0.1250)

  
## ptx
//
// Generated by LLVM NVPTX Back-End
//
.version 8.0
.target sm_80
.address_size 64
    // .globl   triton__0d1d2d3d
.visible .entry triton__0d1d2d3d(
    .param .u64 triton__0d1d2d3d_param_0,
    .param .u64 triton__0d1d2d3d_param_1,

## main.cc
#include <dlfcn.h>
#include "tool.h"

int main() {
  //void *handle = dlopen("./tool.so", RTLD_NOW);
  print_t func = (print_t)dlsym(RTLD_NEXT, "print");
  func();
  return 0;
}

## gist:024f6b405f62fe9b10e950e7bebfce2e

      
              0 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Jokeren
                / gist:024f6b405f62fe9b10e950e7bebfce2e
            
            
              Last active
              December 20, 2020 00:23
            
              
                Time
              
          
              We couldn’t find that file to show.
              
            
## waka
waka
	import torch
	import time
	import sys


	def run(nelems, iters):
	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	tensor_a = torch.randn(nelems, dtype=torch.float32, device=device)
	"""
	Matrix Multiplication
	=====================
	In this tutorial, you will write a very short high-performance FP16 matrix multiplication kernel that achieves
	performance on par with cuBLAS or rocBLAS.

	You will specifically learn about:

	* Block-level matrix multiplications.
	import torch
	import sys

	device = torch.device('cpu')
	left = torch.zeros(100, device=device, requires_grad=True)
	right = torch.zeros(100, device=device, requires_grad=True)
	grad = torch.zeros(100, device=device)

	for _ in range(10):
	output = torch.add(left, right)
	import triton
	import pytest
	import torch
	import triton.language as tl
	import numpy as np
	from numpy.random import RandomState


	@pytest.mark.parametrize("M, N, K, num_warps, epilogue, allow_tf32, in_dtype, out_dtype, axis",
	[(*shape_nw, 'softmax', allow_tf32, in_dtype, out_dtype, axis)
	//
	// Generated by LLVM NVPTX Back-End
	//

	.version 8.0
	.target sm_80
	.address_size 64

	// .globl triton__0d1d2d3d4d56d7d89d1011d1213d1415d1617d1819d2021d2223d2425d2627d2829d3031d3233d3435d3637d3839d4041d42d
	.extern .shared .align 1 .b8 global_smem[];
	#include <dlfcn.h>
	#include "tool.h"

	int main() {
	//void *handle = dlopen("./tool.so", RTLD_NOW);
	print_t func = (print_t)dlsym(RTLD_NEXT, "print");
	func();
	return 0;
	}