malfet/test_trition.py

## test_trition.py
import triton
import triton.language as tl

@triton.jit
def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
    xnumel = 10
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


if __name__ == "__main__":
    import torch
    print(f"torch version {torch.__version__} triton version {triton.__version__}")
    inp = torch.randn(10, device='cuda')
    out = torch.randn(10, device='cuda')
    kernel[(10,)](inp, out, 10, XBLOCK=16)
    print(inp, out)
	import triton
	import triton.language as tl

	@triton.jit
	def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
	xnumel = 10
	xoffset = tl.program_id(0) * XBLOCK
	xindex = xoffset + tl.arange(0, XBLOCK)[:]
	xmask = xindex < xnumel
	x0 = xindex
	tmp0 = tl.load(in_ptr0 + (x0), xmask)
	tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp0, xmask)


	if __name__ == "__main__":
	import torch
	print(f"torch version {torch.__version__} triton version {triton.__version__}")
	inp = torch.randn(10, device='cuda')
	out = torch.randn(10, device='cuda')
	kernel[(10,)](inp, out, 10, XBLOCK=16)
	print(inp, out)