jjsjann123/gist:2c4db9f6659cfe2cc8aa9503cb8a806c Secret

## gistfile1.txt
import torch
from nvfuser import FusionDefinition, DataType

bsz = 2
block_size = 1024
n_head = 16
head_size = 32
rope_n_elem = 8

def rope_fusion(fd: FusionDefinition) -> None:
    q = fd.define_tensor(
        shape=[bsz, n_head, block_size, head_size],
        contiguity=[True, True, True, True],
        dtype=DataType.BFloat16,
        is_cpu=False,
        stride_order=[3, 2, 1, 0],
    )
    cos = fd.define_tensor(
        shape=[block_size, rope_n_elem],
        contiguity=[True, True],
        dtype=DataType.BFloat16,
        is_cpu=False,
        stride_order=[1, 0],
    )
    sin = fd.define_tensor(
        shape=[block_size, rope_n_elem],
        contiguity=[True, True],
        dtype=DataType.BFloat16,
        is_cpu=False,
        stride_order=[1, 0],
    )

    offset_0 = rope_n_elem // 2

    q_rope = fd.ops.slice(q, start_indices=[0, 0, 0, 0], end_indices=[bsz, n_head, block_size, rope_n_elem], strides=[1, 1, 1, 1])
    q_remainder = fd.ops.slice(q, start_indices=[0, 0, 0, rope_n_elem], end_indices=[bsz, n_head, block_size, head_size], strides=[1, 1, 1, 1])
    q_remainder = fd.ops.pad(q_remainder, list(reversed([0, 0, 0, 0, 0, 0, 0, rope_n_elem])))

    q_left = fd.ops.slice(q_rope, start_indices=[0, 0, 0, 0], end_indices=[bsz, n_head, block_size, offset_0], strides=[1, 1, 1, 1])
    q_left = fd.ops.pad(q_left, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem, rope_n_elem - offset_0])))
    q_right = fd.ops.slice(q_rope, start_indices=[0, 0, 0, offset_0], end_indices=[bsz, n_head, block_size, rope_n_elem], strides=[1, 1, 1, 1])
    q_right = fd.ops.pad(q_right, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem + offset_0, 0])))

    # note that this is identical to q_left and q_right. We should be able to merge it back.
    q_left_cos = fd.ops.slice(q_rope, start_indices=[0, 0, 0, 0], end_indices=[bsz, n_head, block_size, offset_0], strides=[1, 1, 1, 1])
    q_left_cos = fd.ops.pad(q_left_cos, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem + offset_0, 0])))
    q_right_cos = fd.ops.slice(q_rope, start_indices=[0, 0, 0, offset_0], end_indices=[bsz, n_head, block_size, rope_n_elem], strides=[1, 1, 1, 1])
    q_right_cos = fd.ops.pad(q_right_cos, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem, rope_n_elem - offset_0])))

    # slice cos/sin
    cos_left = fd.ops.slice(cos, start_indices=[0, 0], end_indices=[block_size, offset_0], strides=[1, 1])
    cos_left = fd.ops.pad(cos_left, list(reversed([0, 0, head_size - offset_0, 0])))
    cos_left = fd.ops.broadcast_in_dim(cos_left, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])
    cos_right = fd.ops.slice(cos, start_indices=[0, offset_0], end_indices=[block_size, rope_n_elem], strides=[1, 1])
    cos_right = fd.ops.pad(cos_right, list(reversed([0, 0, head_size - rope_n_elem, offset_0])))
    cos_right = fd.ops.broadcast_in_dim(cos_right, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])

    sin_left = fd.ops.slice(sin, start_indices=[0, 0], end_indices=[block_size, offset_0], strides=[1, 1])
    sin_left = fd.ops.pad(sin_left, list(reversed([0, 0, head_size - offset_0, 0])))
    sin_left = fd.ops.broadcast_in_dim(sin_left, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])
    sin_right = fd.ops.slice(sin, start_indices=[0, offset_0], end_indices=[block_size, rope_n_elem], strides=[1, 1])
    sin_right = fd.ops.pad(sin_right, list(reversed([0, 0, head_size - rope_n_elem, offset_0])))
    sin_right = fd.ops.broadcast_in_dim(sin_right, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])

    q0 = (-q_right) * sin_left + cos_left * q_left_cos
    q1 = q_left * sin_right + cos_right * q_right_cos
    q_out = q0 + q1 + q_remainder
    q_out = fd.ops.cast(q_out, dtype=DataType.BFloat16)
    q0 = fd.ops.cast(q0, dtype=DataType.BFloat16)

    fd.add_output(q_out)

with FusionDefinition() as fd:
    rope_fusion(fd)

inputs = [
    torch.randn((bsz, n_head, block_size, head_size), dtype=torch.bfloat16, device="cuda:0"),
    torch.randn((block_size, rope_n_elem), dtype=torch.bfloat16, device="cuda:0"),
    torch.randn((block_size, rope_n_elem), dtype=torch.bfloat16, device="cuda:0"),
]

o = fd.execute(inputs)[0]

def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
    head_size = x.size(-1)
    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
    x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
    roped = (x * cos) + (rotated * sin)
    return roped.to(dtype=x.dtype)

def rope_one_entry(x, cos, sin, rope_n_elem):
    x_roped = apply_rope(x[..., : rope_n_elem], cos, sin)
    return torch.cat((x_roped, x[..., rope_n_elem :]), dim=-1)

#import thunder
#thunder_rope_one = thunder.jit(rope_one_entry, nv_enable_bookend=False)
#o_ref = thunder_rope_one(*inputs, rope_n_elem)
	import torch
	from nvfuser import FusionDefinition, DataType

	bsz = 2
	block_size = 1024
	n_head = 16
	head_size = 32
	rope_n_elem = 8

	def rope_fusion(fd: FusionDefinition) -> None:
	q = fd.define_tensor(
	shape=[bsz, n_head, block_size, head_size],
	contiguity=[True, True, True, True],
	dtype=DataType.BFloat16,
	is_cpu=False,
	stride_order=[3, 2, 1, 0],
	)
	cos = fd.define_tensor(
	shape=[block_size, rope_n_elem],
	contiguity=[True, True],
	dtype=DataType.BFloat16,
	is_cpu=False,
	stride_order=[1, 0],
	)
	sin = fd.define_tensor(
	shape=[block_size, rope_n_elem],
	contiguity=[True, True],
	dtype=DataType.BFloat16,
	is_cpu=False,
	stride_order=[1, 0],
	)

	offset_0 = rope_n_elem // 2

	q_rope = fd.ops.slice(q, start_indices=[0, 0, 0, 0], end_indices=[bsz, n_head, block_size, rope_n_elem], strides=[1, 1, 1, 1])
	q_remainder = fd.ops.slice(q, start_indices=[0, 0, 0, rope_n_elem], end_indices=[bsz, n_head, block_size, head_size], strides=[1, 1, 1, 1])
	q_remainder = fd.ops.pad(q_remainder, list(reversed([0, 0, 0, 0, 0, 0, 0, rope_n_elem])))

	q_left = fd.ops.slice(q_rope, start_indices=[0, 0, 0, 0], end_indices=[bsz, n_head, block_size, offset_0], strides=[1, 1, 1, 1])
	q_left = fd.ops.pad(q_left, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem, rope_n_elem - offset_0])))
	q_right = fd.ops.slice(q_rope, start_indices=[0, 0, 0, offset_0], end_indices=[bsz, n_head, block_size, rope_n_elem], strides=[1, 1, 1, 1])
	q_right = fd.ops.pad(q_right, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem + offset_0, 0])))

	# note that this is identical to q_left and q_right. We should be able to merge it back.
	q_left_cos = fd.ops.slice(q_rope, start_indices=[0, 0, 0, 0], end_indices=[bsz, n_head, block_size, offset_0], strides=[1, 1, 1, 1])
	q_left_cos = fd.ops.pad(q_left_cos, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem + offset_0, 0])))
	q_right_cos = fd.ops.slice(q_rope, start_indices=[0, 0, 0, offset_0], end_indices=[bsz, n_head, block_size, rope_n_elem], strides=[1, 1, 1, 1])
	q_right_cos = fd.ops.pad(q_right_cos, list(reversed([0, 0, 0, 0, 0, 0, head_size - rope_n_elem, rope_n_elem - offset_0])))

	# slice cos/sin
	cos_left = fd.ops.slice(cos, start_indices=[0, 0], end_indices=[block_size, offset_0], strides=[1, 1])
	cos_left = fd.ops.pad(cos_left, list(reversed([0, 0, head_size - offset_0, 0])))
	cos_left = fd.ops.broadcast_in_dim(cos_left, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])
	cos_right = fd.ops.slice(cos, start_indices=[0, offset_0], end_indices=[block_size, rope_n_elem], strides=[1, 1])
	cos_right = fd.ops.pad(cos_right, list(reversed([0, 0, head_size - rope_n_elem, offset_0])))
	cos_right = fd.ops.broadcast_in_dim(cos_right, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])

	sin_left = fd.ops.slice(sin, start_indices=[0, 0], end_indices=[block_size, offset_0], strides=[1, 1])
	sin_left = fd.ops.pad(sin_left, list(reversed([0, 0, head_size - offset_0, 0])))
	sin_left = fd.ops.broadcast_in_dim(sin_left, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])
	sin_right = fd.ops.slice(sin, start_indices=[0, offset_0], end_indices=[block_size, rope_n_elem], strides=[1, 1])
	sin_right = fd.ops.pad(sin_right, list(reversed([0, 0, head_size - rope_n_elem, offset_0])))
	sin_right = fd.ops.broadcast_in_dim(sin_right, shape=[1, 1, block_size, head_size], broadcast_dims=[2, 3])

	q0 = (-q_right) * sin_left + cos_left * q_left_cos
	q1 = q_left * sin_right + cos_right * q_right_cos
	q_out = q0 + q1 + q_remainder
	q_out = fd.ops.cast(q_out, dtype=DataType.BFloat16)
	q0 = fd.ops.cast(q0, dtype=DataType.BFloat16)

	fd.add_output(q_out)

	with FusionDefinition() as fd:
	rope_fusion(fd)

	inputs = [
	torch.randn((bsz, n_head, block_size, head_size), dtype=torch.bfloat16, device="cuda:0"),
	torch.randn((block_size, rope_n_elem), dtype=torch.bfloat16, device="cuda:0"),
	torch.randn((block_size, rope_n_elem), dtype=torch.bfloat16, device="cuda:0"),
	]

	o = fd.execute(inputs)[0]

	def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	head_size = x.size(-1)
	x1 = x[..., : head_size // 2] # (B, nh, T, hs/2)
	x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2)
	rotated = torch.cat((-x2, x1), dim=-1) # (B, nh, T, hs)
	roped = (x * cos) + (rotated * sin)
	return roped.to(dtype=x.dtype)

	def rope_one_entry(x, cos, sin, rope_n_elem):
	x_roped = apply_rope(x[..., : rope_n_elem], cos, sin)
	return torch.cat((x_roped, x[..., rope_n_elem :]), dim=-1)

	#import thunder
	#thunder_rope_one = thunder.jit(rope_one_entry, nv_enable_bookend=False)
	#o_ref = thunder_rope_one(*inputs, rope_n_elem)