George Hotz geohot

## llm.c
#include <stdlib.h>
#include <stdbool.h>
#include <tgmath.h>
#define max(x,y) ((x>y)?x:y)
#define half __fp16

void E_(int* data0) {
  int val0 = data0[0];
  data0[0] = (val0+1);
}

## gist:7dff8fd6259b1e6d57efb772b900fd69
[55883.721977] amdgpu:   map VA 0x702eae9d2000 - 0x702eae9d3000 in entry 0000000072d2b750
[55883.721996] amdgpu:   INC mapping count 1
[55883.722133] kfd kfd: amdgpu: ioctl cmd 0xc0184b0c (#0xc), arg 0x7ffe16172bef
[55883.722238] gmc_v11_0_process_interrupt: 6 callbacks suppressed
[55883.722250] amdgpu 0000:c3:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32774, for process python3 pid 356134 thread python3 pid 356134)
[55883.722343] amdgpu 0000:c3:00.0: amdgpu:   in page starting at address 0x00000000aabbc000 from client 10
[55883.722391] amdgpu 0000:c3:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00800A30
[55883.722429] amdgpu 0000:c3:00.0: amdgpu:      Faulty UTCL2 client ID: CPC (0x5)
[55883.722466] amdgpu 0000:c3:00.0: amdgpu:      MORE_FAULTS: 0x0
[55883.722497] amdgpu 0000:c3:00.0: amdgpu:      WALKER_ERROR: 0x0

## doorbell_mes_crash.py
# one hit, no loop needed
# this is caused by creating a KFD_IOC_QUEUE_TYPE_COMPUTE_AQL without an EOP buffer
# this causes the MES to page fault

import os, ctypes, pathlib, re, fcntl, functools, mmap, time
import tinygrad.runtime.autogen.kfd as kfd
from tinygrad.helpers import to_mv
from extra.hip_gpu_driver import hip_ioctl
import tinygrad.runtime.autogen.hsa as hsa

## hip.py
# -*- coding: utf-8 -*-
#
# TARGET arch is: ['-D__HIP_PLATFORM_AMD__', '-I/opt/rocm/include']
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes


## memcpy.py
# tiny@tiny9:~/tinygrad$ python3 examples/benchmark_copies.py
# CPU copy 6.18 ms, 16.28 GB/s
# GPU copy 4.38 ms, 23.00 GB/s
# GPU  6x  1.85 ms, 54.54 GB/s

import time
def timeit(fxn):
  tms = []
  for _ in range(10):
    st = time.perf_counter()

## matmul.cl
__kernel void matmul(__global float* data0, const __global float* data1, const __global float* data2) {
  int gidx0 = get_group_id(1); /* 512 */
  int gidx1 = get_group_id(0); /* 512 */
  float2 acc0 = (float2)(0.0f,0.0f);
  float2 acc1 = (float2)(0.0f,0.0f);
  for (int ridx0 = 0; ridx0 < 512; ++ridx0) {
    float2 val0 = (float2)(*((__global float2*)(data1+(gidx0*2048)+(ridx0*2))));
    float2 val1 = (float2)(*((__global float2*)(data1+(gidx0*2048)+(ridx0*2)+1024)));
    float2 val2 = (float2)(*((__global float2*)(data2+(gidx1*2)+(ridx0*2048))));
    float2 val3 = (float2)(*((__global float2*)(data2+(gidx1*2)+(ridx0*2048)+1024)));

## cifar_wino_kernels
***    0 E_64_32_6_6n5                         arg   2 sz [64, 1, 1]         [32, 1, 1]   OPs     33M/   0.00G  mem  3.07 GB tm      3.20us/     0.00ms (10483.20 GFLOPS,  297.02 GB/s)
***    1 r_128_31_31_3_2_3_2_2_2_8n26          arg   3 sz [31, 31, 128]      [2, 3, 1]    OPs    283M/   0.03G  mem  3.07 GB tm    218.44us/     0.22ms ( 1297.42 GFLOPS,  216.24 GB/s)
***    2 r_1024_32_16_2_3_4_4_8n6              arg   3 sz [32, 1024, 1]      [2, 16, 1]   OPs    805M/   0.32G  mem  3.07 GB tm     64.68us/     0.29ms (12450.43 GFLOPS, 1426.62 GB/s)

## test_allreduce.py
import os
import sys
import time
import torch
import torch.distributed as dist
import torch.multiprocessing as mp

def all_reduce_latency(nbytes, rank):
  buf = torch.randn(nbytes // 4).cuda(rank)

## mod_range.py
# given a number in the range [amin, amax] (inclusive)
# what are the min and max of that number after modding it by b?

# c style modulus
def modn(a, b): return -((-a)%b) if a < 0 else a%b

# aka a fast version of
def slow_modrange(amin, amax, b):
  values = [modn(rv, b) for rv in range(amin, amax+1)]
  return min(values), max(values)

## wtf_cuda.py
import torch
torch.set_grad_enabled(False)
model = torch.nn.Linear(1, 1, bias=False).cuda()
model.weight[:] = 1.
print(model(torch.Tensor([2349.]).cuda()))
	#include <stdlib.h>
	#include <stdbool.h>
	#include <tgmath.h>
	#define max(x,y) ((x>y)?x:y)
	#define half __fp16

	void E_(int* data0) {
	int val0 = data0[0];
	data0[0] = (val0+1);
	}
	[55883.721977] amdgpu: map VA 0x702eae9d2000 - 0x702eae9d3000 in entry 0000000072d2b750
	[55883.721996] amdgpu: INC mapping count 1
	[55883.722133] kfd kfd: amdgpu: ioctl cmd 0xc0184b0c (#0xc), arg 0x7ffe16172bef
	[55883.722238] gmc_v11_0_process_interrupt: 6 callbacks suppressed
	[55883.722250] amdgpu 0000:c3:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32774, for process python3 pid 356134 thread python3 pid 356134)
	[55883.722343] amdgpu 0000:c3:00.0: amdgpu: in page starting at address 0x00000000aabbc000 from client 10
	[55883.722391] amdgpu 0000:c3:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00800A30
	[55883.722429] amdgpu 0000:c3:00.0: amdgpu: Faulty UTCL2 client ID: CPC (0x5)
	[55883.722466] amdgpu 0000:c3:00.0: amdgpu: MORE_FAULTS: 0x0
	[55883.722497] amdgpu 0000:c3:00.0: amdgpu: WALKER_ERROR: 0x0
	# one hit, no loop needed
	# this is caused by creating a KFD_IOC_QUEUE_TYPE_COMPUTE_AQL without an EOP buffer
	# this causes the MES to page fault

	import os, ctypes, pathlib, re, fcntl, functools, mmap, time
	import tinygrad.runtime.autogen.kfd as kfd
	from tinygrad.helpers import to_mv
	from extra.hip_gpu_driver import hip_ioctl
	import tinygrad.runtime.autogen.hsa as hsa
	# -- coding: utf-8 --
	#
	# TARGET arch is: ['-D__HIP_PLATFORM_AMD__', '-I/opt/rocm/include']
	# WORD_SIZE is: 8
	# POINTER_SIZE is: 8
	# LONGDOUBLE_SIZE is: 16
	#
	import ctypes
	# tiny@tiny9:~/tinygrad$ python3 examples/benchmark_copies.py
	# CPU copy 6.18 ms, 16.28 GB/s
	# GPU copy 4.38 ms, 23.00 GB/s
	# GPU 6x 1.85 ms, 54.54 GB/s

	import time
	def timeit(fxn):
	tms = []
	for _ in range(10):
	st = time.perf_counter()
	__kernel void matmul(__global float* data0, const __global float* data1, const __global float* data2) {
	int gidx0 = get_group_id(1); /* 512 */
	int gidx1 = get_group_id(0); /* 512 */
	float2 acc0 = (float2)(0.0f,0.0f);
	float2 acc1 = (float2)(0.0f,0.0f);
	for (int ridx0 = 0; ridx0 < 512; ++ridx0) {
	float2 val0 = (float2)(((__global float2)(data1+(gidx02048)+(ridx02))));
	float2 val1 = (float2)(((__global float2)(data1+(gidx02048)+(ridx02)+1024)));
	float2 val2 = (float2)(((__global float2)(data2+(gidx12)+(ridx02048))));
	float2 val3 = (float2)(((__global float2)(data2+(gidx12)+(ridx02048)+1024)));
	*** 0 E_64_32_6_6n5 arg 2 sz [64, 1, 1] [32, 1, 1] OPs 33M/ 0.00G mem 3.07 GB tm 3.20us/ 0.00ms (10483.20 GFLOPS, 297.02 GB/s)
	*** 1 r_128_31_31_3_2_3_2_2_2_8n26 arg 3 sz [31, 31, 128] [2, 3, 1] OPs 283M/ 0.03G mem 3.07 GB tm 218.44us/ 0.22ms ( 1297.42 GFLOPS, 216.24 GB/s)
	*** 2 r_1024_32_16_2_3_4_4_8n6 arg 3 sz [32, 1024, 1] [2, 16, 1] OPs 805M/ 0.32G mem 3.07 GB tm 64.68us/ 0.29ms (12450.43 GFLOPS, 1426.62 GB/s)
	import os
	import sys
	import time
	import torch
	import torch.distributed as dist
	import torch.multiprocessing as mp

	def all_reduce_latency(nbytes, rank):
	buf = torch.randn(nbytes // 4).cuda(rank)
	# given a number in the range [amin, amax] (inclusive)
	# what are the min and max of that number after modding it by b?

	# c style modulus
	def modn(a, b): return -((-a)%b) if a < 0 else a%b

	# aka a fast version of
	def slow_modrange(amin, amax, b):
	values = [modn(rv, b) for rv in range(amin, amax+1)]
	return min(values), max(values)
	import torch
	torch.set_grad_enabled(False)
	model = torch.nn.Linear(1, 1, bias=False).cuda()
	model.weight[:] = 1.
	print(model(torch.Tensor([2349.]).cuda()))