Skip to content

Instantly share code, notes, and snippets.

View geohot's full-sized avatar

George Hotz geohot

View GitHub Profile
@geohot
geohot / run.txt
Created September 19, 2024 01:38
tinygrad 0.7 openpilot 0.9.7 run
comma@tiny24:/data/openpilot/tinygrad_repo$ python3 openpilot/compile2.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx
https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx: 100%|███████████████████████████████████████████| 51.5M/51.5M [00:00<00:00, 88.2MB/s]
cache is out of date, clearing it
/usr/local/pyenv/versions/3.11.4/lib/python3.11/site-packages/pyopencl/__init__.py:528: CompilerWarning: Non-empty compiler output encountered. Set the environment variable PYOPENCL_COMPILER_OUTPUT=1 to see more.
lambda: self._prg.build(options_bytes, devices),
190 schedule items depend on the input, 462 don't
7 inputs
13: rewrite input, image dtype dtypes.imageh((16, 2048, 4)), (View(shape=(1, 16, 32, 64, 2), strides=(0, 8192, 256, 4, 1), offset=0, mask=None, contiguous=False), View(shape=(1, 16, 32, 128), strides=(0, 4096, 128, 1), offset=0, mask=None, contiguous=True))
24: rewrite input, image dtype dtypes.imageh((8, 2048, 4)), (View(shap
from huggingface_hub import snapshot_download
from tinygrad import nn, Tensor, TinyJit, Device
import time
class Block:
def __init__(self, in_dims, dims, stride=1):
super().__init__()
self.conv1 = nn.Conv2d(
in_dims, dims, kernel_size=3, stride=stride, padding=1, bias=False
@geohot
geohot / llm.c
Last active May 1, 2024 13:41
Outputted llm.c from tinygrad
#include <stdlib.h>
#include <stdbool.h>
#include <tgmath.h>
#define max(x,y) ((x>y)?x:y)
#define half __fp16
void E_(int* data0) {
int val0 = data0[0];
data0[0] = (val0+1);
}
[55883.721977] amdgpu: map VA 0x702eae9d2000 - 0x702eae9d3000 in entry 0000000072d2b750
[55883.721996] amdgpu: INC mapping count 1
[55883.722133] kfd kfd: amdgpu: ioctl cmd 0xc0184b0c (#0xc), arg 0x7ffe16172bef
[55883.722238] gmc_v11_0_process_interrupt: 6 callbacks suppressed
[55883.722250] amdgpu 0000:c3:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32774, for process python3 pid 356134 thread python3 pid 356134)
[55883.722343] amdgpu 0000:c3:00.0: amdgpu: in page starting at address 0x00000000aabbc000 from client 10
[55883.722391] amdgpu 0000:c3:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00800A30
[55883.722429] amdgpu 0000:c3:00.0: amdgpu: Faulty UTCL2 client ID: CPC (0x5)
[55883.722466] amdgpu 0000:c3:00.0: amdgpu: MORE_FAULTS: 0x0
[55883.722497] amdgpu 0000:c3:00.0: amdgpu: WALKER_ERROR: 0x0
@geohot
geohot / doorbell_mes_crash.py
Last active March 25, 2024 19:05
MES Crash in KFD_IOC_QUEUE_TYPE_COMPUTE_AQL created without an EOP buffer
# one hit, no loop needed
# this is caused by creating a KFD_IOC_QUEUE_TYPE_COMPUTE_AQL without an EOP buffer
# this causes the MES to page fault
import os, ctypes, pathlib, re, fcntl, functools, mmap, time
import tinygrad.runtime.autogen.kfd as kfd
from tinygrad.helpers import to_mv
from extra.hip_gpu_driver import hip_ioctl
import tinygrad.runtime.autogen.hsa as hsa
@geohot
geohot / hip.py
Created November 25, 2023 23:28
Wrapper for HIP
# -*- coding: utf-8 -*-
#
# TARGET arch is: ['-D__HIP_PLATFORM_AMD__', '-I/opt/rocm/include']
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes
@geohot
geohot / memcpy.py
Created November 21, 2023 19:21
Fast memcpy using GPUs
# tiny@tiny9:~/tinygrad$ python3 examples/benchmark_copies.py
# CPU copy 6.18 ms, 16.28 GB/s
# GPU copy 4.38 ms, 23.00 GB/s
# GPU 6x 1.85 ms, 54.54 GB/s
import time
def timeit(fxn):
tms = []
for _ in range(10):
st = time.perf_counter()
@geohot
geohot / matmul.cl
Last active October 19, 2023 22:21
A 1024x1024x1024 matmul with a 2x2x2 core in OpenCL
__kernel void matmul(__global float* data0, const __global float* data1, const __global float* data2) {
int gidx0 = get_group_id(1); /* 512 */
int gidx1 = get_group_id(0); /* 512 */
float2 acc0 = (float2)(0.0f,0.0f);
float2 acc1 = (float2)(0.0f,0.0f);
for (int ridx0 = 0; ridx0 < 512; ++ridx0) {
float2 val0 = (float2)(*((__global float2*)(data1+(gidx0*2048)+(ridx0*2))));
float2 val1 = (float2)(*((__global float2*)(data1+(gidx0*2048)+(ridx0*2)+1024)));
float2 val2 = (float2)(*((__global float2*)(data2+(gidx1*2)+(ridx0*2048))));
float2 val3 = (float2)(*((__global float2*)(data2+(gidx1*2)+(ridx0*2048)+1024)));
@geohot
geohot / cifar_wino_kernels
Created October 19, 2023 05:48
kernels for BS=1024 CIFAR BEAM=2 WINO=1
*** 0 E_64_32_6_6n5 arg 2 sz [64, 1, 1] [32, 1, 1] OPs 33M/ 0.00G mem 3.07 GB tm 3.20us/ 0.00ms (10483.20 GFLOPS, 297.02 GB/s)
*** 1 r_128_31_31_3_2_3_2_2_2_8n26 arg 3 sz [31, 31, 128] [2, 3, 1] OPs 283M/ 0.03G mem 3.07 GB tm 218.44us/ 0.22ms ( 1297.42 GFLOPS, 216.24 GB/s)
*** 2 r_1024_32_16_2_3_4_4_8n6 arg 3 sz [32, 1024, 1] [2, 16, 1] OPs 805M/ 0.32G mem 3.07 GB tm 64.68us/ 0.29ms (12450.43 GFLOPS, 1426.62 GB/s)
@geohot
geohot / test_allreduce.py
Created March 14, 2023 07:19
Test Bandwidth of all reduce
import os
import sys
import time
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
def all_reduce_latency(nbytes, rank):
buf = torch.randn(nbytes // 4).cuda(rank)