Skip to content

Instantly share code, notes, and snippets.

View zhuangh's full-sized avatar
🎯
Focusing

Hao Zhuang zhuangh

🎯
Focusing
View GitHub Profile
@zhuangh
zhuangh / mqa_reshape_go_faster.py
Last active May 5, 2024 02:00
MQA reshape_go_faster.py
"""
baseline runtime(s) 0.5243263244628906
with reshape runtime (s) 0.0022399425506591797
@ cpu
=========
baseline runtime (s) 0.25386476516723633
with reshape runtime (s) 0.0008966922760009766
@ cuda:0
"""
import torch
@zhuangh
zhuangh / vs-config-gdbserver.json
Created May 4, 2024 23:58
vscode-bebyond gdb - gdb server
// https://marketplace.visualstudio.com/items?itemName=coolchyni.beyond-debug
{
"inputs": [
{
"id": "hostname",
"description": "xxx",
"default": "localhost",
"type": "promptString"
},
{
@zhuangh
zhuangh / single_gpu_ddp.py
Last active May 4, 2024 23:51
single_gpu_ddp.py
# python single_gpu_ddp.py
# https://discuss.pytorch.org/t/single-machine-single-gpu-distributed-best-practices/169243
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
def setup(rank, world_size):
@zhuangh
zhuangh / matmul_gtx1060.ir
Created November 22, 2023 07:54
matmul_gtx1060.ir
IR module {
tt.func public @matmul_kernel_0d1d2d3d4c5d6c7d8c(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%c16_i32 = arith.constant 16 : i32
%c1024_i32 = arith.constant 1024 : i32
%c0_i32 = arith.constant 0 : i32
%cst = arith.constant dense<16> : tensor<16x16xi32>
%cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
%0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
%1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
%2 = tt.splat %arg3 : (i32) -> tensor<16x1xi32>
@zhuangh
zhuangh / run_matmul_gtx1060.py
Created November 22, 2023 07:52
run_matmul_gtx1060.py
import torch
import triton
import triton.language as tl
import torch.nn.functional as F
@triton.jit
def matmul_kernel(
a_ptr, b_ptr, c_ptr,
stride_am, stride_ak,
stride_bk, stride_bn,
@zhuangh
zhuangh / run_triton.py
Last active November 22, 2023 01:50
run_triton.py
import torch
import triton
import triton.language as tl
import torch.nn.functional as F
import time
@triton.jit
def add_kernel(x_ptr, y_ptr, output_ptr, N,
BLOCK_SIZE: tl.constexpr):
pid = tl.program_id(0)
@zhuangh
zhuangh / cudagraph_decorator.py
Last active November 19, 2023 06:46
cudagraph_decorator.py
import torch
# acknowledgement: https://gist.github.com/bwasti/7e4cb9bd1aaddeb09bd360b570a486b1
def cudagraph(f):
_graphs = {}
def f_(*args):
key = hash(tuple(tuple(a.shape) for a in args))
if key in _graphs:
wrapped, *_ = _graphs[key]
@zhuangh
zhuangh / vec2dIterator.cc
Created April 16, 2018 05:33
2d iterator with remove()
class Vector2D {
private:
vector<vector<int>>::iterator row, iBegin, iEnd;
vector<int>::iterator col;
public:
Vector2D(vector<vector<int>>& vec2d) {
iBegin = row = vec2d.begin();
iEnd = vec2d.end();
if(vec2d.size())
col = row->begin();
#include<string>
#include<iostream>
#include<vector>
using namespace std;
class ShortPalindromes{
public:
string solve(const string & s, int i, int j,
/**
* Definition for an interval.
* struct Interval {
* int start;
* int end;
* Interval() : start(0), end(0) {}
* Interval(int s, int e) : start(s), end(e) {}
* };
*/