Skip to content

Instantly share code, notes, and snippets.

View Chillee's full-sized avatar

Horace He Chillee

View GitHub Profile
@Chillee
Chillee / attention_dim_bench.py
Created April 12, 2024 05:13
You Could Have Invented Flash-Attention!
import torch
from torch.utils.flop_counter import FlopCounterMode
from triton.testing import do_bench
torch.set_default_device('cuda')
def get_flops_achieved(f):
flop_counter = FlopCounterMode(display=False)
with flop_counter:
f()
total_flops = flop_counter.get_total_flops()
@Chillee
Chillee / mfu_compute.py
Last active April 11, 2024 17:17
Compute Flop Utilization in PyTorch
import torch
from torch.utils.flop_counter import FlopCounterMode
from triton.testing import do_bench
def get_flops_achieved(f):
flop_counter = FlopCounterMode(display=False)
with flop_counter:
f()
total_flops = flop_counter.get_total_flops()
ms_per_iter = do_bench(f)
@Chillee
Chillee / Q1.py
Last active April 8, 2024 04:07
What Shapes Do Matrix Multiplications Like?
import torch
from triton.testing import do_bench
torch.set_default_device('cuda')
for M, K, N in [(2047, 2048, 2048), (2048, 2047, 2048), (2048, 2048, 2047)]:
A = torch.randn(M, K, dtype=torch.bfloat16)
B = torch.randn(K, N, dtype=torch.bfloat16)
print(f"M={M}, K={K}, N={N}")
print(do_bench(lambda: torch.mm(A, B)))
@Chillee
Chillee / 1-pw_op_fusion.py
Last active February 26, 2024 20:45
PT 2.0 Benchmarks
import torch
import torch._inductor.config
import time
torch._inductor.config.triton.cudagraphs = False
torch.set_float32_matmul_precision('high')
def bench(f, name=None, iters=100, warmup=5, display=True, profile=False):
for _ in range(warmup):
f()
@Chillee
Chillee / lora_example.py
Last active May 14, 2023 09:45
lora_example.py
import torch
import torch.nn as nn
import torch.nn.utils.parametrize as parametrize
from torch.utils._pytree import tree_map
class LoraTensor(object):
def __init__(self, weights, A, B):
self.weights = weights
self.A = A
self.B = B
@Chillee
Chillee / dinic.cpp
Last active July 14, 2022 02:10
Max Flow (Dinic's, HLPP)
template <int MAXV, class T = int> struct Dinic {
const static bool SCALING = false; // non-scaling = V^2E, Scaling=VElog(U) with higher constant
int lim = 1;
const T INF = numeric_limits<T>::max();
struct edge {
int to, rev;
T cap, flow;
};
int s = MAXV - 2, t = MAXV - 1;
@Chillee
Chillee / fft.cpp
Last active January 24, 2022 21:03
FFT
template <int maxn> struct FFT {
constexpr static int lg2(int n) { return 32 - __builtin_clz(n - 1); }
const static int MAXN = 1 << lg2(maxn);
typedef complex<double> cpx;
int rev[MAXN];
cpx rt[MAXN];
FFT() {
rt[1] = cpx{1, 0};
for (int k = 2; k < MAXN; k *= 2) {
cpx z[] = {1, polar(1.0, M_PI / k)};
@Chillee
Chillee / hash table.cpp
Last active January 15, 2022 00:53
Policy based data structures
#include <ext/pb_ds/assoc_container.hpp>
using namespace __gnu_pbds;
struct chash {
const int RANDOM = (long long)(make_unique<char>().get()) ^ chrono::high_resolution_clock::now().time_since_epoch().count();
static unsigned long long hash_f(unsigned long long x) {
x += 0x9e3779b97f4a7c15;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9;
x = (x ^ (x >> 27)) * 0x94d049bb133111eb;
return x ^ (x >> 31);
@Chillee
Chillee / mobius.cpp
Last active March 30, 2021 15:50
Number Theory Sieves
int mobius[MAXPR];
bool sieve[MAXPR];
vector<int> primes;
void calcMobius() {
mobius[1] = 1;
for (int i = 2; i < MAXPR; i++) {
if (!sieve[i]) {
primes.push_back(i);
mobius[i] = -1;
}
@Chillee
Chillee / Lazy.cpp
Last active May 25, 2020 11:28
Aho-Corasick (with leaf links)
struct AhoCorasick {
struct Vertex {
int next[MAXCHAR], go[MAXCHAR];
int leaf = -1;
int p = -1;
char pch;
int link = -1, leaflink = -1;
Vertex(int p = -1, char ch = '$') : p(p), pch(ch) {
fill(begin(next), end(next), -1);