Skip to content

Instantly share code, notes, and snippets.

View msaroufim's full-sized avatar
🤖
Putting the finishing touches on my robot army

Mark Saroufim msaroufim

🤖
Putting the finishing touches on my robot army
View GitHub Profile
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import glob
from datetime import datetime
from setuptools import find_packages, setup
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5
INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3
~ nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas
⚡ ~ ./attention_forward 1
Using kernel 1
-0.529510 -0.529510
0.889394 0.889394
0.881674 0.881674
0.651789 0.651789
-0.483486 -0.483486
Results match!
block_size 32 | time 7618.906250 ms
import time
from typing import Callable, List
import torch
torch.set_printoptions(threshold=10000)
# Llama-7B
SIZES = [torch.Size([32000, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([40
import time
from typing import Callable, List
import torch
torch.set_printoptions(threshold=10000)
# Llama-7B
SIZES = [torch.Size([32000, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([40
#!/bin/bash
set -eo pipefail
VER_IPEX=main
# Mode: Select which components to install. PyTorch and Intel® Extension for PyTorch* are always installed.
# High bit: 8 7 6 5 4 3 2 1 :Low bit
# | | | | | | | └- torch-ccl
# | | | | | | └--- TorchAudio
# | | | | | └----- TorchVision

How to build a Discord community TL;DR: Be responsive, have a bold raison d’etre, make sure people have low and high effort things to do, impact the real world with as many artifacts as possible and share the impact with external partners.

A lot of the leading applied research in ML these days is happening on Discord so a common question I get asked is “Hey Mark, which Discord group should I join?”. That’s an easy enough question to answer these days just subscribe to https://buttondown.email/ainews but then I always make sure to remind people: “You should probably create your own Discord community” and I feel like people don’t quite like it when I say this because well how do you create a discord community from scratch?

I’ve created 3 communities so far and each one has grown larger more quickly than the last so hopefully some of these lessons apply to you as well.

Robot Overlords: Took about a year to reach 450 people NeurIPS LLM Efficiency Competition: Took about 6 months to reach 1,300 people. Learn m

#include <iostream>
#include <cuda.h>
#define BLOCK_DIM 1024
#define COARSE_FACTOR 2
#define NUM_DEVICES 2
__global__ void CoarsenedReduction(float* input, float* output, int size) {
__shared__ float input_s[BLOCK_DIM];
# Compilation provided by Compiler Explorer at https://godbolt.org/
__nv_save_fatbinhandle_for_managed_rt(void**):
pushq %rbp
movq %rsp, %rbp
movq %rdi, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, __nv_fatbinhandle_for_managed_rt(%rip)
nop
popq %rbp
ret
import torch
import torch.nn as nn
import torch.optim as optim
import os
torch.set_default_device("cpu")
torch.set_default_dtype(torch.float32)
class SimpleNet(nn.Module):
def __init__(self):