Mark Saroufim msaroufim

## bitpack.py

import torch


# >>> import sys
# >>> size_of_bool = sys.getsizeof(True)  # or sys.getsizeof(False)
# >>> print(size_of_bool)
# 28


## gist:8421170840da7935e53c3b833a62c326
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import os
import glob
from datetime import datetime

from setuptools import find_packages, setup

## gist:f6d234f22428848cf23dbc0566e77130

INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3

INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5

INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3

## gist:5defcd59aed4364846d034ac01eb6cfd
 ~ nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas
⚡ ~ ./attention_forward 1
Using kernel 1
-0.529510 -0.529510
0.889394 0.889394
0.881674 0.881674
0.651789 0.651789
-0.483486 -0.483486
Results match!
block_size   32 | time 7618.906250 ms

## gist:c855ef19644b547ea4f29085962d6585
import time

from typing import Callable, List

import torch

torch.set_printoptions(threshold=10000)

# Llama-7B
SIZES = [torch.Size([32000, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([40

## gist:1b2c4b30bb62c423f9129f09bddddfb6
import time

from typing import Callable, List

import torch

torch.set_printoptions(threshold=10000)

# Llama-7B
SIZES = [torch.Size([32000, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([40

## gist:03f72ea0ffc16fb57774b9ac385aca45
#!/bin/bash
set -eo pipefail

VER_IPEX=main

# Mode: Select which components to install. PyTorch and Intel® Extension for PyTorch* are always installed.
# High bit: 8 7 6 5 4 3 2 1 :Low bit
#           | | | | | | | └- torch-ccl
#           | | | | | | └--- TorchAudio
#           | | | | | └----- TorchVision

## how to discord.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                msaroufim
                / how to discord.md
            
            
              Created
              March 12, 2024 02:45
            
          
    How to build a Discord community
TL;DR: Be responsive, have a bold raison d’etre, make sure people have low and high effort things to do, impact the real world with as many artifacts as possible and share the impact with external partners.
A lot of the leading applied research in ML these days is happening on Discord so a common question I get asked is “Hey Mark, which Discord group should I join?”. That’s an easy enough question to answer these days just subscribe to https://buttondown.email/ainews but then I always make sure to remind people: “You should probably create your own Discord community” and I feel like people don’t quite like it when I say this because well how do you create a discord community from scratch?
I’ve created 3 communities so far and each one has grown larger more quickly than the last so hopefully some of these lessons apply to you as well.
Robot Overlords: Took about a year to reach 450 people
NeurIPS LLM Efficiency Competition: Took about 6 months to reach 1,300 people. Learn m

  
## gist:a69900a54751e99c44e7fa2341c8a0f9
#include <iostream>
#include <cuda.h>

#define BLOCK_DIM 1024
#define COARSE_FACTOR 2
#define NUM_DEVICES 2

__global__ void CoarsenedReduction(float* input, float* output, int size) {
    __shared__ float input_s[BLOCK_DIM];

## gist:46b16075a27053e00cf9d47ca7398648
# Compilation provided by Compiler Explorer at https://godbolt.org/
__nv_save_fatbinhandle_for_managed_rt(void**):
        pushq   %rbp
        movq    %rsp, %rbp
        movq    %rdi, -8(%rbp)
        movq    -8(%rbp), %rax
        movq    %rax, __nv_fatbinhandle_for_managed_rt(%rip)
        nop
        popq    %rbp
        ret

	import torch


	# >>> import sys
	# >>> size_of_bool = sys.getsizeof(True) # or sys.getsizeof(False)
	# >>> print(size_of_bool)
	# 28
	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	import glob
	from datetime import datetime

	from setuptools import find_packages, setup

	INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
	Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3

	INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
	Autotune Best Config: BLOCK_M: 16, BLOCK_N: 32, BLOCK_K: 32, SPLIT_K: 1, num_warps: 2, num_ctas: 1, num_stages: 5

	INFO:/home/marksaroufim/.local/lib/python3.10/site-packages/torchao/prototype/galore/kernels/custom_autotune.py:
	Autotune Best Config: BLOCK_M: 32, BLOCK_N: 128, BLOCK_K: 32, SPLIT_K: 1, num_warps: 4, num_ctas: 1, num_stages: 3
	~ nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas
	⚡ ~ ./attention_forward 1
	Using kernel 1
	-0.529510 -0.529510
	0.889394 0.889394
	0.881674 0.881674
	0.651789 0.651789
	-0.483486 -0.483486
	Results match!
	block_size 32 \| time 7618.906250 ms
	import time

	from typing import Callable, List

	import torch

	torch.set_printoptions(threshold=10000)

	# Llama-7B
	SIZES = [torch.Size([32000, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([4096]), torch.Size([4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([4096, 4096]), torch.Size([11008, 4096]), torch.Size([4096, 11008]), torch.Size([11008, 4096]), torch.Size([40
	#!/bin/bash
	set -eo pipefail

	VER_IPEX=main

	# Mode: Select which components to install. PyTorch and Intel® Extension for PyTorch* are always installed.
	# High bit: 8 7 6 5 4 3 2 1 :Low bit
	# \| \| \| \| \| \| \| └- torch-ccl
	# \| \| \| \| \| \| └--- TorchAudio
	# \| \| \| \| \| └----- TorchVision
	#include <iostream>
	#include <cuda.h>

	#define BLOCK_DIM 1024
	#define COARSE_FACTOR 2
	#define NUM_DEVICES 2

	__global__ void CoarsenedReduction(float* input, float* output, int size) {
	__shared__ float input_s[BLOCK_DIM];
	# Compilation provided by Compiler Explorer at https://godbolt.org/
	__nv_save_fatbinhandle_for_managed_rt(void**):
	pushq %rbp
	movq %rsp, %rbp
	movq %rdi, -8(%rbp)
	movq -8(%rbp), %rax
	movq %rax, __nv_fatbinhandle_for_managed_rt(%rip)
	nop
	popq %rbp
	ret