kimbochen

## device_query.cpp
#include <iostream>
#include <cuda_runtime.h>

int main() {
    // Ref: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html
    // Compilation command: nvcc device_query.cpp -arch=sm_100 -o device_query && ./device_query
    int device = 0;
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device);


## enable_gqa_repro.py
import torch
import torch.nn.functional as F


@torch.compile()
def baseline(q_BHTD, k_BJTD, v_BJTD, gq_ratio):
    k_BHTD = k_BJTD.repeat_interleave(gq_ratio, 1)
    v_BHTD = v_BJTD.repeat_interleave(gq_ratio, 1)
    o_BHTD = F.scaled_dot_product_attention(q_BHTD, k_BHTD, v_BHTD, is_causal=True)
    return o_BHTD

## llama.py
import functools
from dataclasses import asdict, dataclass
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F


@dataclass

## prefix-flash-attn.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / prefix-flash-attn.ipynb
            
            
              Created
              July 17, 2024 04:36
            
              
                prefix-flash-attn
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## speculative-sampling.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / speculative-sampling.ipynb
            
            
              Last active
              July 17, 2024 04:27
            
              
                speculative-sampling.ipynb
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## notes.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / notes.md
            
            
              Created
              July 7, 2024 05:05
            
              
                ML Efficiency Notes
              
          
    ML Efficiency Notes

GPU Specs


Name
FP16 Compute
Memory Bandwidth
Memory Size
TDP


A100
312 TFLOP/s
2 TB/s
40 GB
250 W


H100
750 TFLOP/s
2 TB/s
80 GB
350 W


A10
125 TFLOP/s
0.6 TB/s
24 GB
150 W


## flash-attn-triton.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / flash-attn-triton.ipynb
            
            
              Last active
              July 17, 2024 04:27
            
              
                flash-attn-triton.ipynb
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## triton-puzzles.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / triton-puzzles.ipynb
            
            
              Last active
              May 3, 2024 22:31
            
              
                triton-puzzles.ipynb
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## pytorch-practice.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / pytorch-practice.ipynb
            
            
              Last active
              February 26, 2024 01:59
            
              
                PyTorch Practice
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## hand-code-mlp-backprop.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kimbochen
                / hand-code-mlp-backprop.ipynb
            
            
              Last active
              February 26, 2024 00:23
            
              
                Hand-code MLP Backprop
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	#include <iostream>
	#include <cuda_runtime.h>

	int main() {
	// Ref: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html
	// Compilation command: nvcc device_query.cpp -arch=sm_100 -o device_query && ./device_query
	int device = 0;
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, device);
	import torch
	import torch.nn.functional as F


	@torch.compile()
	def baseline(q_BHTD, k_BJTD, v_BJTD, gq_ratio):
	k_BHTD = k_BJTD.repeat_interleave(gq_ratio, 1)
	v_BHTD = v_BJTD.repeat_interleave(gq_ratio, 1)
	o_BHTD = F.scaled_dot_product_attention(q_BHTD, k_BHTD, v_BHTD, is_causal=True)
	return o_BHTD
	import functools
	from dataclasses import asdict, dataclass
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	@dataclass
Name	FP16 Compute	Memory Bandwidth	Memory Size	TDP
A100	312 TFLOP/s	2 TB/s	40 GB	250 W
H100	750 TFLOP/s	2 TB/s	80 GB	350 W
A10	125 TFLOP/s	0.6 TB/s	24 GB	150 W