Yi Liu yiliu30

## test_inplace.py
import torch
import torch.nn as nn
from torchao.quantization.GPTQ_MT import MultiTensor


class InplaceMod(torch.nn.Module):
    def __init__(self, N=10):
        super().__init__()
        self.register_buffer("cache", torch.zeros(N, N))

## inspect_exported_qmodel.py
from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
    X86InductorQuantizer,
    get_default_x86_inductor_quantization_config,
)
import torch

from torch._export import capture_pre_autograd_graph

from torch.ao.quantization.quantize_pt2e import (
    convert_pt2e,

## cude_device_info.cu
// Copied from https://github.com/siboehm/SGEMM_CUDA

// nvcc -o cuda_info cuda_info.cu -std=c++11 && ./cuda_info


#include <cuda_runtime.h>
#include <stdio.h>


void CudaDeviceInfo() {

## mutli_tensor_check_model.py
# Adapted from: https://gist.github.com/HDCharles/a1b575bbf8875f994af8a01b225e1227
import torch
import torch.nn as nn
from torch.utils._pytree import tree_flatten, tree_unflatten
import gc
class MultiTensor(torch.Tensor):
    @staticmethod
    def __new__(cls, input, **kwargs):
        if isinstance(input, (list, tuple)):
            input = input[0]

## bench_autoround.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                yiliu30
                / bench_autoround.md
            
            
              Last active
              September 3, 2024 15:05
            
          
    1. Install the required packages

pip install -r torchao/prototype/autoround/requirements.txt 
2. Do benchmark with different configs


Benchmark llama2/llama3 with lightweight config, it depends on a small fix in pytorch/ao#769.

# auto-round w/ quant_lm_head
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoround

  
## inc_pt2e.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                yiliu30
                / inc_pt2e.md
            
            
              Last active
              August 29, 2024 07:20
            
          
user-facing config
https://github.com/intel/neural-compressor/blob/f2c454f88c0ffbb4d30d66eedaa6fc56ad47f804/neural_compressor/torch/quantization/config.py#L1317
        w_granularity: str = "per_channel",
        w_algo: str = "minmax",
        act_dtype: str = "uint8",


## gptq_with_multi_tensors.py
# Adapted from https://raw.githubusercontent.com/jerryzh168/ao/module-hook/tutorials/calibration_flow/gptq_like.py

"""
This is a example flow for GPTQ like calibration flows, where we:
(1) optimize (quantize) one module at a time
(2) with each optimization step, we need to get a set of all calibration data
(3) the output of each module is calculated based on the optimized (quantized) module, and then pass down to the next module

In this tutorial we mainly use two things:
(1) MultiTensor subclass https://gist.github.com/HDCharles/a1b575bbf8875f994af8a01b225e1227

## bench_int4.py
import argparse
import logging

import torch
from torchao.utils import benchmark_model, TORCH_VERSION_AFTER_2_4


def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
    import transformers

## update_model.py
import torch

def _update_model_and_children(model, replacement_fn):
    replacement_fn(model)
    for name, child in model.named_children():
        new_child = _update_model_and_children(child, replacement_fn)
        if new_child is not child:
            setattr(model, name, new_child)
    return model

## pymem.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                yiliu30
                / pymem.md
            
            
              Created
              August 7, 2024 11:24
                — forked from vahbuna/pymem.md
            
              
                Debugging PyTorch memory use with snapshots - Zach's Blog
              
          
    https://zdevito.github.io/2022/08/16/memory-snapshots.html
# enable the recording of stack frame information for each allocation
import torch
torch.cuda.memory._record_memory_history(True)

from torchvision.models import resnet18
from pprint import pprint
	import torch
	import torch.nn as nn
	from torchao.quantization.GPTQ_MT import MultiTensor


	class InplaceMod(torch.nn.Module):
	def __init__(self, N=10):
	super().__init__()
	self.register_buffer("cache", torch.zeros(N, N))
	from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
	X86InductorQuantizer,
	get_default_x86_inductor_quantization_config,
	)
	import torch

	from torch._export import capture_pre_autograd_graph

	from torch.ao.quantization.quantize_pt2e import (
	convert_pt2e,
	// Copied from https://github.com/siboehm/SGEMM_CUDA

	// nvcc -o cuda_info cuda_info.cu -std=c++11 && ./cuda_info


	#include <cuda_runtime.h>
	#include <stdio.h>


	void CudaDeviceInfo() {
	# Adapted from: https://gist.github.com/HDCharles/a1b575bbf8875f994af8a01b225e1227
	import torch
	import torch.nn as nn
	from torch.utils._pytree import tree_flatten, tree_unflatten
	import gc
	class MultiTensor(torch.Tensor):
	@staticmethod
	def __new__(cls, input, **kwargs):
	if isinstance(input, (list, tuple)):
	input = input[0]
	# Adapted from https://raw.githubusercontent.com/jerryzh168/ao/module-hook/tutorials/calibration_flow/gptq_like.py

	"""
	This is a example flow for GPTQ like calibration flows, where we:
	(1) optimize (quantize) one module at a time
	(2) with each optimization step, we need to get a set of all calibration data
	(3) the output of each module is calculated based on the optimized (quantized) module, and then pass down to the next module

	In this tutorial we mainly use two things:
	(1) MultiTensor subclass https://gist.github.com/HDCharles/a1b575bbf8875f994af8a01b225e1227
	import argparse
	import logging

	import torch
	from torchao.utils import benchmark_model, TORCH_VERSION_AFTER_2_4


	def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
	import transformers
	import torch

	def _update_model_and_children(model, replacement_fn):
	replacement_fn(model)
	for name, child in model.named_children():
	new_child = _update_model_and_children(child, replacement_fn)
	if new_child is not child:
	setattr(model, name, new_child)
	return model