Skip to content

Instantly share code, notes, and snippets.

@haotian-liu
Created July 31, 2023 16:32
Show Gist options
  • Save haotian-liu/0dc96a1c63e91f31b04b3e94250c716b to your computer and use it in GitHub Desktop.
Save haotian-liu/0dc96a1c63e91f31b04b3e94250c716b to your computer and use it in GitHub Desktop.
AutoGPTQ quantization for LLaVA
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
"""
Download https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview to local
Make following edits to the config.json
LlavaLlamaForCausalLM -> LlamaForCausalLM
"model_type": "llava" -> "llama"
"""
pretrained_model_dir = "./checkpoints/llava-llama-2-13b-chat-lightning-preview"
quantized_model_dir = "llava-llama-2-13b-chat-lightning-4bit-128g"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
tokenizer(
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
)
]
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment