haotian-liu/quantize.py

## quantize.py
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

"""
Download https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview to local
Make following edits to the config.json
LlavaLlamaForCausalLM -> LlamaForCausalLM
"model_type": "llava" -> "llama"
"""
pretrained_model_dir = "./checkpoints/llava-llama-2-13b-chat-lightning-preview"

quantized_model_dir = "llava-llama-2-13b-chat-lightning-4bit-128g"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
    tokenizer(
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
    )
]

quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)
	from transformers import AutoTokenizer, TextGenerationPipeline
	from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
	import logging

	logging.basicConfig(
	format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
	)

	"""
	Download https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview to local
	Make following edits to the config.json
	LlavaLlamaForCausalLM -> LlamaForCausalLM
	"model_type": "llava" -> "llama"
	"""
	pretrained_model_dir = "./checkpoints/llava-llama-2-13b-chat-lightning-preview"

	quantized_model_dir = "llava-llama-2-13b-chat-lightning-4bit-128g"

	tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
	examples = [
	tokenizer(
	"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
	)
	]

	quantize_config = BaseQuantizeConfig(
	bits=4, # quantize model to 4-bit
	group_size=128, # it is recommended to set the value to 128
	desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
	)

	# load un-quantized model, by default, the model will always be loaded into CPU memory
	model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

	# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
	model.quantize(examples)

	# save quantized model using safetensors
	model.save_quantized(quantized_model_dir, use_safetensors=True)