Last active
November 5, 2023 21:47
-
-
Save thistleknot/625c99bec5c954bda29cff935cdc8394 to your computer and use it in GitHub Desktop.
gptq quantization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig | |
from transformers import AutoTokenizer | |
pretrained_model_dir = "/home/user/text-generation-webui/models/open_llama_3b_v2/" | |
quantized_model_dir = "/home/user/text-generation-webui/models/open_llama_3b_v2_qptq/" | |
quantize_config = BaseQuantizeConfig( | |
bits=4, # quantize model to 4-bit | |
group_size=32, # it is recommended to set the value to 128 | |
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad | |
) | |
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) | |
# load un-quantized model, by default, the model will always be loaded into CPU memory | |
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) | |
text_examples = [ | |
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm.", | |
"What's the weather like today in New York City? I'm planning to visit Central Park in the afternoon.", | |
"Hey, check out this article on the benefits of a plant-based diet: https://www.example.com/plant-based-diet", | |
"Can you recommend any good science fiction books? I love stories about time travel and space exploration.", | |
"How can I learn a new language quickly? I'm planning to move to Spain next year and need to learn Spanish.", | |
"Just finished watching The Matrix. What are some other popular sci-fi movies to watch this weekend?", | |
"I'm feeling stressed lately. What are some effective ways to deal with stress and improve my mental health?", | |
"What are the top tourist attractions in Paris? I'll be visiting the city for a week and want to make the most of my time there.", | |
"Tell me a joke about computers. I need something to cheer me up after a long day at work.", | |
"How do I cook spaghetti carbonara? Can you share a simple recipe that I can follow at home?", | |
"Can you give me a brief summary of the latest news? I haven't had the chance to catch up on current events.", | |
"What's the difference between a psychologist and a psychiatrist? I'm considering therapy but not sure which one to see.", | |
"I'm planning to start my own online store. What are the steps to start a small business and make it successful?", | |
] | |
examples = [tokenizer(text, truncation=True) for text in text_examples] | |
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" | |
model.quantize(examples, batch_size=13) | |
model.save_pretrained(quantized_model_dir) | |
#mv /home/user/text-generation-webui/models/open_llama_3b_v2_qptq/gptq_model-4bit-32g.bin /home/user/text-generation-webui/models/open_llama_3b_v2_qptq/pytorch_model.bin | |
# mv /home/user/text-generation-webui/models/open_llama_3b_v2_qptq/quantize_config.json /home/user/text-generation-webui/models/open_llama_3b_v2_qptq/quantization_config.json |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment