Skip to content

Instantly share code, notes, and snippets.

@zhangw
Last active February 21, 2024 02:52
Show Gist options
  • Save zhangw/5886f9f83f54a2685a8f97d5a5e941ab to your computer and use it in GitHub Desktop.
Save zhangw/5886f9f83f54a2685a8f97d5a5e941ab to your computer and use it in GitHub Desktop.
load-codefuse-codellama-34B-4bits
import os
import torch
import time
from modelscope import AutoTokenizer, snapshot_download
from auto_gptq import AutoGPTQForCausalLM
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def load_model_tokenizer(model_path):
"""
Load model and tokenizer based on the given model name or local path of downloaded model.
"""
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False, lagecy=False)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
print(f"tokenizer loaded: {tokenizer}")
model = AutoGPTQForCausalLM.from_quantized(model_path, inject_fused_attention=False, inject_fused_mlp=False, use_safetensors=False, use_cuda_fp16=True, disable_exllama=False, device_map='auto')
print(f"model loaded: {model}")
return model, tokenizer
def inference(model, tokenizer, prompt):
"""
Uset the given model and tokenizer to generate an answer for the speicifed prompt.
"""
st = time.time()
prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
inputs = f"<|role_start|>human<|role_end|>{prompt}<|role_start|>bot<|role_end|>"
input_ids = tokenizer.encode(inputs, return_tensors="pt", padding=True, add_special_tokens=False).to("cuda")
with torch.no_grad():
generated_ids = model.generate(input_ids=input_ids, top_p=0.95, temperature=0.1, do_sample=True, max_new_tokens=512, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(f'generate text is {outputs[0][len(inputs): ]}')
latency = time.time() - st
print('latency is {} seconds'.format(latency))
if __name__ == "__main__":
model_path = snapshot_download('codefuse-ai/CodeFuse-CodeLlama-34B-4bits', revision='v1.0.0')
print(f"model_path: {model_path}")
model, tokenizer = load_model_tokenizer(model_path)
prompt = 'Please write a QuickSort program in Python'
inference(model, tokenizer, prompt)
prompt = 'SpringBoot的程序,如何进行启动速度的优化?'
inference(model, tokenizer, prompt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment