Skip to content

Instantly share code, notes, and snippets.

@thistleknot
Last active December 12, 2023 00:08
Show Gist options
  • Save thistleknot/69092521560eee628a7807aa91970c2e to your computer and use it in GitHub Desktop.
Save thistleknot/69092521560eee628a7807aa91970c2e to your computer and use it in GitHub Desktop.
basic inference
#export LD_LIBRARY_PATH=/usr/local/cuda/targets/x86_64-linux/lib/
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from datasets import load_dataset
import json
import torch
from tqdm import tqdm
if torch.cuda.is_available():
device = torch.device("cuda")
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("/workspace/OpenHermes-2.5-neural-chat-v3-3-Slerp")
#tokenizer = AutoTokenizer.from_pretrained("/workspace/Marcoroni-7B-v3")
model = AutoModelForCausalLM.from_pretrained("/workspace/OpenHermes-2.5-neural-chat-v3-3-Slerp",quantization_config=nf4_config)
#model = AutoModelForCausalLM.from_pretrained("/workspace/Marcoroni-7B-v3",quantization_config=nf4_config)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
model.resize_token_embeddings(len(tokenizer))
dataset = load_dataset("xquad",'xquad.en')
input_ids = tokenizer(prompts[0],return_tensors='pt')
attention_mask = input_ids['attention_mask']
# Move input_ids and attention_mask to the same device as the model
input_ids = input_ids.to(model.device)
attention_mask = attention_mask.to(model.device)
# Sampling with Temperature
sample_output = model.generate(
input_ids['input_ids'],
attention_mask=attention_mask,
do_sample=True,
max_length=1024,
top_k=50,
top_p=0.9,
temperature=0.7
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment