Skip to content

Instantly share code, notes, and snippets.

@nebrelbug
Last active November 8, 2023 01:50
Show Gist options
  • Save nebrelbug/1da2c0064d53decf197a304267799708 to your computer and use it in GitHub Desktop.
Save nebrelbug/1da2c0064d53decf197a304267799708 to your computer and use it in GitHub Desktop.
Rebuilding Alpaca with the Hugging Face Trainer Class (see https://bengubler.com/posts/rebuilding-alpaca-huggingface-trainer)
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
downcast_bf16: 'no'
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 1
use_cpu: false
from datasets import load_dataset
original_dataset = load_dataset("tatsu-lab/alpaca")["train"]
template_no_context = """Below is an instruction that describes a task. \
Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
"""
template_context = """Below is an instruction that describes a task. \
Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:
"""
def data_to_string(data):
instruction = data["instruction"]
context = data["input"]
response = data["output"]
template = template_context if len(context) > 0 else template_no_context
source = template.format(instruction=instruction, input=context)
return {
"source": source,
"text": source + response,
}
dataset = original_dataset.map(
data_to_string
).remove_columns(['instruction', 'input', 'output'])
processed_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = processed_dataset["train"]
eval_dataset = processed_dataset["test"]
IGNORE_TOKEN = -100
def data_collator(features, tokenizer):
sources = [feature["source"] for feature in features]
targets = [feature["text"] for feature in features]
source_tokens = tokenizer(
sources,
return_tensors="pt",
padding='longest',
max_length=None,
)
target_tokens = tokenizer(
targets,
return_tensors="pt",
padding='longest',
max_length=None,
)
labels = target_tokens["input_ids"].clone()
for i in range(len(labels)):
source_len = source_tokens["attention_mask"][i].sum()
labels[i, :source_len] = IGNORE_TOKEN
res = {
"input_ids": target_tokens["input_ids"],
"attention_mask": target_tokens["attention_mask"],
"labels": labels,
}
return res
processed_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = processed_dataset["train"]
eval_dataset = processed_dataset["test"]
IGNORE_TOKEN = -100
def data_collator(features, tokenizer):
sources = [feature["source"] for feature in features]
targets = [feature["text"] for feature in features]
source_tokens = tokenizer(
sources,
return_tensors="pt",
padding='longest',
max_length=None,
)
target_tokens = tokenizer(
targets,
return_tensors="pt",
padding='longest',
max_length=None,
)
labels = target_tokens["input_ids"].clone()
for i in range(len(labels)):
source_len = source_tokens["attention_mask"][i].sum()
labels[i, :source_len] = IGNORE_TOKEN
res = {
"input_ids": target_tokens["input_ids"],
"attention_mask": target_tokens["attention_mask"],
"labels": labels,
}
return res
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from accelerate import Accelerator
from get_data import train_dataset, eval_dataset, data_collator
accelerator = Accelerator()
MODEL_PATH = "meta-llama/Llama-2-7b-hf" # path to Llama on Hugging Face Hub
OUTPUT_DIR = "../finetunes/alpaca-7b" # where to save the fine-tuned model
tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, legacy=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # not set by default, strangely
model = LlamaForCausalLM.from_pretrained(
MODEL_PATH, device_map="auto"
)
training_args = TrainingArguments(
output_dir='checkpoints', # where Trainer will save model checkpoints
num_train_epochs=1, # start with a low number of epochs for testing
learning_rate=2e-5,
logging_steps=10,
per_device_train_batch_size=8,
remove_unused_columns=False,
save_steps=1000,
save_total_limit=1,
report_to="wandb",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=lambda x: data_collator(x, tokenizer),
)
trainer.train()
trainer.evaluate()
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
template = """Below is an instruction that describes a task. \
Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
"""
model_path = f"./finetunes/alpaca-7b"
tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map="auto", local_files_only=True
)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
do_sample=True,
temperature=0.9,
max_new_tokens=200,
)
def prompt_model():
prompt = input("Enter your question: ")
prompt = template.format(instruction=prompt)
answer = pipe(prompt)
print(answer[0]["generated_text"])
while True:
prompt_model()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment