Last active
November 8, 2023 01:50
-
-
Save nebrelbug/1da2c0064d53decf197a304267799708 to your computer and use it in GitHub Desktop.
Rebuilding Alpaca with the Hugging Face Trainer Class (see https://bengubler.com/posts/rebuilding-alpaca-huggingface-trainer)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
compute_environment: LOCAL_MACHINE | |
deepspeed_config: {} | |
distributed_type: 'NO' | |
downcast_bf16: 'no' | |
machine_rank: 0 | |
main_process_ip: null | |
main_process_port: null | |
main_training_function: main | |
mixed_precision: 'no' | |
num_machines: 1 | |
num_processes: 1 | |
use_cpu: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datasets import load_dataset | |
original_dataset = load_dataset("tatsu-lab/alpaca")["train"] | |
template_no_context = """Below is an instruction that describes a task. \ | |
Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Response: | |
""" | |
template_context = """Below is an instruction that describes a task. \ | |
Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Input: | |
{input} | |
### Response: | |
""" | |
def data_to_string(data): | |
instruction = data["instruction"] | |
context = data["input"] | |
response = data["output"] | |
template = template_context if len(context) > 0 else template_no_context | |
source = template.format(instruction=instruction, input=context) | |
return { | |
"source": source, | |
"text": source + response, | |
} | |
dataset = original_dataset.map( | |
data_to_string | |
).remove_columns(['instruction', 'input', 'output']) | |
processed_dataset = dataset.train_test_split(test_size=0.1) | |
train_dataset = processed_dataset["train"] | |
eval_dataset = processed_dataset["test"] | |
IGNORE_TOKEN = -100 | |
def data_collator(features, tokenizer): | |
sources = [feature["source"] for feature in features] | |
targets = [feature["text"] for feature in features] | |
source_tokens = tokenizer( | |
sources, | |
return_tensors="pt", | |
padding='longest', | |
max_length=None, | |
) | |
target_tokens = tokenizer( | |
targets, | |
return_tensors="pt", | |
padding='longest', | |
max_length=None, | |
) | |
labels = target_tokens["input_ids"].clone() | |
for i in range(len(labels)): | |
source_len = source_tokens["attention_mask"][i].sum() | |
labels[i, :source_len] = IGNORE_TOKEN | |
res = { | |
"input_ids": target_tokens["input_ids"], | |
"attention_mask": target_tokens["attention_mask"], | |
"labels": labels, | |
} | |
return res | |
processed_dataset = dataset.train_test_split(test_size=0.1) | |
train_dataset = processed_dataset["train"] | |
eval_dataset = processed_dataset["test"] | |
IGNORE_TOKEN = -100 | |
def data_collator(features, tokenizer): | |
sources = [feature["source"] for feature in features] | |
targets = [feature["text"] for feature in features] | |
source_tokens = tokenizer( | |
sources, | |
return_tensors="pt", | |
padding='longest', | |
max_length=None, | |
) | |
target_tokens = tokenizer( | |
targets, | |
return_tensors="pt", | |
padding='longest', | |
max_length=None, | |
) | |
labels = target_tokens["input_ids"].clone() | |
for i in range(len(labels)): | |
source_len = source_tokens["attention_mask"][i].sum() | |
labels[i, :source_len] = IGNORE_TOKEN | |
res = { | |
"input_ids": target_tokens["input_ids"], | |
"attention_mask": target_tokens["attention_mask"], | |
"labels": labels, | |
} | |
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments | |
from accelerate import Accelerator | |
from get_data import train_dataset, eval_dataset, data_collator | |
accelerator = Accelerator() | |
MODEL_PATH = "meta-llama/Llama-2-7b-hf" # path to Llama on Hugging Face Hub | |
OUTPUT_DIR = "../finetunes/alpaca-7b" # where to save the fine-tuned model | |
tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, legacy=False) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" # not set by default, strangely | |
model = LlamaForCausalLM.from_pretrained( | |
MODEL_PATH, device_map="auto" | |
) | |
training_args = TrainingArguments( | |
output_dir='checkpoints', # where Trainer will save model checkpoints | |
num_train_epochs=1, # start with a low number of epochs for testing | |
learning_rate=2e-5, | |
logging_steps=10, | |
per_device_train_batch_size=8, | |
remove_unused_columns=False, | |
save_steps=1000, | |
save_total_limit=1, | |
report_to="wandb", | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
tokenizer=tokenizer, | |
data_collator=lambda x: data_collator(x, tokenizer), | |
) | |
trainer.train() | |
trainer.evaluate() | |
model.save_pretrained(OUTPUT_DIR) | |
tokenizer.save_pretrained(OUTPUT_DIR) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
template = """Below is an instruction that describes a task. \ | |
Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Response: | |
""" | |
model_path = f"./finetunes/alpaca-7b" | |
tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False) | |
tokenizer.pad_token = tokenizer.eos_token | |
tokenizer.padding_side = "right" | |
model = AutoModelForCausalLM.from_pretrained( | |
model_path, device_map="auto", local_files_only=True | |
) | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
return_full_text=False, | |
do_sample=True, | |
temperature=0.9, | |
max_new_tokens=200, | |
) | |
def prompt_model(): | |
prompt = input("Enter your question: ") | |
prompt = template.format(instruction=prompt) | |
answer = pipe(prompt) | |
print(answer[0]["generated_text"]) | |
while True: | |
prompt_model() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment