Skip to content

Instantly share code, notes, and snippets.

@honglu2875
Last active June 12, 2023 18:12
Show Gist options
  • Save honglu2875/a436bc7034b3658f56522bf6f7c297e0 to your computer and use it in GitHub Desktop.
Save honglu2875/a436bc7034b3658f56522bf6f7c297e0 to your computer and use it in GitHub Desktop.
The scripts used in HumanEval evaluation with CFG
#!/bin/bash
#SBATCH --job-name="eval"
#SBATCH --partition=a100-cu117
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6 # Number of cores per tasks
#SBATCH --hint=nomultithread # We get physical cores not logical
#SBATCH --gres=gpu:8 # Number of gpus
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go
#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go
#SBATCH --exclusive # Turn off node sharing
#SBATCH --account=elm
source /fsx/home-honglu/miniconda3/bin/activate
conda activate cfg
NAME=Salesforce/codegen-350m-mono
TEMP=${2:-1.0}
#for CFG in 1 1.25 1.5 1.75 2 3 4 5 6 7
for CFG in $1
do
echo $CFG $NAME $TEMP
#torchrun --standalone --nnodes=1 --nproc-per-node=8 test_ds.py $CFG $NAME $TEMP
deepspeed test_ds.py --cfg=$CFG --model-name=$NAME --temp=$TEMP
done
import sys
from human_eval.data import stream_jsonl, write_jsonl
model = sys.argv[1]
#model_name = "Salesforce/codegen-350m-mono"
model_name = f"Salesforce/codegen-{model}-mono"
def postprocess(completion: str) -> str:
if completion.find("<|endoftext|>"):
completion = completion[:completion.find("<|endoftext|>")]
lines = completion.split("\n")
first_def = [r.startswith("def") for r in lines].index(True)
initial = "\n".join(lines[:first_def + 1])
completion = "\n".join(lines[first_def + 1:])
b = [r.startswith(" ") or r.startswith("\t") for r in completion.split("\n") if r]
if not all(b):
cutoff = b.index(False)
else:
cutoff = len(b)
r = "\n".join(completion.split("\n")[:cutoff])
return initial + "\n" + r
data = []
#prefix = "samples_use_def"
prefix = "samples"
for local_rank in range(8):
filename = f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}_{local_rank}.jsonl"
for item in stream_jsonl(filename):
item["completion"] = postprocess(item["completion"])
data.append(item)
write_jsonl(f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}.jsonl", data)
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor
import os
import sys
import numpy as np
import torch
import tqdm
from transformers import LogitsWarper, LogitsProcessorList
from transformers.generation import LogitNormalization
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from human_eval.data import write_jsonl, read_problems
_sanity_check = True
class CFGLogits(LogitsWarper):
def __init__(self, cfg, model):
self.cfg = cfg
self.model = model
self.reset()
def reset(self):
self.prompt_len = -1
self.past_kv = None
def __call__(self, input_ids, scores):
if self.cfg == 1:
return scores
scores = F.log_softmax(scores, dim=-1)
if self.past_kv is not None:
assert self.prompt_len > 0
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
global _sanity_check
if _sanity_check:
mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:]))
assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3))
print(f"Sanity test passed")
_sanity_check = False
else:
self.prompt_len = input_ids.shape[1]
model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
return scores
def setup(rank, world_size):
print(f"rank {rank} is being set up.")
torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank)
print(f"rank {rank} setup finished")
def cleanup():
torch.distributed.destroy_process_group()
def main():
cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1])
model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2]
temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
setup(local_rank, world_size)
_dev = torch.device(f"cuda:{local_rank}")
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = 50256
l = 1000
batch_size = 8
def completion(prompt: str):
cfg_logits = CFGLogits(cfg, model)
inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
outputs = model.generate(
**inputs,
max_new_tokens=l,
do_sample=True,
temperature=temp,
#min_length=l,
#repetition_penalty=1.2,
pad_token_id=50256,
logits_processor=LogitsProcessorList(
[cfg_logits]),
)
codes = tokenizer.batch_decode(outputs)
for code in codes:
yield code
test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n"
print(f"----------test on rank {local_rank}---------")
print(list(completion(test))[0])
print()
problems = read_problems()
num_samples_per_task = 4
samples = []
with torch.inference_mode():
for task_id in tqdm.tqdm(problems):
for _ in range(num_samples_per_task):
for code in completion(problems[task_id]["prompt"]):
samples.append(
dict(task_id=task_id, completion=code)
)
write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples)
cleanup()
if __name__ == "__main__":
main()
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor
import argparse
import os
import sys
import numpy as np
import torch
import tqdm
import deepspeed
from transformers import LogitsWarper, LogitsProcessorList
from transformers.generation import LogitNormalization
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from human_eval.data import write_jsonl, read_problems
_sanity_check = True
class CFGLogits(LogitsWarper):
def __init__(self, cfg, model):
self.cfg = cfg
self.model = model
self.reset()
def reset(self):
self.prompt_len = -1
self.past_kv = None
def __call__(self, input_ids, scores):
if self.cfg == 1:
return scores
scores = F.log_softmax(scores, dim=-1)
if self.past_kv is not None:
assert self.prompt_len > 0
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
global _sanity_check
if _sanity_check:
mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:]))
assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3))
print(f"Sanity test passed")
_sanity_check = False
else:
self.prompt_len = input_ids.shape[1]
model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
return scores
def main():
parser = argparse.ArgumentParser(description='CFG')
parser.add_argument('--local_rank', type=int, default=-1,
help='local rank passed from distributed launcher')
parser.add_argument('--cfg', type=float, default=1.0)
parser.add_argument('--model-name', type=str, default='Salesforce/codegen-350m-mono')
parser.add_argument('--temp', type=float, default=1.0)
# Include DeepSpeed configuration arguments
parser = deepspeed.add_config_arguments(parser)
cmd_args = parser.parse_args()
"""
cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1])
model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2]
temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3])
"""
cfg = cmd_args.cfg
model_name = cmd_args.model_name
temp = cmd_args.temp
#local_rank = int(os.environ["LOCAL_RANK"])
local_rank = cmd_args.local_rank
world_size = int(os.environ["WORLD_SIZE"])
#world_size = 1
print(local_rank, world_size)
_dev = torch.device(f"cuda:{local_rank}")
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)
model = deepspeed.init_inference(model,
mp_size=world_size,
dtype=torch.float,
replace_with_kernel_inject=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = 50256
l = 500
batch_size = 8
def completion(prompt: str):
cfg_logits = CFGLogits(cfg, model)
inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
outputs = model.generate(
**inputs,
max_new_tokens=l,
do_sample=True,
temperature=temp,
#min_length=l,
#repetition_penalty=1.2,
pad_token_id=50256,
logits_processor=LogitsProcessorList(
[cfg_logits]),
)
codes = tokenizer.batch_decode(outputs)
for code in codes:
yield code
test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n"
print(f"----------test on rank {local_rank}---------")
print(list(completion(test))[0])
print()
problems = read_problems()
num_samples_per_task = 4
samples = []
with torch.inference_mode():
for task_id in tqdm.tqdm(problems):
for _ in range(num_samples_per_task):
for code in completion(problems[task_id]["prompt"]):
samples.append(
dict(task_id=task_id, completion=code)
)
write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples)
if __name__ == "__main__":
main()
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor
import os
import sys
import numpy as np
import torch
import tqdm
from transformers import LogitsWarper, LogitsProcessorList
from transformers.generation import LogitNormalization
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from human_eval.data import write_jsonl, read_problems
_sanity_check = True
class CFGLogits(LogitsWarper):
def __init__(self, cfg, model):
self.cfg = cfg
self.model = model
self.reset()
def reset(self):
self.prompt_len = -1
self.past_kv = None
def __call__(self, input_ids, scores):
if self.cfg == 1:
return scores
scores = F.log_softmax(scores, dim=-1)
if self.past_kv is not None:
assert self.prompt_len > 0
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
global _sanity_check
if _sanity_check:
full_prompt = {"input_ids": torch.cat([self.preamble["input_ids"], input_ids[:, -1:]], dim=1)}
full_prompt["attention_mask"] = torch.ones_like(full_prompt["input_ids"])
mo = self.model(**full_prompt)
assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-4))
print(f"Def Line: {self.def_line}")
print(f"Sanity test passed")
_sanity_check = False
else:
original = tokenizer.decode(input_ids[0])
lines = original.split("\n")
self.def_line = [l for l in lines if l.startswith("def")][0]
self.preamble = tokenizer([self.def_line] * input_ids.size(0), return_tensors="pt").to(_dev)
self.prompt_len = input_ids.shape[1]
model_output = self.model(**self.preamble)
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
return scores
def setup(rank, world_size):
print(f"rank {rank} is being set up.")
torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank)
print(f"rank {rank} setup finished")
def cleanup():
torch.distributed.destroy_process_group()
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
setup(local_rank, world_size)
_dev = torch.device(f"cuda:{local_rank}")
model_name = sys.argv[2]
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = 50256
l = 1000
batch_size = 25
def completion(prompt: str):
cfg_logits = CFGLogits(float(sys.argv[1]), model)
inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
outputs = model.generate(
**inputs,
max_new_tokens=l,
do_sample=True,
temperature=1.0,
#min_length=l,
#repetition_penalty=1.2,
pad_token_id=50256,
logits_processor=LogitsProcessorList(
[cfg_logits]),
)
codes = tokenizer.batch_decode(outputs)
for code in codes:
yield code
test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n"
print(f"----------test on rank {local_rank}---------")
print(list(completion(test))[0])
print()
problems = read_problems()
num_samples_per_task = 1
samples = []
with torch.inference_mode():
for task_id in tqdm.tqdm(problems):
for code in completion(problems[task_id]["prompt"]):
samples.append(
dict(task_id=task_id, completion=code)
)
write_jsonl(f"samples_use_def_{float(sys.argv[1])}_{model_name.split('/')[-1]}_{local_rank}.jsonl", samples)
cleanup()
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsWarper, LogitsProcessorList
import torch
import torch.nn.functional as F
_MODEL_REGISTRY = {
"GPT-2": "gpt2",
"RedPajama-INCITE-Instruct": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
"GPT4ALL-J": "nomic-ai/gpt4all-j",
"Pythia-410M": "EleutherAI/pythia-410m-v0",
"CodeGen-350M": "Salesforce/codegen-350M-mono",
}
_CFGS = [1.0, 1.25, 1.5, 1.75, 2.0]
models = {}
tokenizers = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CFGLogits(LogitsWarper):
def __init__(self, cfg, model, uncond=None):
global device
self.cfg = torch.tensor(cfg, device=device)[:, None]
self.model = model
self.uncond = uncond
self.reset()
def reset(self):
self.past_kv = None
def __call__(self, input_ids, scores):
#if self.cfg == 1:
# return scores
scores = F.log_softmax(scores, dim=-1)
if self.past_kv is not None:
model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
else:
model_output = self.model(**self.uncond, use_cache=True)
#model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)
uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]
scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
return scores
def generate():
model_label = st.session_state.model
cond = st.session_state.cond
uncond = st.session_state.uncond
max_len = st.session_state.max_len
temp = st.session_state.temperature
global models, tokenizers
if model_label not in models:
if model_label == "GPT4ALL-J":
model = AutoModelForCausalLM.from_pretrained("nomic-ai/gpt4all-j", revision="v1.3-groovy").half().to(device)
else:
model = AutoModelForCausalLM.from_pretrained(_MODEL_REGISTRY[model_label]).half().to(device)
tokenizer = AutoTokenizer.from_pretrained(_MODEL_REGISTRY[model_label])
models[model_label] = model
tokenizers[model_label] = tokenizer
tokenizer = tokenizers[model_label]
model = models[model_label]
uncond_input = tokenizer([uncond or "<|endoftext|>"] * len(_CFGS), return_tensors='pt').to(device)
input_ids = tokenizer([cond] * len(_CFGS), return_tensors='pt').to(device)
logits_processor = LogitsProcessorList([CFGLogits(_CFGS, model, uncond=uncond_input)])
with torch.no_grad():
output_ids = model.generate(input_ids["input_ids"], do_sample=True, temperature=temp, max_length=max_len, logits_processor=logits_processor)
output = tokenizer.batch_decode(output_ids)
for i, cfg in enumerate(_CFGS):
st.session_state[f"cfg_{cfg}"] = output[i]
def main():
st.set_page_config(layout="wide")
st.title("Text Generation with different CFGs")
col1, col2 = st.columns([1, 3])
with col1:
model = st.radio("Model", list(_MODEL_REGISTRY.keys()), key='model')
with st.form(key='prompts'):
max_len = st.number_input("Max Length", min_value=1, max_value=1000, value=100, step=1, key='max_len')
temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.9, step=0.01, key='temperature')
cond = st.text_area(label='conditional prompt', key='cond')
uncond = st.text_area(label='unconditional prompt', key='uncond')
submit_button = st.form_submit_button(label='Submit', on_click=generate)
with col2:
for i in _CFGS:
st.text_area(label=f"CFG = {i}", key=f"cfg_{i}", height=200)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment