honglu2875/Evaluate Classifier-Free Guidance impact

## eval-codegen-350m.sbatch
#!/bin/bash
#SBATCH --job-name="eval"
#SBATCH --partition=a100-cu117
#SBATCH --mem-per-cpu=16GB        # Amount of CPU memory
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8      # Crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=6           # Number of cores per tasks
#SBATCH --hint=nomultithread         # We get physical cores not logical
#SBATCH --gres=gpu:8                 # Number of gpus
#SBATCH --output=%x_%j.out   # Set this dir where you want slurm outs to go
#SBATCH --error=%x_%j.out    # Set this dir where you want slurm outs to go
#SBATCH --exclusive      # Turn off node sharing
#SBATCH --account=elm


source /fsx/home-honglu/miniconda3/bin/activate
conda activate cfg

NAME=Salesforce/codegen-350m-mono
TEMP=${2:-1.0}

#for CFG in 1 1.25 1.5 1.75 2 3 4 5 6 7
for CFG in $1
do
	echo $CFG $NAME $TEMP
	#torchrun --standalone --nnodes=1 --nproc-per-node=8 test_ds.py $CFG $NAME $TEMP
	deepspeed test_ds.py --cfg=$CFG --model-name=$NAME --temp=$TEMP
done

## Evaluate Classifier-Free Guidance impact
_

## merge.py
import sys
from human_eval.data import stream_jsonl, write_jsonl


model = sys.argv[1]
#model_name = "Salesforce/codegen-350m-mono"
model_name = f"Salesforce/codegen-{model}-mono"


def postprocess(completion: str) -> str:
    if completion.find("<|endoftext|>"):
        completion = completion[:completion.find("<|endoftext|>")]
    lines = completion.split("\n")
    first_def = [r.startswith("def") for r in lines].index(True)
    initial = "\n".join(lines[:first_def + 1])
    completion = "\n".join(lines[first_def + 1:])

    b = [r.startswith(" ") or r.startswith("\t") for r in completion.split("\n") if r]
    if not all(b):
        cutoff = b.index(False)
    else:
        cutoff = len(b)
    r = "\n".join(completion.split("\n")[:cutoff])
    return initial + "\n" + r


data = []
#prefix = "samples_use_def"
prefix = "samples"
for local_rank in range(8):
    filename = f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}_{local_rank}.jsonl"
    for item in stream_jsonl(filename):
        item["completion"] = postprocess(item["completion"])

        data.append(item)

    write_jsonl(f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}.jsonl", data)

## test.py
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor

import os
import sys
import numpy as np
import torch
import tqdm
from transformers import LogitsWarper, LogitsProcessorList
from transformers.generation import LogitNormalization
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from human_eval.data import write_jsonl, read_problems


_sanity_check = True

class CFGLogits(LogitsWarper):

    def __init__(self, cfg, model):
        self.cfg = cfg
        self.model = model
        self.reset()

    def reset(self):
        self.prompt_len = -1
        self.past_kv = None

    def __call__(self, input_ids, scores):
        if self.cfg == 1:
            return scores
        scores = F.log_softmax(scores, dim=-1)

        if self.past_kv is not None:
            assert self.prompt_len > 0
            model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
            global _sanity_check
            if _sanity_check:
                mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:]))
                assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3))
                print(f"Sanity test passed")
                _sanity_check = False
        else:
            self.prompt_len = input_ids.shape[1]
            model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)


        uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
        unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

        scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
        return scores

def setup(rank, world_size):
    print(f"rank {rank} is being set up.")
    torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank)
    print(f"rank {rank} setup finished")

def cleanup():
    torch.distributed.destroy_process_group()


def main():
    cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1])
    model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2]
    temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3])

    local_rank = int(os.environ["LOCAL_RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    setup(local_rank, world_size)
    _dev = torch.device(f"cuda:{local_rank}")
    model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = 50256


    l = 1000
    batch_size = 8

    def completion(prompt: str):
        cfg_logits = CFGLogits(cfg, model)
        inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
        outputs = model.generate(
            **inputs,
            max_new_tokens=l,
            do_sample=True,
            temperature=temp,
            #min_length=l,
            #repetition_penalty=1.2,
            pad_token_id=50256,
            logits_processor=LogitsProcessorList(
                [cfg_logits]),
        )
        codes = tokenizer.batch_decode(outputs)

        for code in codes:
            yield code


    test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
    print(f"----------test on rank {local_rank}---------")
    print(list(completion(test))[0])
    print()


    problems = read_problems()

    num_samples_per_task = 4
    samples = []
    with torch.inference_mode():
        for task_id in tqdm.tqdm(problems):
            for _ in range(num_samples_per_task):
                for code in completion(problems[task_id]["prompt"]):
                    samples.append(
                        dict(task_id=task_id, completion=code)
                    )
    write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples)


    cleanup()


if __name__ == "__main__":
    main()

## test_ds.py
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor
import argparse
import os
import sys
import numpy as np
import torch
import tqdm
import deepspeed
from transformers import LogitsWarper, LogitsProcessorList
from transformers.generation import LogitNormalization
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from human_eval.data import write_jsonl, read_problems


_sanity_check = True

class CFGLogits(LogitsWarper):

    def __init__(self, cfg, model):
        self.cfg = cfg
        self.model = model
        self.reset()

    def reset(self):
        self.prompt_len = -1
        self.past_kv = None

    def __call__(self, input_ids, scores):
        if self.cfg == 1:
            return scores
        scores = F.log_softmax(scores, dim=-1)

        if self.past_kv is not None:
            assert self.prompt_len > 0
            model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
            global _sanity_check
            if _sanity_check:
                mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:]))
                assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3))
                print(f"Sanity test passed")
                _sanity_check = False
        else:
            self.prompt_len = input_ids.shape[1]
            model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)


        uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
        unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

        scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
        return scores


def main():
    parser = argparse.ArgumentParser(description='CFG')
    parser.add_argument('--local_rank', type=int, default=-1,
                        help='local rank passed from distributed launcher')
    parser.add_argument('--cfg', type=float, default=1.0)
    parser.add_argument('--model-name', type=str, default='Salesforce/codegen-350m-mono')
    parser.add_argument('--temp', type=float, default=1.0)
    # Include DeepSpeed configuration arguments
    parser = deepspeed.add_config_arguments(parser)
    cmd_args = parser.parse_args()

    """
    cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1])
    model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2]
    temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3])
    """
    cfg = cmd_args.cfg
    model_name = cmd_args.model_name
    temp = cmd_args.temp

    #local_rank = int(os.environ["LOCAL_RANK"])
    local_rank = cmd_args.local_rank
    world_size = int(os.environ["WORLD_SIZE"])
    #world_size = 1
    print(local_rank, world_size)
    _dev = torch.device(f"cuda:{local_rank}")
    model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)
    model = deepspeed.init_inference(model,
                                     mp_size=world_size,
                                     dtype=torch.float,
                                     replace_with_kernel_inject=True)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = 50256


    l = 500
    batch_size = 8

    def completion(prompt: str):
        cfg_logits = CFGLogits(cfg, model)
        inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
        outputs = model.generate(
            **inputs,
            max_new_tokens=l,
            do_sample=True,
            temperature=temp,
            #min_length=l,
            #repetition_penalty=1.2,
            pad_token_id=50256,
            logits_processor=LogitsProcessorList(
                [cfg_logits]),
        )
        codes = tokenizer.batch_decode(outputs)

        for code in codes:
            yield code


    test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
    print(f"----------test on rank {local_rank}---------")
    print(list(completion(test))[0])
    print()


    problems = read_problems()

    num_samples_per_task = 4
    samples = []
    with torch.inference_mode():
        for task_id in tqdm.tqdm(problems):
            for _ in range(num_samples_per_task):
                for code in completion(problems[task_id]["prompt"]):
                    samples.append(
                        dict(task_id=task_id, completion=code)
                    )
    write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples)


if __name__ == "__main__":
    main()

## test_use_def.py
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor

import os
import sys
import numpy as np
import torch
import tqdm
from transformers import LogitsWarper, LogitsProcessorList
from transformers.generation import LogitNormalization
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel
from human_eval.data import write_jsonl, read_problems


_sanity_check = True

class CFGLogits(LogitsWarper):

    def __init__(self, cfg, model):
        self.cfg = cfg
        self.model = model
        self.reset()

    def reset(self):
        self.prompt_len = -1
        self.past_kv = None

    def __call__(self, input_ids, scores):
        if self.cfg == 1:
            return scores
        scores = F.log_softmax(scores, dim=-1)

        if self.past_kv is not None:
            assert self.prompt_len > 0
            model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
            global _sanity_check
            if _sanity_check:
                full_prompt = {"input_ids": torch.cat([self.preamble["input_ids"], input_ids[:, -1:]], dim=1)}
                full_prompt["attention_mask"] = torch.ones_like(full_prompt["input_ids"])
                mo = self.model(**full_prompt)
                assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-4))
                print(f"Def Line: {self.def_line}")
                print(f"Sanity test passed")
                _sanity_check = False
        else:
            original = tokenizer.decode(input_ids[0])
            lines = original.split("\n")
            self.def_line = [l for l in lines if l.startswith("def")][0]
            self.preamble = tokenizer([self.def_line] * input_ids.size(0), return_tensors="pt").to(_dev)

            self.prompt_len = input_ids.shape[1]
            model_output = self.model(**self.preamble)


        uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
        unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

        scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
        return scores

def setup(rank, world_size):
    print(f"rank {rank} is being set up.")
    torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank)
    print(f"rank {rank} setup finished")

def cleanup():
    torch.distributed.destroy_process_group()


local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
setup(local_rank, world_size)
_dev = torch.device(f"cuda:{local_rank}")
model_name = sys.argv[2]
model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = 50256


l = 1000
batch_size = 25

def completion(prompt: str):
    cfg_logits = CFGLogits(float(sys.argv[1]), model)
    inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
    outputs = model.generate(
        **inputs,
        max_new_tokens=l,
        do_sample=True,
        temperature=1.0,
        #min_length=l,
        #repetition_penalty=1.2,
        pad_token_id=50256,
        logits_processor=LogitsProcessorList(
            [cfg_logits]),
    )
    codes = tokenizer.batch_decode(outputs)

    for code in codes:
        yield code


test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
print(f"----------test on rank {local_rank}---------")
print(list(completion(test))[0])
print()


problems = read_problems()

num_samples_per_task = 1
samples = []
with torch.inference_mode():
    for task_id in tqdm.tqdm(problems):
        for code in completion(problems[task_id]["prompt"]):
            samples.append(
                dict(task_id=task_id, completion=code)
            )
write_jsonl(f"samples_use_def_{float(sys.argv[1])}_{model_name.split('/')[-1]}_{local_rank}.jsonl", samples)


cleanup()

## UI.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsWarper, LogitsProcessorList
import torch
import torch.nn.functional as F


_MODEL_REGISTRY = {
        "GPT-2": "gpt2",
        "RedPajama-INCITE-Instruct": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
        "GPT4ALL-J": "nomic-ai/gpt4all-j",
        "Pythia-410M": "EleutherAI/pythia-410m-v0",
        "CodeGen-350M": "Salesforce/codegen-350M-mono",
        }
_CFGS = [1.0, 1.25, 1.5, 1.75, 2.0]
models = {}
tokenizers = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class CFGLogits(LogitsWarper):

    def __init__(self, cfg, model, uncond=None):
        global device
        self.cfg = torch.tensor(cfg, device=device)[:, None]
        self.model = model
        self.uncond = uncond
        self.reset()

    def reset(self):
        self.past_kv = None

    def __call__(self, input_ids, scores):
        #if self.cfg == 1:
        #    return scores
        scores = F.log_softmax(scores, dim=-1)

        if self.past_kv is not None:
            model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
        else:
            model_output = self.model(**self.uncond, use_cache=True)
            #model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)


        uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
        unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

        scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
        return scores


def generate():
    model_label = st.session_state.model
    cond = st.session_state.cond
    uncond = st.session_state.uncond
    max_len = st.session_state.max_len
    temp = st.session_state.temperature

    global models, tokenizers
    if model_label not in models:
        if model_label == "GPT4ALL-J":
            model = AutoModelForCausalLM.from_pretrained("nomic-ai/gpt4all-j", revision="v1.3-groovy").half().to(device)
        else:
            model = AutoModelForCausalLM.from_pretrained(_MODEL_REGISTRY[model_label]).half().to(device)

        tokenizer = AutoTokenizer.from_pretrained(_MODEL_REGISTRY[model_label])
        models[model_label] = model
        tokenizers[model_label] = tokenizer
    tokenizer = tokenizers[model_label]
    model = models[model_label]

    uncond_input = tokenizer([uncond or "<|endoftext|>"] * len(_CFGS), return_tensors='pt').to(device)
    input_ids = tokenizer([cond] * len(_CFGS), return_tensors='pt').to(device)
    logits_processor = LogitsProcessorList([CFGLogits(_CFGS, model, uncond=uncond_input)])
    with torch.no_grad():
        output_ids = model.generate(input_ids["input_ids"], do_sample=True, temperature=temp, max_length=max_len, logits_processor=logits_processor)

    output = tokenizer.batch_decode(output_ids)
    for i, cfg in enumerate(_CFGS):
        st.session_state[f"cfg_{cfg}"] = output[i]

def main():
    st.set_page_config(layout="wide")
    st.title("Text Generation with different CFGs")
    col1, col2 = st.columns([1, 3])
    with col1:
        model = st.radio("Model", list(_MODEL_REGISTRY.keys()), key='model')
        with st.form(key='prompts'):
            max_len = st.number_input("Max Length", min_value=1, max_value=1000, value=100, step=1, key='max_len')
            temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.9, step=0.01, key='temperature')
            cond = st.text_area(label='conditional prompt', key='cond')
            uncond = st.text_area(label='unconditional prompt', key='uncond')
            submit_button = st.form_submit_button(label='Submit', on_click=generate)

    with col2:
        for i in _CFGS:
            st.text_area(label=f"CFG = {i}", key=f"cfg_{i}", height=200)

if __name__ == "__main__":
    main()
	#!/bin/bash
	#SBATCH --job-name="eval"
	#SBATCH --partition=a100-cu117
	#SBATCH --mem-per-cpu=16GB # Amount of CPU memory
	#SBATCH --nodes=1
	#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node!
	#SBATCH --cpus-per-task=6 # Number of cores per tasks
	#SBATCH --hint=nomultithread # We get physical cores not logical
	#SBATCH --gres=gpu:8 # Number of gpus
	#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go
	#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go
	#SBATCH --exclusive # Turn off node sharing
	#SBATCH --account=elm


	source /fsx/home-honglu/miniconda3/bin/activate
	conda activate cfg

	NAME=Salesforce/codegen-350m-mono
	TEMP=${2:-1.0}

	#for CFG in 1 1.25 1.5 1.75 2 3 4 5 6 7
	for CFG in $1
	do
	echo $CFG $NAME $TEMP
	#torchrun --standalone --nnodes=1 --nproc-per-node=8 test_ds.py $CFG $NAME $TEMP
	deepspeed test_ds.py --cfg=$CFG --model-name=$NAME --temp=$TEMP
	done
	import sys
	from human_eval.data import stream_jsonl, write_jsonl


	model = sys.argv[1]
	#model_name = "Salesforce/codegen-350m-mono"
	model_name = f"Salesforce/codegen-{model}-mono"


	def postprocess(completion: str) -> str:
	if completion.find("<\|endoftext\|>"):
	completion = completion[:completion.find("<\|endoftext\|>")]
	lines = completion.split("\n")
	first_def = [r.startswith("def") for r in lines].index(True)
	initial = "\n".join(lines[:first_def + 1])
	completion = "\n".join(lines[first_def + 1:])

	b = [r.startswith(" ") or r.startswith("\t") for r in completion.split("\n") if r]
	if not all(b):
	cutoff = b.index(False)
	else:
	cutoff = len(b)
	r = "\n".join(completion.split("\n")[:cutoff])
	return initial + "\n" + r


	data = []
	#prefix = "samples_use_def"
	prefix = "samples"
	for local_rank in range(8):
	filename = f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}_{local_rank}.jsonl"
	for item in stream_jsonl(filename):
	item["completion"] = postprocess(item["completion"])

	data.append(item)

	write_jsonl(f"{prefix}_{float(sys.argv[2])}_{model_name.split('/')[-1]}.jsonl", data)
	from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor

	import os
	import sys
	import numpy as np
	import torch
	import tqdm
	from transformers import LogitsWarper, LogitsProcessorList
	from transformers.generation import LogitNormalization
	import torch.nn.functional as F
	from torch.nn.parallel import DistributedDataParallel
	from human_eval.data import write_jsonl, read_problems


	_sanity_check = True

	class CFGLogits(LogitsWarper):

	def __init__(self, cfg, model):
	self.cfg = cfg
	self.model = model
	self.reset()

	def reset(self):
	self.prompt_len = -1
	self.past_kv = None

	def __call__(self, input_ids, scores):
	if self.cfg == 1:
	return scores
	scores = F.log_softmax(scores, dim=-1)

	if self.past_kv is not None:
	assert self.prompt_len > 0
	model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
	global _sanity_check
	if _sanity_check:
	mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:]))
	assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3))
	print(f"Sanity test passed")
	_sanity_check = False
	else:
	self.prompt_len = input_ids.shape[1]
	model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)


	uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
	unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

	scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
	return scores

	def setup(rank, world_size):
	print(f"rank {rank} is being set up.")
	torch.distributed.init_process_group(backend='nccl', world_size=world_size, init_method='env://', rank=rank)
	print(f"rank {rank} setup finished")

	def cleanup():
	torch.distributed.destroy_process_group()


	def main():
	cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1])
	model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2]
	temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3])

	local_rank = int(os.environ["LOCAL_RANK"])
	world_size = int(os.environ["WORLD_SIZE"])
	setup(local_rank, world_size)
	_dev = torch.device(f"cuda:{local_rank}")
	model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	tokenizer.pad_token = 50256



	l = 1000
	batch_size = 8

	def completion(prompt: str):
	cfg_logits = CFGLogits(cfg, model)
	inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
	outputs = model.generate(
	**inputs,
	max_new_tokens=l,
	do_sample=True,
	temperature=temp,
	#min_length=l,
	#repetition_penalty=1.2,
	pad_token_id=50256,
	logits_processor=LogitsProcessorList(
	[cfg_logits]),
	)
	codes = tokenizer.batch_decode(outputs)

	for code in codes:
	yield code


	test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n"
	print(f"----------test on rank {local_rank}---------")
	print(list(completion(test))[0])
	print()



	problems = read_problems()

	num_samples_per_task = 4
	samples = []
	with torch.inference_mode():
	for task_id in tqdm.tqdm(problems):
	for _ in range(num_samples_per_task):
	for code in completion(problems[task_id]["prompt"]):
	samples.append(
	dict(task_id=task_id, completion=code)
	)
	write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples)


	cleanup()



	if __name__ == "__main__":
	main()
	from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor
	import argparse
	import os
	import sys
	import numpy as np
	import torch
	import tqdm
	import deepspeed
	from transformers import LogitsWarper, LogitsProcessorList
	from transformers.generation import LogitNormalization
	import torch.nn.functional as F
	from torch.nn.parallel import DistributedDataParallel
	from human_eval.data import write_jsonl, read_problems


	_sanity_check = True

	class CFGLogits(LogitsWarper):

	def __init__(self, cfg, model):
	self.cfg = cfg
	self.model = model
	self.reset()

	def reset(self):
	self.prompt_len = -1
	self.past_kv = None

	def __call__(self, input_ids, scores):
	if self.cfg == 1:
	return scores
	scores = F.log_softmax(scores, dim=-1)

	if self.past_kv is not None:
	assert self.prompt_len > 0
	model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
	global _sanity_check
	if _sanity_check:
	mo = self.model(input_ids=input_ids[:, self.prompt_len - 1:], attention_mask=torch.ones_like(input_ids[:, self.prompt_len - 1:]))
	assert torch.all(torch.isclose(mo[0][:, -1:], model_output[0], atol=1e-3))
	print(f"Sanity test passed")
	_sanity_check = False
	else:
	self.prompt_len = input_ids.shape[1]
	model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)


	uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
	unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

	scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
	return scores


	def main():
	parser = argparse.ArgumentParser(description='CFG')
	parser.add_argument('--local_rank', type=int, default=-1,
	help='local rank passed from distributed launcher')
	parser.add_argument('--cfg', type=float, default=1.0)
	parser.add_argument('--model-name', type=str, default='Salesforce/codegen-350m-mono')
	parser.add_argument('--temp', type=float, default=1.0)
	# Include DeepSpeed configuration arguments
	parser = deepspeed.add_config_arguments(parser)
	cmd_args = parser.parse_args()

	"""
	cfg = 1.0 if len(sys.argv) < 2 else float(sys.argv[1])
	model_name = "Salesforce/codegen-350m-mono" if len(sys.argv) < 3 else sys.argv[2]
	temp = 1.0 if len(sys.argv) < 4 else float(sys.argv[3])
	"""
	cfg = cmd_args.cfg
	model_name = cmd_args.model_name
	temp = cmd_args.temp

	#local_rank = int(os.environ["LOCAL_RANK"])
	local_rank = cmd_args.local_rank
	world_size = int(os.environ["WORLD_SIZE"])
	#world_size = 1
	print(local_rank, world_size)
	_dev = torch.device(f"cuda:{local_rank}")
	model = AutoModelForCausalLM.from_pretrained(model_name).to(_dev)
	model = deepspeed.init_inference(model,
	mp_size=world_size,
	dtype=torch.float,
	replace_with_kernel_inject=True)

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	tokenizer.pad_token = 50256



	l = 500
	batch_size = 8

	def completion(prompt: str):
	cfg_logits = CFGLogits(cfg, model)
	inputs = tokenizer([prompt]*batch_size, return_tensors="pt").to(_dev)
	outputs = model.generate(
	**inputs,
	max_new_tokens=l,
	do_sample=True,
	temperature=temp,
	#min_length=l,
	#repetition_penalty=1.2,
	pad_token_id=50256,
	logits_processor=LogitsProcessorList(
	[cfg_logits]),
	)
	codes = tokenizer.batch_decode(outputs)

	for code in codes:
	yield code


	test = "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n"
	print(f"----------test on rank {local_rank}---------")
	print(list(completion(test))[0])
	print()



	problems = read_problems()

	num_samples_per_task = 4
	samples = []
	with torch.inference_mode():
	for task_id in tqdm.tqdm(problems):
	for _ in range(num_samples_per_task):
	for code in completion(problems[task_id]["prompt"]):
	samples.append(
	dict(task_id=task_id, completion=code)
	)
	write_jsonl(f"samples_{cfg}_{model_name.split('/')[-1]}_temp_{temp}_{local_rank}.jsonl", samples)





	if __name__ == "__main__":
	main()
	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsWarper, LogitsProcessorList
	import torch
	import torch.nn.functional as F


	_MODEL_REGISTRY = {
	"GPT-2": "gpt2",
	"RedPajama-INCITE-Instruct": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
	"GPT4ALL-J": "nomic-ai/gpt4all-j",
	"Pythia-410M": "EleutherAI/pythia-410m-v0",
	"CodeGen-350M": "Salesforce/codegen-350M-mono",
	}
	_CFGS = [1.0, 1.25, 1.5, 1.75, 2.0]
	models = {}
	tokenizers = {}
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	class CFGLogits(LogitsWarper):

	def __init__(self, cfg, model, uncond=None):
	global device
	self.cfg = torch.tensor(cfg, device=device)[:, None]
	self.model = model
	self.uncond = uncond
	self.reset()

	def reset(self):
	self.past_kv = None

	def __call__(self, input_ids, scores):
	#if self.cfg == 1:
	# return scores
	scores = F.log_softmax(scores, dim=-1)

	if self.past_kv is not None:
	model_output = self.model(input_ids=input_ids[:, -1:], use_cache=True, past_key_values=self.past_kv, attention_mask=torch.ones_like(input_ids[:, -1:]))
	else:
	model_output = self.model(**self.uncond, use_cache=True)
	#model_output = self.model(input_ids=input_ids[:, -1:], attention_mask=torch.ones_like(input_ids[:, -1:]), use_cache=True)


	uncond_output, self.past_kv = model_output.logits, model_output.past_key_values
	unconditional_logits = F.log_softmax(uncond_output, dim=-1)[:, -1, :]

	scores = self.cfg * scores + (1 - self.cfg) * unconditional_logits
	return scores


	def generate():
	model_label = st.session_state.model
	cond = st.session_state.cond
	uncond = st.session_state.uncond
	max_len = st.session_state.max_len
	temp = st.session_state.temperature

	global models, tokenizers
	if model_label not in models:
	if model_label == "GPT4ALL-J":
	model = AutoModelForCausalLM.from_pretrained("nomic-ai/gpt4all-j", revision="v1.3-groovy").half().to(device)
	else:
	model = AutoModelForCausalLM.from_pretrained(_MODEL_REGISTRY[model_label]).half().to(device)

	tokenizer = AutoTokenizer.from_pretrained(_MODEL_REGISTRY[model_label])
	models[model_label] = model
	tokenizers[model_label] = tokenizer
	tokenizer = tokenizers[model_label]
	model = models[model_label]

	uncond_input = tokenizer([uncond or "<\|endoftext\|>"] * len(_CFGS), return_tensors='pt').to(device)
	input_ids = tokenizer([cond] * len(_CFGS), return_tensors='pt').to(device)
	logits_processor = LogitsProcessorList([CFGLogits(_CFGS, model, uncond=uncond_input)])
	with torch.no_grad():
	output_ids = model.generate(input_ids["input_ids"], do_sample=True, temperature=temp, max_length=max_len, logits_processor=logits_processor)

	output = tokenizer.batch_decode(output_ids)
	for i, cfg in enumerate(_CFGS):
	st.session_state[f"cfg_{cfg}"] = output[i]

	def main():
	st.set_page_config(layout="wide")
	st.title("Text Generation with different CFGs")
	col1, col2 = st.columns([1, 3])
	with col1:
	model = st.radio("Model", list(_MODEL_REGISTRY.keys()), key='model')
	with st.form(key='prompts'):
	max_len = st.number_input("Max Length", min_value=1, max_value=1000, value=100, step=1, key='max_len')
	temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.9, step=0.01, key='temperature')
	cond = st.text_area(label='conditional prompt', key='cond')
	uncond = st.text_area(label='unconditional prompt', key='uncond')
	submit_button = st.form_submit_button(label='Submit', on_click=generate)

	with col2:
	for i in _CFGS:
	st.text_area(label=f"CFG = {i}", key=f"cfg_{i}", height=200)

	if __name__ == "__main__":
	main()