Skip to content

Instantly share code, notes, and snippets.

@harry-stark
Created January 19, 2023 16:33
Show Gist options
  • Save harry-stark/ab7868252a8a23a6ea71b88bcffb0b3a to your computer and use it in GitHub Desktop.
Save harry-stark/ab7868252a8a23a6ea71b88bcffb0b3a to your computer and use it in GitHub Desktop.
```python convertFt.py --output_dir= --n_gpus=8```
import torch
from transformers import GPTJForCausalLM, GPTJConfig
from transformers import CodeGenTokenizer, CodeGenForCausalLM
def cg2gptj(code_model):
cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto")
cg_config = cg_model.config
# Create empty GPTJ model
print('Creating empty GPTJ model')
config = GPTJConfig(
vocab_size=cg_config.vocab_size,
n_positions=cg_config.n_positions,
n_embd=cg_config.n_embd,
n_layer=cg_config.n_layer,
n_head=cg_config.n_head,
rotary_dim=cg_config.rotary_dim,
n_inner=cg_config.n_inner,
activation_function=cg_config.activation_function,
resid_pdrop=cg_config.resid_pdrop,
embd_pdrop=cg_config.embd_pdrop,
attn_pdrop=cg_config.attn_pdrop,
layer_norm_epsilon=cg_config.layer_norm_epsilon,
initializer_range=cg_config.initializer_range,
scale_attn_weights=cg_config.scale_attn_weights,
use_cache=cg_config.use_cache,
bos_token_id=cg_config.bos_token_id,
eos_token_id=cg_config.eos_token_id,
torch_dtype=cg_config.torch_dtype,
)
# Fix tokenizer type
config.tokenizer_class = 'CodeGenTokenizer'
gptj_model = GPTJForCausalLM(config)
embed_dim = config.n_embd
def replace(model, weights, name):
model.state_dict()[name].copy_(weights.detach())
def replace_by_name(dest_model, src_model, old_name, new_name):
assert old_name in src_model.state_dict()
assert new_name in dest_model.state_dict()
replace(dest_model, src_model.state_dict()[old_name], new_name)
print('Converting...')
# Copy weights from CodeGen model
with torch.no_grad():
cg_model.eval()
gptj_model.eval()
for name, param in cg_model.named_parameters():
# print(f'Converting {name}')
# Handle the qkv weights separately because we need to split them
if 'qkv_proj' in name:
qkv_proj = param.detach().clone()
mp_num = 4 # number of cores on their TPU I guess?
local_dim = embed_dim // mp_num
# GPT-J and CodeGen slice up the qkv projection slightly differently.
# After a great deal of pain, I figured out that this permutation on
# the weights of the qkv_proj fixes it.
base_permutation = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]
permutation = torch.cat([torch.arange(i*local_dim, (i+1)*local_dim) for i in base_permutation])
# NB: we permute the *rows* here because the computation is xA.T
new_qkv_proj = qkv_proj[permutation,:]
# NB: the name QKV is misleading here; they are actually stored in
# the order QVK
query, value, key = torch.split(new_qkv_proj, embed_dim, dim=0)
replace(gptj_model, query, name.replace('qkv_proj', 'q_proj'))
replace(gptj_model, key, name.replace('qkv_proj', 'k_proj'))
replace(gptj_model, value, name.replace('qkv_proj', 'v_proj'))
else:
replace_by_name(gptj_model, cg_model, name, name)
return gptj_model
import argparse
import os
from string import Template
from transformers import GPTJConfig, AutoTokenizer
import torch
from codegen_gptj_converter import cg2gptj
from gptj_ftconverter import split_and_convert_main
def round_up(x, multiple):
remainder = x % multiple
return x if remainder == 0 else x + multiple - remainder
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')
# Generate a config file for a CodeGen model for use with Triton
parser = argparse.ArgumentParser('Convert SalesForce CodeGen model to GPT-J FT')
parser.add_argument('--code_model', default='Salesforce/codegen-350M-mono',help='which SalesForce model to convert')
parser.add_argument('--template', default=CONFIG_TEMPLATE_PATH, help='Path to the config template')
parser.add_argument('--tokenizer', default='Salesforce/codegen-16B-multi', help='Name or path to the tokenizer')
parser.add_argument('--output_dir', required=True, help='Where to store the converted model')
parser.add_argument('--n_gpus', '--num_gpus', help='Number of GPUs to use for inference', type=int, default=1)
parser.add_argument('--t_gpus','--train_gpus', help='Number of GPUs used for training', type=int, default=1)
parser.add_argument("--processes", "--p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4)
parser.add_argument("--weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"], help="output weight data type")
args = parser.parse_args()
# Vars we need to fill in:
# name
# tensor_para_size
# max_seq_len
# is_half
# head_num
# size_per_head
# inter_size
# vocab_size
# start_id
# end_id
# decoder_layers
# name
# rotary_embedding
# checkpoint_path
# Global options
gptj_model = cg2gptj(args.code_model)
config = gptj_model.config
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
max_seq_len = config.n_positions
is_half = '1' if config.torch_dtype == torch.float16 else '0'
# Read in the template config file
with open(args.template, 'r') as f:
template = Template(f.read())
#from code_model
model_name = args.code_model.split('/')[-1]
version = '1'
params = {}
params['tensor_para_size'] = args.n_gpus
params['name'] = model_name
params['max_seq_len'] = max_seq_len
params['is_half'] = is_half
params['head_num'] = config.n_head
params['size_per_head'] = config.n_embd // config.n_head
params['inter_size'] = 4*config.n_embd
# Vocab size gets rounded up to a multiple of 1024
params['vocab_size'] = round_up(tokenizer.vocab_size, 1024)
params['start_id'] = tokenizer.eos_token_id
params['end_id'] = tokenizer.eos_token_id
params['decoder_layers'] = config.n_layer
params['rotary_embedding'] = config.rotary_dim
# NOTE: this assumes that the model dir follows the format used by the other conversion scripts
model_dir = os.path.join(args.output_dir, f'{model_name}-{args.n_gpus}gpu')
os.makedirs(model_dir, exist_ok=True)
weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.n_gpus}-gpu')
params['checkpoint_path'] = weights_path
triton_config = template.substitute(params)
assert '${' not in triton_config
# Make directory structure
os.makedirs(weights_path, exist_ok=True)
# Write config file
config_path = os.path.join(model_dir, 'fastertransformer', 'config.pbtxt')
with open(config_path, 'w') as f:
f.write(triton_config)
print('==========================================================')
print(f'Created config file for {model_name}')
print(f' Config: {config_path}')
# Convert the model weights
#args [gptj model] [weight path] [t-gpu] [num gpu] [fp16]
split_and_convert_main(gptj_model, weights_path, args.n_gpus, args.t_gpus, args.weight_data_type, args.processes)
print('==========================================================')
print(f'Converted weights for {model_name}')
print(f' Weights: {weights_path}')
print('==========================================================')
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
# Modified by Brendan Dolan-Gavitt, 2022
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import configparser
import multiprocessing
import numpy as np
from pathlib import Path
import torch
import os
import sys
from transformers import GPTJForCausalLM
# dir_path = os.path.dirname(os.path.realpath(__file__))
# sys.path.append(dir_path + "/../../../..")
# sys.path.append(dir_path)
def get_weight_data_type(data_type):
if data_type == "fp32":
return np.float32
elif data_type == "fp16":
return np.float16
else:
assert False, f"Invalid weight data type {data_type}"
def split_and_convert_process_2(i,saved_dir,factor,key, val):
if key.find("input_layernorm.weight") != -1 or key.find("input_layernorm.bias") != -1 or \
key.find("attention.dense.bias") != -1 or key.find("post_attention_layernorm.weight") != -1 or \
key.find("post_attention_layernorm.bias") != -1 or key.find("mlp.dense_4h_to_h.bias") != -1 or \
key.find("final_layernorm.weight") != -1 or key.find("final_layernorm.bias") != -1:
# shared weights, only need to convert the weights of rank 0
if i == 0:
saved_path = saved_dir + "/model." + key + ".bin"
val.tofile(saved_path)
elif key.find("attention.dense.weight") != -1 or key.find("mlp.dense_4h_to_h.weight") != -1:
split_vals = np.split(val, factor, axis=0)
for j in range(factor):
saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
split_vals[j].tofile(saved_path)
elif key.find("mlp.dense_h_to_4h.weight") != -1 or key.find("mlp.dense_h_to_4h.bias") != -1:
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
split_vals[j].tofile(saved_path)
elif key.find("attention.query_key_value.weight") != -1:
split_vals = np.split(val, factor, axis=-1)
for j in range(factor):
saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
split_vals[j].tofile(saved_path)
else:
print("[ERROR] cannot find key '{}'".format(key))
def split_and_convert_main(gptjmodel,weights_path,n_gpu,t_gpu,weight_data_type,processes):
saved_dir = weights_path
t_gpu_num = t_gpu
i_gpu_num = n_gpu
print(f"t_gpu_num: {t_gpu_num}, i_gpu_num: {i_gpu_num}")
assert(i_gpu_num % t_gpu_num == 0)
factor = (int)(i_gpu_num / t_gpu_num)
model = gptjmodel
if weight_data_type == "fp16":
model = model.half()
try:
config = configparser.ConfigParser()
config["gpt"] = {}
config["gpt"]["weights_path"] = saved_dir
config["gpt"]["trained_gpu_num"] = f"{t_gpu}"
config["gpt"]["inference_gpu_num"] = f"{n_gpu}"
config["gpt"]["processes"] = f"{processes}"
for k, v in vars(model.config).items():
config["gpt"][k] = f"{v}"
config["gpt"]["weight_data_type"] = weight_data_type
with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile:
config.write(configfile)
except Exception as e:
print(f"Fail to save the config in config.ini.")
print(e)
np_weight_data_type = get_weight_data_type(weight_data_type)
huggingface_model_name_pattern = [
"ln_1.bias",
"ln_1.weight",
"attn.q_proj.weight",
"attn.out_proj.weight",
"mlp.fc_in.bias",
"mlp.fc_in.weight",
"mlp.fc_out.bias",
"mlp.fc_out.weight",
]
ft_model_name_pattern = [
"input_layernorm.bias",
"input_layernorm.weight",
"attention.query_key_value.weight",
"attention.dense.weight",
"mlp.dense_h_to_4h.bias",
"mlp.dense_h_to_4h.weight",
"mlp.dense_4h_to_h.bias",
"mlp.dense_4h_to_h.weight",
]
#torch.multiprocessing.set_start_method("spawn")
#pool = multiprocessing.Pool(processes)
# with multiprocessing.Pool(processes) as pool:
for name, param in model.named_parameters():
if name.find("weight") == -1 and name.find("bias") == -1:
continue
print(name)
if name == 'transformer.wte.weight':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
elif name == 'transformer.ln_f.bias':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.bias.bin")
elif name == 'transformer.ln_f.weight':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin")
elif name == 'lm_head.weight':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
elif name == 'lm_head.bias':
param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.bias.bin")
else:
for i in range(len(huggingface_model_name_pattern)):
if name.find(huggingface_model_name_pattern[i]) != -1:
# Special case for QKV weights
if name.find("attn.q_proj.weight") != -1:
layer = name.split('.')[2]
base_k = f'transformer.h.{layer}.'
w = model.state_dict()
QKV_w = torch.stack([
w[base_k + "attn.q_proj.weight"],
w[base_k + "attn.k_proj.weight"],
w[base_k + "attn.v_proj.weight"],
]) # [qkv, n_heads * dim_head, latent_space]
QKV_w = QKV_w.permute(2, 0, 1)
weights = QKV_w.detach().cpu().numpy().astype(np_weight_data_type)
else:
weights = param.detach().cpu().numpy().astype(np_weight_data_type)
# Some weights need to be transposed
if name.find("mlp.fc_in.weight") != -1 or \
name.find("mlp.fc_out.weight") != -1 or \
name.find("attn.out_proj.weight") != -1:
weights = weights.T
new_name = name.replace("transformer.h.", "layers.").replace(huggingface_model_name_pattern[i], ft_model_name_pattern[i])
#split_and_convert_process_2(0, saved_dir, factor, new_name, weights)
pool.map(split_and_convert_process_2,[(0, saved_dir, factor, new_name, weights)] )
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment