harry-stark/codegen_gptj_converter.py

## codegen_gptj_converter.py
import torch
from transformers import GPTJForCausalLM, GPTJConfig
from transformers import CodeGenTokenizer, CodeGenForCausalLM

def cg2gptj(code_model):
    cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto")
    cg_config = cg_model.config

    # Create empty GPTJ model
    print('Creating empty GPTJ model')
    config = GPTJConfig(
        vocab_size=cg_config.vocab_size,
        n_positions=cg_config.n_positions,
        n_embd=cg_config.n_embd,
        n_layer=cg_config.n_layer,
        n_head=cg_config.n_head,
        rotary_dim=cg_config.rotary_dim,
        n_inner=cg_config.n_inner,
        activation_function=cg_config.activation_function,
        resid_pdrop=cg_config.resid_pdrop,
        embd_pdrop=cg_config.embd_pdrop,
        attn_pdrop=cg_config.attn_pdrop,
        layer_norm_epsilon=cg_config.layer_norm_epsilon,
        initializer_range=cg_config.initializer_range,
        scale_attn_weights=cg_config.scale_attn_weights,
        use_cache=cg_config.use_cache,
        bos_token_id=cg_config.bos_token_id,
        eos_token_id=cg_config.eos_token_id,
        torch_dtype=cg_config.torch_dtype,
    )
    # Fix tokenizer type
    config.tokenizer_class = 'CodeGenTokenizer'

    gptj_model = GPTJForCausalLM(config)
    embed_dim = config.n_embd

    def replace(model, weights, name):
        model.state_dict()[name].copy_(weights.detach())

    def replace_by_name(dest_model, src_model, old_name, new_name):
        assert old_name in src_model.state_dict()
        assert new_name in dest_model.state_dict()
        replace(dest_model, src_model.state_dict()[old_name], new_name)

    print('Converting...')
    # Copy weights from CodeGen model
    with torch.no_grad():
        cg_model.eval()
        gptj_model.eval()

        for name, param in cg_model.named_parameters():
            # print(f'Converting {name}')
            # Handle the qkv weights separately because we need to split them
            if 'qkv_proj' in name:
                qkv_proj = param.detach().clone()
                mp_num = 4 # number of cores on their TPU I guess?
                local_dim = embed_dim // mp_num
                # GPT-J and CodeGen slice up the qkv projection slightly differently.
                # After a great deal of pain, I figured out that this permutation on
                # the weights of the qkv_proj fixes it.
                base_permutation = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]
                permutation = torch.cat([torch.arange(i*local_dim, (i+1)*local_dim) for i in base_permutation])
                # NB: we permute the *rows* here because the computation is xA.T
                new_qkv_proj = qkv_proj[permutation,:]
                # NB: the name QKV is misleading here; they are actually stored in
                #     the order QVK
                query, value, key = torch.split(new_qkv_proj, embed_dim, dim=0)
                replace(gptj_model, query, name.replace('qkv_proj', 'q_proj'))
                replace(gptj_model, key, name.replace('qkv_proj', 'k_proj'))
                replace(gptj_model, value, name.replace('qkv_proj', 'v_proj'))
            else:
                replace_by_name(gptj_model, cg_model, name, name)

    return gptj_model

## convertFt.py
import argparse
import os
from string import Template
from transformers import GPTJConfig, AutoTokenizer
import torch
from codegen_gptj_converter import cg2gptj
from gptj_ftconverter import split_and_convert_main

def round_up(x, multiple):
    remainder = x % multiple
    return x if remainder == 0 else x + multiple - remainder

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')

# Generate a config file for a CodeGen model for use with Triton


parser = argparse.ArgumentParser('Convert SalesForce CodeGen model to GPT-J FT')
parser.add_argument('--code_model', default='Salesforce/codegen-350M-mono',help='which SalesForce model to convert')
parser.add_argument('--template', default=CONFIG_TEMPLATE_PATH, help='Path to the config template')
parser.add_argument('--tokenizer', default='Salesforce/codegen-16B-multi', help='Name or path to the tokenizer')
parser.add_argument('--output_dir', required=True, help='Where to store the converted model')
parser.add_argument('--n_gpus', '--num_gpus', help='Number of GPUs to use for inference', type=int, default=1)
parser.add_argument('--t_gpus','--train_gpus', help='Number of GPUs used for training', type=int, default=1)
parser.add_argument("--processes", "--p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4)
parser.add_argument("--weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"], help="output weight data type")
args = parser.parse_args()

# Vars we need to fill in:
# name
# tensor_para_size
# max_seq_len
# is_half
# head_num
# size_per_head
# inter_size
# vocab_size
# start_id
# end_id
# decoder_layers
# name
# rotary_embedding
# checkpoint_path

# Global options
gptj_model = cg2gptj(args.code_model)
config = gptj_model.config
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
max_seq_len = config.n_positions
is_half = '1' if config.torch_dtype == torch.float16 else '0'

# Read in the template config file
with open(args.template, 'r') as f:
    template = Template(f.read())

#from code_model
model_name = args.code_model.split('/')[-1]

version = '1'
params = {}
params['tensor_para_size'] = args.n_gpus
params['name'] = model_name
params['max_seq_len'] = max_seq_len
params['is_half'] = is_half
params['head_num'] = config.n_head
params['size_per_head'] = config.n_embd // config.n_head
params['inter_size'] = 4*config.n_embd
# Vocab size gets rounded up to a multiple of 1024
params['vocab_size'] = round_up(tokenizer.vocab_size, 1024)
params['start_id'] = tokenizer.eos_token_id
params['end_id'] = tokenizer.eos_token_id
params['decoder_layers'] = config.n_layer
params['rotary_embedding'] = config.rotary_dim
# NOTE: this assumes that the model dir follows the format used by the other conversion scripts
model_dir = os.path.join(args.output_dir, f'{model_name}-{args.n_gpus}gpu')
os.makedirs(model_dir, exist_ok=True)
weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.n_gpus}-gpu')
params['checkpoint_path'] = weights_path
triton_config = template.substitute(params)
assert '${' not in triton_config

# Make directory structure
os.makedirs(weights_path, exist_ok=True)

# Write config file
config_path = os.path.join(model_dir, 'fastertransformer', 'config.pbtxt')
with open(config_path, 'w') as f:
    f.write(triton_config)

print('==========================================================')
print(f'Created config file for {model_name}')
print(f'  Config:  {config_path}')

# Convert the model weights
#args [gptj model] [weight path] [t-gpu] [num gpu] [fp16]

split_and_convert_main(gptj_model, weights_path, args.n_gpus, args.t_gpus, args.weight_data_type, args.processes)
print('==========================================================')
print(f'Converted weights for {model_name}')
print(f'  Weights: {weights_path}')
print('==========================================================')

## gptj_ftconverter.py
# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
# Modified by Brendan Dolan-Gavitt, 2022
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import configparser
import multiprocessing
import numpy as np
from pathlib import Path
import torch

import os
import sys
from transformers import GPTJForCausalLM
# dir_path = os.path.dirname(os.path.realpath(__file__))
# sys.path.append(dir_path + "/../../../..")
# sys.path.append(dir_path)

def get_weight_data_type(data_type):
    if data_type == "fp32":
        return np.float32
    elif data_type == "fp16":
        return np.float16
    else:
        assert False, f"Invalid weight data type {data_type}"

def split_and_convert_process_2(i,saved_dir,factor,key, val):
    if key.find("input_layernorm.weight") != -1 or key.find("input_layernorm.bias") != -1 or \
        key.find("attention.dense.bias") != -1 or key.find("post_attention_layernorm.weight") != -1 or \
        key.find("post_attention_layernorm.bias") != -1 or key.find("mlp.dense_4h_to_h.bias") != -1 or \
        key.find("final_layernorm.weight") != -1 or key.find("final_layernorm.bias") != -1:

        # shared weights, only need to convert the weights of rank 0
        if i == 0:
            saved_path = saved_dir + "/model." + key + ".bin"
            val.tofile(saved_path)

    elif key.find("attention.dense.weight") != -1 or key.find("mlp.dense_4h_to_h.weight") != -1:
        split_vals = np.split(val, factor, axis=0)
        for j in range(factor):
            saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            split_vals[j].tofile(saved_path)

    elif key.find("mlp.dense_h_to_4h.weight") != -1 or key.find("mlp.dense_h_to_4h.bias") != -1:

        split_vals = np.split(val, factor, axis=-1)
        for j in range(factor):
            saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            split_vals[j].tofile(saved_path)

    elif key.find("attention.query_key_value.weight") != -1:
        split_vals = np.split(val, factor, axis=-1)

        for j in range(factor):
            saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
            split_vals[j].tofile(saved_path)

    else:
        print("[ERROR] cannot find key '{}'".format(key))

def split_and_convert_main(gptjmodel,weights_path,n_gpu,t_gpu,weight_data_type,processes):
    saved_dir = weights_path
    t_gpu_num = t_gpu
    i_gpu_num = n_gpu
    print(f"t_gpu_num: {t_gpu_num}, i_gpu_num: {i_gpu_num}")

    assert(i_gpu_num % t_gpu_num == 0)

    factor = (int)(i_gpu_num / t_gpu_num)

    model = gptjmodel
    if weight_data_type == "fp16":
        model = model.half()

    try:
        config = configparser.ConfigParser()
        config["gpt"] = {}
        config["gpt"]["weights_path"] = saved_dir
        config["gpt"]["trained_gpu_num"] = f"{t_gpu}"
        config["gpt"]["inference_gpu_num"] = f"{n_gpu}"
        config["gpt"]["processes"] = f"{processes}"

        for k, v in vars(model.config).items():
            config["gpt"][k] = f"{v}"
        config["gpt"]["weight_data_type"] = weight_data_type
        with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile:
            config.write(configfile)
    except Exception as e:
        print(f"Fail to save the config in config.ini.")
        print(e)
    np_weight_data_type = get_weight_data_type(weight_data_type)

    huggingface_model_name_pattern = [
        "ln_1.bias",
        "ln_1.weight",
        "attn.q_proj.weight",
        "attn.out_proj.weight",
        "mlp.fc_in.bias",
        "mlp.fc_in.weight",
        "mlp.fc_out.bias",
        "mlp.fc_out.weight",
    ]

    ft_model_name_pattern = [
        "input_layernorm.bias",
        "input_layernorm.weight",
        "attention.query_key_value.weight",
        "attention.dense.weight",
        "mlp.dense_h_to_4h.bias",
        "mlp.dense_h_to_4h.weight",
        "mlp.dense_4h_to_h.bias",
        "mlp.dense_4h_to_h.weight",
    ]

    #torch.multiprocessing.set_start_method("spawn")
    #pool = multiprocessing.Pool(processes)
    # with multiprocessing.Pool(processes) as pool:
    for name, param in model.named_parameters():
        if name.find("weight") == -1 and name.find("bias") == -1:
            continue
        print(name)
        if name == 'transformer.wte.weight':
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
        elif name == 'transformer.ln_f.bias':
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.bias.bin")
        elif name == 'transformer.ln_f.weight':
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin")
        elif name == 'lm_head.weight':
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
        elif name == 'lm_head.bias':
            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.bias.bin")
        else:
            for i in range(len(huggingface_model_name_pattern)):
                if name.find(huggingface_model_name_pattern[i]) != -1:
                    # Special case for QKV weights
                    if name.find("attn.q_proj.weight") != -1:
                        layer = name.split('.')[2]
                        base_k = f'transformer.h.{layer}.'
                        w = model.state_dict()
                        QKV_w = torch.stack([
                            w[base_k + "attn.q_proj.weight"],
                            w[base_k + "attn.k_proj.weight"],
                            w[base_k + "attn.v_proj.weight"],
                        ]) # [qkv, n_heads * dim_head, latent_space]
                        QKV_w = QKV_w.permute(2, 0, 1)
                        weights = QKV_w.detach().cpu().numpy().astype(np_weight_data_type)
                    else:
                        weights = param.detach().cpu().numpy().astype(np_weight_data_type)

                    # Some weights need to be transposed
                    if name.find("mlp.fc_in.weight") != -1 or \
                        name.find("mlp.fc_out.weight") != -1 or \
                        name.find("attn.out_proj.weight") != -1:
                        weights = weights.T

                    new_name = name.replace("transformer.h.", "layers.").replace(huggingface_model_name_pattern[i], ft_model_name_pattern[i])
                    #split_and_convert_process_2(0, saved_dir, factor, new_name, weights)
                    pool.map(split_and_convert_process_2,[(0, saved_dir, factor, new_name, weights)] )


    pool.close()
    pool.join()
	import torch
	from transformers import GPTJForCausalLM, GPTJConfig
	from transformers import CodeGenTokenizer, CodeGenForCausalLM

	def cg2gptj(code_model):
	cg_model = CodeGenForCausalLM.from_pretrained(code_model, torch_dtype="auto")
	cg_config = cg_model.config

	# Create empty GPTJ model
	print('Creating empty GPTJ model')
	config = GPTJConfig(
	vocab_size=cg_config.vocab_size,
	n_positions=cg_config.n_positions,
	n_embd=cg_config.n_embd,
	n_layer=cg_config.n_layer,
	n_head=cg_config.n_head,
	rotary_dim=cg_config.rotary_dim,
	n_inner=cg_config.n_inner,
	activation_function=cg_config.activation_function,
	resid_pdrop=cg_config.resid_pdrop,
	embd_pdrop=cg_config.embd_pdrop,
	attn_pdrop=cg_config.attn_pdrop,
	layer_norm_epsilon=cg_config.layer_norm_epsilon,
	initializer_range=cg_config.initializer_range,
	scale_attn_weights=cg_config.scale_attn_weights,
	use_cache=cg_config.use_cache,
	bos_token_id=cg_config.bos_token_id,
	eos_token_id=cg_config.eos_token_id,
	torch_dtype=cg_config.torch_dtype,
	)
	# Fix tokenizer type
	config.tokenizer_class = 'CodeGenTokenizer'

	gptj_model = GPTJForCausalLM(config)
	embed_dim = config.n_embd

	def replace(model, weights, name):
	model.state_dict()[name].copy_(weights.detach())

	def replace_by_name(dest_model, src_model, old_name, new_name):
	assert old_name in src_model.state_dict()
	assert new_name in dest_model.state_dict()
	replace(dest_model, src_model.state_dict()[old_name], new_name)

	print('Converting...')
	# Copy weights from CodeGen model
	with torch.no_grad():
	cg_model.eval()
	gptj_model.eval()

	for name, param in cg_model.named_parameters():
	# print(f'Converting {name}')
	# Handle the qkv weights separately because we need to split them
	if 'qkv_proj' in name:
	qkv_proj = param.detach().clone()
	mp_num = 4 # number of cores on their TPU I guess?
	local_dim = embed_dim // mp_num
	# GPT-J and CodeGen slice up the qkv projection slightly differently.
	# After a great deal of pain, I figured out that this permutation on
	# the weights of the qkv_proj fixes it.
	base_permutation = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]
	permutation = torch.cat([torch.arange(ilocal_dim, (i+1)local_dim) for i in base_permutation])
	# NB: we permute the rows here because the computation is xA.T
	new_qkv_proj = qkv_proj[permutation,:]
	# NB: the name QKV is misleading here; they are actually stored in
	# the order QVK
	query, value, key = torch.split(new_qkv_proj, embed_dim, dim=0)
	replace(gptj_model, query, name.replace('qkv_proj', 'q_proj'))
	replace(gptj_model, key, name.replace('qkv_proj', 'k_proj'))
	replace(gptj_model, value, name.replace('qkv_proj', 'v_proj'))
	else:
	replace_by_name(gptj_model, cg_model, name, name)

	return gptj_model
	import argparse
	import os
	from string import Template
	from transformers import GPTJConfig, AutoTokenizer
	import torch
	from codegen_gptj_converter import cg2gptj
	from gptj_ftconverter import split_and_convert_main

	def round_up(x, multiple):
	remainder = x % multiple
	return x if remainder == 0 else x + multiple - remainder

	SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
	CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')

	# Generate a config file for a CodeGen model for use with Triton


	parser = argparse.ArgumentParser('Convert SalesForce CodeGen model to GPT-J FT')
	parser.add_argument('--code_model', default='Salesforce/codegen-350M-mono',help='which SalesForce model to convert')
	parser.add_argument('--template', default=CONFIG_TEMPLATE_PATH, help='Path to the config template')
	parser.add_argument('--tokenizer', default='Salesforce/codegen-16B-multi', help='Name or path to the tokenizer')
	parser.add_argument('--output_dir', required=True, help='Where to store the converted model')
	parser.add_argument('--n_gpus', '--num_gpus', help='Number of GPUs to use for inference', type=int, default=1)
	parser.add_argument('--t_gpus','--train_gpus', help='Number of GPUs used for training', type=int, default=1)
	parser.add_argument("--processes", "--p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4)
	parser.add_argument("--weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"], help="output weight data type")
	args = parser.parse_args()

	# Vars we need to fill in:
	# name
	# tensor_para_size
	# max_seq_len
	# is_half
	# head_num
	# size_per_head
	# inter_size
	# vocab_size
	# start_id
	# end_id
	# decoder_layers
	# name
	# rotary_embedding
	# checkpoint_path

	# Global options
	gptj_model = cg2gptj(args.code_model)
	config = gptj_model.config
	tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
	max_seq_len = config.n_positions
	is_half = '1' if config.torch_dtype == torch.float16 else '0'

	# Read in the template config file
	with open(args.template, 'r') as f:
	template = Template(f.read())

	#from code_model
	model_name = args.code_model.split('/')[-1]

	version = '1'
	params = {}
	params['tensor_para_size'] = args.n_gpus
	params['name'] = model_name
	params['max_seq_len'] = max_seq_len
	params['is_half'] = is_half
	params['head_num'] = config.n_head
	params['size_per_head'] = config.n_embd // config.n_head
	params['inter_size'] = 4*config.n_embd
	# Vocab size gets rounded up to a multiple of 1024
	params['vocab_size'] = round_up(tokenizer.vocab_size, 1024)
	params['start_id'] = tokenizer.eos_token_id
	params['end_id'] = tokenizer.eos_token_id
	params['decoder_layers'] = config.n_layer
	params['rotary_embedding'] = config.rotary_dim
	# NOTE: this assumes that the model dir follows the format used by the other conversion scripts
	model_dir = os.path.join(args.output_dir, f'{model_name}-{args.n_gpus}gpu')
	os.makedirs(model_dir, exist_ok=True)
	weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.n_gpus}-gpu')
	params['checkpoint_path'] = weights_path
	triton_config = template.substitute(params)
	assert '${' not in triton_config

	# Make directory structure
	os.makedirs(weights_path, exist_ok=True)

	# Write config file
	config_path = os.path.join(model_dir, 'fastertransformer', 'config.pbtxt')
	with open(config_path, 'w') as f:
	f.write(triton_config)

	print('==========================================================')
	print(f'Created config file for {model_name}')
	print(f' Config: {config_path}')

	# Convert the model weights
	#args [gptj model] [weight path] [t-gpu] [num gpu] [fp16]

	split_and_convert_main(gptj_model, weights_path, args.n_gpus, args.t_gpus, args.weight_data_type, args.processes)
	print('==========================================================')
	print(f'Converted weights for {model_name}')
	print(f' Weights: {weights_path}')
	print('==========================================================')
	# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
	# Modified by Brendan Dolan-Gavitt, 2022
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import configparser
	import multiprocessing
	import numpy as np
	from pathlib import Path
	import torch

	import os
	import sys
	from transformers import GPTJForCausalLM
	# dir_path = os.path.dirname(os.path.realpath(__file__))
	# sys.path.append(dir_path + "/../../../..")
	# sys.path.append(dir_path)

	def get_weight_data_type(data_type):
	if data_type == "fp32":
	return np.float32
	elif data_type == "fp16":
	return np.float16
	else:
	assert False, f"Invalid weight data type {data_type}"

	def split_and_convert_process_2(i,saved_dir,factor,key, val):
	if key.find("input_layernorm.weight") != -1 or key.find("input_layernorm.bias") != -1 or \
	key.find("attention.dense.bias") != -1 or key.find("post_attention_layernorm.weight") != -1 or \
	key.find("post_attention_layernorm.bias") != -1 or key.find("mlp.dense_4h_to_h.bias") != -1 or \
	key.find("final_layernorm.weight") != -1 or key.find("final_layernorm.bias") != -1:

	# shared weights, only need to convert the weights of rank 0
	if i == 0:
	saved_path = saved_dir + "/model." + key + ".bin"
	val.tofile(saved_path)

	elif key.find("attention.dense.weight") != -1 or key.find("mlp.dense_4h_to_h.weight") != -1:
	split_vals = np.split(val, factor, axis=0)
	for j in range(factor):
	saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
	split_vals[j].tofile(saved_path)

	elif key.find("mlp.dense_h_to_4h.weight") != -1 or key.find("mlp.dense_h_to_4h.bias") != -1:

	split_vals = np.split(val, factor, axis=-1)
	for j in range(factor):
	saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
	split_vals[j].tofile(saved_path)

	elif key.find("attention.query_key_value.weight") != -1:
	split_vals = np.split(val, factor, axis=-1)

	for j in range(factor):
	saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
	split_vals[j].tofile(saved_path)

	else:
	print("[ERROR] cannot find key '{}'".format(key))

	def split_and_convert_main(gptjmodel,weights_path,n_gpu,t_gpu,weight_data_type,processes):
	saved_dir = weights_path
	t_gpu_num = t_gpu
	i_gpu_num = n_gpu
	print(f"t_gpu_num: {t_gpu_num}, i_gpu_num: {i_gpu_num}")

	assert(i_gpu_num % t_gpu_num == 0)

	factor = (int)(i_gpu_num / t_gpu_num)

	model = gptjmodel
	if weight_data_type == "fp16":
	model = model.half()

	try:
	config = configparser.ConfigParser()
	config["gpt"] = {}
	config["gpt"]["weights_path"] = saved_dir
	config["gpt"]["trained_gpu_num"] = f"{t_gpu}"
	config["gpt"]["inference_gpu_num"] = f"{n_gpu}"
	config["gpt"]["processes"] = f"{processes}"

	for k, v in vars(model.config).items():
	config["gpt"][k] = f"{v}"
	config["gpt"]["weight_data_type"] = weight_data_type
	with open((Path(saved_dir) / f"config.ini").as_posix(), 'w') as configfile:
	config.write(configfile)
	except Exception as e:
	print(f"Fail to save the config in config.ini.")
	print(e)
	np_weight_data_type = get_weight_data_type(weight_data_type)

	huggingface_model_name_pattern = [
	"ln_1.bias",
	"ln_1.weight",
	"attn.q_proj.weight",
	"attn.out_proj.weight",
	"mlp.fc_in.bias",
	"mlp.fc_in.weight",
	"mlp.fc_out.bias",
	"mlp.fc_out.weight",
	]

	ft_model_name_pattern = [
	"input_layernorm.bias",
	"input_layernorm.weight",
	"attention.query_key_value.weight",
	"attention.dense.weight",
	"mlp.dense_h_to_4h.bias",
	"mlp.dense_h_to_4h.weight",
	"mlp.dense_4h_to_h.bias",
	"mlp.dense_4h_to_h.weight",
	]

	#torch.multiprocessing.set_start_method("spawn")
	#pool = multiprocessing.Pool(processes)
	# with multiprocessing.Pool(processes) as pool:
	for name, param in model.named_parameters():
	if name.find("weight") == -1 and name.find("bias") == -1:
	continue
	print(name)
	if name == 'transformer.wte.weight':
	param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.wte.bin")
	elif name == 'transformer.ln_f.bias':
	param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.bias.bin")
	elif name == 'transformer.ln_f.weight':
	param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.final_layernorm.weight.bin")
	elif name == 'lm_head.weight':
	param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.weight.bin")
	elif name == 'lm_head.bias':
	param.detach().cpu().numpy().astype(np_weight_data_type).tofile(saved_dir + "model.lm_head.bias.bin")
	else:
	for i in range(len(huggingface_model_name_pattern)):
	if name.find(huggingface_model_name_pattern[i]) != -1:
	# Special case for QKV weights
	if name.find("attn.q_proj.weight") != -1:
	layer = name.split('.')[2]
	base_k = f'transformer.h.{layer}.'
	w = model.state_dict()
	QKV_w = torch.stack([
	w[base_k + "attn.q_proj.weight"],
	w[base_k + "attn.k_proj.weight"],
	w[base_k + "attn.v_proj.weight"],
	]) # [qkv, n_heads * dim_head, latent_space]
	QKV_w = QKV_w.permute(2, 0, 1)
	weights = QKV_w.detach().cpu().numpy().astype(np_weight_data_type)
	else:
	weights = param.detach().cpu().numpy().astype(np_weight_data_type)

	# Some weights need to be transposed
	if name.find("mlp.fc_in.weight") != -1 or \
	name.find("mlp.fc_out.weight") != -1 or \
	name.find("attn.out_proj.weight") != -1:
	weights = weights.T

	new_name = name.replace("transformer.h.", "layers.").replace(huggingface_model_name_pattern[i], ft_model_name_pattern[i])
	#split_and_convert_process_2(0, saved_dir, factor, new_name, weights)
	pool.map(split_and_convert_process_2,[(0, saved_dir, factor, new_name, weights)] )



	pool.close()
	pool.join()