Created
April 2, 2023 19:35
-
-
Save ahoho/57f5c3dcdce4be522ca13dfb96cc1eb8 to your computer and use it in GitHub Desktop.
Convert huggingface model to pytorch checkpoint (modified from alpaca-lora)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert a huggingface LLaMA checkpoint to an (unsharded) pytorch checkpoint | |
# comes from https://github.com/tloen/alpaca-lora/blob/main/export_state_dict_checkpoint.py | |
import argparse | |
import json | |
from pathlib import Path | |
import torch | |
import transformers | |
from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: E402 | |
parser = argparse.ArgumentParser() | |
parser.add_argument("base_model") | |
parser.add_argument("size_key") | |
args = parser.parse_args() | |
tokenizer = LlamaTokenizer.from_pretrained(args.base_model) | |
base_model = LlamaForCausalLM.from_pretrained( | |
args.base_model, | |
load_in_8bit=False, | |
torch_dtype=torch.float16, | |
device_map={"": "cpu"}, | |
) | |
base_model.train(False) | |
base_model_sd = base_model.state_dict() | |
params_by_model = { | |
"7b": { | |
"dim": 4096, | |
"multiple_of": 256, | |
"n_heads": 32, | |
"n_layers": 32, | |
"norm_eps": 1e-06, | |
"vocab_size": -1, | |
}, | |
"13b": { | |
"dim": 5120, | |
"multiple_of": 256, | |
"n_heads": 40, | |
"n_layers": 40, | |
"norm_eps": 1e-06, | |
"vocab_size": -1, | |
}, | |
"30b": { | |
"dim": 6656, | |
"multiple_of": 256, | |
"n_heads": 52, | |
"n_layers": 60, | |
"norm_eps": 1e-06, | |
"vocab_size": -1, | |
}, | |
"65b": { | |
"dim": 8192, | |
"multiple_of": 256, | |
"n_heads": 64, | |
"n_layers": 80, | |
"norm_eps": 1e-06, | |
"vocab_size": -1, | |
}, | |
} | |
params = params_by_model[args.size_key.lower()] | |
n_layers = params["n_layers"] | |
n_heads = params["n_heads"] | |
dim = params["dim"] | |
dims_per_head = dim // n_heads | |
base = 10000.0 | |
inv_freq = 1.0 / ( | |
base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head) | |
) | |
def permute(w): | |
return ( | |
w.view(n_heads, dim // n_heads // 2, 2, dim) | |
.transpose(1, 2) | |
.reshape(dim, dim) | |
) | |
def unpermute(w): | |
return ( | |
w.view(n_heads, 2, dim // n_heads // 2, dim) | |
.transpose(1, 2) | |
.reshape(dim, dim) | |
) | |
def translate_state_dict_key(k): # noqa: C901 | |
k = k.replace("base_model.model.", "") | |
if k == "model.embed_tokens.weight": | |
return "tok_embeddings.weight" | |
elif k == "model.norm.weight": | |
return "norm.weight" | |
elif k == "lm_head.weight": | |
return "output.weight" | |
elif k.startswith("model.layers."): | |
layer = k.split(".")[2] | |
if k.endswith(".self_attn.q_proj.weight"): | |
return f"layers.{layer}.attention.wq.weight" | |
elif k.endswith(".self_attn.k_proj.weight"): | |
return f"layers.{layer}.attention.wk.weight" | |
elif k.endswith(".self_attn.v_proj.weight"): | |
return f"layers.{layer}.attention.wv.weight" | |
elif k.endswith(".self_attn.o_proj.weight"): | |
return f"layers.{layer}.attention.wo.weight" | |
elif k.endswith(".mlp.gate_proj.weight"): | |
return f"layers.{layer}.feed_forward.w1.weight" | |
elif k.endswith(".mlp.down_proj.weight"): | |
return f"layers.{layer}.feed_forward.w2.weight" | |
elif k.endswith(".mlp.up_proj.weight"): | |
return f"layers.{layer}.feed_forward.w3.weight" | |
elif k.endswith(".input_layernorm.weight"): | |
return f"layers.{layer}.attention_norm.weight" | |
elif k.endswith(".post_attention_layernorm.weight"): | |
return f"layers.{layer}.ffn_norm.weight" | |
elif k.endswith("rotary_emb.inv_freq") or "lora" in k: | |
return None | |
else: | |
print(layer, k) | |
raise NotImplementedError | |
else: | |
print(k) | |
raise NotImplementedError | |
new_state_dict = {} | |
for k, v in base_model_sd.items(): | |
new_k = translate_state_dict_key(k) | |
if new_k is not None: | |
if "wq" in new_k or "wk" in new_k: | |
new_state_dict[new_k] = unpermute(v) | |
else: | |
new_state_dict[new_k] = v | |
out_path = Path(args.base_model, "consolidated") | |
out_path.mkdir(exist_ok=True) | |
torch.save(new_state_dict, out_path / "consolidated.00.pth") | |
with open(out_path / "params.json", "w") as f: | |
json.dump(params, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# minor modification of the original file from llama.ccp | |
# to account for the unsharded checkpoint; | |
# call with `convert-pth-to-ggml.py <output dir of convert-hf-to-pth.py> 1 1` | |
import argparse | |
import os | |
import sys | |
import json | |
import struct | |
import numpy as np | |
import torch | |
from sentencepiece import SentencePieceProcessor | |
QK = 32 | |
GGML_TYPE_Q4_0 = 0 | |
GGML_TYPE_Q4_1 = 1 | |
GGML_TYPE_I8 = 2 | |
GGML_TYPE_I16 = 3 | |
GGML_TYPE_I32 = 4 | |
GGML_TYPE_F16 = 5 | |
GGML_TYPE_F32 = 6 | |
WTYPES = { | |
0: GGML_TYPE_F32, | |
1: GGML_TYPE_F16, | |
2: GGML_TYPE_Q4_0, | |
3: GGML_TYPE_Q4_1, | |
} | |
GGML_BLCK_SIZE = { | |
GGML_TYPE_Q4_0: QK, | |
GGML_TYPE_Q4_1: QK, | |
GGML_TYPE_I8: 1, | |
GGML_TYPE_I16: 1, | |
GGML_TYPE_I32: 1, | |
GGML_TYPE_F16: 1, | |
GGML_TYPE_F32: 1, | |
} | |
GGML_TYPE_SIZE = { | |
GGML_TYPE_Q4_0: 4 + QK//2, | |
GGML_TYPE_Q4_1: 4*2 + QK//2, | |
GGML_TYPE_I8: 1, | |
GGML_TYPE_I16: 2, | |
GGML_TYPE_I32: 4, | |
GGML_TYPE_F16: 2, | |
GGML_TYPE_F32: 4, | |
} | |
def ggml_nelements(shape): | |
r = 1 | |
for i in shape: | |
r *= i | |
return r | |
def ggml_nbytes(shape, ftype): | |
x = ggml_nelements(shape) | |
t = WTYPES[ftype] | |
x *= GGML_TYPE_SIZE[t] | |
x //= GGML_BLCK_SIZE[t] | |
return x | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') | |
parser.add_argument('dir_model', help='directory containing the model checkpoint') | |
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1) | |
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?') | |
parser.add_argument('n_parts', type=int, default=None) | |
return parser.parse_args() | |
def get_n_parts(dim): | |
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8} | |
n_parts = mappings.get(dim) | |
if n_parts is None: | |
print(f"Invalid dim: {dim}") | |
sys.exit(1) | |
print(f"n_parts = {n_parts}\n") | |
return n_parts | |
def load_hparams_and_tokenizer(dir_model): | |
# `dir_model` is something like `models/7B` or `models/7B/`. | |
# "tokenizer.model" is expected under model's parent dir. | |
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found. | |
# Let's use the model's parent dir directly. | |
model_parent_dir = os.path.dirname(os.path.normpath(dir_model)) | |
fname_hparams = f"{dir_model}/params.json" | |
fname_tokenizer = f"{model_parent_dir}/tokenizer.model" | |
with open(fname_hparams, "r") as f: | |
hparams = json.load(f) | |
print(hparams) | |
tokenizer = SentencePieceProcessor(fname_tokenizer) | |
hparams.update({"vocab_size": tokenizer.vocab_size()}) | |
return hparams, tokenizer | |
def write_header(fout, hparams, ftype): | |
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] | |
values = [ | |
0x67676a74, # magic: ggjt in hex | |
1, # file version | |
*[hparams[key] for key in keys], | |
hparams["dim"] // hparams["n_heads"], # rot (obsolete) | |
ftype | |
] | |
fout.write(struct.pack("i" * len(values), *values)) | |
def write_tokens(fout, tokenizer): | |
for i in range(tokenizer.vocab_size()): | |
if tokenizer.is_unknown(i): | |
text = " \u2047 ".encode() | |
elif tokenizer.is_control(i): | |
text = b"" | |
elif tokenizer.is_byte(i): | |
piece = tokenizer.id_to_piece(i) | |
if len(piece) != 6: | |
print(f"Invalid token: {piece}") | |
sys.exit(1) | |
byte_value = int(piece[3:-1], 16) | |
text = struct.pack("B", byte_value) | |
else: | |
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode() | |
fout.write(struct.pack("i", len(text))) | |
fout.write(text) | |
fout.write(struct.pack("f", tokenizer.get_score(i))) | |
def process_and_write_variables(fout, model, ftype, part_id, n_parts): | |
for name, datao in model.items(): | |
if name.endswith("freqs"): | |
continue | |
# remove dimensions with a single element | |
data = datao.numpy().squeeze() | |
partshape = data.shape | |
n_dims = len(data.shape) | |
assert n_dims in (1, 2) | |
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}") | |
# coerce single-dimensional tensors from float16 to float32 | |
ftype_cur = 1 | |
if ftype == 0 or n_dims == 1: | |
print(" Converting to float32") | |
data = data.astype(np.float32) | |
ftype_cur = 0 | |
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]] | |
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]] | |
# determine dimension along which multipart tensor is sharded | |
# | |
# split_dim 0 regex: | |
# - output.* | |
# - layers.*.attention.wq.weight | |
# - layers.*.attention.wk.weight | |
# - layers.*.attention.wv.weight | |
# - layers.*.feed_forward.w1.weight | |
# - layers.*.feed_forward.w3.weight | |
# | |
# split_dim 1 regex: | |
# - tok_embeddings.* | |
# - layers.*.attention.wo.weight | |
# - layers.*.feed_forward.w2.weight | |
# | |
if n_dims > 1: | |
split_dim = 1 | |
if "tok_embeddings" in name: | |
split_dim = 1 | |
elif "layers" in name: | |
if "attention.wo.weight" in name: | |
split_dim = 1 | |
elif "feed_forward.w2.weight" in name: | |
split_dim = 1 | |
else: | |
split_dim = 0 | |
elif "output" in name: | |
split_dim = 0 | |
# output tensor header | |
fullshape = list(partshape) | |
if n_dims > 1: | |
fullshape[split_dim] *= n_parts | |
sname = name.encode() | |
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) | |
for dim in reversed(fullshape): | |
fout.write(struct.pack("i", dim)) | |
fout.write(sname) | |
# ensure tensor data is aligned | |
tensor_data_offset = fout.tell() | |
while tensor_data_offset % QK != 0: | |
fout.write(struct.pack("B", 0)) | |
tensor_data_offset += 1 | |
# output unified mappable tensor data | |
if n_dims == 1 or n_parts == 1: | |
# copy tensor which we thankfully received in one piece | |
if part_id == 0: | |
data.tofile(fout) | |
elif split_dim == 0: | |
# reassemble multifile tensor containing some of the rows | |
rows_per_chunk = partshape[0] | |
current_row = part_id * rows_per_chunk | |
bytes_per_row = fullshape[1] // blck_size * type_size | |
offset = current_row * bytes_per_row | |
fout.seek(tensor_data_offset + offset) | |
data.tofile(fout) | |
elif split_dim == 1: | |
# reassemble multifile tensor containing some of the cols | |
cols_per_chunk = partshape[1] | |
current_col = part_id * cols_per_chunk | |
bytes_per_row = fullshape[1] // blck_size * type_size | |
offset_current_col = current_col // blck_size * type_size | |
for row in range(partshape[0]): | |
offset_row = row * bytes_per_row | |
offset = offset_row + offset_current_col | |
fout.seek(tensor_data_offset + offset) | |
data[row].tofile(fout) | |
# advance file position to next tensor | |
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur)) | |
def main(): | |
args = parse_args() | |
dir_model = args.dir_model | |
ftype = args.ftype | |
ftype_str = ["f32", "f16"] | |
hparams, tokenizer = load_hparams_and_tokenizer(dir_model) | |
print(args) | |
# if only writing vocab to file | |
if args.vocab_only: | |
fname_model = f"{dir_model}/consolidated.00.pth" | |
fname_out = f"{dir_model}/ggml-vocab.bin" | |
print(f"Extracting only the vocab from '{fname_model}'\n") | |
with open(fname_out, "wb") as fout: | |
write_header(fout, hparams, ftype) | |
write_tokens(fout, tokenizer) | |
print(f"Done. Output file: {fname_out}\n") | |
return | |
n_parts = args.n_parts if args.n_parts is not None else get_n_parts(hparams["dim"]) | |
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin" | |
# we output a single file for ggml | |
with open(fname_out, "wb") as fout: | |
write_header(fout, hparams, ftype) | |
write_tokens(fout, tokenizer) | |
offset_of_tensors = fout.tell() | |
# the tensors we load could be split across multiple files | |
for part_id in range(n_parts): | |
fout.seek(offset_of_tensors) | |
print(f"Processing part {part_id+1} of {n_parts}\n") | |
fname_model = f"{dir_model}/consolidated.0{part_id}.pth" | |
model = torch.load(fname_model, map_location="cpu") | |
process_and_write_variables(fout, model, ftype, part_id, n_parts) | |
del model | |
print(f"Done. Output file: {fname_out}\n") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment