Skip to content

Instantly share code, notes, and snippets.

@ahoho
Created April 2, 2023 19:35
Show Gist options
  • Save ahoho/57f5c3dcdce4be522ca13dfb96cc1eb8 to your computer and use it in GitHub Desktop.
Save ahoho/57f5c3dcdce4be522ca13dfb96cc1eb8 to your computer and use it in GitHub Desktop.
Convert huggingface model to pytorch checkpoint (modified from alpaca-lora)
# Convert a huggingface LLaMA checkpoint to an (unsharded) pytorch checkpoint
# comes from https://github.com/tloen/alpaca-lora/blob/main/export_state_dict_checkpoint.py
import argparse
import json
from pathlib import Path
import torch
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: E402
parser = argparse.ArgumentParser()
parser.add_argument("base_model")
parser.add_argument("size_key")
args = parser.parse_args()
tokenizer = LlamaTokenizer.from_pretrained(args.base_model)
base_model = LlamaForCausalLM.from_pretrained(
args.base_model,
load_in_8bit=False,
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
base_model.train(False)
base_model_sd = base_model.state_dict()
params_by_model = {
"7b": {
"dim": 4096,
"multiple_of": 256,
"n_heads": 32,
"n_layers": 32,
"norm_eps": 1e-06,
"vocab_size": -1,
},
"13b": {
"dim": 5120,
"multiple_of": 256,
"n_heads": 40,
"n_layers": 40,
"norm_eps": 1e-06,
"vocab_size": -1,
},
"30b": {
"dim": 6656,
"multiple_of": 256,
"n_heads": 52,
"n_layers": 60,
"norm_eps": 1e-06,
"vocab_size": -1,
},
"65b": {
"dim": 8192,
"multiple_of": 256,
"n_heads": 64,
"n_layers": 80,
"norm_eps": 1e-06,
"vocab_size": -1,
},
}
params = params_by_model[args.size_key.lower()]
n_layers = params["n_layers"]
n_heads = params["n_heads"]
dim = params["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (
base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
)
def permute(w):
return (
w.view(n_heads, dim // n_heads // 2, 2, dim)
.transpose(1, 2)
.reshape(dim, dim)
)
def unpermute(w):
return (
w.view(n_heads, 2, dim // n_heads // 2, dim)
.transpose(1, 2)
.reshape(dim, dim)
)
def translate_state_dict_key(k): # noqa: C901
k = k.replace("base_model.model.", "")
if k == "model.embed_tokens.weight":
return "tok_embeddings.weight"
elif k == "model.norm.weight":
return "norm.weight"
elif k == "lm_head.weight":
return "output.weight"
elif k.startswith("model.layers."):
layer = k.split(".")[2]
if k.endswith(".self_attn.q_proj.weight"):
return f"layers.{layer}.attention.wq.weight"
elif k.endswith(".self_attn.k_proj.weight"):
return f"layers.{layer}.attention.wk.weight"
elif k.endswith(".self_attn.v_proj.weight"):
return f"layers.{layer}.attention.wv.weight"
elif k.endswith(".self_attn.o_proj.weight"):
return f"layers.{layer}.attention.wo.weight"
elif k.endswith(".mlp.gate_proj.weight"):
return f"layers.{layer}.feed_forward.w1.weight"
elif k.endswith(".mlp.down_proj.weight"):
return f"layers.{layer}.feed_forward.w2.weight"
elif k.endswith(".mlp.up_proj.weight"):
return f"layers.{layer}.feed_forward.w3.weight"
elif k.endswith(".input_layernorm.weight"):
return f"layers.{layer}.attention_norm.weight"
elif k.endswith(".post_attention_layernorm.weight"):
return f"layers.{layer}.ffn_norm.weight"
elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
return None
else:
print(layer, k)
raise NotImplementedError
else:
print(k)
raise NotImplementedError
new_state_dict = {}
for k, v in base_model_sd.items():
new_k = translate_state_dict_key(k)
if new_k is not None:
if "wq" in new_k or "wk" in new_k:
new_state_dict[new_k] = unpermute(v)
else:
new_state_dict[new_k] = v
out_path = Path(args.base_model, "consolidated")
out_path.mkdir(exist_ok=True)
torch.save(new_state_dict, out_path / "consolidated.00.pth")
with open(out_path / "params.json", "w") as f:
json.dump(params, f)
# minor modification of the original file from llama.ccp
# to account for the unsharded checkpoint;
# call with `convert-pth-to-ggml.py <output dir of convert-hf-to-pth.py> 1 1`
import argparse
import os
import sys
import json
import struct
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def parse_args():
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
parser.add_argument('n_parts', type=int, default=None)
return parser.parse_args()
def get_n_parts(dim):
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
n_parts = mappings.get(dim)
if n_parts is None:
print(f"Invalid dim: {dim}")
sys.exit(1)
print(f"n_parts = {n_parts}\n")
return n_parts
def load_hparams_and_tokenizer(dir_model):
# `dir_model` is something like `models/7B` or `models/7B/`.
# "tokenizer.model" is expected under model's parent dir.
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
# Let's use the model's parent dir directly.
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
fname_hparams = f"{dir_model}/params.json"
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
with open(fname_hparams, "r") as f:
hparams = json.load(f)
print(hparams)
tokenizer = SentencePieceProcessor(fname_tokenizer)
hparams.update({"vocab_size": tokenizer.vocab_size()})
return hparams, tokenizer
def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
0x67676a74, # magic: ggjt in hex
1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
ftype
]
fout.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
for name, datao in model.items():
if name.endswith("freqs"):
continue
# remove dimensions with a single element
data = datao.numpy().squeeze()
partshape = data.shape
n_dims = len(data.shape)
assert n_dims in (1, 2)
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
# coerce single-dimensional tensors from float16 to float32
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if "tok_embeddings" in name:
split_dim = 1
elif "layers" in name:
if "attention.wo.weight" in name:
split_dim = 1
elif "feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif "output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
sname = name.encode()
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(sname)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
data.tofile(fout)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
data.tofile(fout)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
data[row].tofile(fout)
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
def main():
args = parse_args()
dir_model = args.dir_model
ftype = args.ftype
ftype_str = ["f32", "f16"]
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
print(args)
# if only writing vocab to file
if args.vocab_only:
fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"
print(f"Extracting only the vocab from '{fname_model}'\n")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
print(f"Done. Output file: {fname_out}\n")
return
n_parts = args.n_parts if args.n_parts is not None else get_n_parts(hparams["dim"])
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
# we output a single file for ggml
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
model = torch.load(fname_model, map_location="cpu")
process_and_write_variables(fout, model, ftype, part_id, n_parts)
del model
print(f"Done. Output file: {fname_out}\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment