Skip to content

Instantly share code, notes, and snippets.

@antmikinka
Created May 29, 2024 03:24
Show Gist options
  • Save antmikinka/7ba4d13db90c5afc739615ffe95196b3 to your computer and use it in GitHub Desktop.
Save antmikinka/7ba4d13db90c5afc739615ffe95196b3 to your computer and use it in GitHub Desktop.
forked from pcuena
import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM
# When using float16, all predicted logits are 0. To be debugge
from coremltools.optimize.torch.palettization import (
DKMPalettizer,
DKMPalettizerConfig,
ModuleDKMPalettizerConfig,
)
def selector(op):
return op.op_type != "l2_norm"
compute_precision = ct.transform.FP16ComputePrecision(op_selector=selector)
#compute_precision = ct.precision.FLOAT16
#compute_precision = ct.precision.FLOAT32
#compute_precision = ct.transform.FP16ComputePrecision(op_selector)
#compute_units = ct.ComputeUnit.CPU_ONLY
compute_units = ct.ComputeUnit.CPU_AND_NE
# Fixed sequence length
shape = (1, 128)
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_id",
required=True,
help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
"--output_dir",
required=True,
help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()
model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"
print(model_id)
print(outpath)
# OpenELM uses the Llama tokenizer, see https://huggingface.co/apple/OpenELM-270M-Instruct/blob/main/generate_openelm.py#L21.
# It also uses custom code.
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()
'''
# Configuration details based on config.json
context_length = 2048 # max_context_length from config.json
vocab_size = 32001 # Derived from _anchor_vocab_size with padding token
padding_index = 32000 # _anchor_padding_index
forward_dtype = torch.bfloat16 # torch_dtype from config.json
backward_dtype = torch.float32
# Apply mixed precision
model = model.to(forward_dtype)
# Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=padding_index)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0024, betas=(0.9, 0.95), eps=1.e-8, weight_decay=0.1)
# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=350000, eta_min=0.00024)
# Function to create data
def create_data():
# Placeholder data creation function
# Replace with actual data loading and preprocessing
inputs = torch.randint(0, vocab_size, (16, context_length)).to(torch.long) # Example input tensor
labels = torch.randint(0, vocab_size, (16, context_length)).to(torch.long) # Example label tensor
return [(inputs, labels)]
data = create_data()
# Prepare model for palettization
config = DKMPalettizerConfig(global_config=ModuleDKMPalettizerConfig(
n_bits=6,
weight_threshold=1024,
quantize_activations=True,
quant_min=0,
quant_max=100
))
palettizer = DKMPalettizer(model, config)
prepared_model = palettizer.prepare()
# Fine-tune the model for a few epochs
for epoch in range(1): # Replace 1 with the number of epochs you want
for inputs, labels in data:
optimizer.zero_grad() # Clear the gradients
outputs = model(inputs)
loss = loss_fn(outputs.view(-1, vocab_size), labels.view(-1))
loss.backward()
optimizer.step()
palettizer.step()
scheduler.step() # Update learning rate
# Finalize the model
finalized_model = palettizer.finalize(inplace=True)
'''
## palettization may need to be after the code below. idk if it will work but lets try.
inputs = {
"input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}
with torch.no_grad():
t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
outputs = model(**t_inputs, use_cache=False)
class Wrapper(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, *args, **kwargs):
input_ids = args[0]
return self.model(
input_ids=input_ids,
return_dict=False,
use_cache=False,
**kwargs
)
to_jit = Wrapper(model)
jit_inputs = list(t_inputs.values())
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval();
with torch.no_grad():
output_jit = jitted_model(*jit_inputs)
assert torch.allclose(output_jit[0], outputs["logits"])
## Core ML conversion
coreml_input_types = [ct.TensorType(
name="input_ids",
shape=ct.Shape(shape=shape),
dtype=np.int32,
)]
#coreml_output_types = [ct.TensorType(name=name) for name in outputs.keys()]
coreml_output_types = [ct.TensorType(name=name, dtype=np.float32) for name in outputs.keys()]
# Conversion fails with `Conversion for torch.repeat_interleave with non-zero dim has not been implemented`.
# We hack a special case shortcut when the first dim is `1`.
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil import register_torch_op
from coremltools.converters.mil.frontend.torch.torch_op_registry import _TORCH_OPS_REGISTRY
from coremltools.converters.mil.frontend.torch.ops import _get_inputs
del _TORCH_OPS_REGISTRY["repeat_interleave"]
@register_torch_op
def repeat_interleave(context, node):
"""
Copy from https://github.com/apple/coremltools/blob/0bef2d6aabd122527cf86cc983c08fb16a4041b5/coremltools/converters/mil/frontend/torch/ops.py#L5174
plus special case for dim=1 and bs=1
"""
x, repeats, dim, _ = _get_inputs(context, node, expected=4)
special_case = dim.val == 1 and x.shape[0] == 1
if special_case:
x = mb.reshape(x=x, shape=(x.shape[1:]))
repeats_val = repeats.val
if isinstance(repeats_val, np.ndarray):
repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
if np.any(repeats_val != repeats_val0):
raise NotImplementedError(
"Conversion for torch.repeat_interleave with Tensor repeats has not been implemented"
)
repeats_val = repeats_val0
# This would operate on the flattened input tensor
if dim is None:
x = mb.reshape(x=x, shape=(-1,))
else:
if dim.val != 0 and not special_case:
raise NotImplementedError(
"Conversion for torch.repeat_interleave with non-zero dim has not been implemented"
)
"""
on a high level:
x
| tile in dim 0
v
[x, x, ...]
| reshape to split the repeats
v
[[x],
[x],
...]
| transpose(1, 0)
V
[x^T, x^T, ...]
| flatten
V
result
"""
reps = [1] * x.rank
reps[0] = repeats_val
x_tiled = mb.tile(x=x, reps=reps)
split_reps = [repeats_val] + list(x.shape)
x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))
perm = [*range(x.rank + 1)]
perm[0] = 1
perm[1] = 0
x_transposed = mb.transpose(x=x_reshaped, perm=perm)
result_shape = list(x.shape)
result_shape[0] = -1
if special_case:
result_shape = [1] + result_shape
result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
context.add(result)
eps = 1e-6
###
def stable_l2_norm(x, eps):
max_val = x.abs().max(axis=-1, keepdim=True).values
max_val = torch.clamp(max_val, min=eps)
xscaled = x / max_val
scaled_norm = torch.acos(xscaled)
return x / torch.clamp(scaled_norm, min=eps), max_val
###
class CustomRMSNorm(nn.Module):
def __init__(self, weight, eps):
super().__init__()
self.weight = weight
self.eps = eps
def forward(self, x):
x, max_val = stable_l2_norm(x, self.eps)
return x * (x.size(-1) ** 0.5 / max_val) * self.weight
###
model.transformer.norm = CustomRMSNorm(model.transformer.norm.weight, model.transformer.norm.eps)
for layer in model.transformer.layers:
layer.attn.q_norm = CustomRMSNorm(layer.attn.q_norm.weight, layer.attn.q_norm.eps)
layer.attn.k_norm = CustomRMSNorm(layer.attn.k_norm.weight, layer.attn.k_norm.eps)
layer.ffn_norm = CustomRMSNorm(layer.ffn_norm.weight, layer.ffn_norm.eps)
layer.attn_norm = CustomRMSNorm(layer.attn_norm.weight, layer.attn_norm.eps)
coreml_model = ct.convert(
jitted_model,
convert_to="mlprogram",
minimum_deployment_target=ct.target.macOS14,
inputs=coreml_input_types,
outputs=coreml_output_types,
compute_precision=compute_precision,
compute_units=compute_units
#pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION, # palettization
)
import sys
if sys.platform == "darwin":
coreml_outputs = coreml_model.predict(t_inputs)
print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")
# Override tokenizer
model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type
'''
user_defined_metadata = {
"co.huggingface.exporters.name": model_name,
"co.huggingface.exporters.task": "text-generation",
"co.huggingface.exporters.architecture": architecture,
"co.huggingface.exporters.framework": "pytorch",
"co.huggingface.exporters.precision": compute_precision,
}
'''
# Assuming `compute_precision` is used to apply float16 precision selectively
precision_description = "FP16 for all ops except l2_norm"
user_defined_metadata = {
"co.huggingface.exporters.name": model_name,
"co.huggingface.exporters.task": "text-generation",
"co.huggingface.exporters.architecture": architecture,
"co.huggingface.exporters.framework": "pytorch",
"co.huggingface.exporters.precision": precision_description,
}
spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)
coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
- Sequence length: {shape[-1]}, fixed.
- Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
f.write(card)
@antmikinka
Copy link
Author

v7 here - https://chatgpt.com/g/g-dws2ovt4u-apple-coreml-complete-code-expert

The error indicates that the function DynamicCache.from_legacy_cache expects past_key_values to contain pairs of key and value states, but it seems that the placeholder tensor does not meet this expectation.

To handle this correctly, we need to ensure that the placeholder tensor is structured properly to reflect the expected shape of past_key_values. Typically, past_key_values is a tuple containing pairs of tensors (key states and value states) for each layer.

Let's update the placeholder to correctly reflect this structure:

Structure past_key_values as a tuple of pairs of tensors for each layer.
Ensure the past_key_values_placeholder correctly mimics the actual structure expected during inference.

revised code:

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=False)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.num_layers = model.config.num_transformer_layers
        self.num_heads = model.config.num_query_heads[0]  # Assuming all layers have the same number of query heads
        self.head_dim = model.config.model_dim // self.num_heads

    def forward(self, input_ids, past_key_values):
        if past_key_values is None:
            # Create a placeholder tensor with the appropriate shape if past_key_values is None
            past_key_values = tuple(
                (torch.zeros((input_ids.shape[0], self.num_heads, 0, self.head_dim), dtype=torch.float16, device=input_ids.device),
                 torch.zeros((input_ids.shape[0], self.num_heads, 0, self.head_dim), dtype=torch.float16, device=input_ids.device))
                for _ in range(self.num_layers)
            )
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)
# Use a placeholder tensor for past_key_values during tracing
past_key_values_placeholder = tuple(
    (torch.zeros((1, model.config.num_query_heads[0], 0, model.config.model_dim // model.config.num_query_heads[0]), dtype=torch.float16),
     torch.zeros((1, model.config.num_query_heads[0], 0, model.config.model_dim // model.config.num_query_heads[0]), dtype=torch.float16))
    for _ in range(model.config.num_transformer_layers)
)
jitted_model = torch.jit.trace(to_jit, (t_inputs["input_ids"], past_key_values_placeholder))
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(t_inputs["input_ids"], past_key_values_placeholder)

assert torch.allclose(output_jit[0], outputs["logits"])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

v7

@antmikinka
Copy link
Author

error from v7

  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv7.py", line 82, in <module>
    jitted_model = torch.jit.trace(to_jit, (t_inputs["input_ids"], past_key_values_placeholder))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 806, in trace
    return trace_module(
           ^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 1074, in trace_module
    module._c._create_method_from_trace(
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv7.py", line 68, in forward
    return self.model(
           ^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 864, in forward
    outputs = self.transformer(
              ^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 704, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 508, in forward
    hidden_states, self_attn_weights, present_key_value = self.attn(
                                                          ^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 369, in forward
    keys, values = past_key_value.update(
                   ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/cache_utils.py", line 146, in update
    self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 12 but got size 3 for tensor number 1 in the list.

@antmikinka
Copy link
Author

antmikinka commented May 29, 2024

v8 here

The error indicates a mismatch in tensor sizes when attempting to concatenate the cached key and value states. This often occurs when the initial past key values tensor dimensions do not match the expected dimensions during inference.

We need to ensure that the past key values tensor has the correct shape, matching the expected number of attention heads and the hidden state dimensions.

Here's the revised script to fix the dimension issue:

Ensure past_key_values dimensions match those expected by the model.
Correctly initialize past_key_values as empty tensors with the correct dimensions.

revised code:

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=False)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.num_layers = model.config.num_transformer_layers
        self.num_heads = model.config.num_query_heads[0]  # Assuming all layers have the same number of query heads
        self.head_dim = model.config.model_dim // self.num_heads

    def forward(self, input_ids, past_key_values):
        if past_key_values is None:
            # Create a placeholder tensor with the appropriate shape if past_key_values is None
            past_key_values = tuple(
                (torch.zeros((input_ids.shape[0], self.num_heads, input_ids.shape[1], self.head_dim), dtype=torch.float16, device=input_ids.device),
                 torch.zeros((input_ids.shape[0], self.num_heads, input_ids.shape[1], self.head_dim), dtype=torch.float16, device=input_ids.device))
                for _ in range(self.num_layers)
            )
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)
# Use a placeholder tensor for past_key_values during tracing
past_key_values_placeholder = tuple(
    (torch.zeros((1, model.config.num_query_heads[0], shape[1], model.config.model_dim // model.config.num_query_heads[0]), dtype=torch.float16),
     torch.zeros((1, model.config.num_query_heads[0], shape[1], model.config.model_dim // model.config.num_query_heads[0]), dtype=torch.float16))
    for _ in range(model.config.num_transformer_layers)
)
jitted_model = torch.jit.trace(to_jit, (t_inputs["input_ids"], past_key_values_placeholder))
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(t_inputs["input_ids"], past_key_values_placeholder)

assert torch.allclose(output_jit[0], outputs["logits"])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

v8

@antmikinka
Copy link
Author

error from v8

/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:759: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if seq_length > self.causal_mask.shape[-1]:
Traceback (most recent call last):
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv8.py", line 82, in <module>
    jitted_model = torch.jit.trace(to_jit, (t_inputs["input_ids"], past_key_values_placeholder))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 806, in trace
    return trace_module(
           ^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 1074, in trace_module
    module._c._create_method_from_trace(
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv8.py", line 68, in forward
    return self.model(
           ^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 864, in forward
    outputs = self.transformer(
              ^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 704, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 508, in forward
    hidden_states, self_attn_weights, present_key_value = self.attn(
                                                          ^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 369, in forward
    keys, values = past_key_value.update(
                   ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/cache_utils.py", line 146, in update
    self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 12 but got size 3 for tensor number 1 in the list.

@antmikinka
Copy link
Author

antmikinka commented May 29, 2024

v9

The error message indicates that there's a mismatch in the sizes of the tensors being concatenated. Specifically, the size of the key_states tensor in dimension 2 does not match the size of the key_cache tensor in the same dimension.

Let's address this by ensuring that the past_key_values placeholder tensors have the correct shapes and sizes, which align with the model's expectations during concatenation.

Use a placeholder with the correct sequence length and head dimension.
Ensure all tensors used in concatenation have matching dimensions.

revised code:

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=False)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.num_layers = model.config.num_transformer_layers
        self.num_heads = model.config.num_query_heads[0]  # Assuming all layers have the same number of query heads
        self.head_dim = model.config.model_dim // self.num_heads

    def forward(self, input_ids, past_key_values):
        if past_key_values is None:
            # Create a placeholder tensor with the appropriate shape if past_key_values is None
            past_key_values = tuple(
                (torch.zeros((input_ids.shape[0], self.num_heads, 0, self.head_dim), dtype=torch.float16, device=input_ids.device),
                 torch.zeros((input_ids.shape[0], self.num_heads, 0, self.head_dim), dtype=torch.float16, device=input_ids.device))
                for _ in range(self.num_layers)
            )
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)
# Use a placeholder tensor for past_key_values during tracing
past_key_values_placeholder = tuple(
    (torch.zeros((1, model.config.num_query_heads[0], shape[1], model.config.model_dim // model.config.num_query_heads[0]), dtype=torch.float16),
     torch.zeros((1, model.config.num_query_heads[0], shape[1], model.config.model_dim // model.config.num_query_heads[0]), dtype=torch.float16))
    for _ in range(model.config.num_transformer_layers)
)
jitted_model = torch.jit.trace(to_jit, (t_inputs["input_ids"], past_key_values_placeholder))
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(t_inputs["input_ids"], past_key_values_placeholder)

assert torch.allclose(output_jit[0], outputs["logits"])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)


v9

@antmikinka
Copy link
Author

i must note that around v6 or so, that I changed what OpenELM model I was converting from the 1B to the 270M because its faster to convert.

I gave the CoreML GPT the configuration_openelm.py file for i think 270M so hopefully it just took the principles....
wondering if these past 2 errors are from 1B to 270M or not....

as I am writing this, just tried v9 with 1B and it failed with the caching thing again.
and again with the 270M

@antmikinka
Copy link
Author

v9 error

Traceback (most recent call last):
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv9.py", line 82, in <module>
    jitted_model = torch.jit.trace(to_jit, (t_inputs["input_ids"], past_key_values_placeholder))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 806, in trace
    return trace_module(
           ^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 1074, in trace_module
    module._c._create_method_from_trace(
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv9.py", line 68, in forward
    return self.model(
           ^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 864, in forward
    outputs = self.transformer(
              ^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 704, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 508, in forward
    hidden_states, self_attn_weights, present_key_value = self.attn(
                                                          ^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 369, in forward
    keys, values = past_key_value.update(
                   ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/cache_utils.py", line 146, in update
    self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 12 but got size 3 for tensor number 1 in the list.

@antmikinka
Copy link
Author

V10

after reviewing gpt2.py
from smpanaro/more-ane-transformers

Updated Code
Let's incorporate these fixes and handle the issues with tensor sizes and concatenation:

Ensure Compatibility in Concatenation:

Make sure that all tensors have compatible sizes before concatenation.
Align the slicing operations to match the expected dimensions.
Fix Concatenation Issues in the Wrapper Class:

Modify the Wrapper class to correctly handle the past key values.
Ensure the correct dimensions are used when updating the cache.

revised code below:

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=True)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, past_key_values=None):
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)
# Placeholder for past_key_values
past_key_values_placeholder = tuple([torch.zeros((1, model.config.num_attention_heads, 0, model.config.hidden_size // model.config.num_attention_heads), dtype=torch.float16)] * 2 * model.config.num_hidden_layers)
jit_inputs = [t_inputs["input_ids"], past_key_values_placeholder]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v10 error

 python openelm-coremlv10.py --model_id apple/OpenELM-270M-Instruct --output_dir Users/anthonymikinka/Downloads
scikit-learn version 1.4.1.post1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.
Torch version 2.2.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.
apple/OpenELM-270M-Instruct
Users/anthonymikinka/Downloads/OpenELM-270M-Instruct-128-ComputePrecision.FLOAT16.mlpackage
/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Traceback (most recent call last):
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv10.py", line 68, in <module>
    past_key_values_placeholder = tuple([torch.zeros((1, model.config.num_attention_heads, 0, model.config.hidden_size // model.config.num_attention_heads), dtype=torch.float16)] * 2 * model.config.num_hidden_layers)
                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/configuration_utils.py", line 263, in __getattribute__
    return super().__getattribute__(key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'OpenELMConfig' object has no attribute 'num_attention_heads'

@antmikinka
Copy link
Author

V11

The OpenELMConfig object in your error message doesn't have the attribute num_attention_heads. Based on the provided config.json, the relevant attributes are num_query_heads and num_kv_heads.

To resolve this, we'll use these attributes to properly initialize the past_key_values_placeholder. Each layer should have num_query_heads heads for the queries and num_kv_heads heads for the keys and values.

Here's how you can adjust the code:

Use num_query_heads and num_kv_heads from the configuration.
Adjust the placeholder initialization for past_key_values.
Let's update the script accordingly:

revised code below


import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=True)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, past_key_values=None):
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)

# Initialize past_key_values with the correct shapes
past_key_values_placeholder = tuple(
    [(torch.zeros((1, num_heads, 0, model.config.model_dim // num_heads), dtype=torch.float16),
      torch.zeros((1, num_heads, 0, model.config.model_dim // num_heads), dtype=torch.float16))
     for num_heads in model.config.num_query_heads]
)

jit_inputs = [t_inputs["input_ids"], past_key_values_placeholder]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v11 error

Traceback (most recent call last):
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv11.py", line 76, in <module>
    jitted_model = torch.jit.trace(to_jit, jit_inputs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 806, in trace
    return trace_module(
           ^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/jit/_trace.py", line 1074, in trace_module
    module._c._create_method_from_trace(
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv11.py", line 59, in forward
    return self.model(
           ^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 864, in forward
    outputs = self.transformer(
              ^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 704, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 508, in forward
    hidden_states, self_attn_weights, present_key_value = self.attn(
                                                          ^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _slow_forward
    result = self.forward(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py", line 369, in forward
    keys, values = past_key_value.update(
                   ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/cache_utils.py", line 146, in update
    self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 12 but got size 3 for tensor number 1 in the list.

@antmikinka
Copy link
Author

V12

lots has happened

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

#tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") # this was removed some versions ago
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=True)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, past_key_values=None):
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)

# Initialize past_key_values with the correct shapes using configuration
config = model.config
past_key_values_placeholder = tuple(
    (torch.zeros((1, config.num_kv_heads[i], 0, config.model_dim // config.num_kv_heads[i]), dtype=torch.float16),
     torch.zeros((1, config.num_kv_heads[i], 0, config.model_dim // config.num_kv_heads[i]), dtype=torch.float16))
    for i in range(config.num_transformer_layers)
)

jit_inputs = [t_inputs["input_ids"], past_key_values_placeholder]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v12.5 (no past-key value implementation)

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False
        )

to_jit = Wrapper(model)
jit_inputs = [t_inputs["input_ids"]]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v12.5 (no past key values) error

 python openelm-coremlv125.py --model_id apple/OpenELM-270M-Instruct --output_dir Users/anthonymikinka/Downloads
scikit-learn version 1.4.1.post1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.
Torch version 2.2.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.
apple/OpenELM-270M-Instruct
Users/anthonymikinka/Downloads/OpenELM-270M-Instruct-128-ComputePrecision.FLOAT16.mlpackage
/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/modeling_utils.py:4371: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead
  warnings.warn(
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:759: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if seq_length > self.causal_mask.shape[-1]:
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:230: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert dim == self.model_dim
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:240: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  key_len >= query_len
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:175: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  key_len > self._cached_seq_length
Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   0%|                                                                                                                                                             | 0/1755 [00:00<?, ? ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:   7%|█████████▉                                                                                                                                       | 120/1755 [00:00<00:00, 3292.99 ops/s]
Traceback (most recent call last):
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv125.py", line 125, in <module>
    coreml_model = ct.convert(
                   ^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/_converters_entry.py", line 574, in convert
    mlmodel = mil_convert(
              ^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 188, in mil_convert
    return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 212, in _mil_convert
    proto, mil_program = mil_convert_to_proto(
                         ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 286, in mil_convert_to_proto
    prog = frontend_converter(model, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 108, in __call__
    return load(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 80, in load
    return _perform_torch_convert(converter, debug)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 107, in _perform_torch_convert
    raise e
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 99, in _perform_torch_convert
    prog = converter.convert()
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 519, in convert
    convert_nodes(self.context, self.graph)
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 88, in convert_nodes
    add_op(context, node)
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 4812, in repeat_interleave
    raise NotImplementedError(
NotImplementedError: Conversion for torch.repeat_interleave with non-zero dim has not been implemented

@antmikinka
Copy link
Author

v13.5 ( no past-key values)

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False
        )

to_jit = Wrapper(model)
jit_inputs = [t_inputs["input_ids"]]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val not in [0, 1]:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim other than 1 has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v14.5 (no past-key values)

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False
        )

to_jit = Wrapper(model)
jit_inputs = [t_inputs["input_ids"]]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val not in [0, 1]:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim other than 1 has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs[0].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment