Skip to content

Instantly share code, notes, and snippets.

@antmikinka
Created May 29, 2024 03:24
Show Gist options
  • Save antmikinka/7ba4d13db90c5afc739615ffe95196b3 to your computer and use it in GitHub Desktop.
Save antmikinka/7ba4d13db90c5afc739615ffe95196b3 to your computer and use it in GitHub Desktop.
forked from pcuena
import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM
# When using float16, all predicted logits are 0. To be debugge
from coremltools.optimize.torch.palettization import (
DKMPalettizer,
DKMPalettizerConfig,
ModuleDKMPalettizerConfig,
)
def selector(op):
return op.op_type != "l2_norm"
compute_precision = ct.transform.FP16ComputePrecision(op_selector=selector)
#compute_precision = ct.precision.FLOAT16
#compute_precision = ct.precision.FLOAT32
#compute_precision = ct.transform.FP16ComputePrecision(op_selector)
#compute_units = ct.ComputeUnit.CPU_ONLY
compute_units = ct.ComputeUnit.CPU_AND_NE
# Fixed sequence length
shape = (1, 128)
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_id",
required=True,
help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
"--output_dir",
required=True,
help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()
model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"
print(model_id)
print(outpath)
# OpenELM uses the Llama tokenizer, see https://huggingface.co/apple/OpenELM-270M-Instruct/blob/main/generate_openelm.py#L21.
# It also uses custom code.
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()
'''
# Configuration details based on config.json
context_length = 2048 # max_context_length from config.json
vocab_size = 32001 # Derived from _anchor_vocab_size with padding token
padding_index = 32000 # _anchor_padding_index
forward_dtype = torch.bfloat16 # torch_dtype from config.json
backward_dtype = torch.float32
# Apply mixed precision
model = model.to(forward_dtype)
# Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=padding_index)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0024, betas=(0.9, 0.95), eps=1.e-8, weight_decay=0.1)
# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=350000, eta_min=0.00024)
# Function to create data
def create_data():
# Placeholder data creation function
# Replace with actual data loading and preprocessing
inputs = torch.randint(0, vocab_size, (16, context_length)).to(torch.long) # Example input tensor
labels = torch.randint(0, vocab_size, (16, context_length)).to(torch.long) # Example label tensor
return [(inputs, labels)]
data = create_data()
# Prepare model for palettization
config = DKMPalettizerConfig(global_config=ModuleDKMPalettizerConfig(
n_bits=6,
weight_threshold=1024,
quantize_activations=True,
quant_min=0,
quant_max=100
))
palettizer = DKMPalettizer(model, config)
prepared_model = palettizer.prepare()
# Fine-tune the model for a few epochs
for epoch in range(1): # Replace 1 with the number of epochs you want
for inputs, labels in data:
optimizer.zero_grad() # Clear the gradients
outputs = model(inputs)
loss = loss_fn(outputs.view(-1, vocab_size), labels.view(-1))
loss.backward()
optimizer.step()
palettizer.step()
scheduler.step() # Update learning rate
# Finalize the model
finalized_model = palettizer.finalize(inplace=True)
'''
## palettization may need to be after the code below. idk if it will work but lets try.
inputs = {
"input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}
with torch.no_grad():
t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
outputs = model(**t_inputs, use_cache=False)
class Wrapper(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, *args, **kwargs):
input_ids = args[0]
return self.model(
input_ids=input_ids,
return_dict=False,
use_cache=False,
**kwargs
)
to_jit = Wrapper(model)
jit_inputs = list(t_inputs.values())
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval();
with torch.no_grad():
output_jit = jitted_model(*jit_inputs)
assert torch.allclose(output_jit[0], outputs["logits"])
## Core ML conversion
coreml_input_types = [ct.TensorType(
name="input_ids",
shape=ct.Shape(shape=shape),
dtype=np.int32,
)]
#coreml_output_types = [ct.TensorType(name=name) for name in outputs.keys()]
coreml_output_types = [ct.TensorType(name=name, dtype=np.float32) for name in outputs.keys()]
# Conversion fails with `Conversion for torch.repeat_interleave with non-zero dim has not been implemented`.
# We hack a special case shortcut when the first dim is `1`.
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil import register_torch_op
from coremltools.converters.mil.frontend.torch.torch_op_registry import _TORCH_OPS_REGISTRY
from coremltools.converters.mil.frontend.torch.ops import _get_inputs
del _TORCH_OPS_REGISTRY["repeat_interleave"]
@register_torch_op
def repeat_interleave(context, node):
"""
Copy from https://github.com/apple/coremltools/blob/0bef2d6aabd122527cf86cc983c08fb16a4041b5/coremltools/converters/mil/frontend/torch/ops.py#L5174
plus special case for dim=1 and bs=1
"""
x, repeats, dim, _ = _get_inputs(context, node, expected=4)
special_case = dim.val == 1 and x.shape[0] == 1
if special_case:
x = mb.reshape(x=x, shape=(x.shape[1:]))
repeats_val = repeats.val
if isinstance(repeats_val, np.ndarray):
repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
if np.any(repeats_val != repeats_val0):
raise NotImplementedError(
"Conversion for torch.repeat_interleave with Tensor repeats has not been implemented"
)
repeats_val = repeats_val0
# This would operate on the flattened input tensor
if dim is None:
x = mb.reshape(x=x, shape=(-1,))
else:
if dim.val != 0 and not special_case:
raise NotImplementedError(
"Conversion for torch.repeat_interleave with non-zero dim has not been implemented"
)
"""
on a high level:
x
| tile in dim 0
v
[x, x, ...]
| reshape to split the repeats
v
[[x],
[x],
...]
| transpose(1, 0)
V
[x^T, x^T, ...]
| flatten
V
result
"""
reps = [1] * x.rank
reps[0] = repeats_val
x_tiled = mb.tile(x=x, reps=reps)
split_reps = [repeats_val] + list(x.shape)
x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))
perm = [*range(x.rank + 1)]
perm[0] = 1
perm[1] = 0
x_transposed = mb.transpose(x=x_reshaped, perm=perm)
result_shape = list(x.shape)
result_shape[0] = -1
if special_case:
result_shape = [1] + result_shape
result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
context.add(result)
eps = 1e-6
###
def stable_l2_norm(x, eps):
max_val = x.abs().max(axis=-1, keepdim=True).values
max_val = torch.clamp(max_val, min=eps)
xscaled = x / max_val
scaled_norm = torch.acos(xscaled)
return x / torch.clamp(scaled_norm, min=eps), max_val
###
class CustomRMSNorm(nn.Module):
def __init__(self, weight, eps):
super().__init__()
self.weight = weight
self.eps = eps
def forward(self, x):
x, max_val = stable_l2_norm(x, self.eps)
return x * (x.size(-1) ** 0.5 / max_val) * self.weight
###
model.transformer.norm = CustomRMSNorm(model.transformer.norm.weight, model.transformer.norm.eps)
for layer in model.transformer.layers:
layer.attn.q_norm = CustomRMSNorm(layer.attn.q_norm.weight, layer.attn.q_norm.eps)
layer.attn.k_norm = CustomRMSNorm(layer.attn.k_norm.weight, layer.attn.k_norm.eps)
layer.ffn_norm = CustomRMSNorm(layer.ffn_norm.weight, layer.ffn_norm.eps)
layer.attn_norm = CustomRMSNorm(layer.attn_norm.weight, layer.attn_norm.eps)
coreml_model = ct.convert(
jitted_model,
convert_to="mlprogram",
minimum_deployment_target=ct.target.macOS14,
inputs=coreml_input_types,
outputs=coreml_output_types,
compute_precision=compute_precision,
compute_units=compute_units
#pass_pipeline=ct.PassPipeline.DEFAULT_PALETTIZATION, # palettization
)
import sys
if sys.platform == "darwin":
coreml_outputs = coreml_model.predict(t_inputs)
print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")
# Override tokenizer
model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type
'''
user_defined_metadata = {
"co.huggingface.exporters.name": model_name,
"co.huggingface.exporters.task": "text-generation",
"co.huggingface.exporters.architecture": architecture,
"co.huggingface.exporters.framework": "pytorch",
"co.huggingface.exporters.precision": compute_precision,
}
'''
# Assuming `compute_precision` is used to apply float16 precision selectively
precision_description = "FP16 for all ops except l2_norm"
user_defined_metadata = {
"co.huggingface.exporters.name": model_name,
"co.huggingface.exporters.task": "text-generation",
"co.huggingface.exporters.architecture": architecture,
"co.huggingface.exporters.framework": "pytorch",
"co.huggingface.exporters.precision": precision_description,
}
spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)
coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
- Sequence length: {shape[-1]}, fixed.
- Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
f.write(card)
@antmikinka
Copy link
Author

V12

lots has happened

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

#tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") # this was removed some versions ago
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs, use_cache=True)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, past_key_values=None):
        return self.model(
            input_ids=input_ids,
            past_key_values=past_key_values,
            return_dict=False,
            use_cache=True
        )

to_jit = Wrapper(model)

# Initialize past_key_values with the correct shapes using configuration
config = model.config
past_key_values_placeholder = tuple(
    (torch.zeros((1, config.num_kv_heads[i], 0, config.model_dim // config.num_kv_heads[i]), dtype=torch.float16),
     torch.zeros((1, config.num_kv_heads[i], 0, config.model_dim // config.num_kv_heads[i]), dtype=torch.float16))
    for i in range(config.num_transformer_layers)
)

jit_inputs = [t_inputs["input_ids"], past_key_values_placeholder]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v12.5 (no past-key value implementation)

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False
        )

to_jit = Wrapper(model)
jit_inputs = [t_inputs["input_ids"]]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val != 0 and dim.val != 1:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v12.5 (no past key values) error

 python openelm-coremlv125.py --model_id apple/OpenELM-270M-Instruct --output_dir Users/anthonymikinka/Downloads
scikit-learn version 1.4.1.post1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.
Torch version 2.2.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.
apple/OpenELM-270M-Instruct
Users/anthonymikinka/Downloads/OpenELM-270M-Instruct-128-ComputePrecision.FLOAT16.mlpackage
/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/transformers/modeling_utils.py:4371: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead
  warnings.warn(
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:759: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if seq_length > self.causal_mask.shape[-1]:
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:230: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  assert dim == self.model_dim
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:240: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  key_len >= query_len
/Users/anthonymikinka/.cache/huggingface/modules/transformers_modules/apple/OpenELM-270M-Instruct/eb111ff2e6724348e5b905984063d4064d4bc579/modeling_openelm.py:175: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  key_len > self._cached_seq_length
Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   0%|                                                                                                                                                             | 0/1755 [00:00<?, ? ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:   7%|█████████▉                                                                                                                                       | 120/1755 [00:00<00:00, 3292.99 ops/s]
Traceback (most recent call last):
  File "/Users/anthonymikinka/corenet/mlx_examples/open_elm/openelm-coremlv125.py", line 125, in <module>
    coreml_model = ct.convert(
                   ^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/_converters_entry.py", line 574, in convert
    mlmodel = mil_convert(
              ^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 188, in mil_convert
    return _mil_convert(model, convert_from, convert_to, ConverterRegistry, MLModel, compute_units, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 212, in _mil_convert
    proto, mil_program = mil_convert_to_proto(
                         ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 286, in mil_convert_to_proto
    prog = frontend_converter(model, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/converter.py", line 108, in __call__
    return load(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 80, in load
    return _perform_torch_convert(converter, debug)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 107, in _perform_torch_convert
    raise e
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/load.py", line 99, in _perform_torch_convert
    prog = converter.convert()
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/converter.py", line 519, in convert
    convert_nodes(self.context, self.graph)
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 88, in convert_nodes
    add_op(context, node)
  File "/Users/anthonymikinka/corenet/venv/lib/python3.11/site-packages/coremltools/converters/mil/frontend/torch/ops.py", line 4812, in repeat_interleave
    raise NotImplementedError(
NotImplementedError: Conversion for torch.repeat_interleave with non-zero dim has not been implemented

@antmikinka
Copy link
Author

v13.5 ( no past-key values)

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False
        )

to_jit = Wrapper(model)
jit_inputs = [t_inputs["input_ids"]]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val not in [0, 1]:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim other than 1 has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs['logits'].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

@antmikinka
Copy link
Author

v14.5 (no past-key values)

import argparse
import numpy as np
import torch
import torch.nn as nn
import coremltools as ct
from transformers import AutoTokenizer, AutoModelForCausalLM

def selector(op):
    return op.op_type != "l2_norm"

compute_precision = ct.precision.FLOAT16
compute_units = ct.ComputeUnit.CPU_AND_NE

# Fixed sequence length
shape = (1, 128)

parser = argparse.ArgumentParser()
parser.add_argument(
    "--model_id",
    required=True,
    help="OpenELM checkpoint from the Hub. Example: 'apple/OpenELM-1_1B-Instruct'",
)
parser.add_argument(
    "--output_dir",
    required=True,
    help="Parent folder to save the converted Core ML model",
)
args = parser.parse_args()

model_id = args.model_id
basename = model_id.split("/")[-1]
outpath = f"{args.output_dir}/{basename}-{shape[1]}-{compute_precision}.mlpackage"

print(model_id)
print(outpath)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
model.eval()

# Convert the model weights to float16
model.half()

inputs = {
    "input_ids": np.random.randint(0, tokenizer.vocab_size, shape),
}

with torch.no_grad():
    t_inputs = {k: torch.tensor(v, dtype=torch.int32) for k, v in inputs.items()}
    outputs = model(**t_inputs)

class Wrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids):
        return self.model(
            input_ids=input_ids,
            return_dict=False,
            use_cache=False
        )

to_jit = Wrapper(model)
jit_inputs = [t_inputs["input_ids"]]
jitted_model = torch.jit.trace(to_jit, jit_inputs)
jitted_model.eval()

with torch.no_grad():
    output_jit = jitted_model(*jit_inputs)

assert torch.allclose(output_jit[0], outputs[0])

# Register the custom repeat_interleave operation
from coremltools.converters.mil import Builder as mb
from coremltools.converters.mil.frontend.torch.torch_op_registry import register_torch_op
from coremltools.converters.mil.frontend.torch.ops import _get_inputs

@register_torch_op
def repeat_interleave(context, node):
    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
    if dim.val == 1 and x.shape[0] == 1:
        x = mb.reshape(x=x, shape=(x.shape[1:],))

    repeats_val = repeats.val
    if isinstance(repeats_val, np.ndarray):
        repeats_val0 = np.expand_dims(repeats_val, 0).reshape(-1)[0]
        if np.any(repeats_val != repeats_val0):
            raise NotImplementedError("Conversion for torch.repeat_interleave with Tensor repeats has not been implemented")
        repeats_val = repeats_val0

    if dim is None:
        x = mb.reshape(x=x, shape=(-1,))
    else:
        if dim.val not in [0, 1]:
            raise NotImplementedError("Conversion for torch.repeat_interleave with non-zero dim other than 1 has not been implemented")

    reps = [1] * x.rank
    reps[dim.val] = repeats_val
    x_tiled = mb.tile(x=x, reps=reps)

    split_reps = [repeats_val] + list(x.shape)
    x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))

    perm = [*range(x.rank + 1)]
    perm[0], perm[1] = perm[1], perm[0]
    x_transposed = mb.transpose(x=x_reshaped, perm=perm)

    result_shape = list(x.shape)
    result_shape[dim.val] = -1
    if dim.val == 1:
        result_shape = [1] + result_shape

    result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
    context.add(result)

# Core ML conversion
coreml_input_types = [ct.TensorType(
    name="input_ids",
    shape=ct.Shape(shape=shape),
    dtype=np.int32,
)]
coreml_output_types = [ct.TensorType(name="logits", dtype=np.float16)]

coreml_model = ct.convert(
    jitted_model,
    convert_to="mlprogram",
    minimum_deployment_target=ct.target.macOS14,
    inputs=coreml_input_types,
    outputs=coreml_output_types,
    compute_precision=compute_precision,
    compute_units=compute_units
)

import sys
if sys.platform == "darwin":
    coreml_outputs = coreml_model.predict(t_inputs)
    print(f"Converted, max diff for random inputs: {abs(coreml_outputs['logits'] - outputs[0].numpy()).max()}")

model_name = "pcuenq/test-llama-tokenizer"
architecture = model.config.model_type

precision_description = "FP16 for all ops"

user_defined_metadata = {
    "co.huggingface.exporters.name": model_name,
    "co.huggingface.exporters.task": "text-generation",
    "co.huggingface.exporters.architecture": architecture,
    "co.huggingface.exporters.framework": "pytorch",
    "co.huggingface.exporters.precision": precision_description,
}

spec = coreml_model._spec
spec.description.metadata.userDefined.update(user_defined_metadata)

coreml_model.save(outpath)
card = f"""
This repository contains a Core ML conversion of [{model_id}](https://hf.co/{model_id}) with the following characteristics:
    - Sequence length: {shape[-1]}, fixed.
    - Precision: {precision_description}.
Please, check the [original model card](https://hf.co/{model_id}) for additional details on the model.
"""
with open(f"{args.output_dir}/README.md", "w") as f:
    f.write(card)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment