Skip to content

Instantly share code, notes, and snippets.

@pboyd
Last active February 10, 2025 14:17
Model Registry Catalog
{
"$id": "https://kubeflow.org/model-registry/catalog.yaml",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "Model Catalog",
"type": "object",
"properties": {
"source": {
"type": "string",
"description": "The name of the catalog provider.",
"example": "Red Hat"
},
"models": {
"type": "array",
"description": "List of models available in the catalog. `repository` and `name` are used\nto uniquely identify a model, and should be unique within the catalog.",
"items": {
"type": "object",
"required": [
"repository",
"name"
],
"properties": {
"repository": {
"type": "string",
"description": "Name of the repository in the catalog.",
"example": "ibm-granite"
},
"name": {
"type": "string",
"description": "Code name of the model.",
"example": "granite-3.1-8b-base"
},
"provider": {
"type": "string",
"description": "Name of the organization or entity that provides the model.",
"example": "IBM"
},
"description": {
"type": "string",
"description": "Short description of the model."
},
"longDescription": {
"type": "string",
"description": "Longer description of the model."
},
"logo": {
"type": "string",
"format": "uri",
"description": "URL to the model's logo. A [data\nURL](https://developer.mozilla.org/en-US/docs/Web/URI/Schemes/data)\nis recommended."
},
"readme": {
"type": "string",
"description": "Model documentation in Markdown."
},
"language": {
"type": "array",
"description": "List of supported languages (https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes).",
"items": {
"type": "string"
},
"example": [
"en",
"es",
"cz"
]
},
"license": {
"type": "string",
"description": "Short name of the model's license.",
"example": "apache-2.0"
},
"licenseLink": {
"type": "string",
"format": "uri",
"description": "URL to the license text."
},
"maturity": {
"type": "string",
"description": "Maturity level of the model.",
"example": "Generally Available"
},
"libraryName": {
"type": "string",
"example": "transformers"
},
"baseModel": {
"type": "array",
"description": "Reference to the base model (if any).",
"items": {
"type": "object",
"properties": {
"catalog": {
"type": "string",
"description": "Name of the catalog for an external base model. Omit for\nmodels in the same catalog.",
"example": "huggingface.io"
},
"repository": {
"type": "string",
"description": "Name of the repository in an external catalog where the base\nmodel exists. Omit for models in the same catalog.",
"example": "ibm-granite"
},
"name": {
"type": "string",
"example": "granite-3.1-8b-base"
}
}
}
},
"labels": {
"type": "array",
"description": "List of labels for categorization.",
"example": [
"language"
],
"items": {
"type": "string"
}
},
"tasks": {
"type": "array",
"description": "List of tasks the model is designed for.",
"items": {
"type": "string"
},
"example": [
"text-generation"
]
},
"createTimeSinceEpoch": {
"description": "Creation time in milliseconds since epoch.",
"type": "integer"
},
"lastUpdateTimeSinceEpoch": {
"description": "Last update time in milliseconds since epoch.",
"type": "integer"
},
"artifacts": {
"type": "array",
"description": "If a model has multiple versions, each version should have a\nseparate artifact.",
"items": {
"type": "object",
"properties": {
"createTimeSinceEpoch": {
"description": "Creation time in milliseconds since epoch.",
"type": "integer"
},
"protocol": {
"type": "string",
"description": "The protocol used to access the artifact (only `oci` for now).",
"enum": [
"oci"
]
},
"tags": {
"type": "array",
"description": "List of tags for the artifact. These are for information\nonly. It is recommended that this list only include immutable\ntags (e.g. `1.2.3` instead of ephemeral/floating tags such as\n`1`, `1.2`, or `latest`).",
"example": [
"2.1.2"
],
"items": {
"type": "string"
}
},
"uri": {
"type": "string",
"description": "Artifact URI."
}
}
}
}
}
}
}
}
}
$id: https://kubeflow.org/model-registry/catalog.yaml
$schema: https://json-schema.org/draft/2020-12/schema
title: Model Catalog
type: object
properties:
source:
type: string
description: The name of the catalog provider.
example: Red Hat
models:
type: array
description: |-
List of models available in the catalog. `repository` and `name` are used
to uniquely identify a model, and should be unique within the catalog.
items:
type: object
required:
- repository
- name
properties:
repository:
type: string
description: Name of the repository in the catalog.
example: ibm-granite
name:
type: string
description: Code name of the model.
example: granite-3.1-8b-base
provider:
type: string
description: Name of the organization or entity that provides the model.
example: IBM
description:
type: string
description: Short description of the model.
longDescription:
type: string
description: Longer description of the model.
logo:
type: string
format: uri
description: |-
URL to the model's logo. A [data
URL](https://developer.mozilla.org/en-US/docs/Web/URI/Schemes/data)
is recommended.
readme:
type: string
description: Model documentation in Markdown.
language:
type: array
description: List of supported languages (https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes).
items:
type: string
example:
- en
- es
- cz
license:
type: string
description: Short name of the model's license.
example: apache-2.0
licenseLink:
type: string
format: uri
description: URL to the license text.
maturity:
type: string
description: Maturity level of the model.
example: Generally Available
libraryName:
type: string
example: transformers
baseModel:
type: array
description: Reference to the base model (if any).
items:
type: object
properties:
catalog:
type: string
description: |-
Name of the catalog for an external base model. Omit for
models in the same catalog.
example: huggingface.io
repository:
type: string
description: |-
Name of the repository in an external catalog where the base
model exists. Omit for models in the same catalog.
example: ibm-granite
name:
type: string
example: granite-3.1-8b-base
labels:
type: array
description: List of labels for categorization.
example:
- language
items:
type: string
tasks:
type: array
description: List of tasks the model is designed for.
items:
type: string
example:
- text-generation
createTimeSinceEpoch:
description: Creation time in milliseconds since epoch.
type: integer
lastUpdateTimeSinceEpoch:
description: Last update time in milliseconds since epoch.
type: integer
artifacts:
type: array
description: |-
If a model has multiple versions, each version should have a
separate artifact.
items:
type: object
properties:
createTimeSinceEpoch:
description: Creation time in milliseconds since epoch.
type: integer
protocol:
type: string
description: The protocol used to access the artifact (only `oci` for now).
enum:
- oci
tags:
type: array
description: |-
List of tags for the artifact. These are for information
only. It is recommended that this list only include immutable
tags (e.g. `1.2.3` instead of ephemeral/floating tags such as
`1`, `1.2`, or `latest`).
example: ["2.1.2"]
items:
type: string
uri:
type: string
description: Artifact URI.
source: Red Hat
models:
- repository: rhelai1
name: granite-8b-code-base
provider: IBM
description: A decoder-only code model designed for code generative tasks
longDescription: |-
Granite-8B-Code-Base is a decoder-only code model designed for code
generative tasks (e.g., code generation, code explanation, code fixing,
etc.). It is trained from scratch with a two-phase training strategy. In
phase 1, our model is trained on 4 trillion tokens sourced from 116
programming languages, ensuring a comprehensive understanding of
programming languages and syntax. In phase 2, our model is trained on 500
billion tokens with a carefully designed mixture of high-quality data from
code and natural language domains to improve the models’ ability to reason
and follow instructions.
readme: |-
---
license: apache-2.0
library_name: transformers
tags:
- language
- granite-3.1
---
# Granite-3.1-8B-Base
**Model Summary:**
Granite-3.1-8B-Base extends the context length of Granite-3.0-8B-Base from 4K to 128K using a progressive training strategy by increasing the supported context length in increments while adjusting RoPE theta until the model has successfully adapted to desired length of 128K. This long-context pre-training stage was performed using approximately 500B tokens.
- **Developers:** Granite Team, IBM
- **GitHub Repository:** [ibm-granite/granite-3.1-language-models](https://github.com/ibm-granite/granite-3.1-language-models)
- **Website**: [Granite Docs](https://www.ibm.com/granite/docs/)
- **Paper:** [Granite 3.1 Language Models (coming soon)](https://huggingface.co/collections/ibm-granite/granite-31-language-models-6751dbbf2f3389bec5c6f02d)
- **Release Date**: December 18th, 2024
- **License:** [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
**Supported Languages:**
English, German, Spanish, French, Japanese, Portuguese, Arabic, Czech, Italian, Korean, Dutch, and Chinese. Users may finetune Granite 3.1 models for languages beyond these 12 languages.
**Intended Use:**
Prominent use cases of LLMs in text-to-text generation include summarization, text classification, extraction, question-answering, and other long-context tasks. All Granite Base models are able to handle these tasks as they were trained on a large amount of data from various domains. Moreover, they can serve as baseline to create specialized models for specific application scenarios.
**Generation:**
This is a simple example of how to use Granite-3.1-8B-Base model.
Install the following libraries:
```shell
pip install torch torchvision torchaudio
pip install accelerate
pip install transformers
```
Then, copy the code snippet below to run the example.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "auto"
model_path = "ibm-granite/granite-3.1-8B-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# drop device_map if running on CPU
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
model.eval()
# change input text as desired
input_text = "Where is the Thomas J. Watson Research Center located?"
# tokenize the text
input_tokens = tokenizer(input_text, return_tensors="pt").to(device)
# generate output tokens
output = model.generate(**input_tokens,
max_length=4000)
# decode output tokens into text
output = tokenizer.batch_decode(output)
# print output
print(output)
```
**Evaluation Results:**
<table>
<caption><b>HuggingFace Open LLM Leaderboard V1</b></caption>
<thead>
<tr>
<th style="text-align:left; background-color: #001d6c; color: white;">Models</th>
<th style="text-align:center; background-color: #001d6c; color: white;">ARC-Challenge</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Hellaswag</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Winogrande</th>
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Avg</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:left; background-color: #DAE8FF; color: black;">Granite-3.1-8B-Base</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">63.99</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">83.27</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">63.45</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">51.29</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">78.92</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">60.19</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">66.85</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-2B-Base</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">53.58</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">77.67</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.86</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">39.02</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.84</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.99</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.32</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-3B-A800M-Base</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.76</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.45</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">48.31</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">39.91</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.29</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">40.56</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">53.88</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-1B-A400M-Base</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">39.42</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.13</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">26.53</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">37.67</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">2.03</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.87</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">31.78</td>
</tr>
</tbody></table>
<table>
<caption><b>HuggingFace Open LLM Leaderboard V2</b></caption>
<thead>
<tr>
<th style="text-align:left; background-color: #001d6c; color: white;">Models</th>
<th style="text-align:center; background-color: #001d6c; color: white;">IFEval</th>
<th style="text-align:center; background-color: #001d6c; color: white;">BBH</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MATH Lvl 5</th>
<th style="text-align:center; background-color: #001d6c; color: white;">GPQA</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MUSR</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU-Pro</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Avg</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:left; background-color: #DAE8FF; color: black;">Granite-3.1-8B-Base</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">42.21</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">26.02</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">9.52</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">9.51</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8.36</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">24.8</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">20.07</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-2B-Base</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">35.22</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">16.84</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">5.59</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">3.69</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">3.9</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.9</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.19</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-3B-A800M-Base</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">29.96</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">11.91</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">4</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">3.69</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">1.11</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">8.81</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.91</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-1B-A400M-Base</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">25.19</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">6.43</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">2.19</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">0.22</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">1.76</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">1.55</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">6.22</td>
</tr>
</tbody></table>
**Model Architecture:**
Granite-3.1-8B-Base is based on a decoder-only dense transformer architecture. Core components of this architecture are: GQA and RoPE, MLP with SwiGLU, RMSNorm, and shared input/output embeddings.
<table>
<thead>
<tr>
<th style="text-align:left; background-color: #001d6c; color: white;">Model</th>
<th style="text-align:center; background-color: #001d6c; color: white;">2B Dense</th>
<th style="text-align:center; background-color: #001d6c; color: white;">8B Dense</th>
<th style="text-align:center; background-color: #001d6c; color: white;">1B MoE</th>
<th style="text-align:center; background-color: #001d6c; color: white;">3B MoE</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Embedding size</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">2048</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">4096</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">1024</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">1536</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of layers</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">40</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">40</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">24</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Attention head size</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">128</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of attention heads</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">32</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">16</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">24</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of KV heads</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">MLP hidden size</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8192</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">12800</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">512</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">512</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">MLP activation</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">SwiGLU</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of experts</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">—</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">—</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">40</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">MoE TopK</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">—</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">—</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Initialization std</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">0.1</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Sequence length</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">128K</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Position embedding</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">RoPE</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;"># Parameters</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">2.5B</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8.1B</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">1.3B</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">3.3B</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;"># Active parameters</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">2.5B</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8.1B</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">400M</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">800M</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;"># Training tokens</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">12T</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">12T</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">10T</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">10T</td>
</tr>
</tbody></table>
**Training Data:**
This model is trained on a mix of open source and proprietary data following a three-stage training strategy.
* Stage 1 data: The data for stage 1 is sourced from diverse domains, such as: web, code, academic sources, books, and math data.
* Stage 2 data: The data for stage 2 comprises a curated mix of high-quality data from the same domains, plus multilingual and instruction data. The goal of this second training phase is to enhance the model’s performance on specific tasks.
* Stage 3 data: The data for stage 3 consists of original stage-2 pretraining data with additional synthetic long-context data in form of QA/summary pairs where the answer
contains a recitation of the related paragraph before the answer.
A detailed attribution of datasets can be found in the [Granite 3.0 Technical Report](https://github.com/ibm-granite/granite-3.0-language-models/blob/main/paper.pdf), [Granite 3.1 Technical Report (coming soon)](https://huggingface.co/collections/ibm-granite/granite-31-language-models-6751dbbf2f3389bec5c6f02d), and [Accompanying Author List](https://github.com/ibm-granite/granite-3.0-language-models/blob/main/author-ack.pdf).
**Infrastructure:**
We train Granite 3.1 Language Models using IBM's super computing cluster, Blue Vela, which is outfitted with NVIDIA H100 GPUs. This cluster provides a scalable and efficient infrastructure for training our models over thousands of GPUs.
**Ethical Considerations and Limitations:**
The use of Large Language Models involves risks and ethical considerations people must be aware of, including but not limited to: bias and fairness, misinformation, and autonomous decision-making. Granite-3.1-8B-Base model is not the exception in this regard. Even though this model is suited for multiple generative AI tasks, it has not undergone any safety alignment, there it may produce problematic outputs. Additionally, it remains uncertain whether smaller models might exhibit increased susceptibility to hallucination in generation scenarios by copying text verbatim from the training dataset due to their reduced sizes and memorization capacities. This aspect is currently an active area of research, and we anticipate more rigorous exploration, comprehension, and mitigations in this domain. Regarding ethics, a latent risk associated with all Large Language Models is their malicious utilization. We urge the community to use Granite-3.1-8B-Base model with ethical intentions and in a responsible way.
**Resources**
- ⭐️ Learn about the latest updates with Granite: https://www.ibm.com/granite
- 📄 Get started with tutorials, best practices, and prompt engineering advice: https://www.ibm.com/granite/docs/
- 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1733514949000
lastUpdateTimeSinceEpoch: 1734637721000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1733514949000
tags: ["1.3.0"]
uri: oci://registry.redhat.io/rhelai1/granite-8b-code-base:1.3-1732870892
- repository: rhelai1
name: granite-8b-code-instruct
provider: IBM
description: A fine-tuned model based on Granite 8B Code Base
longDescription: |-
Granite-8B-Code-Instruct is a 8B parameter model fine tuned from
Granite-8B-Code-Base on a combination of permissively licensed instruction
data to enhance instruction following capabilities including logical
reasoning and problem-solving skills.
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
readme: |-
---
pipeline_tag: text-generation
inference: false
license: apache-2.0
library_name: transformers
tags:
- language
- granite-3.1
base_model:
- ibm-granite/granite-3.1-8b-base
---
# Granite-3.1-8B-Instruct
**Model Summary:**
Granite-3.1-8B-Instruct is a 8B parameter long-context instruct model finetuned from Granite-3.1-8B-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets tailored for solving long context problems. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging.
- **Developers:** Granite Team, IBM
- **GitHub Repository:** [ibm-granite/granite-3.1-language-models](https://github.com/ibm-granite/granite-3.1-language-models)
- **Website**: [Granite Docs](https://www.ibm.com/granite/docs/)
- **Paper:** [Granite 3.1 Language Models (coming soon)](https://huggingface.co/collections/ibm-granite/granite-31-language-models-6751dbbf2f3389bec5c6f02d)
- **Release Date**: December 18th, 2024
- **License:** [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
**Supported Languages:**
English, German, Spanish, French, Japanese, Portuguese, Arabic, Czech, Italian, Korean, Dutch, and Chinese. Users may finetune Granite 3.1 models for languages beyond these 12 languages.
**Intended Use:**
The model is designed to respond to general instructions and can be used to build AI assistants for multiple domains, including business applications.
*Capabilities*
* Summarization
* Text classification
* Text extraction
* Question-answering
* Retrieval Augmented Generation (RAG)
* Code related tasks
* Function-calling tasks
* Multilingual dialog use cases
* Long-context tasks including long document/meeting summarization, long document QA, etc.
**Generation:**
This is a simple example of how to use Granite-3.1-8B-Instruct model.
Install the following libraries:
```shell
pip install torch torchvision torchaudio
pip install accelerate
pip install transformers
```
Then, copy the snippet from the section that is relevant for your use case.
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "auto"
model_path = "ibm-granite/granite-3.1-8b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# drop device_map if running on CPU
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
model.eval()
# change input text as desired
chat = [
{ "role": "user", "content": "Please list one IBM Research laboratory located in the United States. You should only output its name and location." },
]
chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
# tokenize the text
input_tokens = tokenizer(chat, return_tensors="pt").to(device)
# generate output tokens
output = model.generate(**input_tokens,
max_new_tokens=100)
# decode output tokens into text
output = tokenizer.batch_decode(output)
# print output
print(output)
```
**Evaluation Results:**
<table>
<caption><b>HuggingFace Open LLM Leaderboard V1</b></caption>
<thead>
<tr>
<th style="text-align:left; background-color: #001d6c; color: white;">Models</th>
<th style="text-align:center; background-color: #001d6c; color: white;">ARC-Challenge</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Hellaswag</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Winogrande</th>
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Avg</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:left; background-color: #DAE8FF; color: black;">Granite-3.1-8B-Instruct</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">62.62</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">84.48</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">65.34</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">66.23</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">75.37</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">73.84</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">71.31</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-2B-Instruct</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.61</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">75.14</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">55.31</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.42</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.48</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.76</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">60.79</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-3B-A800M-Instruct</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.42</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.01</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.19</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.71</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.87</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">48.97</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">56.53</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-1B-A400M-Instruct</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">42.66</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.97</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">26.13</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">46.77</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">62.35</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">33.88</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">46.29</td>
</tr>
</tbody></table>
<table>
<caption><b>HuggingFace Open LLM Leaderboard V2</b></caption>
<thead>
<tr>
<th style="text-align:left; background-color: #001d6c; color: white;">Models</th>
<th style="text-align:center; background-color: #001d6c; color: white;">IFEval</th>
<th style="text-align:center; background-color: #001d6c; color: white;">BBH</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MATH Lvl 5</th>
<th style="text-align:center; background-color: #001d6c; color: white;">GPQA</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MUSR</th>
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU-Pro</th>
<th style="text-align:center; background-color: #001d6c; color: white;">Avg</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:left; background-color: #DAE8FF; color: black;">Granite-3.1-8B-Instruct</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">72.08</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">34.09</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">21.68</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8.28</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">19.01</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">28.19</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">30.55</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-2B-Instruct</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">62.86</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">21.82</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">11.33</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">5.26</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">4.87</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.21</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">21.06</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-3B-A800M-Instruct</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">55.16</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">16.69</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">10.35</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">5.15</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">2.51</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">12.75</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">17.1</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: #2D2D2D;">Granite-3.1-1B-A400M-Instruct</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">46.86</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">6.18</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">4.08</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">0</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">0.78</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">2.41</td>
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">10.05</td>
</tr>
</tbody></table>
**Model Architecture:**
Granite-3.1-8B-Instruct is based on a decoder-only dense transformer architecture. Core components of this architecture are: GQA and RoPE, MLP with SwiGLU, RMSNorm, and shared input/output embeddings.
<table>
<thead>
<tr>
<th style="text-align:left; background-color: #001d6c; color: white;">Model</th>
<th style="text-align:center; background-color: #001d6c; color: white;">2B Dense</th>
<th style="text-align:center; background-color: #001d6c; color: white;">8B Dense</th>
<th style="text-align:center; background-color: #001d6c; color: white;">1B MoE</th>
<th style="text-align:center; background-color: #001d6c; color: white;">3B MoE</th>
</tr></thead>
<tbody>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Embedding size</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">2048</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">4096</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">1024</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">1536</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of layers</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">40</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">40</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">24</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Attention head size</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">128</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">64</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of attention heads</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">32</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">16</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">24</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of KV heads</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">MLP hidden size</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8192</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">12800</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">512</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">512</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">MLP activation</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">SwiGLU</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">SwiGLU</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Number of experts</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">—</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">—</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">32</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">40</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">MoE TopK</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">—</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">—</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">8</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Initialization std</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">0.1</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">0.1</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Sequence length</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">128K</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">128K</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;">Position embedding</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">RoPE</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">RoPE</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;"># Parameters</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">2.5B</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8.1B</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">1.3B</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">3.3B</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;"># Active parameters</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">2.5B</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">8.1B</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">400M</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">800M</td>
</tr>
<tr>
<td style="text-align:left; background-color: #FFFFFF; color: black;"># Training tokens</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">12T</td>
<td style="text-align:center; background-color: #DAE8FF; color: black;">12T</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">10T</td>
<td style="text-align:center; background-color: #FFFFFF; color: black;">10T</td>
</tr>
</tbody></table>
**Training Data:**
Overall, our SFT data is largely comprised of three key sources: (1) publicly available datasets with permissive license, (2) internal synthetic data targeting specific capabilities including long-context tasks, and (3) very small amounts of human-curated data. A detailed attribution of datasets can be found in the [Granite 3.0 Technical Report](https://github.com/ibm-granite/granite-3.0-language-models/blob/main/paper.pdf), [Granite 3.1 Technical Report (coming soon)](https://huggingface.co/collections/ibm-granite/granite-31-language-models-6751dbbf2f3389bec5c6f02d), and [Accompanying Author List](https://github.com/ibm-granite/granite-3.0-language-models/blob/main/author-ack.pdf).
**Infrastructure:**
We train Granite 3.1 Language Models using IBM's super computing cluster, Blue Vela, which is outfitted with NVIDIA H100 GPUs. This cluster provides a scalable and efficient infrastructure for training our models over thousands of GPUs.
**Ethical Considerations and Limitations:**
Granite 3.1 Instruct Models are primarily finetuned using instruction-response pairs mostly in English, but also multilingual data covering eleven languages. Although this model can handle multilingual dialog use cases, its performance might not be similar to English tasks. In such case, introducing a small number of examples (few-shot) can help the model in generating more accurate outputs. While this model has been aligned by keeping safety in consideration, the model may in some cases produce inaccurate, biased, or unsafe responses to user prompts. So we urge the community to use this model with proper safety testing and tuning tailored for their specific tasks.
**Resources**
- ⭐️ Learn about the latest updates with Granite: https://www.ibm.com/granite
- 📄 Get started with tutorials, best practices, and prompt engineering advice: https://www.ibm.com/granite/docs/
- 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources
<!-- ## Citation
```
@misc{granite-models,
author = {author 1, author2, ...},
title = {},
journal = {},
volume = {},
year = {2024},
url = {https://arxiv.org/abs/0000.00000},
}
``` -->
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
baseModel:
- repository: rhelai1
name: granite-8b-code-base
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1733514949000
lastUpdateTimeSinceEpoch: 1734637721000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1733514949000
tags: ["1.3.0"]
uri: oci://registry.redhat.io/rhelai1/granite-8b-code-instruct:1.3-1732870892
- repository: rhelai1
name: granite-8b-lab-v2-preview
provider: IBM
description: A derivative of granite-8b-base trained with the LAB methodology
longDescription: |-
LAB: Large-scale Alignment for chatBots is a novel synthetic data-based
alignment tuning method for LLMs from IBM Research.
Granite-8b-lab-v2-preview is a Granite-8b-base derivative model trained
with the LAB methodology, using Mixtral-8x7b-Instruct as a teacher model.
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
baseModel:
- repository: rhelai1
name: granite-8b-code-base
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1732870892000
lastUpdateTimeSinceEpoch: 1732870892000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1732870892000
tags: ["1.3.0"]
uri: registry.redhat.io/rhelai1/granite-8b-lab-v2-preview:1.3-1732870892
- repository: rhelai1
name: granite-8b-starter-v1
provider: IBM
description: Custom Red Hat phase00 tuned base model.
longDescription: |-
A custom Red Hat base model instruct tuned only for phase 00, produced by IBM Research specifically for RHEL AI.
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1732870892000
lastUpdateTimeSinceEpoch: 1732870892000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1732870892000
tags: ["1.3.0"]
uri: registry.redhat.io/rhelai1/granite-8b-starter-v1:1.3-1732870892
- repository: rhelai1
name: granite-8b-lab-v1
provider: IBM
description: A derivative of granite-8b-base trained with the LAB methodology
longDescription: |-
LAB: Large-scale Alignment for chatBots is a novel synthetic data-based
alignment tuning method for LLMs from IBM Research. Granite-8b-lab-v1 is
a Granite-8b-base derivative model trained with the LAB methodology,
using Mixtral-8x7b-Instruct as a teacher model.
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
baseModel:
- repository: rhelai1
name: granite-8b-code-base
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1732870892000
lastUpdateTimeSinceEpoch: 1732870892000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1732870892000
tags: ["1.3.0"]
uri: registry.redhat.io/rhelai1/granite-8b-lab-v1:1.3-1732870892
- repository: rhelai1
name: granite-7b-starter
provider: IBM
description: Custom Red Hat phase00 tuned base model.
longDescription: |-
A custom Red Hat base model instruct tuned only for phase 00, produced by IBM Research specifically for RHEL AI.
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
language: ["en"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
tasks:
- text-generation
createTimeSinceEpoch: 1732870892000
lastUpdateTimeSinceEpoch: 1732870892000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1732870892000
tags: ["1.3.0"]
uri: registry.redhat.io/rhelai1/granite-7b-starter:1.3-1732870892
- repository: rhelai1
name: granite-7b-redhat
provider: IBM
description: A derivative of granite-7b-base trained with the LAB methodology
longDescription: |-
LAB: Large-scale Alignment for chatBots is a novel synthetic data-based
alignment tuning method for LLMs from IBM Research. Granite-7b-lab is a
Granite-7b-base derivative model trained with the LAB methodology, using
Mixtral-8x7b-Instruct as a teacher model.
logo: data:image/svg+xml;base64,PHN2ZyBpZD0iTGF5ZXJfMSIgZGF0YS1uYW1lPSJMYXllciAxIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAxOTIgMTQ1Ij48ZGVmcz48c3R5bGU+LmNscy0xe2ZpbGw6I2UwMDt9PC9zdHlsZT48L2RlZnM+PHRpdGxlPlJlZEhhdC1Mb2dvLUhhdC1Db2xvcjwvdGl0bGU+PHBhdGggZD0iTTE1Ny43Nyw2Mi42MWExNCwxNCwwLDAsMSwuMzEsMy40MmMwLDE0Ljg4LTE4LjEsMTcuNDYtMzAuNjEsMTcuNDZDNzguODMsODMuNDksNDIuNTMsNTMuMjYsNDIuNTMsNDRhNi40Myw2LjQzLDAsMCwxLC4yMi0xLjk0bC0zLjY2LDkuMDZhMTguNDUsMTguNDUsMCwwLDAtMS41MSw3LjMzYzAsMTguMTEsNDEsNDUuNDgsODcuNzQsNDUuNDgsMjAuNjksMCwzNi40My03Ljc2LDM2LjQzLTIxLjc3LDAtMS4wOCwwLTEuOTQtMS43My0xMC4xM1oiLz48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik0xMjcuNDcsODMuNDljMTIuNTEsMCwzMC42MS0yLjU4LDMwLjYxLTE3LjQ2YTE0LDE0LDAsMCwwLS4zMS0zLjQybC03LjQ1LTMyLjM2Yy0xLjcyLTcuMTItMy4yMy0xMC4zNS0xNS43My0xNi42QzEyNC44OSw4LjY5LDEwMy43Ni41LDk3LjUxLjUsOTEuNjkuNSw5MCw4LDgzLjA2LDhjLTYuNjgsMC0xMS42NC01LjYtMTcuODktNS42LTYsMC05LjkxLDQuMDktMTIuOTMsMTIuNSwwLDAtOC40MSwyMy43Mi05LjQ5LDI3LjE2QTYuNDMsNi40MywwLDAsMCw0Mi41Myw0NGMwLDkuMjIsMzYuMywzOS40NSw4NC45NCwzOS40NU0xNjAsNzIuMDdjMS43Myw4LjE5LDEuNzMsOS4wNSwxLjczLDEwLjEzLDAsMTQtMTUuNzQsMjEuNzctMzYuNDMsMjEuNzdDNzguNTQsMTA0LDM3LjU4LDc2LjYsMzcuNTgsNTguNDlhMTguNDUsMTguNDUsMCwwLDEsMS41MS03LjMzQzIyLjI3LDUyLC41LDU1LC41LDc0LjIyYzAsMzEuNDgsNzQuNTksNzAuMjgsMTMzLjY1LDcwLjI4LDQ1LjI4LDAsNTYuNy0yMC40OCw1Ni43LTM2LjY1LDAtMTIuNzItMTEtMjcuMTYtMzAuODMtMzUuNzgiLz48L3N2Zz4=
language: ["en"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
labels:
- language
tasks:
- text-generation
createTimeSinceEpoch: 1732870892000
lastUpdateTimeSinceEpoch: 1732870892000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1732870892000
tags: ["1.3.0"]
uri: registry.redhat.io/rhelai1/granite-7b-redhat-lab:1.3-1732870892
source: Red Hat
models:
- repository: rhelai1
name: granite-8b-code-base
provider: IBM
description: A decoder-only code model designed for code generative tasks
longDescription: |-
Granite-8B-Code-Base is a decoder-only code model designed for code
generative tasks (e.g., code generation, code explanation, code fixing,
etc.). It is trained from scratch with a two-phase training strategy. In
phase 1, our model is trained on 4 trillion tokens sourced from 116
programming languages, ensuring a comprehensive understanding of
programming languages and syntax. In phase 2, our model is trained on 500
billion tokens with a carefully designed mixture of high-quality data from
code and natural language domains to improve the models’ ability to reason
and follow instructions.
readme: |-
# Granite-3.1-8B-Base
**Model Summary:**
Granite-3.1-8B-Base extends the context length of Granite-3.0-8B-Base from 4K to 128K using a progressive training strategy by increasing the supported context length in increments while adjusting RoPE theta until the model has successfully adapted to desired length of 128K. This long-context pre-training stage was performed using approximately 500B tokens.
...
logo: data:image/gif;base64,R0lGODlhIAAgAMZLAAAAAAEAAAIAAAQAAAYAAAcAAAgAAAoAAAsAAA0AAA4AAA8AABEAABMAABwAAB4AAB8AACAAACQAACYAACkAAC8AADAAADsAAEIAAEMAAEkAAEoAAE4AAFAAAFYAAFkAAF8AAHAAAHMAAHQAAHsAAHwAAIcAAJMAAJcAAJoAAKYAAKwAAK0AAK4AALAAALYAALcAALkAALoAAL4AAMAAAMUAAMgAAMsAANAAANEAANcAANgAANoAANwAAOEAAOUAAOcAAOgAAOkAAOoAAOwAAO0AAO4AAO8AAPAAAPEAAPIAAP///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////yH5BAEKAH8ALAAAAAAgACAAAAf3gH+Cg4SFhoeIiYqLjI2Oj5CCRkaDk5SRhZaam5iSm5+TkaCjl46ko4+noKaqn441Sa2cixobPbGyoYeWRB0AHTm4sruWRyEAABYqQ8KkRYabSDYTyAkfMUJKSKNHQCaZ0SwRyAAMHCk4QklKSknrSkQpGj6EoEgyFwLkAw8aIygsYMRwccKDAgo9jtT7lIRHCQgByCELQAABAgMRAWD4caQUtyM3SFQwILEkAAcrtun6QwoJkh4sRGSQ0OBAAQMLJoCY0VFTJVJH3AXRUeNFCxc0dhRpttLTqaDvkvR0BS7XKURWUS3KaimV1U4sSYEdS7as2bKBAAA7
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1733514949000
lastUpdateTimeSinceEpoch: 1734637721000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1733514949000
tags: ["1.3.0"]
uri: oci://registry.redhat.io/rhelai1/granite-8b-code-base:1.3-1732870892
- repository: rhelai1
name: granite-8b-code-instruct
provider: IBM
description: A fine-tuned model based on Granite 8B Code Base
longDescription: |-
Granite-8B-Code-Instruct is a 8B parameter model fine tuned from
Granite-8B-Code-Base on a combination of permissively licensed instruction
data to enhance instruction following capabilities including logical
reasoning and problem-solving skills.
logo: data:image/gif;base64,R0lGODlhIAAgAMZLAAAAAAEAAAIAAAQAAAYAAAcAAAgAAAoAAAsAAA0AAA4AAA8AABEAABMAABwAAB4AAB8AACAAACQAACYAACkAAC8AADAAADsAAEIAAEMAAEkAAEoAAE4AAFAAAFYAAFkAAF8AAHAAAHMAAHQAAHsAAHwAAIcAAJMAAJcAAJoAAKYAAKwAAK0AAK4AALAAALYAALcAALkAALoAAL4AAMAAAMUAAMgAAMsAANAAANEAANcAANgAANoAANwAAOEAAOUAAOcAAOgAAOkAAOoAAOwAAO0AAO4AAO8AAPAAAPEAAPIAAP///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////yH5BAEKAH8ALAAAAAAgACAAAAf3gH+Cg4SFhoeIiYqLjI2Oj5CCRkaDk5SRhZaam5iSm5+TkaCjl46ko4+noKaqn441Sa2cixobPbGyoYeWRB0AHTm4sruWRyEAABYqQ8KkRYabSDYTyAkfMUJKSKNHQCaZ0SwRyAAMHCk4QklKSknrSkQpGj6EoEgyFwLkAw8aIygsYMRwccKDAgo9jtT7lIRHCQgByCELQAABAgMRAWD4caQUtyM3SFQwILEkAAcrtun6QwoJkh4sRGSQ0OBAAQMLJoCY0VFTJVJH3AXRUeNFCxc0dhRpttLTqaDvkvR0BS7XKURWUS3KaimV1U4sSYEdS7as2bKBAAA7
readme: |-
# Granite-3.1-8B-Instruct
**Model Summary:**
Granite-3.1-8B-Instruct is a 8B parameter long-context instruct model finetuned from Granite-3.1-8B-Base using a combination of open source instruction datasets with permissive license and internally collected synthetic datasets tailored for solving long context problems. This model is developed using a diverse set of techniques with a structured chat format, including supervised finetuning, model alignment using reinforcement learning, and model merging.
...
language: ["ar", "cs", "de", "en", "es", "fr", "it", "ja", "ko", "nl", "pt", "zh"]
license: apache-2.0
licenseLink: https://www.apache.org/licenses/LICENSE-2.0.txt
maturity: Generally Available
libraryName: transformers
baseModel:
- repository: rhelai1
name: granite-8b-code-base
labels:
- language
- granite-3.1
tasks:
- text-generation
createTimeSinceEpoch: 1733514949000
lastUpdateTimeSinceEpoch: 1734637721000
artifacts:
- protocol: oci
createTimeSinceEpoch: 1733514949000
tags: ["1.3.0"]
uri: oci://registry.redhat.io/rhelai1/granite-8b-code-instruct:1.3-1732870892
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment