This document is a collection of thoughts and observations about the tokenizers used in llama-rooted large language models.
Most language models use the LlamaTokenizer.
import contextlib | |
import datasets | |
from datasets.combine import concatenate_datasets | |
import json | |
import os | |
import pandas as pd | |
from peft import LoftQConfig, PeftModel, PeftConfig | |
import random | |
import torch | |
from transformers import TrainingArguments |
This document is a collection of thoughts and observations about the tokenizers used in llama-rooted large language models.
Most language models use the LlamaTokenizer.
# engine/contextflow.py | |
from ctypes import c_float, c_size_t, c_void_p, c_char, c_int, c_uint8, c_int8, c_int32, pointer, byref | |
import logging | |
import multiprocessing | |
import numpy as np | |
import os | |
from typing import Any, List, Optional, Dict | |
import llama_cpp | |
from llama_cpp._internals import _LlamaTokenDataArray |