Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Norod/c3888a0d26daa499ea62e254bb5c08b6 to your computer and use it in GitHub Desktop.
Save Norod/c3888a0d26daa499ea62e254bb5c08b6 to your computer and use it in GitHub Desktop.
A set of scripts for: training a small tokenizer in a new language, merging small tokinizer with existing one and saving the combined and resized model
"""
Given two tokenizers, combine them and create a new tokenizer
Usage: python combine_tokenizers.py --tokenizer1 ../config/en/roberta_8 --tokenizer2 ../config/hi/roberta_8 --save_dir ../config/en/en_hi/roberta_8
Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989
"""
# Libraries for tokenizer
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import argparse
import json
import os
from tqdm import tqdm
from transformers import AutoTokenizer
from timeit import default_timer as timer
import sys
def combine_tokenizers(args):
# Load both the json files, take the union, and store it
json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json')))
json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json')))
# Create a new vocabulary
new_vocab = {}
idx = 0
for word in json1.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Add words from second tokenizer
for word in json2.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Make the directory if necessary
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
# Save the vocab
with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp:
json.dump(new_vocab, fp, ensure_ascii=False)
# Merge the two merges file. Don't handle duplicates here
# Concatenate them, but ignore the first line of the second file
os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
# Save other files
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir))
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir))
# Instantiate the new tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True)
tokenizer.save_pretrained(args.save_dir+'/tokenizer')
def main():
parser = argparse.ArgumentParser()
# Dataset Arguments
parser.add_argument("--tokenizer1", type=str, required=True, help="")
parser.add_argument("--tokenizer2", type=str, required=True, help="")
parser.add_argument("--save_dir", type=str, required=True, help="")
args = parser.parse_args()
combine_tokenizers(args)
if __name__ == '__main__':
main()
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained("./SmolLM-tokenizer-with-added-hebrew-14k")
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained("./SmolLM-135M").to(device)
model.resize_token_embeddings(len(tokenizer))
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))
model.save_pretrained("./Heb-SmolLM-135M")
tokenizer.save_pretrained("./Heb-SmolLM-135M")
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment