Skip to content

Instantly share code, notes, and snippets.

@bhavsarpratik
Last active February 23, 2020 07:24
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save bhavsarpratik/fdccb7cb22ddf7f48ea32bf9ce77abe8 to your computer and use it in GitHub Desktop.
import pandas as pd
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000):
"""Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model"""
PATH = Path('/tmp/lm_data')
PATH.mkdir(exist_ok=True)
df.dropna(inplace=True)
lm_text = ' '.join(df['text'])
with open('/tmp/lm_data/data.txt', mode='w') as f:
f.write(lm_text)
paths = [str(x) for x in Path("/tmp/lm_data/").glob("**/*.txt")]
# Initialize a tokenizer
tokenizer_new = BertWordPieceTokenizer(old_vocab, lowercase=True)
# Customize training
tokenizer_new.train(files=paths, vocab_size=vocab_size, min_frequency=2)
tokenizer_new.save(".", "/tmp/new")
new_vocab = open('/tmp/new-vocab.txt', 'r').read().split('\n')
new_vocab.remove('')
print('Adding new tokens to vocab')
n_orig_tokens = len(tokenizer)
tokenizer.add_tokens(new_vocab)
print('Original no. of tokens: %s'%n_orig_tokens)
print('Final no. of tokens: %s'%len(tokenizer))
print('Initialised random emb for new tokens')
model.resize_token_embeddings(len(tokenizer))
return model, tokenizer
# Insert below code after the tokenizer and model are loaded in run_language_modeling.py
df = pd.read_csv(args.train_data_file) #df should have a column called 'text' which contains the text rows
df.dropna(inplace=True)
# Getting theseus vocab==bert-base-uncased vocab==tinBERT vocab. Contains 30.5k tokens
# Get vocab link from here https://huggingface.co/canwenxu/BERT-of-Theseus-MNLI
os.system('wget -O https://s3.amazonaws.com/models.huggingface.co/bert/canwenxu/BERT-of-Theseus-MNLI/vocab.txt')
# Add new vocab to tokenizer and randomly initialise rows for new vocab in the model
model, tokenizer = add_vocab_to_model(df, model, tokenizer, 'vocab.txt', vocab_size=10000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment