Last active
February 23, 2020 07:24
Star
You must be signed in to star a gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pathlib import Path | |
from tokenizers import BertWordPieceTokenizer | |
def add_vocab_to_model(df, model, tokenizer, old_vocab, vocab_size=30000): | |
"""Adds new vocab to tokenizer and randomly initialises rows for new vocab in the model""" | |
PATH = Path('/tmp/lm_data') | |
PATH.mkdir(exist_ok=True) | |
df.dropna(inplace=True) | |
lm_text = ' '.join(df['text']) | |
with open('/tmp/lm_data/data.txt', mode='w') as f: | |
f.write(lm_text) | |
paths = [str(x) for x in Path("/tmp/lm_data/").glob("**/*.txt")] | |
# Initialize a tokenizer | |
tokenizer_new = BertWordPieceTokenizer(old_vocab, lowercase=True) | |
# Customize training | |
tokenizer_new.train(files=paths, vocab_size=vocab_size, min_frequency=2) | |
tokenizer_new.save(".", "/tmp/new") | |
new_vocab = open('/tmp/new-vocab.txt', 'r').read().split('\n') | |
new_vocab.remove('') | |
print('Adding new tokens to vocab') | |
n_orig_tokens = len(tokenizer) | |
tokenizer.add_tokens(new_vocab) | |
print('Original no. of tokens: %s'%n_orig_tokens) | |
print('Final no. of tokens: %s'%len(tokenizer)) | |
print('Initialised random emb for new tokens') | |
model.resize_token_embeddings(len(tokenizer)) | |
return model, tokenizer | |
# Insert below code after the tokenizer and model are loaded in run_language_modeling.py | |
df = pd.read_csv(args.train_data_file) #df should have a column called 'text' which contains the text rows | |
df.dropna(inplace=True) | |
# Getting theseus vocab==bert-base-uncased vocab==tinBERT vocab. Contains 30.5k tokens | |
# Get vocab link from here https://huggingface.co/canwenxu/BERT-of-Theseus-MNLI | |
os.system('wget -O https://s3.amazonaws.com/models.huggingface.co/bert/canwenxu/BERT-of-Theseus-MNLI/vocab.txt') | |
# Add new vocab to tokenizer and randomly initialise rows for new vocab in the model | |
model, tokenizer = add_vocab_to_model(df, model, tokenizer, 'vocab.txt', vocab_size=10000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment