https://www.kaggle.com/riblidezso/finetune-xlm-roberta-on-jigsaw-test-data-with-mlm
def prepare_mlm_input_and_labels(X):
# 15% BERT masking
inp_mask = np.random.rand(*X.shape)<0.15
# do not mask special tokens
inp_mask[X<=2] = False
# set targets to -1 by default, it means ignore
labels = -1 * np.ones(X.shape, dtype=int)
# set labels for masked tokens
labels[inp_mask] = X[inp_mask]
# prepare input
X_mlm = np.copy(X)
# set input to [MASK] which is the last token for the 90% of tokens
# this means leaving 10% unchanged
inp_mask_2mask = inp_mask & (np.random.rand(*X.shape)<0.90)
X_mlm[inp_mask_2mask] = 250001 # mask token is the last in the dict
# set 10% to a random token
# this means 80% masked and 10% replaced with random token
inp_mask_2random = inp_mask_2mask & (np.random.rand(*X.shape) < 1/9)
X_mlm[inp_mask_2random] = np.random.randint(3, 250001, inp_mask_2random.sum())
return X_mlm, labels