nlpjoe/main.md

## main.md

      
    Raw
  

              main.md
            
          
    https://www.kaggle.com/riblidezso/finetune-xlm-roberta-on-jigsaw-test-data-with-mlm
def prepare_mlm_input_and_labels(X):
    # 15% BERT masking
    inp_mask = np.random.rand(*X.shape)<0.15 
    # do not mask special tokens
    inp_mask[X<=2] = False
    # set targets to -1 by default, it means ignore
    labels =  -1 * np.ones(X.shape, dtype=int)
    # set labels for masked tokens
    labels[inp_mask] = X[inp_mask]
    
    # prepare input
    X_mlm = np.copy(X)
    # set input to [MASK] which is the last token for the 90% of tokens
    # this means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*X.shape)<0.90)
    X_mlm[inp_mask_2mask] = 250001  # mask token is the last in the dict

    # set 10% to a random token
    # this means 80% masked and 10% replaced with random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*X.shape) < 1/9)
    X_mlm[inp_mask_2random] = np.random.randint(3, 250001, inp_mask_2random.sum())
    
    return X_mlm, labels