negedng/nlp_datasets_hfds_tokenizer_freq.py

## nlp_datasets_hfds_tokenizer_freq.py
# Formating vocab dictionary from the most common words
vocab_dict = {k:i+4 for i,k in enumerate([l for l,m in vocabulary_counter.most_common(20000-4)])}
# Adding the special characters
vocab_dict["[PAD]"]=0
vocab_dict["[UNK]"]=1
vocab_dict["[CLS]"]=2
vocab_dict["[SEP]"]=3
vocab_dict["[MASK]"]=4

tokenizer_2 = BertWordPieceTokenizer(vocab_dict)
	# Formating vocab dictionary from the most common words
	vocab_dict = {k:i+4 for i,k in enumerate([l for l,m in vocabulary_counter.most_common(20000-4)])}
	# Adding the special characters
	vocab_dict["[PAD]"]=0
	vocab_dict["[UNK]"]=1
	vocab_dict["[CLS]"]=2
	vocab_dict["[SEP]"]=3
	vocab_dict["[MASK]"]=4

	tokenizer_2 = BertWordPieceTokenizer(vocab_dict)