VibhuJawa/subword_tokenizer_example.py Secret

## subword_tokenizer_example.py
import cudf
from cudf.utils.hash_vocab_utils import hash_vocab
hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')

from cudf.core.subword_tokenizer import SubwordTokenizer
cudf_tokenizer = SubwordTokenizer('voc_hash.txt',
                                   do_lower_case=True)
str_series = cudf.Series(['This is the', 'best book'])
tokenizer_output = cudf_tokenizer(str_series,
                                  max_length=8,
                                  max_num_rows=len(str_series),
                                  padding='max_length',
                                  return_tensors='pt',
                                  truncation=True)
tokenizer_output['input_ids']
tensor([[ 101, 1142, 1110, 1103,  102,    0,    0,    0],
        [ 101, 1436, 1520,  102,    0,    0,    0,    0]],
        device='cuda:0',
       dtype=torch.int32)
tokenizer_output['attention_mask']
tensor([[1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0]],
        device='cuda:0', dtype=torch.int32)
tokenizer_output['metadata']
tensor([[0, 1, 3],
        [1, 1, 2]], device='cuda:0', dtype=torch.int32)
	import cudf
	from cudf.utils.hash_vocab_utils import hash_vocab
	hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')

	from cudf.core.subword_tokenizer import SubwordTokenizer
	cudf_tokenizer = SubwordTokenizer('voc_hash.txt',
	do_lower_case=True)
	str_series = cudf.Series(['This is the', 'best book'])
	tokenizer_output = cudf_tokenizer(str_series,
	max_length=8,
	max_num_rows=len(str_series),
	padding='max_length',
	return_tensors='pt',
	truncation=True)
	tokenizer_output['input_ids']
	tensor([[ 101, 1142, 1110, 1103, 102, 0, 0, 0],
	[ 101, 1436, 1520, 102, 0, 0, 0, 0]],
	device='cuda:0',
	dtype=torch.int32)
	tokenizer_output['attention_mask']
	tensor([[1, 1, 1, 1, 1, 0, 0, 0],
	[1, 1, 1, 1, 0, 0, 0, 0]],
	device='cuda:0', dtype=torch.int32)
	tokenizer_output['metadata']
	tensor([[0, 1, 3],
	[1, 1, 2]], device='cuda:0', dtype=torch.int32)