Skip to content

Instantly share code, notes, and snippets.

T = 12
padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
print(padded_tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print(attn_mask)
# Out: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]']
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#For single sequence input
sentence = 'I really enjoyed this movie a lot.'
tokens = tokenizer.tokenize(sentence)
print(tokens)
# Out: ['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.']
class CustomIterableDatasetv2(IterableDataset):
def __init__(self, filename_en, filename_gm):
#Store the filenames in object's memory
self.filename_en = filename_en
self.filename_gm = filename_gm
#And that's it, we no longer need to store the contents in the memory
itr1 = range(0,5)
itr2 = range(1,6)
itr3 = zip(itr1, itr2)
for i in itr3:
print(i)
'''
Prints
(0,1)
dataset = CustomIterableDatasetv1('path_to/somefile')
dataloader = DataLoader(dataset, batch_size = 64)
for X, y in dataloader:
print(len(X)) # 64
print(y.shape) # (64,)
### Do something with X and y
###
class CustomIterableDatasetv1(IterableDataset):
def __init__(self, filename):
#Store the filename in object's memory
self.filename = filename
#And that's it, we no longer need to store the contents in the memory
def preprocess(self, text):
def square(x):
return x**2
itr1 = range(5)
for i in itr1:
print(i)
'''
Prints
0
1
#Creating the iterable dataset object
dataset = CustomIterableDataset('path_to/somefile')
#Creating the dataloader
dataloader = DataLoader(dataset, batch_size = 64)
for data in dataloader:
#Data is a list containing 64 (=batch_size) consecutive lines of the file
print(len(data)) #[64,]
#We still need to separate the text and labels from each other and preprocess the text
from torch.utils.data import IterableDataset
class CustomIterableDataset(IterableDataset):
def __init__(self, filename):
#Store the filename in object's memory
self.filename = filename
#And that's it, we no longer need to store the contents in the memory