This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
T = 12 | |
padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))] | |
print(padded_tokens) | |
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]'] | |
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens] | |
print(attn_mask) | |
# Out: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tokens = ['[CLS]'] + tokens + ['[SEP]'] | |
print(tokens) | |
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import BertTokenizer | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
#For single sequence input | |
sentence = 'I really enjoyed this movie a lot.' | |
tokens = tokenizer.tokenize(sentence) | |
print(tokens) | |
# Out: ['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CustomIterableDatasetv2(IterableDataset): | |
def __init__(self, filename_en, filename_gm): | |
#Store the filenames in object's memory | |
self.filename_en = filename_en | |
self.filename_gm = filename_gm | |
#And that's it, we no longer need to store the contents in the memory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
itr1 = range(0,5) | |
itr2 = range(1,6) | |
itr3 = zip(itr1, itr2) | |
for i in itr3: | |
print(i) | |
''' | |
Prints | |
(0,1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dataset = CustomIterableDatasetv1('path_to/somefile') | |
dataloader = DataLoader(dataset, batch_size = 64) | |
for X, y in dataloader: | |
print(len(X)) # 64 | |
print(y.shape) # (64,) | |
### Do something with X and y | |
### |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class CustomIterableDatasetv1(IterableDataset): | |
def __init__(self, filename): | |
#Store the filename in object's memory | |
self.filename = filename | |
#And that's it, we no longer need to store the contents in the memory | |
def preprocess(self, text): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def square(x): | |
return x**2 | |
itr1 = range(5) | |
for i in itr1: | |
print(i) | |
''' | |
Prints | |
0 | |
1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Creating the iterable dataset object | |
dataset = CustomIterableDataset('path_to/somefile') | |
#Creating the dataloader | |
dataloader = DataLoader(dataset, batch_size = 64) | |
for data in dataloader: | |
#Data is a list containing 64 (=batch_size) consecutive lines of the file | |
print(len(data)) #[64,] | |
#We still need to separate the text and labels from each other and preprocess the text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data import IterableDataset | |
class CustomIterableDataset(IterableDataset): | |
def __init__(self, filename): | |
#Store the filename in object's memory | |
self.filename = filename | |
#And that's it, we no longer need to store the contents in the memory |