Kabir Ahuja kabirahuja2431

## pad.py
T = 12
padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
print(padded_tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print(attn_mask)
# Out: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]

## cls_sep.py
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]']

## tokenize.py
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#For single sequence input
sentence = 'I really enjoyed this movie a lot.'
tokens = tokenizer.tokenize(sentence)
print(tokens)
# Out: ['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.']

## iterable_dataset_v2.py
class CustomIterableDatasetv2(IterableDataset):

    def __init__(self, filename_en, filename_gm):

        #Store the filenames in object's memory
        self.filename_en = filename_en
        self.filename_gm = filename_gm

        #And that's it, we no longer need to store the contents in the memory

## zip.py
itr1 = range(0,5)
itr2 = range(1,6)
itr3 = zip(itr1, itr2)

for i in itr3:
    print(i)

'''
Prints
(0,1)

## iterable_dataloader_v2.py
dataset = CustomIterableDatasetv1('path_to/somefile')
dataloader = DataLoader(dataset, batch_size = 64)

for X, y in dataloader:
    print(len(X)) # 64
    print(y.shape) # (64,)

    ### Do something with X and y

    ###

## iterable_dataset_v1.py
class CustomIterableDatasetv1(IterableDataset):

    def __init__(self, filename):

        #Store the filename in object's memory
        self.filename = filename

        #And that's it, we no longer need to store the contents in the memory

    def preprocess(self, text):

## map.py
def square(x):
    return x**2

itr1 = range(5)
for i in itr1:
    print(i)
'''
Prints
0
1

## iterable_dataloader_v0.py
#Creating the iterable dataset object
dataset     = CustomIterableDataset('path_to/somefile')
#Creating the dataloader
dataloader  = DataLoader(dataset, batch_size = 64)

for data in dataloader:
    #Data is a list containing 64 (=batch_size) consecutive lines of the file
    print(len(data)) #[64,]

    #We still need to separate the text and labels from each other and preprocess the text

## iterable_dataset_v0.py
from torch.utils.data import IterableDataset

class CustomIterableDataset(IterableDataset):

    def __init__(self, filename):

        #Store the filename in object's memory
        self.filename = filename

        #And that's it, we no longer need to store the contents in the memory
	T = 12
	padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
	print(padded_tokens)
	# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']
	attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
	print(attn_mask)
	# Out: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
	tokens = ['[CLS]'] + tokens + ['[SEP]']
	print(tokens)
	# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]']
	from transformers import BertTokenizer

	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

	#For single sequence input
	sentence = 'I really enjoyed this movie a lot.'
	tokens = tokenizer.tokenize(sentence)
	print(tokens)
	# Out: ['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.']
	class CustomIterableDatasetv2(IterableDataset):

	def __init__(self, filename_en, filename_gm):

	#Store the filenames in object's memory
	self.filename_en = filename_en
	self.filename_gm = filename_gm

	#And that's it, we no longer need to store the contents in the memory
	itr1 = range(0,5)
	itr2 = range(1,6)
	itr3 = zip(itr1, itr2)

	for i in itr3:
	print(i)

	'''
	Prints
	(0,1)
	dataset = CustomIterableDatasetv1('path_to/somefile')
	dataloader = DataLoader(dataset, batch_size = 64)

	for X, y in dataloader:
	print(len(X)) # 64
	print(y.shape) # (64,)

	### Do something with X and y

	###
	class CustomIterableDatasetv1(IterableDataset):

	def __init__(self, filename):

	#Store the filename in object's memory
	self.filename = filename

	#And that's it, we no longer need to store the contents in the memory

	def preprocess(self, text):
	def square(x):
	return x**2

	itr1 = range(5)
	for i in itr1:
	print(i)
	'''
	Prints
	0
	1
	#Creating the iterable dataset object
	dataset = CustomIterableDataset('path_to/somefile')
	#Creating the dataloader
	dataloader = DataLoader(dataset, batch_size = 64)

	for data in dataloader:
	#Data is a list containing 64 (=batch_size) consecutive lines of the file
	print(len(data)) #[64,]

	#We still need to separate the text and labels from each other and preprocess the text
	from torch.utils.data import IterableDataset

	class CustomIterableDataset(IterableDataset):

	def __init__(self, filename):

	#Store the filename in object's memory
	self.filename = filename

	#And that's it, we no longer need to store the contents in the memory