kabirahuja2431/iterable_dataset_v2.py

## iterable_dataset_v2.py
class CustomIterableDatasetv2(IterableDataset):

    def __init__(self, filename_en, filename_gm):

        #Store the filenames in object's memory
        self.filename_en = filename_en
        self.filename_gm = filename_gm

        #And that's it, we no longer need to store the contents in the memory

    def preprocess(self, text):

        ### Do something with text here
        text_pp = text.split()
        ###

        return text_pp

    def line_mapper(self, line):

        #We only have the text in the file for this case
        text = line
        text = self.preprocess(text)
        return text


    def __iter__(self):

        #Create an iterator
        en_itr = open(self.filename_en)
        gm_itr = open(self.filename_gm)

        #Map each element using the line_mapper
        mapped_en_itr = map(self.line_mapper, en_itr)
        mapped_gm_itr = map(self.line_mapper, gm_itr)

        #Zip both iterators
        zipped_itr = zip(mapped_en_itr, mapped_gm_itr)

        return zipped_itr
	class CustomIterableDatasetv2(IterableDataset):

	def __init__(self, filename_en, filename_gm):

	#Store the filenames in object's memory
	self.filename_en = filename_en
	self.filename_gm = filename_gm

	#And that's it, we no longer need to store the contents in the memory

	def preprocess(self, text):

	### Do something with text here
	text_pp = text.split()
	###

	return text_pp

	def line_mapper(self, line):

	#We only have the text in the file for this case
	text = line
	text = self.preprocess(text)
	return text


	def __iter__(self):

	#Create an iterator
	en_itr = open(self.filename_en)
	gm_itr = open(self.filename_gm)

	#Map each element using the line_mapper
	mapped_en_itr = map(self.line_mapper, en_itr)
	mapped_gm_itr = map(self.line_mapper, gm_itr)

	#Zip both iterators
	zipped_itr = zip(mapped_en_itr, mapped_gm_itr)

	return zipped_itr