Sirsirious/final_pipelines.py

## final_pipelines.py
# First we get the streams from TFDS
train_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=True)()
eval_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=False)()

# Next, we build the pipeline
data_pipeline = trax.data.Serial(
    trax.data.Tokenize(vocab_file='en_8k.subword', keys=[0]),
    trax.data.Shuffle(),
    trax.data.FilterByLength(max_length=2048, length_keys=[0]),
    trax.data.BucketByLength(boundaries=[  32, 128, 512, 2048],
                             batch_sizes=[512, 128,  32,    8, 1],
                             length_keys=[0]),
    trax.data.AddLossWeights()
  )

# Finally, we get the generators
train_batches_stream = data_pipeline(train_stream)
eval_batches_stream = data_pipeline(eval_stream)
	# First we get the streams from TFDS
	train_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=True)()
	eval_stream = trax.data.TFDS('imdb_reviews', keys=('text', 'label'), train=False)()

	# Next, we build the pipeline
	data_pipeline = trax.data.Serial(
	trax.data.Tokenize(vocab_file='en_8k.subword', keys=[0]),
	trax.data.Shuffle(),
	trax.data.FilterByLength(max_length=2048, length_keys=[0]),
	trax.data.BucketByLength(boundaries=[ 32, 128, 512, 2048],
	batch_sizes=[512, 128, 32, 8, 1],
	length_keys=[0]),
	trax.data.AddLossWeights()
	)

	# Finally, we get the generators
	train_batches_stream = data_pipeline(train_stream)
	eval_batches_stream = data_pipeline(eval_stream)