Skip to content

Instantly share code, notes, and snippets.

@deepanshu-yadav
Last active June 7, 2022 10:52
Show Gist options
  • Save deepanshu-yadav/09f02db1ca75bf07818dbfef1f142e1c to your computer and use it in GitHub Desktop.
Save deepanshu-yadav/09f02db1ca75bf07818dbfef1f142e1c to your computer and use it in GitHub Desktop.
# Class that handles scaling operation.
class Scaler:
def __init__(self, scaler):
self.scaler = scaler
# For backward compatibilty with earlier version of scikit learn
self.scaler.clip = False
def transform_using_scaler(self, data_x):
# Convert from tensor to numpy.
data_x = data_x.numpy()
# If we apply batch before scaling the input is
# (BATCH_SIZE, 1, NO_OF_FEATURES)
# The following will reduce the dimension to
# (BATCH_SIZE, NO_OF_FEATURES)
data_x = np.squeeze(data_x, axis=1)
# Finally tranform the data using the fitted scaler.
scaled_x = self.scaler.transform(data_x)
return scaled_x
# Initilaze the scaling object
scale_obj = Scaler(min_max_scaler_train)
# Get the header bytes to skip in every numpy file.
npy_file = training_files[0]
dtype = tf.float64
header_offset = npy_header_offset(npy_file)
# We will use the header bytes in the code below.
dataset_train = tf.data.FixedLengthRecordDataset(training_files,
NO_OF_FEATURES * dtype.size,
header_bytes=header_offset)
# Convert the raw binary data to a tensor of dimension (1, NO_OF_FEATURES)
dataset_train = dataset_train.map(lambda s: tf.reshape(
tf.io.decode_raw(s, dtype), (1, NO_OF_FEATURES)))
# Make a batch of these tensors.
dataset_train = dataset_train.batch(BATCH_SIZE)
# The result will be a tensor of of dimension (BATCH_SIZE, 1, NO_OF_FEATURES)
# Now there is no function for min max scaling in tensorflow so we need
# to wrap our function into
# py_function. Notice the input is a array and output is also an array.
dataset_train = dataset_train.map(lambda x: tf.py_function(
scale_obj.transform_using_scaler,
[x],
[tf.float32]))
# Time for caching.
dataset_train = dataset_train.cache()
# For autoencoder there is no y (label).
# So will make a tuple of only X (input).
# Notice this is a memory consuming operation.
# Hence caching is not applied here. Rather it is applied before it.
dataset_train = dataset_train.map(lambda x: (x, x))
# Now apply prefetch to ensure the next batch is already available when
# the processing of this batch is
# over.
dataset_train = dataset_train.prefetch(1)
# Repeat the dataset to prevent loss of samples in
# training because we may exhaust all the data
# after one epoch.
dataset_train = dataset_train.repeat()
# Repeat the whole process for validation.
npy_file = validation_files[0]
dtype = tf.float64
header_offset = npy_header_offset(npy_file)
dataset_valid = tf.data.FixedLengthRecordDataset(validation_files,
NO_OF_FEATURES * dtype.size,
header_bytes=header_offset)
dataset_valid = dataset_valid.map(lambda s: tf.reshape(tf.io.decode_raw(s, dtype), (1, NO_OF_FEATURES)))
dataset_valid = dataset_valid.batch(BATCH_SIZE)
dataset_valid = dataset_valid.map(lambda x: tf.py_function(scale_obj.transform_using_scaler,
[x],
[tf.float32]))
dataset_valid = dataset_valid.cache()
dataset_valid = dataset_valid.map(lambda x: (x, x))
dataset_valid = dataset_valid.prefetch(1)
dataset_valid = dataset_valid.repeat()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment