kastnerkyle/batch_ar_example.py

## batch_ar_example.py
import numpy as np

# make a minibatch of time, batch, features
# time length 7
# batch size 2
# feature dimension 4:
# 1:4, 10:14, 20:24, 30:34, etc for first minibatch element
# 5:8, 15:18, etc second minibatch el
n_features = 4
n_timesteps = 7
base_mb1_features = np.arange(n_features) + 1
time_mb1_features = 10 * np.arange(n_timesteps)[:, None] + base_mb1_features[None]

base_mb2_features = np.arange(n_features) + 5 + 1
time_mb2_features = 10 * np.arange(n_timesteps)[:, None] + base_mb2_features[None]

data = np.concatenate((time_mb1_features[:, None], time_mb2_features[:, None]), axis=1)

time_len = data.shape[0]
minibatch_size = data.shape[1]
features = data.shape[2]

# for each example [0, 6) and [6, 12), we will make an autoregressive mask and equivalent targets for each step

# new assumption is that the "feature" dimension is the one for autoregression
# should be more natural compared to the previous example

# 1, 2, 3, 4 -> in: 0, 0, 0, 0, ; target: 1
# 1, 2, 3, 4 -> in: 1, 0, 0, 0, ; target: 2
# 1, 2, 3, 4 -> in: 1, 2, 0, 0, ; target: 3
# 1, 2, 3, 4 -> in: 1, 2, 3, 0, ; target: 4

# accomplished using np.triu with 1 argument
# [[1, 1, 1]
#  [1, 1, 1]
#  [1, 1, 1]] -> [[0, 1, 1]
#                 [0, 0, 1]
#                 [0, 0, 0]]
mask_array = np.triu(np.ones((features, features)), k=1)

# now we have a 4, 4 we want to multiply across 7, 2, 1
#
# results in 4, 4, 7, 2 which is basically features, "feature time" (autoregressive), time, minibatch
masked_and_copied = mask_array[:, :, None, None] * data.transpose(2, 0, 1)[:, None]

# there is still extra 0s at the end but we leave them alone for now

# now we transpose it to
# "feature_time", time, minibatch, features
masked_and_copied = masked_and_copied.transpose(1, 2, 3, 0)

# 0th "timestep", 0th element, looping over the "autoregressive" axis we see
# masked_and_copied[0, 0, 0] = [0, 0, 0, 0]
# masked_and_copied[0, 0, 1] = [1, 0, 0, 0]
# masked_and_copied[0, 0, 2] = [1, 2, 0, 0]
# masked_and_copied[0, 0, 3] = [1, 2, 3, 0]

ar_data = masked_and_copied.copy()
ar_data_shape = ar_data.shape

#  make the targets
ar_targets = data.transpose(2, 0, 1)[..., None]
ar_targets_shape = ar_targets.shape

# we rearrange (and unarrange) with this function to make the "normal" training scheme of minibatch, features
def ar_minibatch_conversion(arr, original_shape=None, inverse=False):
    # expects
    # "feature_time", time, minibatch, features
    # skip total blank?
    if inverse == False:
        # ar, t, mb, f -> t, ar * mb, f
        # particularly, we want the masked groups in order so that we can do a reshape/structured sum to average them
        shp = arr.shape
        arr = arr.transpose(1, 2, 0, 3)
        arr = arr.reshape(shp[1], shp[2] * shp[0], shp[3])
        # now we have t, ar * mb, f
        # *each* of these can be fed through a network, and effectively we ran all the timesteps in parallel (assuming not passing hidden info)
        return arr
    else:
        # need to invert the old procedure, original shape is REQUIRED
        shp = original_shape
        arr = arr.reshape(shp[1], shp[2], shp[0], shp[3])
        arr = arr.transpose(2, 0, 1, 3)
        return arr

flat_data = ar_minibatch_conversion(ar_data)
orig_data = ar_minibatch_conversion(flat_data, ar_data_shape, inverse=True)

flat_targets = ar_minibatch_conversion(ar_targets)
orig_targets = ar_minibatch_conversion(flat_targets, ar_targets_shape, inverse=True)
# normally you would do something like step_preds = f(flat_data)
# per_step_loss = (step_preds - flat_targets) ** 2
# loss = loss.sum() or loss = loss.mean()
# loss.backwards()

for i in range(n_features):
    print("flat")
    # show that it is chunkwise blocked into the minibatch
    print(flat_data[:, i])
    print(flat_targets[:, i])

# this should match the previous
# this setup is available if you wanted to do more structured losses than just averaged per step
for i in range(n_features):
    print("orig")
    print(orig_data[:, i, 0])
    print(orig_targets[:, i, 0])
	import numpy as np

	# make a minibatch of time, batch, features
	# time length 7
	# batch size 2
	# feature dimension 4:
	# 1:4, 10:14, 20:24, 30:34, etc for first minibatch element
	# 5:8, 15:18, etc second minibatch el
	n_features = 4
	n_timesteps = 7
	base_mb1_features = np.arange(n_features) + 1
	time_mb1_features = 10 * np.arange(n_timesteps)[:, None] + base_mb1_features[None]

	base_mb2_features = np.arange(n_features) + 5 + 1
	time_mb2_features = 10 * np.arange(n_timesteps)[:, None] + base_mb2_features[None]

	data = np.concatenate((time_mb1_features[:, None], time_mb2_features[:, None]), axis=1)

	time_len = data.shape[0]
	minibatch_size = data.shape[1]
	features = data.shape[2]

	# for each example [0, 6) and [6, 12), we will make an autoregressive mask and equivalent targets for each step

	# new assumption is that the "feature" dimension is the one for autoregression
	# should be more natural compared to the previous example

	# 1, 2, 3, 4 -> in: 0, 0, 0, 0, ; target: 1
	# 1, 2, 3, 4 -> in: 1, 0, 0, 0, ; target: 2
	# 1, 2, 3, 4 -> in: 1, 2, 0, 0, ; target: 3
	# 1, 2, 3, 4 -> in: 1, 2, 3, 0, ; target: 4

	# accomplished using np.triu with 1 argument
	# [[1, 1, 1]
	# [1, 1, 1]
	# [1, 1, 1]] -> [[0, 1, 1]
	# [0, 0, 1]
	# [0, 0, 0]]
	mask_array = np.triu(np.ones((features, features)), k=1)

	# now we have a 4, 4 we want to multiply across 7, 2, 1
	#
	# results in 4, 4, 7, 2 which is basically features, "feature time" (autoregressive), time, minibatch
	masked_and_copied = mask_array[:, :, None, None] * data.transpose(2, 0, 1)[:, None]

	# there is still extra 0s at the end but we leave them alone for now

	# now we transpose it to
	# "feature_time", time, minibatch, features
	masked_and_copied = masked_and_copied.transpose(1, 2, 3, 0)

	# 0th "timestep", 0th element, looping over the "autoregressive" axis we see
	# masked_and_copied[0, 0, 0] = [0, 0, 0, 0]
	# masked_and_copied[0, 0, 1] = [1, 0, 0, 0]
	# masked_and_copied[0, 0, 2] = [1, 2, 0, 0]
	# masked_and_copied[0, 0, 3] = [1, 2, 3, 0]

	ar_data = masked_and_copied.copy()
	ar_data_shape = ar_data.shape

	# make the targets
	ar_targets = data.transpose(2, 0, 1)[..., None]
	ar_targets_shape = ar_targets.shape

	# we rearrange (and unarrange) with this function to make the "normal" training scheme of minibatch, features
	def ar_minibatch_conversion(arr, original_shape=None, inverse=False):
	# expects
	# "feature_time", time, minibatch, features
	# skip total blank?
	if inverse == False:
	# ar, t, mb, f -> t, ar * mb, f
	# particularly, we want the masked groups in order so that we can do a reshape/structured sum to average them
	shp = arr.shape
	arr = arr.transpose(1, 2, 0, 3)
	arr = arr.reshape(shp[1], shp[2] * shp[0], shp[3])
	# now we have t, ar * mb, f
	# each of these can be fed through a network, and effectively we ran all the timesteps in parallel (assuming not passing hidden info)
	return arr
	else:
	# need to invert the old procedure, original shape is REQUIRED
	shp = original_shape
	arr = arr.reshape(shp[1], shp[2], shp[0], shp[3])
	arr = arr.transpose(2, 0, 1, 3)
	return arr

	flat_data = ar_minibatch_conversion(ar_data)
	orig_data = ar_minibatch_conversion(flat_data, ar_data_shape, inverse=True)

	flat_targets = ar_minibatch_conversion(ar_targets)
	orig_targets = ar_minibatch_conversion(flat_targets, ar_targets_shape, inverse=True)
	# normally you would do something like step_preds = f(flat_data)
	# per_step_loss = (step_preds - flat_targets) ** 2
	# loss = loss.sum() or loss = loss.mean()
	# loss.backwards()

	for i in range(n_features):
	print("flat")
	# show that it is chunkwise blocked into the minibatch
	print(flat_data[:, i])
	print(flat_targets[:, i])

	# this should match the previous
	# this setup is available if you wanted to do more structured losses than just averaged per step
	for i in range(n_features):
	print("orig")
	print(orig_data[:, i, 0])
	print(orig_targets[:, i, 0])