aravindsrinivas/botnet.py

## 11 changes: 0 additions & 11 deletions botnet.py
@@ -1,14 +1,3 @@

    """
"""

    MIT License
MIT License


    Copyright (c) 2021 Aravind Srinivas
Copyright (c) 2021 Aravind Srinivas


    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:


    The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.


    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    """
"""

    import functools
import functools

    import numpy as np
import numpy as np

    import tensorflow.compat.v1 as tf
import tensorflow.compat.v1 as tf


## 306 changes: 306 additions & 0 deletions botnet.py
@@ -0,0 +1,306 @@

    """
"""

    MIT License
MIT License


    Copyright (c) 2021 Aravind Srinivas
Copyright (c) 2021 Aravind Srinivas


    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:


    The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.


    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    """
"""

    import functools
import functools

    import numpy as np
import numpy as np

    import tensorflow.compat.v1 as tf
import tensorflow.compat.v1 as tf

    from tensorflow.python.tpu import tpu_function
from tensorflow.python.tpu import tpu_function


    BATCH_NORM_DECAY = 0.9
BATCH_NORM_DECAY = 0.9

    BATCH_NORM_EPSILON = 1e-5
BATCH_NORM_EPSILON = 1e-5


    def Activation(inputs, activation='relu'):
def Activation(inputs, activation='relu'):

      """Only supports ReLU and SiLU/Swish."""
  """Only supports ReLU and SiLU/Swish."""

      assert activation in ['relu', 'silu']
  assert activation in ['relu', 'silu']

      if activation == 'relu':
  if activation == 'relu':

        return tf.nn.relu(inputs)
    return tf.nn.relu(inputs)

      else:
  else:

        return tf.nn.swish(inputs)
    return tf.nn.swish(inputs)


    def BNReLU(
def BNReLU(

        inputs, is_training, nonlinearity=True,
    inputs, is_training, nonlinearity=True,

        init_zero=False, activation='relu'):
    init_zero=False, activation='relu'):

      """Performs a batch normalization followed by a ReLU."""
  """Performs a batch normalization followed by a ReLU."""

      if init_zero:
  if init_zero:

        gamma_initializer = tf.zeros_initializer()
    gamma_initializer = tf.zeros_initializer()

      else:
  else:

        gamma_initializer = tf.ones_initializer()
    gamma_initializer = tf.ones_initializer()


      inputs = tf.layers.batch_normalization(
  inputs = tf.layers.batch_normalization(

            inputs=inputs,
        inputs=inputs,

            axis=3,
        axis=3,

            momentum=BATCH_NORM_DECAY,
        momentum=BATCH_NORM_DECAY,

            epsilon=BATCH_NORM_EPSILON,
        epsilon=BATCH_NORM_EPSILON,

            center=True,
        center=True,

            scale=True,
        scale=True,

            training=is_training,
        training=is_training,

            fused=True,
        fused=True,

            gamma_initializer=gamma_initializer)
        gamma_initializer=gamma_initializer)


      if nonlinearity:
  if nonlinearity:

        inputs = Activation(inputs, activation=activation)
    inputs = Activation(inputs, activation=activation)


      return inputs
  return inputs


    def fixed_padding(inputs, kernel_size):
def fixed_padding(inputs, kernel_size):

      """Pads the input along the spatial dimensions independently of input size."""
  """Pads the input along the spatial dimensions independently of input size."""

      pad_total = kernel_size - 1
  pad_total = kernel_size - 1

      pad_beg = pad_total // 2
  pad_beg = pad_total // 2

      pad_end = pad_total - pad_beg
  pad_end = pad_total - pad_beg

      padded_inputs = tf.pad(
  padded_inputs = tf.pad(

          inputs, [[0, 0], [pad_beg, pad_end], pad_beg, pad_end], [0, 0]])
      inputs, [[0, 0], [pad_beg, pad_end], pad_beg, pad_end], [0, 0]])

      return padded_inputs
  return padded_inputs


    def Conv2D(inputs, *, filters, kernel_size, strides=1):
def Conv2D(inputs, *, filters, kernel_size, strides=1):

      """Strided 2-D convolution with explicit padding."""
  """Strided 2-D convolution with explicit padding."""

      if strides > 1:
  if strides > 1:

        inputs = fixed_padding(inputs, kernel_size)
    inputs = fixed_padding(inputs, kernel_size)


      return tf.layers.conv2d(
  return tf.layers.conv2d(

          inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
      inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,

          padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
      padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,

          kernel_initializer=tf.variance_scaling_initializer(
      kernel_initializer=tf.variance_scaling_initializer(

              scale=2., mode='fan_in', distribution='untruncated_normal'))
          scale=2., mode='fan_in', distribution='untruncated_normal'))


    # Functions `rel_to_abs`, `relative_logits_1d`, `relative_logits`
# Functions `rel_to_abs`, `relative_logits_1d`, `relative_logits`

    # and `relpos_self_attention` are fully based on
# and `relpos_self_attention` are fully based on

    # https://github.com/tensorflow/tensor2tensor/blob/21dba2c1bdcc7ab582a2bfd8c0885c217963bb4f/tensor2tensor/layers/common_attention.py#L2225.
# https://github.com/tensorflow/tensor2tensor/blob/21dba2c1bdcc7ab582a2bfd8c0885c217963bb4f/tensor2tensor/layers/common_attention.py#L2225.


    def rel_to_abs(x):
def rel_to_abs(x):

      """
  """

      Converts relative indexing to absolute.
  Converts relative indexing to absolute.

      Input: [bs, heads, length, 2*length - 1]
  Input: [bs, heads, length, 2*length - 1]

      Output: [bs, heads, length, length]
  Output: [bs, heads, length, length]

      """
  """

      bs, heads, length, _ = x.shape
  bs, heads, length, _ = x.shape

      col_pad = tf.zeros((bs, heads, length, 1), dtype=x.dtype)
  col_pad = tf.zeros((bs, heads, length, 1), dtype=x.dtype)

      x = tf.concat([x, col_pad], axis=3)
  x = tf.concat([x, col_pad], axis=3)

      flat_x = tf.reshape(x, [bs, heads, -1])
  flat_x = tf.reshape(x, [bs, heads, -1])

      flat_pad = tf.zeros((bs, heads, length-1), dtype=x.dtype)
  flat_pad = tf.zeros((bs, heads, length-1), dtype=x.dtype)

      flat_x_padded = tf.concat([flat_x, flat_pad], axis=2)
  flat_x_padded = tf.concat([flat_x, flat_pad], axis=2)

      final_x = tf.reshape(
  final_x = tf.reshape(

        flat_x_padded, [bs, heads, length+1, 2*length-1])
    flat_x_padded, [bs, heads, length+1, 2*length-1])

      final_x = final_x[:, :, :length, length-1:]
  final_x = final_x[:, :, :length, length-1:]

      return final_x
  return final_x


    def relative_logits_1d(*, q, rel_k, transpose_mask):
def relative_logits_1d(*, q, rel_k, transpose_mask):

      """
  """

      Compute relative logits along one dimenion.
  Compute relative logits along one dimenion.

      `q`: [bs, heads, height, width, dim]
  `q`: [bs, heads, height, width, dim]

      `rel_k`: [2*width - 1, dim]
  `rel_k`: [2*width - 1, dim]

      """
  """

      bs, heads, h, w, dim = q.shape
  bs, heads, h, w, dim = q.shape

      rel_logits = tf.einsum('bhxyd,md->bhxym', q, rel_k)
  rel_logits = tf.einsum('bhxyd,md->bhxym', q, rel_k)

      rel_logits = tf.reshape(rel_logits, [-1, heads * h, w, 2*w-1])
  rel_logits = tf.reshape(rel_logits, [-1, heads * h, w, 2*w-1])

      rel_logits = rel_to_abs(rel_logits)
  rel_logits = rel_to_abs(rel_logits)

      rel_logits = tf.reshape(rel_logits, [-1, heads, h, w, w])
  rel_logits = tf.reshape(rel_logits, [-1, heads, h, w, w])

      rel_logits = tf.expand_dims(rel_logits, axis=3)
  rel_logits = tf.expand_dims(rel_logits, axis=3)

      rel_logits = tf.tile(rel_logits, [1, 1, 1, h, 1, 1])
  rel_logits = tf.tile(rel_logits, [1, 1, 1, h, 1, 1])

      rel_logits = tf.transpose(rel_logits, transpose_mask)
  rel_logits = tf.transpose(rel_logits, transpose_mask)

      return rel_logits
  return rel_logits


    def relative_logits(q):
def relative_logits(q):

      """Compute relative position enc logits."""
  """Compute relative position enc logits."""

      with tf.variable_scope('relative', reuse=tf.AUTO_REUSE):
  with tf.variable_scope('relative', reuse=tf.AUTO_REUSE):

        bs, heads, h, w, dim = q.shape
    bs, heads, h, w, dim = q.shape

        int_dim = dim.value
    int_dim = dim.value

        # Note: below, we passed stddev arg as mean for the initializer.
    # Note: below, we passed stddev arg as mean for the initializer.

        # Providing code as is, with this small error.
    # Providing code as is, with this small error.

        # right way: normal_initializer(stddev=int_dim**-0.5)
    # right way: normal_initializer(stddev=int_dim**-0.5)


        # Relative logits in width dimension.
    # Relative logits in width dimension.

        rel_emb_w = tf.get_variable(
    rel_emb_w = tf.get_variable(

            'r_width', shape=(2*w - 1, dim),
        'r_width', shape=(2*w - 1, dim),

            dtype=q.dtype,
        dtype=q.dtype,

            initializer=tf.random_normal_initializer(int_dim**-0.5))
        initializer=tf.random_normal_initializer(int_dim**-0.5))

        rel_logits_w = relative_logits_1d(
    rel_logits_w = relative_logits_1d(

          q=q, rel_k=rel_emb_w,
      q=q, rel_k=rel_emb_w,

          transpose_mask=[0, 1, 2, 4, 3, 5])
      transpose_mask=[0, 1, 2, 4, 3, 5])


        # Relative logits in height dimension.
    # Relative logits in height dimension.

        rel_emb_h = tf.get_variable(
    rel_emb_h = tf.get_variable(

            'r_height', shape=(2*h - 1, dim),
        'r_height', shape=(2*h - 1, dim),

            dtype=q.dtype,
        dtype=q.dtype,

            initializer=tf.random_normal_initializer(int_dim**-0.5))
        initializer=tf.random_normal_initializer(int_dim**-0.5))

        rel_logits_h = relative_logits_1d(
    rel_logits_h = relative_logits_1d(

            q=tf.transpose(q, [0, 1, 3, 2, 4]),
        q=tf.transpose(q, [0, 1, 3, 2, 4]),

            rel_k=rel_emb_h,
        rel_k=rel_emb_h,

            transpose_mask=[0, 1, 4, 2, 5, 3])
        transpose_mask=[0, 1, 4, 2, 5, 3])

        return rel_logits_h + rel_logits_w
    return rel_logits_h + rel_logits_w


    def relpos_self_attention(
def relpos_self_attention(

      *, q, k, v, relative=True, fold_heads=False):
  *, q, k, v, relative=True, fold_heads=False):

      """2D self-attention with rel-pos. Add option to fold heads."""
  """2D self-attention with rel-pos. Add option to fold heads."""

      bs, heads, h, w, dim = q.shape
  bs, heads, h, w, dim = q.shape

      int_dim = dim.value
  int_dim = dim.value

      q = q * (dim ** -0.5) # scaled dot-product
  q = q * (dim ** -0.5) # scaled dot-product

      logits = tf.einsum('bhHWd,bhPQd->bhHWPQ', q, k)
  logits = tf.einsum('bhHWd,bhPQd->bhHWPQ', q, k)

      if relative:
  if relative:

        logits += relative_logits(q)
    logits += relative_logits(q)

      weights = tf.reshape(logits, [-1, heads, h, w, h * w])
  weights = tf.reshape(logits, [-1, heads, h, w, h * w])

      weights = tf.nn.softmax(weights)
  weights = tf.nn.softmax(weights)

      weights = tf.reshape(weights, [-1, heads, h, w, h, w])
  weights = tf.reshape(weights, [-1, heads, h, w, h, w])

      attn_out = tf.einsum('bhHWPQ,bhPQd->bHWhd', weights, v)
  attn_out = tf.einsum('bhHWPQ,bhPQd->bHWhd', weights, v)

      if fold_heads:
  if fold_heads:

        attn_out = tf.reshape(attn_out, [-1, h, w, heads * dim])
    attn_out = tf.reshape(attn_out, [-1, h, w, heads * dim])

      return attn_out
  return attn_out


    def absolute_logits(q):
def absolute_logits(q):

      """Compute absolute position enc logits."""
  """Compute absolute position enc logits."""

      with tf.variable_scope('absolute', reuse=tf.AUTO_REUSE):
  with tf.variable_scope('absolute', reuse=tf.AUTO_REUSE):

        emb_w = tf.get_variable(
    emb_w = tf.get_variable(

            'r_width', shape=(W, dkh),
        'r_width', shape=(W, dkh),

            dtype=q.dtype,
        dtype=q.dtype,

            initializer=tf.random_normal_initializer(dkh**-0.5))
        initializer=tf.random_normal_initializer(dkh**-0.5))

        emb_h = tf.get_variable(
    emb_h = tf.get_variable(

            'r_height', shape=(H, dkh),
        'r_height', shape=(H, dkh),

            dtype=q.dtype,
        dtype=q.dtype,

            initializer=tf.random_normal_initializer(dkh**-0.5))
        initializer=tf.random_normal_initializer(dkh**-0.5))

        emb_h = emb_h[:, None, :]
    emb_h = emb_h[:, None, :]

        emb_w = emb_w[None, :, :]
    emb_w = emb_w[None, :, :]

        emb = emb_h + emb_w
    emb = emb_h + emb_w

        abs_logits = tf.einsum('bhxyd,pqd->bhxypq', q, emb)
    abs_logits = tf.einsum('bhxyd,pqd->bhxypq', q, emb)

        return abs_logits
    return abs_logits


    def abspos_self_attention(*, q, k, v, absolue=True, fold_heads=False):
def abspos_self_attention(*, q, k, v, absolue=True, fold_heads=False):

      """2D self-attention with abs-pos. Add option to fold heads."""
  """2D self-attention with abs-pos. Add option to fold heads."""

      bs, heads, h, w, dim = q.shape
  bs, heads, h, w, dim = q.shape

      int_dim = dim.value
  int_dim = dim.value

      q = q * (dim ** -0.5) # scaled dot-product
  q = q * (dim ** -0.5) # scaled dot-product

      logits = tf.einsum('bhHWd,bhPQd->bhHWPQ', q, k)
  logits = tf.einsum('bhHWd,bhPQd->bhHWPQ', q, k)

      abs_logits = absolute_logits(q)
  abs_logits = absolute_logits(q)

      if absolute:
  if absolute:

        logits += abs_logits
    logits += abs_logits

      weights = tf.reshape(logits, [-1, heads, h, w, h * w])
  weights = tf.reshape(logits, [-1, heads, h, w, h * w])

      weights = tf.nn.softmax(weights)
  weights = tf.nn.softmax(weights)

      weights = tf.reshape(weights, [-1, heads, h, w, h, w])
  weights = tf.reshape(weights, [-1, heads, h, w, h, w])

      attn_out = tf.einsum('bhHWPQ,bhPQd->bHWhd', weights, v)
  attn_out = tf.einsum('bhHWPQ,bhPQd->bHWhd', weights, v)

      if fold_heads:
  if fold_heads:

        attn_out = tf.reshape(attn_out, [-1, h, w, heads * dim])
    attn_out = tf.reshape(attn_out, [-1, h, w, heads * dim])

      return attn_out
  return attn_out


    def group_pointwise(
def group_pointwise(

      featuremap, proj_factor=1, name='grouppoint',
  featuremap, proj_factor=1, name='grouppoint',

      heads=4, target_dimension=None):
  heads=4, target_dimension=None):

      """1x1 conv with heads."""
  """1x1 conv with heads."""

      with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        in_channels = featuremap.shape[-1]
    in_channels = featuremap.shape[-1]

        if target_dimension is not None:
    if target_dimension is not None:

          proj_channels = target_dimension // proj_factor
      proj_channels = target_dimension // proj_factor

        else:
    else:

          proj_channels = in_channels // proj_factor
      proj_channels = in_channels // proj_factor

        w = tf.get_variable(
    w = tf.get_variable(

            'w',
        'w',

            [in_channels, heads, proj_channels // heads],
        [in_channels, heads, proj_channels // heads],

            dtype=featuremap.dtype,
        dtype=featuremap.dtype,

            initializer=tf.random_normal_initializer(stddev=0.01))
        initializer=tf.random_normal_initializer(stddev=0.01))

        out = tf.einsum('bHWD,Dhd->bhHWd', featuremap, w)
    out = tf.einsum('bHWD,Dhd->bhHWd', featuremap, w)

        return out
    return out


    def MHSA(featuremap, pos_enc_type='relative', use_pos=True):
def MHSA(featuremap, pos_enc_type='relative', use_pos=True):

      """Multi-Head Self-Attention."""
  """Multi-Head Self-Attention."""

      q = group_pointwise(
  q = group_pointwise(

          featuremap, proj_factor=1, name='q_proj', heads=heads,
      featuremap, proj_factor=1, name='q_proj', heads=heads,

          target_dimension=bottleneck_dimension)
      target_dimension=bottleneck_dimension)

      k = group_pointwise(
  k = group_pointwise(

          featuremap, proj_factor=1, name='k_proj', heads=heads,
      featuremap, proj_factor=1, name='k_proj', heads=heads,

          target_dimension=bottleneck_dimension)
      target_dimension=bottleneck_dimension)

      v = group_pointwise(
  v = group_pointwise(

          featuremap, proj_factor=1, name='v_proj', heads=heads,
      featuremap, proj_factor=1, name='v_proj', heads=heads,

          target_dimension=bottleneck_dimension)
      target_dimension=bottleneck_dimension)

      assert pos_enc_type in ['relative', 'absolute']
  assert pos_enc_type in ['relative', 'absolute']

      if pos_enc_type == 'relative':
  if pos_enc_type == 'relative':

        o = relpos_self_attention(
    o = relpos_self_attention(

            q=q, k=k, v=v, relative=use_pos, fold_heads=True)
        q=q, k=k, v=v, relative=use_pos, fold_heads=True)

      else:
  else:

        o = abspos_self_attention(
    o = abspos_self_attention(

            q=q, k=k, v=v, absolute=use_pos, fold_heads=True)
        q=q, k=k, v=v, absolute=use_pos, fold_heads=True)

      return o
  return o


    def BoT_Block(
def BoT_Block(

        featuremap, is_training=False,
    featuremap, is_training=False,

        heads=4, proj_factor=4,
    heads=4, proj_factor=4,

        activation='relu',
    activation='relu',

        pos_enc_type='relative',
    pos_enc_type='relative',

        name='all2all', strides=1,
    name='all2all', strides=1,

        target_dimension=2048):
    target_dimension=2048):

      """Bottleneck Transformer (BoT) Block."""
  """Bottleneck Transformer (BoT) Block."""

      with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        shortcut = featuremap
    shortcut = featuremap

        in_dimension = featuremap.shape[-1]
    in_dimension = featuremap.shape[-1]

        if strides != 1 or in_dimension != target_dimension:
    if strides != 1 or in_dimension != target_dimension:

          shortcut = Conv2D(
      shortcut = Conv2D(

            shortcut, filters=target_dimension, kernel_size=1, strides=strides)
        shortcut, filters=target_dimension, kernel_size=1, strides=strides)

          shortcut = BNReLU(
      shortcut = BNReLU(

              shortcut, is_training, activation=activation, nonlinearity=True)
          shortcut, is_training, activation=activation, nonlinearity=True)


        bottleneck_dimension = target_dimension // proj_factor
    bottleneck_dimension = target_dimension // proj_factor

        featuremap = Conv2D(
    featuremap = Conv2D(

            featuremap, filters=bottleneck_dimension, kernel_size=1, strides=1)
        featuremap, filters=bottleneck_dimension, kernel_size=1, strides=1)

        featuremap = BNReLU(
    featuremap = BNReLU(

            featuremap, is_training, activation=activation, nonlinearity=True)
        featuremap, is_training, activation=activation, nonlinearity=True)


        featuremap = MHSA(featuremap, pos_enc_type=pos_enc_type)
    featuremap = MHSA(featuremap, pos_enc_type=pos_enc_type)

        if strides != 1:
    if strides != 1:

          assert strides == 2
      assert strides == 2

          featuremap = tf.keras.layers.AveragePooling2D(
      featuremap = tf.keras.layers.AveragePooling2D(

              pool_size=(2, 2), strides=(2, 2), padding='same')(featuremap)
          pool_size=(2, 2), strides=(2, 2), padding='same')(featuremap)

        featuremap = BNReLU(
    featuremap = BNReLU(

            featuremap, is_training, activation=activation, nonlinearity=True)
        featuremap, is_training, activation=activation, nonlinearity=True)


        featuremap= Conv2D(
    featuremap= Conv2D(

            featuremap, filters=target_dimension,
        featuremap, filters=target_dimension,

            kernel_size=1, strides=1)
        kernel_size=1, strides=1)

        featuremap = BNReLU(
    featuremap = BNReLU(

            featuremap, is_training, nonlinearity=False, init_zero=True)
        featuremap, is_training, nonlinearity=False, init_zero=True)


        return Activation(shortcut + featuremap, activation=activation)
    return Activation(shortcut + featuremap, activation=activation)


    def BoT_Stack(
def BoT_Stack(

        featuremap, *,
    featuremap, *,

        blocks_so_far,
    blocks_so_far,

        total_blocks,
    total_blocks,

        is_training=False,
    is_training=False,

        heads=4, proj_factor=4,
    heads=4, proj_factor=4,

        activation='relu',
    activation='relu',

        pos_enc_type='relative',
    pos_enc_type='relative',

        name='all2all_stack',
    name='all2all_stack',

        strides=2, num_layers=3,
    strides=2, num_layers=3,

        target_dimension=2048):
    target_dimension=2048):

      """c5 Blockgroup of BoT Blocks."""
  """c5 Blockgroup of BoT Blocks."""

      with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):

        for i in range(num_layers):
    for i in range(num_layers):

          featuremap = BoT_Block(
      featuremap = BoT_Block(

              featuremap,
          featuremap,

              is_training=is_training,
          is_training=is_training,

              heads=heads,
          heads=heads,

              proj_factor=proj_factor,
          proj_factor=proj_factor,

              activation=activation,
          activation=activation,

              pos_enc_type=pos_enc_type,
          pos_enc_type=pos_enc_type,

              strides=strides if i == 0 else 1,
          strides=strides if i == 0 else 1,

              target_dimension=target_dimension,
          target_dimension=target_dimension,

              name='all2all_layer_{}'.format(i))
          name='all2all_layer_{}'.format(i))

        return featuremap
    return featuremap