Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import functools
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.python.tpu import tpu_function
BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5
def Activation(inputs, activation='relu'):
"""Only supports ReLU and SiLU/Swish."""
assert activation in ['relu', 'silu']
if activation == 'relu':
return tf.nn.relu(inputs)
else:
return tf.nn.swish(inputs)
def BNReLU(
inputs, is_training, nonlinearity=True,
init_zero=False, activation='relu'):
"""Performs a batch normalization followed by a ReLU."""
if init_zero:
gamma_initializer = tf.zeros_initializer()
else:
gamma_initializer = tf.ones_initializer()
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=3,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
center=True,
scale=True,
training=is_training,
fused=True,
gamma_initializer=gamma_initializer)
if nonlinearity:
inputs = Activation(inputs, activation=activation)
return inputs
def fixed_padding(inputs, kernel_size):
"""Pads the input along the spatial dimensions independently of input size."""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
padded_inputs = tf.pad(
inputs, [[0, 0], [pad_beg, pad_end], pad_beg, pad_end], [0, 0]])
return padded_inputs
def Conv2D(inputs, *, filters, kernel_size, strides=1):
"""Strided 2-D convolution with explicit padding."""
if strides > 1:
inputs = fixed_padding(inputs, kernel_size)
return tf.layers.conv2d(
inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(
scale=2., mode='fan_in', distribution='untruncated_normal'))
# Functions `rel_to_abs`, `relative_logits_1d`, `relative_logits`
# and `relpos_self_attention` are fully based on
# https://github.com/tensorflow/tensor2tensor/blob/21dba2c1bdcc7ab582a2bfd8c0885c217963bb4f/tensor2tensor/layers/common_attention.py#L2225.
def rel_to_abs(x):
"""
Converts relative indexing to absolute.
Input: [bs, heads, length, 2*length - 1]
Output: [bs, heads, length, length]
"""
bs, heads, length, _ = x.shape
col_pad = tf.zeros((bs, heads, length, 1), dtype=x.dtype)
x = tf.concat([x, col_pad], axis=3)
flat_x = tf.reshape(x, [bs, heads, -1])
flat_pad = tf.zeros((bs, heads, length-1), dtype=x.dtype)
flat_x_padded = tf.concat([flat_x, flat_pad], axis=2)
final_x = tf.reshape(
flat_x_padded, [bs, heads, length+1, 2*length-1])
final_x = final_x[:, :, :length, length-1:]
return final_x
def relative_logits_1d(*, q, rel_k, transpose_mask):
"""
Compute relative logits along one dimenion.
`q`: [bs, heads, height, width, dim]
`rel_k`: [2*width - 1, dim]
"""
bs, heads, h, w, dim = q.shape
rel_logits = tf.einsum('bhxyd,md->bhxym', q, rel_k)
rel_logits = tf.reshape(rel_logits, [-1, heads * h, w, 2*w-1])
rel_logits = rel_to_abs(rel_logits)
rel_logits = tf.reshape(rel_logits, [-1, heads, h, w, w])
rel_logits = tf.expand_dims(rel_logits, axis=3)
rel_logits = tf.tile(rel_logits, [1, 1, 1, h, 1, 1])
rel_logits = tf.transpose(rel_logits, transpose_mask)
return rel_logits
def relative_logits(q):
"""Compute relative position enc logits."""
with tf.variable_scope('relative', reuse=tf.AUTO_REUSE):
bs, heads, h, w, dim = q.shape
int_dim = dim.value
# Note: below, we passed stddev arg as mean for the initializer.
# Providing code as is, with this small error.
# right way: normal_initializer(stddev=int_dim**-0.5)
# Relative logits in width dimension.
rel_emb_w = tf.get_variable(
'r_width', shape=(2*w - 1, dim),
dtype=q.dtype,
initializer=tf.random_normal_initializer(int_dim**-0.5))
rel_logits_w = relative_logits_1d(
q=q, rel_k=rel_emb_w,
transpose_mask=[0, 1, 2, 4, 3, 5])
# Relative logits in height dimension.
rel_emb_h = tf.get_variable(
'r_height', shape=(2*h - 1, dim),
dtype=q.dtype,
initializer=tf.random_normal_initializer(int_dim**-0.5))
rel_logits_h = relative_logits_1d(
q=tf.transpose(q, [0, 1, 3, 2, 4]),
rel_k=rel_emb_h,
transpose_mask=[0, 1, 4, 2, 5, 3])
return rel_logits_h + rel_logits_w
def relpos_self_attention(
*, q, k, v, relative=True, fold_heads=False):
"""2D self-attention with rel-pos. Add option to fold heads."""
bs, heads, h, w, dim = q.shape
int_dim = dim.value
q = q * (dim ** -0.5) # scaled dot-product
logits = tf.einsum('bhHWd,bhPQd->bhHWPQ', q, k)
if relative:
logits += relative_logits(q)
weights = tf.reshape(logits, [-1, heads, h, w, h * w])
weights = tf.nn.softmax(weights)
weights = tf.reshape(weights, [-1, heads, h, w, h, w])
attn_out = tf.einsum('bhHWPQ,bhPQd->bHWhd', weights, v)
if fold_heads:
attn_out = tf.reshape(attn_out, [-1, h, w, heads * dim])
return attn_out
def absolute_logits(q):
"""Compute absolute position enc logits."""
with tf.variable_scope('absolute', reuse=tf.AUTO_REUSE):
emb_w = tf.get_variable(
'r_width', shape=(W, dkh),
dtype=q.dtype,
initializer=tf.random_normal_initializer(dkh**-0.5))
emb_h = tf.get_variable(
'r_height', shape=(H, dkh),
dtype=q.dtype,
initializer=tf.random_normal_initializer(dkh**-0.5))
emb_h = emb_h[:, None, :]
emb_w = emb_w[None, :, :]
emb = emb_h + emb_w
abs_logits = tf.einsum('bhxyd,pqd->bhxypq', q, emb)
return abs_logits
def abspos_self_attention(*, q, k, v, absolue=True, fold_heads=False):
"""2D self-attention with abs-pos. Add option to fold heads."""
bs, heads, h, w, dim = q.shape
int_dim = dim.value
q = q * (dim ** -0.5) # scaled dot-product
logits = tf.einsum('bhHWd,bhPQd->bhHWPQ', q, k)
abs_logits = absolute_logits(q)
if absolute:
logits += abs_logits
weights = tf.reshape(logits, [-1, heads, h, w, h * w])
weights = tf.nn.softmax(weights)
weights = tf.reshape(weights, [-1, heads, h, w, h, w])
attn_out = tf.einsum('bhHWPQ,bhPQd->bHWhd', weights, v)
if fold_heads:
attn_out = tf.reshape(attn_out, [-1, h, w, heads * dim])
return attn_out
def group_pointwise(
featuremap, proj_factor=1, name='grouppoint',
heads=4, target_dimension=None):
"""1x1 conv with heads."""
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
in_channels = featuremap.shape[-1]
if target_dimension is not None:
proj_channels = target_dimension // proj_factor
else:
proj_channels = in_channels // proj_factor
w = tf.get_variable(
'w',
[in_channels, heads, proj_channels // heads],
dtype=featuremap.dtype,
initializer=tf.random_normal_initializer(stddev=0.01))
out = tf.einsum('bHWD,Dhd->bhHWd', featuremap, w)
return out
def MHSA(featuremap, pos_enc_type='relative', use_pos=True):
"""Multi-Head Self-Attention."""
q = group_pointwise(
featuremap, proj_factor=1, name='q_proj', heads=heads,
target_dimension=bottleneck_dimension)
k = group_pointwise(
featuremap, proj_factor=1, name='k_proj', heads=heads,
target_dimension=bottleneck_dimension)
v = group_pointwise(
featuremap, proj_factor=1, name='v_proj', heads=heads,
target_dimension=bottleneck_dimension)
assert pos_enc_type in ['relative', 'absolute']
if pos_enc_type == 'relative':
o = relpos_self_attention(
q=q, k=k, v=v, relative=use_pos, fold_heads=True)
else:
o = abspos_self_attention(
q=q, k=k, v=v, absolute=use_pos, fold_heads=True)
return o
def BoT_Block(
featuremap, is_training=False,
heads=4, proj_factor=4,
activation='relu',
pos_enc_type='relative',
name='all2all', strides=1,
target_dimension=2048):
"""Bottleneck Transformer (BoT) Block."""
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
shortcut = featuremap
in_dimension = featuremap.shape[-1]
if strides != 1 or in_dimension != target_dimension:
shortcut = Conv2D(
shortcut, filters=target_dimension, kernel_size=1, strides=strides)
shortcut = BNReLU(
shortcut, is_training, activation=activation, nonlinearity=True)
bottleneck_dimension = target_dimension // proj_factor
featuremap = Conv2D(
featuremap, filters=bottleneck_dimension, kernel_size=1, strides=1)
featuremap = BNReLU(
featuremap, is_training, activation=activation, nonlinearity=True)
featuremap = MHSA(featuremap, pos_enc_type=pos_enc_type)
if strides != 1:
assert strides == 2
featuremap = tf.keras.layers.AveragePooling2D(
pool_size=(2, 2), strides=(2, 2), padding='same')(featuremap)
featuremap = BNReLU(
featuremap, is_training, activation=activation, nonlinearity=True)
featuremap= Conv2D(
featuremap, filters=target_dimension,
kernel_size=1, strides=1)
featuremap = BNReLU(
featuremap, is_training, nonlinearity=False, init_zero=True)
return Activation(shortcut + featuremap, activation=activation)
def BoT_Stack(
featuremap, *,
blocks_so_far,
total_blocks,
is_training=False,
heads=4, proj_factor=4,
activation='relu',
pos_enc_type='relative',
name='all2all_stack',
strides=2, num_layers=3,
target_dimension=2048):
"""c5 Blockgroup of BoT Blocks."""
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
for i in range(num_layers):
featuremap = BoT_Block(
featuremap,
is_training=is_training,
heads=heads,
proj_factor=proj_factor,
activation=activation,
pos_enc_type=pos_enc_type,
strides=strides if i == 0 else 1,
target_dimension=target_dimension,
name='all2all_layer_{}'.format(i))
return featuremap
@lartpang

This comment has been minimized.

Copy link

@lartpang lartpang commented Jan 29, 2021

This is a very solid work.
But I am very confused about the way the relative position enc logits is used here, why should it be set like this?
Can you give a more detailed explanation?

@ShoufaChen

This comment has been minimized.

Copy link

@ShoufaChen ShoufaChen commented Jan 31, 2021

A PyTorch implementation of botnet.py.

@leondgarse

This comment has been minimized.

Copy link

@leondgarse leondgarse commented Feb 3, 2021

A Keras version on tensorflow 2.4.0 botnet.py

  • A layers.Layer class MHSAWithRelativePosition implemented based on keras.layers.MultiHeadAttention.
  • bot_block based on keras.applications.ResNet50.
  • BotNet50 / BotNet101 / BotNet152 based on keras.applications.ResNet50 / ResNet101 / ResNet152.
  • Only relative position.
@Cyril9227

This comment has been minimized.

Copy link

@Cyril9227 Cyril9227 commented Feb 8, 2021

A Keras version on tensorflow 2.4.0 botnet.py

  • A layers.Layer class MHSAWithRelativePosition implemented based on keras.layers.MultiHeadAttention.
  • bot_block based on keras.applications.ResNet50.
  • BotNet50 / BotNet101 / BotNet152 based on keras.applications.ResNet50 / ResNet101 / ResNet152.
  • Only relative position.

Great work, thanks for sharing. Do you have any pretrained models available ? What training strategy do you recommend ? Should we use Swish or Relu activation for optimal accuracy in images classification ?

@leondgarse

This comment has been minimized.

Copy link

@leondgarse leondgarse commented Feb 8, 2021

@Cyril9227 Sorry, but I don't have them pretrained either. All my understanding is from the article. I think the article shows strides=1, activation="swish" works better. Other strategies like using optimizer SGD with weight_decay is also recommended from the article.

@axhiao

This comment has been minimized.

Copy link

@axhiao axhiao commented Feb 19, 2021

A PyTorch implementation of botnet.py.

Thanks!

@BIGBALLON

This comment has been minimized.

Copy link

@BIGBALLON BIGBALLON commented Mar 16, 2021

A pytorch version: https://github.com/BIGBALLON/distribuuuu/blob/master/distribuuuu/models/botnet.py,

The results (the model trained by distribuuuu) :

model epoch total batch lr policy base lr Acc@1 Acc@5 model / config
resnet18 100 256 (32*8GPUs) cos 0.2 70.902 89.894 Google Drive / cfg
resnet18 100 1024 (128*8GPUs) cos 0.8 70.994 89.892
resnet18 100 8192 (128*64GPUs) cos 6.4 70.165 89.374
resnet18 100 16384 (256*64GPUs) cos 12.8 68.766 88.381
resnet50 100 256 (32*8GPUs) cos 0.2 77.252 93.430 Google Drive / cfg
botnet50 100 256 (32*8GPUs) cos 0.2 77.604 93.682 Google Drive / cfg

Training log: https://gist.github.com/BIGBALLON/3d53c81b2b11ea5dd66417c2a985cd89

@mickvdspoel

This comment has been minimized.

Copy link

@mickvdspoel mickvdspoel commented May 1, 2021

When will you release the pretained model?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment