edumunozsala/scaled_dot_product_attention.py

## scaled_dot_product_attention.py
def scaled_dot_product_attention(queries, keys, values, mask):
    # Calculate the dot product, QK_transpose
    product = tf.matmul(queries, keys, transpose_b=True)
    # Get the scale factor
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    # Apply the scale factor to the dot product
    scaled_product = product / tf.math.sqrt(keys_dim)
    # Apply masking when it is requiered
    if mask is not None:
        scaled_product += (mask * -1e9)
    # dot product with Values
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)

    return attention
	def scaled_dot_product_attention(queries, keys, values, mask):
	# Calculate the dot product, QK_transpose
	product = tf.matmul(queries, keys, transpose_b=True)
	# Get the scale factor
	keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
	# Apply the scale factor to the dot product
	scaled_product = product / tf.math.sqrt(keys_dim)
	# Apply masking when it is requiered
	if mask is not None:
	scaled_product += (mask * -1e9)
	# dot product with Values
	attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)

	return attention