OmriKaduri/positional_encoding.py

## positional_encoding.py
import math
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set()

def plot_pe(PE):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.imshow(PE)
    ax.grid(False)
    ax.set_xlabel('Embedding dimension')
    ax.set_ylabel('Sequence position')

emb_dim = 300
max_len = 128
PE = np.zeros((max_len,emb_dim))
for pos_i in range(max_len):
    for dim_j in range(emb_dim):
        if dim_j % 2 == 0:
            PE[pos_i][dim_j] = math.sin(pos_i/(10000**(2*(dim_j//2)/emb_dim)))
        else:
            PE[pos_i][dim_j] = math.cos(pos_i/(10000**(2*(dim_j//2)/emb_dim)))

plot_pe(PE)

#Different dimensions describe different relations across the input sequence.
#The final dimensions (i.e., 280) gives a monotincally increasing value per input position.
#That encodes the relation that closer words have more similar encoding than far away words.
#However, the first dimensions (i.e., the 10th) describes a more complicated relation - words that are about ~10 words apart get a similar encoding.
#That means that different dimensions in this encoding describes different relations regarding the positions of the words in the sequence, allowing the model to consider the position of the word in the sequence.

#Convince yourself in the above while plotting the positional encoding for different dimensions:
plt.plot(PE[:,280])

plt.plot(PE[:,10])
	import math
	import numpy as np
	%matplotlib inline
	import matplotlib.pyplot as plt
	import seaborn; seaborn.set()

	def plot_pe(PE):
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.imshow(PE)
	ax.grid(False)
	ax.set_xlabel('Embedding dimension')
	ax.set_ylabel('Sequence position')

	emb_dim = 300
	max_len = 128
	PE = np.zeros((max_len,emb_dim))
	for pos_i in range(max_len):
	for dim_j in range(emb_dim):
	if dim_j % 2 == 0:
	PE[pos_i][dim_j] = math.sin(pos_i/(10000*(2(dim_j//2)/emb_dim)))
	else:
	PE[pos_i][dim_j] = math.cos(pos_i/(10000*(2(dim_j//2)/emb_dim)))

	plot_pe(PE)

	#Different dimensions describe different relations across the input sequence.
	#The final dimensions (i.e., 280) gives a monotincally increasing value per input position.
	#That encodes the relation that closer words have more similar encoding than far away words.
	#However, the first dimensions (i.e., the 10th) describes a more complicated relation - words that are about ~10 words apart get a similar encoding.
	#That means that different dimensions in this encoding describes different relations regarding the positions of the words in the sequence, allowing the model to consider the position of the word in the sequence.

	#Convince yourself in the above while plotting the positional encoding for different dimensions:
	plt.plot(PE[:,280])

	plt.plot(PE[:,10])