GastonMazzei/NLP_MaximumEntropy_MinimumSTD.py

## NLP_MaximumEntropy_MinimumSTD.py
import numpy as np

"""
Simple code to show how the minimization of the coefficient of variation (i.e. propto the standard deviation)
is equivalent to the maximization of the entropy for the probability distribution model of the number of counts over the total,
i.e. Dirichlet distribution for multinomial variables approximated by the maximum likelihood :-)

Mathematical equivalence to render in Latex/MathJax:
max_x(-\sum_{i=1}^4\frac{N_i(x)}{Ntot(x)}log(\frac{N_i(x)}{Ntot(x)})) = min_x(\sum_{i=1}^4(\frac{N_i(x)-\bar{N}(x)}{\bar{N}(x)})^2)

@Date: 25th Jan 2022
@Author: https://gastonmazzei.github.io/
"""


# Generate 10 thousand samples of counts for 4 different classes,
# e.g. "word_goes_here" : {"VERB":10, "NOUN":5, "PRONOUN":17, "ANOTHER_CLASS":0}
#
v = np.random.randint(0,300,(10000,4)).astype('float32')


# Define the logarithm cost function, i.e. entropy
def f_log(x):
	"""
	Cost function: -\sum_i p_i log( p_i )

	Using the model for the probability under which
	p_i = N_counts_i / N_total_counts

	For the above example, it's

	P_VERB = 10 / (10 5 17 + 0)
	P_NOUN = 5 / (10 5 17 + 0)
	P_PRONOUN = 17 / (10 5 17 + 0)
	P_ANOTHER_CLASS = 0 / (10 5 17 + 0)

	"""
	EPS = 1e-3
	NTOTS = np.where(np.sum(x,1)>0,np.sum(x,1),1)
	x[:,0] /= NTOTS
	x[:,1] /= NTOTS
	x[:,2] /= NTOTS
	x[:,3] /= NTOTS
	return -(
		np.where(np.abs(x[:,0])>EPS, x[:,0] * np.log2(x[:,0]), 0) +
		np.where(np.abs(x[:,1])>EPS, x[:,1] * np.log2(x[:,1]), 0) +
		np.where(np.abs(x[:,2])>EPS, x[:,2] * np.log2(x[:,2]), 0) +
		np.where(np.abs(x[:,3])>EPS, x[:,3] * np.log2(x[:,3]), 0)
		)

# Define the coefficient of variation cost function
def f_deviation(x):
	"""
	Cost function: \sum_i ( N_counts_i - N_mean )^2 / (N_mean ** 2)

	Actually the coefficient of variation should be proportional to the squared
	root of this, but for optimization purposes we don't care
	"""
	MEAN = np.mean(x,1)
	x[:,0] -= MEAN
	x[:,1] -= MEAN
	x[:,2] -= MEAN
	x[:,3] -= MEAN
	return np.sum(x**2,1) / MEAN**2

# Compute the logarithmic cost function 'for each word'
A = f_log(v.copy())

# Compute the coefficient of variation cost function 'for each word'
B = f_deviation(v.copy())

# Display results on console
print(f'According to the probability maximization, the most degenerated word is {v[np.argmax(A)]}, with array index {np.argmax(A)}')
print(f'According to the coefficient of variation minimization, the most degenerated word is {v[np.argmin(B)]}, with array index {np.argmin(B)}')
	import numpy as np

	"""
	Simple code to show how the minimization of the coefficient of variation (i.e. propto the standard deviation)
	is equivalent to the maximization of the entropy for the probability distribution model of the number of counts over the total,
	i.e. Dirichlet distribution for multinomial variables approximated by the maximum likelihood :-)

	Mathematical equivalence to render in Latex/MathJax:
	max_x(-\sum_{i=1}^4\frac{N_i(x)}{Ntot(x)}log(\frac{N_i(x)}{Ntot(x)})) = min_x(\sum_{i=1}^4(\frac{N_i(x)-\bar{N}(x)}{\bar{N}(x)})^2)

	@Date: 25th Jan 2022
	@Author: https://gastonmazzei.github.io/
	"""


	# Generate 10 thousand samples of counts for 4 different classes,
	# e.g. "word_goes_here" : {"VERB":10, "NOUN":5, "PRONOUN":17, "ANOTHER_CLASS":0}
	#
	v = np.random.randint(0,300,(10000,4)).astype('float32')


	# Define the logarithm cost function, i.e. entropy
	def f_log(x):
	"""
	Cost function: -\sum_i p_i log( p_i )

	Using the model for the probability under which
	p_i = N_counts_i / N_total_counts

	For the above example, it's

	P_VERB = 10 / (10 5 17 + 0)
	P_NOUN = 5 / (10 5 17 + 0)
	P_PRONOUN = 17 / (10 5 17 + 0)
	P_ANOTHER_CLASS = 0 / (10 5 17 + 0)

	"""
	EPS = 1e-3
	NTOTS = np.where(np.sum(x,1)>0,np.sum(x,1),1)
	x[:,0] /= NTOTS
	x[:,1] /= NTOTS
	x[:,2] /= NTOTS
	x[:,3] /= NTOTS
	return -(
	np.where(np.abs(x[:,0])>EPS, x[:,0] * np.log2(x[:,0]), 0) +
	np.where(np.abs(x[:,1])>EPS, x[:,1] * np.log2(x[:,1]), 0) +
	np.where(np.abs(x[:,2])>EPS, x[:,2] * np.log2(x[:,2]), 0) +
	np.where(np.abs(x[:,3])>EPS, x[:,3] * np.log2(x[:,3]), 0)
	)

	# Define the coefficient of variation cost function
	def f_deviation(x):
	"""
	Cost function: \sum_i ( N_counts_i - N_mean )^2 / (N_mean ** 2)

	Actually the coefficient of variation should be proportional to the squared
	root of this, but for optimization purposes we don't care
	"""
	MEAN = np.mean(x,1)
	x[:,0] -= MEAN
	x[:,1] -= MEAN
	x[:,2] -= MEAN
	x[:,3] -= MEAN
	return np.sum(x2,1) / MEAN2

	# Compute the logarithmic cost function 'for each word'
	A = f_log(v.copy())

	# Compute the coefficient of variation cost function 'for each word'
	B = f_deviation(v.copy())

	# Display results on console
	print(f'According to the probability maximization, the most degenerated word is {v[np.argmax(A)]}, with array index {np.argmax(A)}')
	print(f'According to the coefficient of variation minimization, the most degenerated word is {v[np.argmin(B)]}, with array index {np.argmin(B)}')