vihari/OptimalPolicySolution.py

## OptimalPolicySolution.py
#!/usr/bin/python
"""
I have written this script to see if the optimal policy solution obtained by Linear Programming is same as the one that would be obtained by a Gradient based method
The Gradient based approach is implemented with Tensorflow and LP with a scipy library.
I found that n of the nk constraints (greater than or equal to) for the solution become equalities for the solution obtained by LinProg and that is not so in the case of Gradient-based approach. This shows that the constraints do not sufficiently specify the solution, but works in case of LP because of the way it finds the solution.

TF - Tensorflow's Gradient Descent
Run: python <script> [LP|TF]
"""
import tensorflow as tf
import numpy as np
import sys

n, k = 3, 3
np.random.seed(2)
T, R = np.reshape([np.random.sample() for i in range(n*k*n)], [n, k, n]), np.random.standard_normal([n, k, n])
T = T/np.expand_dims(np.sum(T, axis=2), axis=2)
assert np.all(np.sum(T, axis=2)>0.99), "T array does not have valid probabilities"
g = .7

alg = str(sys.argv[1]) if len(sys.argv)>1 else ''
algs = set(['LP', 'TF'])
solve_by = 'LP' if (not alg in algs) else alg
if solve_by=='TF':
    vs = [tf.get_variable("v%d" % i, shape=[], initializer=tf.random_normal_initializer) for i in range(n)]
    l = sum(vs)
    BIG = 1000
    sms = []
    for i in range(n):
        for a in range(k):
            sm = 0
            for j in range(n):
                sm += T[i][a][j]*(R[i][a][j]+g*vs[j])
            sm -= vs[i]
            sms.append(sm)
            l -= tf.maximum(0., BIG*sm)

    lr=0.1
    opt = tf.train.GradientDescentOptimizer(lr)
    op = opt.minimize(-l)
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    for i in range(100):
        x = sess.run([op, l] + sms)
        if i%10 == 0:
            print x
    print sess.run(vs)
elif solve_by=='LP':
    from scipy.optimize import linprog as lp
    c, A, b = [], [], []
    c = [-1]*n
    for i in range(n):
        for a in range(k):
            a_coeffs, b_sum = [0]*n, 0
            for j in range(n):
                a_coeffs[j] = T[i][a][j]*g
                b_sum -= T[i][a][j]*R[i][a][j]
            a_coeffs[i] -= 1
            A.append(a_coeffs)
            b.append(b_sum)

    res = lp(c, A_ub=A, b_ub=b)
    print res
    x = res.x
    # the first k constarints correspond to several actions on the same variable
    for l in range(n):
        print [sum([A[i][j]*x[j] for j in range(n)])-b[i] for i in range(k*l, l*k+k)]
	#!/usr/bin/python
	"""
	I have written this script to see if the optimal policy solution obtained by Linear Programming is same as the one that would be obtained by a Gradient based method
	The Gradient based approach is implemented with Tensorflow and LP with a scipy library.
	I found that n of the nk constraints (greater than or equal to) for the solution become equalities for the solution obtained by LinProg and that is not so in the case of Gradient-based approach. This shows that the constraints do not sufficiently specify the solution, but works in case of LP because of the way it finds the solution.

	TF - Tensorflow's Gradient Descent
	Run: python <script> [LP\|TF]
	"""
	import tensorflow as tf
	import numpy as np
	import sys

	n, k = 3, 3
	np.random.seed(2)
	T, R = np.reshape([np.random.sample() for i in range(nkn)], [n, k, n]), np.random.standard_normal([n, k, n])
	T = T/np.expand_dims(np.sum(T, axis=2), axis=2)
	assert np.all(np.sum(T, axis=2)>0.99), "T array does not have valid probabilities"
	g = .7

	alg = str(sys.argv[1]) if len(sys.argv)>1 else ''
	algs = set(['LP', 'TF'])
	solve_by = 'LP' if (not alg in algs) else alg
	if solve_by=='TF':
	vs = [tf.get_variable("v%d" % i, shape=[], initializer=tf.random_normal_initializer) for i in range(n)]
	l = sum(vs)
	BIG = 1000
	sms = []
	for i in range(n):
	for a in range(k):
	sm = 0
	for j in range(n):
	sm += T[i][a][j](R[i][a][j]+gvs[j])
	sm -= vs[i]
	sms.append(sm)
	l -= tf.maximum(0., BIG*sm)

	lr=0.1
	opt = tf.train.GradientDescentOptimizer(lr)
	op = opt.minimize(-l)
	sess = tf.InteractiveSession()
	sess.run(tf.global_variables_initializer())
	for i in range(100):
	x = sess.run([op, l] + sms)
	if i%10 == 0:
	print x
	print sess.run(vs)
	elif solve_by=='LP':
	from scipy.optimize import linprog as lp
	c, A, b = [], [], []
	c = [-1]*n
	for i in range(n):
	for a in range(k):
	a_coeffs, b_sum = [0]*n, 0
	for j in range(n):
	a_coeffs[j] = T[i][a][j]*g
	b_sum -= T[i][a][j]*R[i][a][j]
	a_coeffs[i] -= 1
	A.append(a_coeffs)
	b.append(b_sum)

	res = lp(c, A_ub=A, b_ub=b)
	print res
	x = res.x
	# the first k constarints correspond to several actions on the same variable
	for l in range(n):
	print [sum([A[i][j]x[j] for j in range(n)])-b[i] for i in range(kl, l*k+k)]