jangirrishabh/inverseCart.py

## inverseCart.py
import gym
import numpy as np
import cv2, math
import logging
import os
import scipy
from numpy import linalg as LA
from matplotlib import pyplot as plt
%matplotlib inline
from poleCart_RL import EpisodicAgent #get the RL agent
from poleCart_manual import expertFeatures
from cvxopt import matrix
from cvxopt import solvers


class irlAgent:
    def __init__(self, gymEnv, rlEpisodes, rlMaxSteps): #initial constructor sorta function
        self.env = gymEnv
        self.episodesRL = rlEpisodes
        self.maxStepsRL = rlMaxSteps
        self.randomPolicy = [6.33159868, 22.18457058, 24.07697606, 64.68426447, 15.92186349] # random initialization
        self.expertPolicy = [1.40327044 , 12.06541251  , 1.39011785 , 15.70455323 , 19.99994606] #human generated get it straight
        #self.expertPolicy  = [ 24.90898925 , 66.21544503 , 28.48223649 , 80.00899435 , 19.67509372] # human generated get max displacement
        #self.expertPolicy = [ 1.38076217  , 3.6306461  ,  0.79024451 ,  3.27657669,  20.99918233] # machine generated get it straight

        self.epsilon = 1.0
        self.policiesFE = {np.linalg.norm(np.asarray(self.expertPolicy)-np.asarray(self.randomPolicy)):self.randomPolicy}


    def getRLAgentFE(self, W): #get the feature expectations of a new poliicy using RL agent
        agent = EpisodicAgent(self.env.action_space)
        return agent.reinforce(self.env, W, self.episodesRL, self.maxStepsRL) #return feature expectations

    def policyListUpdater(self, W):  #update the policyFE list and differences upon arrival of a new weight(policy)
        for i in self.policiesFE.keys():
            temp = np.abs(np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(self.policiesFE[i]))))
            if  temp != i:
                self.policiesFE[temp] = self.policiesFE[i]
                del self.policiesFE[i]
        tempFE = self.getRLAgentFE(W)
        self.policiesFE[np.abs(np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(tempFE))))] = tempFE
        #self.policiesFE[np.linalg.norm(np.asarray(self.expertPolicy)-np.asarray(tempFE))] = tempFE
        #self.policiesFE[np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(tempFE)))] = tempFE

    def optimalWeightFinder(self):
        t_prev = 0
        while True:
            W = self.optimization(np.matrix(self.expertPolicy) - np.matrix(self.policiesFE[min(self.policiesFE)]))
            print " The minimum thing : ", min(self.policiesFE)
            print " The sent Feature Expec : ", np.matrix(self.policiesFE[min(self.policiesFE)])
            #t = np.abs(np.dot(W, np.asarray(self.expertPolicy) - np.asarray(self.policiesFE[min(self.policiesFE)])))
            t = np.dot(W, np.asarray(self.expertPolicy) - np.asarray(self.policiesFE[min(self.policiesFE)]))
            #print np.squeeze(np.asarray(np.matrix(self.expertPolicy) - np.matrix(self.policiesFE[min(self.policiesFE)])))
            if np.abs(t) <= 1.0 + self.epsilon:
                break
            #if np.abs(t-t_prev) < self.epsilon:
                #break
            self.policyListUpdater(W)
            t_prev = t
            print " the t value :: ", np.abs(t)
            print "The KEYS::" , self.policiesFE.keys()
            print "weights ", W
        return W

    def optimization(self, difference):
        P = matrix(2.0*np.eye(5), tc='d')
        q = matrix(np.zeros(5), tc='d')
        #G = matrix((np.matrix(self.expertPolicy) - np.matrix(self.randomPolicy)), tc='d')
        G = matrix(-difference, tc='d')
        h = matrix(np.array([-1]), tc='d')

        sol = solvers.qp(P,q,G,h)
        #print sol['status']
        #return sol['x']
        weights = np.squeeze(np.asarray(sol['x']))
        norm = np.linalg.norm(weights)
        weights = weights/norm
        return weights


if __name__ == '__main__':
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    rlEpisodes = 100
    rlMaxSteps = 250
    #W = [-0.9, -0.9, -0.9, -0.9, 1]
    env = gym.make('CartPole-v0')
    irlearner = irlAgent(env, rlEpisodes, rlMaxSteps)
    #print irlearner.policiesFE
    #irlearner.policyListUpdater(W)
    #print irlearner.rlAgentFeatureExpecs(W)
    #print irlearner.expertFeatureExpecs()
    print irlearner.optimalWeightFinder()
    #print irlearner.optimization(20)
    #np.squeeze(np.asarray(M))
	import gym
	import numpy as np
	import cv2, math
	import logging
	import os
	import scipy
	from numpy import linalg as LA
	from matplotlib import pyplot as plt
	%matplotlib inline
	from poleCart_RL import EpisodicAgent #get the RL agent
	from poleCart_manual import expertFeatures
	from cvxopt import matrix
	from cvxopt import solvers


	class irlAgent:
	def __init__(self, gymEnv, rlEpisodes, rlMaxSteps): #initial constructor sorta function
	self.env = gymEnv
	self.episodesRL = rlEpisodes
	self.maxStepsRL = rlMaxSteps
	self.randomPolicy = [6.33159868, 22.18457058, 24.07697606, 64.68426447, 15.92186349] # random initialization
	self.expertPolicy = [1.40327044 , 12.06541251 , 1.39011785 , 15.70455323 , 19.99994606] #human generated get it straight
	#self.expertPolicy = [ 24.90898925 , 66.21544503 , 28.48223649 , 80.00899435 , 19.67509372] # human generated get max displacement
	#self.expertPolicy = [ 1.38076217 , 3.6306461 , 0.79024451 , 3.27657669, 20.99918233] # machine generated get it straight

	self.epsilon = 1.0
	self.policiesFE = {np.linalg.norm(np.asarray(self.expertPolicy)-np.asarray(self.randomPolicy)):self.randomPolicy}


	def getRLAgentFE(self, W): #get the feature expectations of a new poliicy using RL agent
	agent = EpisodicAgent(self.env.action_space)
	return agent.reinforce(self.env, W, self.episodesRL, self.maxStepsRL) #return feature expectations

	def policyListUpdater(self, W): #update the policyFE list and differences upon arrival of a new weight(policy)
	for i in self.policiesFE.keys():
	temp = np.abs(np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(self.policiesFE[i]))))
	if temp != i:
	self.policiesFE[temp] = self.policiesFE[i]
	del self.policiesFE[i]
	tempFE = self.getRLAgentFE(W)
	self.policiesFE[np.abs(np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(tempFE))))] = tempFE
	#self.policiesFE[np.linalg.norm(np.asarray(self.expertPolicy)-np.asarray(tempFE))] = tempFE
	#self.policiesFE[np.dot(W, (np.asarray(self.expertPolicy)-np.asarray(tempFE)))] = tempFE

	def optimalWeightFinder(self):
	t_prev = 0
	while True:
	W = self.optimization(np.matrix(self.expertPolicy) - np.matrix(self.policiesFE[min(self.policiesFE)]))
	print " The minimum thing : ", min(self.policiesFE)
	print " The sent Feature Expec : ", np.matrix(self.policiesFE[min(self.policiesFE)])
	#t = np.abs(np.dot(W, np.asarray(self.expertPolicy) - np.asarray(self.policiesFE[min(self.policiesFE)])))
	t = np.dot(W, np.asarray(self.expertPolicy) - np.asarray(self.policiesFE[min(self.policiesFE)]))
	#print np.squeeze(np.asarray(np.matrix(self.expertPolicy) - np.matrix(self.policiesFE[min(self.policiesFE)])))
	if np.abs(t) <= 1.0 + self.epsilon:
	break
	#if np.abs(t-t_prev) < self.epsilon:
	#break
	self.policyListUpdater(W)
	t_prev = t
	print " the t value :: ", np.abs(t)
	print "The KEYS::" , self.policiesFE.keys()
	print "weights ", W
	return W

	def optimization(self, difference):
	P = matrix(2.0*np.eye(5), tc='d')
	q = matrix(np.zeros(5), tc='d')
	#G = matrix((np.matrix(self.expertPolicy) - np.matrix(self.randomPolicy)), tc='d')
	G = matrix(-difference, tc='d')
	h = matrix(np.array([-1]), tc='d')

	sol = solvers.qp(P,q,G,h)
	#print sol['status']
	#return sol['x']
	weights = np.squeeze(np.asarray(sol['x']))
	norm = np.linalg.norm(weights)
	weights = weights/norm
	return weights



	if __name__ == '__main__':
	logger = logging.getLogger()
	logger.setLevel(logging.INFO)
	rlEpisodes = 100
	rlMaxSteps = 250
	#W = [-0.9, -0.9, -0.9, -0.9, 1]
	env = gym.make('CartPole-v0')
	irlearner = irlAgent(env, rlEpisodes, rlMaxSteps)
	#print irlearner.policiesFE
	#irlearner.policyListUpdater(W)
	#print irlearner.rlAgentFeatureExpecs(W)
	#print irlearner.expertFeatureExpecs()
	print irlearner.optimalWeightFinder()
	#print irlearner.optimization(20)
	#np.squeeze(np.asarray(M))