Sultan91/SARSA_control.m

## SARSA_control.m
function [eplus_in_curr, userdata] = RadiantControlFileBaseline(cmd,eplus_out_prev, eplus_in_prev, time, stepNumber, userdata)
if strcmp(cmd,'init')
  addpath('./RL_lib')

epsilon1 = 0.7; % Initial value
epsilon2 = 0.7; % Initial value
epsilon3 = 0.7; % Initial value

discount = 0.8;
learnRate = 0.99;
  successRate =1;
  % Temperature setpoint and actual temp state space definition
  tsps = [15:0.2:26];
  temps = tsps;  % setting the same
  actions = [0, -0.1, 0.1];
 % [states, R, Q] = RL_setup(tsps, temps, actions);

  load Q1.mat;
  load Q2.mat;
  load Q3.mat;
  load states.mat
  load R.mat

%   Q3 = Q;
%   Q2 = Q;
%   Q1 = Q;

   z3 = [20, 15];
   z2 = [20, 15];
   z1 = [20, 15];
    [next_state, state_index3] = min(abs(sum(states - repmat(z3,[size(states,1),1]).^2, 2)));

       if (rand()>epsilon1) && rand()<=successRate
        	[~,action_idx3] = max(Q3(state_index3,:)); % Pick the action the Q matrix thinks is best!
       else
             action_idx3 = randi(length(actions),1); % Random action!
       end
    action_idx3
    act = actions(action_idx3); % Taking  chosen action
    eplus_in_prev
  %%    Update  Q matrix (z1 has to be updated at this step)
  z3_new = [20, eplus_out_prev.temp3(end)];
  z2_new = [20, eplus_out_prev.temp2(end)];
  z1_new = [20, eplus_out_prev.temp1(end)];
 [~, new_state_index3] = min(abs(sum(states - repmat(z3_new,[size(states,1),1]).^2, 2))); % Interpolate again to find the new state the system is closest to.
 %OLD_one = Q(next_state_index, action_idx)
 %% Updating Action-Value function with Sarsa
  Q3(state_index3, action_idx3) = Q3(state_index3,action_idx3) + learnRate * ( R(new_state_index3) ...
      + discount*max(Q3(new_state_index3,:)) - Q3(state_index3,action_idx3));
    Q2(state_index3, action_idx3) = Q2(state_index3,action_idx3) + learnRate * ( R(new_state_index3) ...
      + discount*max(Q2(new_state_index3,:)) - Q2(state_index3,action_idx3));
    Q1(state_index3, action_idx3) = Q1(state_index3,action_idx3) + learnRate * ( R(new_state_index3) ...
      + discount*max(Q1(new_state_index3,:)) - Q1(state_index3,action_idx3));

  userdata.currState3 = z3_new;
  userdata.currState2 = z2_new;
  userdata.currState1 = z1_new;
  eplus_in_curr.tsp1 = 20+act;
  eplus_in_curr.tsp2 = 20+act;
  eplus_in_curr.tsp3 = 20+act;
  userdata.old_tsp3 =  eplus_in_curr.tsp3;
  userdata.old_tsp2 =  eplus_in_curr.tsp2;
  userdata.old_tsp1 =  eplus_in_curr.tsp1;
  userdata.Q1 = Q1;
  userdata.Q2 = Q2;
  userdata.Q3 = Q3;
  userdata.states = states;
  userdata.R = R;
  save('./RL_lib/Q1.mat','Q1');
  save('./RL_lib/Q2.mat','Q2');
  save('./RL_lib/Q3.mat','Q3');
  save('./RL_lib/states.mat','states');
  save('./RL_lib/R.mat','R');

epsilonDecay = 0.98; % Decay factor per iteration.
epsilon1 = epsilon1*epsilonDecay;
epsilonDecay = 0.98; % Decay factor per iteration.
epsilon2 = epsilon2*epsilonDecay;
epsilonDecay = 0.98; % Decay factor per iteration.
epsilon3 = epsilon3*epsilonDecay;
  userdata.epsilon1 = epsilon1;userdata.epsilon2 = epsilon2;userdata.epsilon3 = epsilon3;
elseif strcmp(cmd,'normal')

   z3_new = [eplus_in_prev.tsp3(end), eplus_out_prev.temp3(end)];
   z2_new = [eplus_in_prev.tsp2(end), eplus_out_prev.temp2(end)];
   z1_new = [eplus_in_prev.tsp1(end), eplus_out_prev.temp1(end)];
   temperature = [z1_new(2),z2_new(2),z3_new(2)]
   Q3 = userdata.Q3;
   Q2 = userdata.Q2;
   Q1 = userdata.Q1;
   addpath('./RL_lib')
   %load Q.mat;
   R = userdata.R;
   actions = [ 0, -0.1, 0.1];
   states = userdata.states;
   %%
successRate = 1;
epsilon1 = userdata.epsilon1; % Initial value
epsilon2 = userdata.epsilon2; % Initial value
epsilon3 = userdata.epsilon3; % Initial value
epsilonDecay = 0.98; % Decay factor per iteration.

discount = 0.8;
learnRate = 0.99;
   %% curr state
    z3 = userdata.currState3;
    z2 = userdata.currState2;
    z1 = userdata.currState1;
     [~, state_index3] = min(sum(abs(states - repmat(z3,[size(states,1),1])).^2, 2));
     [~, state_index2] = min(sum(abs(states - repmat(z2,[size(states,1),1])).^2, 2));
     [~, state_index1] = min(sum(abs(states - repmat(z1,[size(states,1),1])).^2, 2));

       if (rand()>epsilon3) && rand()<=successRate
        	[~,action_idx3] = max(Q3(state_index3,:)); % Pick the action the Q matrix thinks is best!
       else
             action_idx3 = randi(length(actions),1); % Random action!
       end
       if (rand()>epsilon2) && rand()<=successRate
        	[~,action_idx2] = max(Q2(state_index2,:)); % Pick the action the Q matrix thinks is best!
       else
             action_idx2 = randi(length(actions),1); % Random action!
       end
       if (rand()>epsilon1) && rand()<=successRate
        	[~,action_idx1] = max(Q1(state_index1,:)); % Pick the action the Q matrix thinks is best!
       else
             action_idx1 = randi(length(actions),1); % Random action!
       end

            act3 = actions(action_idx3); % Taking  chosen action (which way change TSP)
            act2 = actions(action_idx2);
            act1 = actions(action_idx1);

   %% New state acquired

 [~, new_state_index3] = min(sum(abs(states - repmat(z3_new,[size(states,1),1])).^2, 2)); % Interpolate again to find the new state the system is closest to.
 OLD_Q3= Q3(state_index3, action_idx3);
 Q3(state_index3, action_idx3) = Q3(state_index3,action_idx3) + learnRate * ( R(new_state_index3)...
                                                                + discount*max(Q3(new_state_index3,:)) - Q3(state_index3,action_idx3));
 Change3=  R(new_state_index3) + discount*max(Q3(new_state_index3,:)) - Q3(state_index3,action_idx3) ;
 %--------------------------------------------------------------------
  [~, new_state_index2] = min(sum(abs(states - repmat(z2_new,[size(states,1),1])).^2, 2)); % Interpolate again to find the new state the system is closest to.
 OLD_Q2= Q2(state_index2, action_idx2);
 Q2(state_index2, action_idx2) = Q2(state_index2,action_idx2) + learnRate * ( R(new_state_index2)...
                                                                + discount*max(Q2(new_state_index2,:)) - Q2(state_index2,action_idx2));
 Change2=  R(new_state_index2) + discount*max(Q2(new_state_index2,:)) - Q2(state_index2,action_idx2) ;
 %_-----------------------------------------------------------------------------------
  [~, new_state_index1] = min(sum(abs(states - repmat(z1_new,[size(states,1),1])).^2, 2)); % Interpolate again to find the new state the system is closest to.
 OLD_Q1= Q1(state_index1, action_idx1);
 Q1(state_index1, action_idx1) = Q1(state_index1,action_idx1) + learnRate * ( R(new_state_index1)...
                                                                + discount*max(Q1(new_state_index1,:)) - Q1(state_index1,action_idx1));
 Change1=  R(new_state_index1) + discount*max(Q1(new_state_index1,:)) - Q1(state_index1,action_idx1) ;


    % Output
    indexes = [ state_index1, new_state_index1;state_index2, new_state_index2;state_index3, new_state_index3;]
%     epsilon

  %  Best_Q = discount*max(Q3(new_state_index3,:));
     % ---------------------------------------------------------------------------------------------
    userdata.currState3 = z3_new;
    userdata.currState2 = z2_new;
    userdata.currState1 = z1_new;
    eplus_in_curr.tsp1 = userdata.old_tsp1 + act1;
    eplus_in_curr.tsp2 = userdata.old_tsp2 + act2;
    eplus_in_curr.tsp3 = userdata.old_tsp3 + act3;

    userdata.old_tsp3 = eplus_in_curr.tsp3;
    userdata.old_tsp2 = eplus_in_curr.tsp2;
    userdata.old_tsp1 = eplus_in_curr.tsp1;

    userdata.Q3 = Q3;
    userdata.Q2 = Q2;
    userdata.Q1 = Q1;
    TSPs = [ userdata.old_tsp1, userdata.old_tsp3, userdata.old_tsp3]
    save('./RL_lib/Q1.mat','Q1');
    save('./RL_lib/Q2.mat','Q2');
    save('./RL_lib/Q3.mat','Q3');

 epsilon1 = epsilon1*epsilonDecay;
 epsilon2 = epsilon2*epsilonDecay;
 epsilon3 = epsilon3*epsilonDecay;
userdata.epsilon1 = epsilon1;
userdata.epsilon2 = epsilon2;
userdata.epsilon3 = epsilon3;
end
	function [eplus_in_curr, userdata] = RadiantControlFileBaseline(cmd,eplus_out_prev, eplus_in_prev, time, stepNumber, userdata)
	if strcmp(cmd,'init')
	addpath('./RL_lib')

	epsilon1 = 0.7; % Initial value
	epsilon2 = 0.7; % Initial value
	epsilon3 = 0.7; % Initial value

	discount = 0.8;
	learnRate = 0.99;
	successRate =1;
	% Temperature setpoint and actual temp state space definition
	tsps = [15:0.2:26];
	temps = tsps; % setting the same
	actions = [0, -0.1, 0.1];
	% [states, R, Q] = RL_setup(tsps, temps, actions);

	load Q1.mat;
	load Q2.mat;
	load Q3.mat;
	load states.mat
	load R.mat

	% Q3 = Q;
	% Q2 = Q;
	% Q1 = Q;

	z3 = [20, 15];
	z2 = [20, 15];
	z1 = [20, 15];
	[next_state, state_index3] = min(abs(sum(states - repmat(z3,[size(states,1),1]).^2, 2)));

	if (rand()>epsilon1) && rand()<=successRate
	[~,action_idx3] = max(Q3(state_index3,:)); % Pick the action the Q matrix thinks is best!
	else
	action_idx3 = randi(length(actions),1); % Random action!
	end
	action_idx3
	act = actions(action_idx3); % Taking chosen action
	eplus_in_prev
	%% Update Q matrix (z1 has to be updated at this step)
	z3_new = [20, eplus_out_prev.temp3(end)];
	z2_new = [20, eplus_out_prev.temp2(end)];
	z1_new = [20, eplus_out_prev.temp1(end)];
	[~, new_state_index3] = min(abs(sum(states - repmat(z3_new,[size(states,1),1]).^2, 2))); % Interpolate again to find the new state the system is closest to.
	%OLD_one = Q(next_state_index, action_idx)
	%% Updating Action-Value function with Sarsa
	Q3(state_index3, action_idx3) = Q3(state_index3,action_idx3) + learnRate * ( R(new_state_index3) ...
	+ discount*max(Q3(new_state_index3,:)) - Q3(state_index3,action_idx3));
	Q2(state_index3, action_idx3) = Q2(state_index3,action_idx3) + learnRate * ( R(new_state_index3) ...
	+ discount*max(Q2(new_state_index3,:)) - Q2(state_index3,action_idx3));
	Q1(state_index3, action_idx3) = Q1(state_index3,action_idx3) + learnRate * ( R(new_state_index3) ...
	+ discount*max(Q1(new_state_index3,:)) - Q1(state_index3,action_idx3));

	userdata.currState3 = z3_new;
	userdata.currState2 = z2_new;
	userdata.currState1 = z1_new;
	eplus_in_curr.tsp1 = 20+act;
	eplus_in_curr.tsp2 = 20+act;
	eplus_in_curr.tsp3 = 20+act;
	userdata.old_tsp3 = eplus_in_curr.tsp3;
	userdata.old_tsp2 = eplus_in_curr.tsp2;
	userdata.old_tsp1 = eplus_in_curr.tsp1;
	userdata.Q1 = Q1;
	userdata.Q2 = Q2;
	userdata.Q3 = Q3;
	userdata.states = states;
	userdata.R = R;
	save('./RL_lib/Q1.mat','Q1');
	save('./RL_lib/Q2.mat','Q2');
	save('./RL_lib/Q3.mat','Q3');
	save('./RL_lib/states.mat','states');
	save('./RL_lib/R.mat','R');

	epsilonDecay = 0.98; % Decay factor per iteration.
	epsilon1 = epsilon1*epsilonDecay;
	epsilonDecay = 0.98; % Decay factor per iteration.
	epsilon2 = epsilon2*epsilonDecay;
	epsilonDecay = 0.98; % Decay factor per iteration.
	epsilon3 = epsilon3*epsilonDecay;
	userdata.epsilon1 = epsilon1;userdata.epsilon2 = epsilon2;userdata.epsilon3 = epsilon3;
	elseif strcmp(cmd,'normal')

	z3_new = [eplus_in_prev.tsp3(end), eplus_out_prev.temp3(end)];
	z2_new = [eplus_in_prev.tsp2(end), eplus_out_prev.temp2(end)];
	z1_new = [eplus_in_prev.tsp1(end), eplus_out_prev.temp1(end)];
	temperature = [z1_new(2),z2_new(2),z3_new(2)]
	Q3 = userdata.Q3;
	Q2 = userdata.Q2;
	Q1 = userdata.Q1;
	addpath('./RL_lib')
	%load Q.mat;
	R = userdata.R;
	actions = [ 0, -0.1, 0.1];
	states = userdata.states;
	%%
	successRate = 1;
	epsilon1 = userdata.epsilon1; % Initial value
	epsilon2 = userdata.epsilon2; % Initial value
	epsilon3 = userdata.epsilon3; % Initial value
	epsilonDecay = 0.98; % Decay factor per iteration.

	discount = 0.8;
	learnRate = 0.99;
	%% curr state
	z3 = userdata.currState3;
	z2 = userdata.currState2;
	z1 = userdata.currState1;
	[~, state_index3] = min(sum(abs(states - repmat(z3,[size(states,1),1])).^2, 2));
	[~, state_index2] = min(sum(abs(states - repmat(z2,[size(states,1),1])).^2, 2));
	[~, state_index1] = min(sum(abs(states - repmat(z1,[size(states,1),1])).^2, 2));

	if (rand()>epsilon3) && rand()<=successRate
	[~,action_idx3] = max(Q3(state_index3,:)); % Pick the action the Q matrix thinks is best!
	else
	action_idx3 = randi(length(actions),1); % Random action!
	end
	if (rand()>epsilon2) && rand()<=successRate
	[~,action_idx2] = max(Q2(state_index2,:)); % Pick the action the Q matrix thinks is best!
	else
	action_idx2 = randi(length(actions),1); % Random action!
	end
	if (rand()>epsilon1) && rand()<=successRate
	[~,action_idx1] = max(Q1(state_index1,:)); % Pick the action the Q matrix thinks is best!
	else
	action_idx1 = randi(length(actions),1); % Random action!
	end

	act3 = actions(action_idx3); % Taking chosen action (which way change TSP)
	act2 = actions(action_idx2);
	act1 = actions(action_idx1);

	%% New state acquired

	[~, new_state_index3] = min(sum(abs(states - repmat(z3_new,[size(states,1),1])).^2, 2)); % Interpolate again to find the new state the system is closest to.
	OLD_Q3= Q3(state_index3, action_idx3);
	Q3(state_index3, action_idx3) = Q3(state_index3,action_idx3) + learnRate * ( R(new_state_index3)...
	+ discount*max(Q3(new_state_index3,:)) - Q3(state_index3,action_idx3));
	Change3= R(new_state_index3) + discount*max(Q3(new_state_index3,:)) - Q3(state_index3,action_idx3) ;
	%--------------------------------------------------------------------
	[~, new_state_index2] = min(sum(abs(states - repmat(z2_new,[size(states,1),1])).^2, 2)); % Interpolate again to find the new state the system is closest to.
	OLD_Q2= Q2(state_index2, action_idx2);
	Q2(state_index2, action_idx2) = Q2(state_index2,action_idx2) + learnRate * ( R(new_state_index2)...
	+ discount*max(Q2(new_state_index2,:)) - Q2(state_index2,action_idx2));
	Change2= R(new_state_index2) + discount*max(Q2(new_state_index2,:)) - Q2(state_index2,action_idx2) ;
	%_-----------------------------------------------------------------------------------
	[~, new_state_index1] = min(sum(abs(states - repmat(z1_new,[size(states,1),1])).^2, 2)); % Interpolate again to find the new state the system is closest to.
	OLD_Q1= Q1(state_index1, action_idx1);
	Q1(state_index1, action_idx1) = Q1(state_index1,action_idx1) + learnRate * ( R(new_state_index1)...
	+ discount*max(Q1(new_state_index1,:)) - Q1(state_index1,action_idx1));
	Change1= R(new_state_index1) + discount*max(Q1(new_state_index1,:)) - Q1(state_index1,action_idx1) ;


	% Output
	indexes = [ state_index1, new_state_index1;state_index2, new_state_index2;state_index3, new_state_index3;]
	% epsilon

	% Best_Q = discount*max(Q3(new_state_index3,:));
	% ---------------------------------------------------------------------------------------------
	userdata.currState3 = z3_new;
	userdata.currState2 = z2_new;
	userdata.currState1 = z1_new;
	eplus_in_curr.tsp1 = userdata.old_tsp1 + act1;
	eplus_in_curr.tsp2 = userdata.old_tsp2 + act2;
	eplus_in_curr.tsp3 = userdata.old_tsp3 + act3;

	userdata.old_tsp3 = eplus_in_curr.tsp3;
	userdata.old_tsp2 = eplus_in_curr.tsp2;
	userdata.old_tsp1 = eplus_in_curr.tsp1;

	userdata.Q3 = Q3;
	userdata.Q2 = Q2;
	userdata.Q1 = Q1;
	TSPs = [ userdata.old_tsp1, userdata.old_tsp3, userdata.old_tsp3]
	save('./RL_lib/Q1.mat','Q1');
	save('./RL_lib/Q2.mat','Q2');
	save('./RL_lib/Q3.mat','Q3');

	epsilon1 = epsilon1*epsilonDecay;
	epsilon2 = epsilon2*epsilonDecay;
	epsilon3 = epsilon3*epsilonDecay;
	userdata.epsilon1 = epsilon1;
	userdata.epsilon2 = epsilon2;
	userdata.epsilon3 = epsilon3;
	end