Skip to content

Instantly share code, notes, and snippets.

@dakshanand
Last active Jun 10, 2020
Embed
What would you like to do?
import math
import copy
import time
import random
import threading
import matplotlib
import numpy as np
from envHandler import Env
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import DQN
from utils import ReplayMemory
from utils import optimize_model
from utils import *
from vars import *
import torch.multiprocessing as mp
from torch.multiprocessing import Process, Lock, Manager, Semaphore
from multiprocessing.managers import BaseManager, NamespaceProxy
useCuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if useCuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if useCuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if useCuda else torch.ByteTensor
Tensor = FloatTensor
class Agent(object):
def __init__(self, agentType, agentNumber, shared, semGive, semAsk, locks):
self.agentType = agentType
self.agentNumber = agentNumber
self.env = Env()
self.shared = shared
self.semGive = semGive
self.semAsk = semAsk
self.locks = locks
self.model = DQN()
self.model.train(False)
self.trustModel = Trust()
self.trustModel.train(False)
self.target = copy.deepcopy(self.model)
self.optimizer = optim.Adam(self.model.parameters(), lr = learningRate)
self.optimizerTrust = optim.Adam(self.trustModel.parameters(), lr = -learningRateTrust)
self.memory = ReplayMemory(10000)
# self.adviceTaken = []
self.stepsDone = 0
self.totalReward = [0 for i in range(int(numEpisodes / testGap))]
if useCuda:
self.model.cuda()
self.trustModel.cuda()
temp = int(math.ceil(float(size) / tileSize)) + 1
self.tiles = np.zeros((numTiles, temp, temp))
def chooseAction(self, state, testing):
sample = random.random()
epsThreshold = epsEnd + (epsStart - epsEnd) * \
math.exp(-1. * self.stepsDone / epsDecay)
if sample > 0.2 or testing:
return self.model(
Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view(1, 1)
else:
return LongTensor([[random.randrange(4)]])
def lockMsgAsk(self, i, j, val):
self.locks["ask"].acquire()
mAsk = copy.deepcopy(self.shared.msgAsk)
mAsk[i][j] = val
self.shared.msgAsk = mAsk
self.locks["ask"].release()
def lockMsgGive(self, i, val):
self.locks["give"].acquire()
msgGive = copy.deepcopy(self.shared.msgGive)
if val == "empty":
msgGive[i] = []
else:
msgGive[i].append((self.agentNumber, val))
self.shared.msgGive = msgGive
self.locks["give"].release()
def askAdvice(self, state):
askedFrom = 0
self.lockMsgGive(self.agentNumber ,"empty")
for i in range(numAgents):
if self.shared.advicePhase[i] == 1 and i != self.agentNumber:
askedFrom += 1
self.lockMsgAsk(i, self.agentNumber, state)
self.semGive[i].release()
for i in range(askedFrom):
self.semAsk[self.agentNumber].acquire()
out = self.trustModel(
Variable(self.convertStateType(state), volatile = False).type(FloatTensor))
action = weightedVote(self.shared.msgGive[self.agentNumber], out.data.cpu().numpy()[0])
if action[0] != '':
self.lastAdviceTime = 0
multiplier = np.ones((1, numAgents))
for advice in self.shared.msgGive[self.agentNumber]:
if action[0] == advice[1]:
multiplier[0][advice[0]] = 1.0
multiplier = Variable(torch.from_numpy(multiplier)).type(FloatTensor)
out = out.mul(multiplier)
self.etrace = self.etrace + out
return (action)
def giveAdvice(self):
c = 0
for i in range(numAgents):
if i == self.agentNumber or self.shared.msgAsk[self.agentNumber][i] == 0:
continue
if c == 1:
self.semGive[self.agentNumber].acquire()
c = 1
state = self.shared.msgAsk[self.agentNumber][i]
stateCount = getStateCount(state, self.tiles)
pgive = pGive(stateCount)
sample = random.random()
if sample > pgive:
self.lockMsgGive(i, -1)
else:
action = self.chooseAction(self.convertStateType(state), True)
self.lockMsgGive(i, action[0, 0])
self.semAsk[i].release()
self.lockMsgAsk(self.agentNumber, i, 0)
def getAction(self, state):
stateCount = getStateCount(state, self.tiles)
pask = pAsk(stateCount)
sample = random.random()
if sample <= pask:
advice = self.askAdvice(state)
if advice[0] != '':
return LongTensor([[advice[0]]])
else:
return self.chooseAction(self.convertStateType(state), False)
else:
return self.chooseAction(self.convertStateType(state), False)
def constGive(self):
while True:
if self.shared.numDone == numAgents:
break
self.semGive[self.agentNumber].acquire()
self.giveAdvice()
def convertStateType(self, state):
if state == None:
return state
state1 = [0, 0]
state1[0] = (state[0] - (size / 2.0)) / float(size)
state1[1] = (state[1] - (size / 2.0)) / float(size)
state1 = np.ascontiguousarray(state1, dtype=np.float32)
state1 = torch.from_numpy(state1).type(Tensor)
return state1.view(1,2)
def run(self):
torch.cuda.set_device(1)
t = threading.Thread(target = self.constGive)
t.start()
for i_episode in range(numEpisodes):
print ("episode_duration :p", self.agentNumber, i_episode)
self.etrace = Variable(torch.zeros(1, numAgents)).type(FloatTensor)
self.lastAdviceTime = 10000
state = [0, 0]
c = 0
while True:
self.lastAdviceTime += 1
if self.stepsDone % 100 == 0:
self.target = copy.deepcopy(self.model)
self.stepsDone += 1
if(self.stepsDone > noAdvice):
self.shared.advicePhase[self.agentNumber] = 1
action = self.getAction(state)
reward, nextState = self.env.getReward(state, action[0, 0], self.agentType)
futureReward = reward
reward = Tensor([reward])
self.memory.push(self.convertStateType(state), action, self.convertStateType(nextState), reward)
c += 1
updateStateCount(state, self.tiles)
state = nextState
# print(state, self.agentNumber)
optimize_model(self.model, self.memory, self.optimizer, self.target)
if self.lastAdviceTime <= trustTraceSize:
if state != None:
R = self.model(
Variable(self.convertStateType(state),
volatile = True).type(FloatTensor)).data.max(1)
futureReward = futureReward + gamma * (R[0].cpu().numpy()[0])
optimizeTrust(self.trustModel, self.optimizerTrust, self.etrace, futureReward)
self.etrace = self.etrace * lambdaFactor
if state == None:
break
if (i_episode+1) % testGap == 0:
c = 0
state = [0, 0]
while True:
c += 1
if c < 25:
action = self.chooseAction(self.convertStateType(state), True)
else:
action = self.chooseAction(self.convertStateType(state), False)
reward, nextState = self.env.getReward(state, action.cpu().numpy()[0][0], self.agentType)
state = nextState
self.totalReward[int(i_episode / testGap)] += reward
if state == None:
break
print (self.totalReward)
self.locks["done"].acquire()
self.shared.numDone += 1
self.locks["done"].release()
if self.shared.numDone == numAgents:
for i in range(numAgents):
self.semGive[i].release()
class MySharedClass(object):
def __init__(self):
self.msgAsk = [[0 for i in range(0,numAgents)] for j in range(0, numAgents)]
self.msgGive = [[] for i in range(0,numAgents)]
self.agentDone = [0 for i in range(0,numAgents)]
self.advicePhase = [1 for i in range(0,numAgents)]
self.numDone = 0
class MyManager(BaseManager):
pass
class MyProxy(NamespaceProxy):
_exposed_ = ('__getattribute__', '__setattr__', '__delattr__')
if __name__ == '__main__':
mp.set_start_method('spawn')
semGive = [Semaphore(0) for i in range(0,numAgents)]
semAsk = [Semaphore(0) for i in range(0,numAgents)]
MyManager.register('MySharedClass', MySharedClass, MyProxy)
manager = MyManager()
manager.start()
Processes = []
shared = manager.MySharedClass()
locks = {
"ask" : Lock(),
"give" : Lock(),
"done" : Lock(),
}
agents = [Agent(1, j, shared, semGive, semAsk, locks) for j in range(numAgents)]
for j in range(numAgents):
t = mp.Process(target = agents[j].run, args = ())
Processes.append(t)
t.start()
for p in Processes:
p.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment