Skip to content

Instantly share code, notes, and snippets.

@KarenWest
Created April 25, 2013 19:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KarenWest/5462214 to your computer and use it in GitHub Desktop.
Save KarenWest/5462214 to your computer and use it in GitHub Desktop.
def computeQValueFromValues(self, state, action):
"""
Compute the Q-value of action in state from the
value function stored in self.values.
"""
"*** YOUR CODE HERE ***"
#for this state and action, get list of probabilities for
#the legal transitions to new states for this state
#if "North" or "South" is action --returns (northState,1-self.noise),
#followed by (westState, self.noise/2.0) and (eastState, self.noise/2)
#must check though if transition would take place - since if the
#transition state = same state passed in, that means it was not allowed
#Similarly for action = "East" or "West"
#value of current state
if self.mdp.isTerminal(state) == True:
print "terminal state -returning None from computeActionFromValues()"
return None
qVal = 0.0
transitionStatesAndProbs = self.mdp.getTransitionStatesAndProbs(state,action)
for newState,prob in transitionStatesAndProbs:
#if newState != state:
#transition could take place to new state
#compute q val for this transition
value = self.values[newState]
reward = self.mdp.getReward(state,action,newState)
qVal += prob * (reward + self.discount * value)
return qVal
#util.raiseNotDefined()
def computeActionFromValues(self, state):
"""
The policy is the best action in the given state
according to the values currently stored in self.values.
You may break ties any way you see fit. Note that if
there are no legal actions, which is the case at the
terminal state, you should return None.
"""
"*** YOUR CODE HERE ***"
ValueDict = {}
totalIter = self.iterations
for i in range(totalIter):
#ValueDict[iteration] = (action, bestQVal)
ValueDict[i] = (' ',0)
#newStates = self.mdp.getStates()
#print "new states"
#print newStates
#list of possible actions from current state
actions = self.mdp.getPossibleActions(state)
#NOTE-perhaps doe a few interations on paper!!
#print "actions"
#print actions
#make a dictionary of q values from taking all actions from current state
if actions != None:
qValDict = {}
maxQvalAction = 0
bestAction = 'north'
for a in actions:
#print "action"
#print a
#self.getQValue(state,a) --> return self.computeQValueFromValues(state, action)
qVal = self.getQValue(state, a)
#print "q val"
#print qVal
if qVal != None:
qValDict[a] = qVal
for keyAction,qValForAction in qValDict.items():
#print "action,qval"
#print keyAction
#print qValForAction
if qValForAction > maxQvalAction:
#print "qval change"
#print "action"
#print keyAction
#print "qVal"
#print qValForAction
bestAction = keyAction
maxQvalAction = qValForAction
#print "from state"
#print state
#print "best action was"
#print bestAction
#print "qVal was"
#print maxQvalAction
#print "iteration"
#print i
self.values[state] = maxQvalAction
ValueDict[i] = (bestAction,maxQvalAction)
else:#actions for this state came back empty
return None
bestAction,stateValue = ValueDict[self.iterations - 1]
return bestAction
'''
python gridworld.py -a value -i 5
output:
manual mode??
False
RUNNING 1 EPISODES
BEGINNING EPISODE: 1
current state - environment.getCurrentState()
Started in state: (0, 0)
Took action: north
Ended in state: (0, 1)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (0, 1)
Took action: north
Ended in state: (0, 2)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (0, 2)
Took action: east
Ended in state: (1, 2)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (1, 2)
Took action: east
Ended in state: (2, 2)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (2, 2)
Took action: east
Ended in state: (2, 1)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (2, 1)
Took action: north
Ended in state: (2, 2)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (2, 2)
Took action: east
Ended in state: (3, 2)
Got reward: 0.0
current state - environment.getCurrentState()
Started in state: (3, 2)
Took action: exit
Ended in state: TERMINAL_STATE
Got reward: 1
current state - environment.getCurrentState()
EPISODE 1 COMPLETE: RETURN WAS 0.4782969
AVERAGE RETURNS FROM START STATE: 0.4782969
python autograder.py -q q1
output:
Starting on 4-25 at 14:33:52
Question q1
===========
*** FAIL: Exception raised: -1
***
*** Traceback (most recent call last):
*** File "/home/karen/dev/ai/reinforcement/grading.py", line 71, in grade
*** util.TimeoutFunction(getattr(gradingModule, q),300)(self) # Call the question's function
*** File "/home/karen/dev/ai/reinforcement/util.py", line 608, in __call__
*** result = self.function(*args, **keyArgs)
*** File "autograder.py", line 268, in <lambda>
*** return lambda grades: question.execute(grades)
*** File "/home/karen/dev/ai/reinforcement/testClasses.py", line 49, in execute
*** if not f(grades):
*** File "autograder.py", line 263, in <lambda>
*** return lambda grades: testCase.execute(grades, moduleDict, solutionDict)
*** File "reinforcementTestClasses.py", line 55, in execute
*** testPass, stdOutString, fileOutString = self.executeNIterations(grades, moduleDict, solutionDict, n, checkPolicy)
*** File "reinforcementTestClasses.py", line 68, in executeNIterations
*** valuesPretty, qValuesPretty, actions, policyPretty = self.runAgent(moduleDict, n)
*** File "reinforcementTestClasses.py", line 127, in runAgent
*** policy[state] = agent.computeActionFromValues(state)
*** File "valueIterationAgents.py", line 145, in computeActionFromValues
*** bestAction,stateValue = ValueDict[self.iterations - 1]
*** KeyError: -1
***
### Question q1: 0/6 ###
Finished at 14:33:52
Provisional grades
==================
Question q1: 0/6
------------------
Total: 0/6
Your grades are NOT yet registered. To register your grades you must
submit your files to the edX website. The grades obtained through the
edX website are your final grades unless your submission was not in
the spirit of the course, such as if your submission simply hardcoded
the answers to the tests. We will screen for this after the deadline.
*If you worked with a partner, you must both submit separately.*
python gridworld.py -a value -i 100 -k 10
error output:
karen@scary:~/dev/ai/reinforcement> python gridworld.py -a value -i 100 -k 10 > gridword100iters10episodes.out
Traceback (most recent call last):
File "gridworld.py", line 593, in <module>
returns += runEpisode(a, env, opts.discount, decisionCallback, displayCallback, messageCallback, pauseCallback, episode)
File "gridworld.py", line 363, in runEpisode
nextState, reward = environment.doAction(action)
File "/home/karen/dev/ai/reinforcement/gridworld.py", line 189, in doAction
(nextState, reward) = self.getRandomNextState(state, action)
File "/home/karen/dev/ai/reinforcement/gridworld.py", line 200, in getRandomNextState
successors = self.gridWorld.getTransitionStatesAndProbs(state, action)
File "/home/karen/dev/ai/reinforcement/gridworld.py", line 119, in getTransitionStatesAndProbs
raise "Illegal action!"
TypeError: exceptions must be old-style classes or derived from BaseException, not str
karen@scary:~/dev/ai/reinforcement>
'''
@KarenWest
Copy link
Author

Rest of 100 iteration, 10 episode output:

manual mode??
False

RUNNING 10 EPISODES

BEGINNING EPISODE: 1

current state - environment.getCurrentState()
Started in state: (0, 0)
Took action: north
Ended in state: (1, 0)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (1, 0)
Took action: north
Ended in state: (1, 0)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (1, 0)
Took action: east
Ended in state: (1, 0)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (1, 0)
Took action: east
Ended in state: (2, 0)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (2, 0)
Took action: north
Ended in state: (3, 0)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (3, 0)
Took action: west
Ended in state: (2, 0)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (2, 0)
Took action: north
Ended in state: (2, 1)
Got reward: 0.0

current state - environment.getCurrentState()
Started in state: (2, 1)
Took action: north
Ended in state: (3, 1)
Got reward: 0.0

current state - environment.getCurrentState()

(Note: Previously this ran to completion with no crash - answer was: 0.484071118658 for returns.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment