1
1
"""Markov Decision Processes (Chapter 17)
2
2
3
3
First we define an MDP, and the special case of a GridMDP, in which
4
- states are laid out in a 2-dimensional grid. We also represent a policy
4
+ states are laid out in a 2-dimensional grid. We also represent a policy
5
5
as a dictionary of {state:action} pairs, and a Utility function as a
6
- dictionary of {state:number} pairs. We then define the value_iteration
6
+ dictionary of {state:number} pairs. We then define the value_iteration
7
7
and policy_iteration algorithms."""
8
8
9
9
from utils import argmax , vector_add
@@ -17,32 +17,37 @@ class MDP:
17
17
"""A Markov Decision Process, defined by an initial state, transition model,
18
18
and reward function. We also keep track of a gamma value, for use by
19
19
algorithms. The transition model is represented somewhat differently from
20
- the text. Instead of P(s' | s, a) being a probability number for each
20
+ the text. Instead of P(s' | s, a) being a probability number for each
21
21
state/state/action triplet, we instead have T(s, a) return a
22
- list of (p, s') pairs. We also keep track of the possible states,
22
+ list of (p, s') pairs. We also keep track of the possible states,
23
23
terminal states, and actions for each state. [page 646]"""
24
24
25
- def __init__ (self , init , actlist , terminals , gamma = .9 ):
25
+ def __init__ (self , init , actlist , terminals , transitions = {}, states = set (), gamma = .9 ):
26
+ if not (0 <= gamma < 1 ):
27
+ raise ValueError ("An MDP must have 0 <= gamma < 1" )
28
+
26
29
self .init = init
27
30
self .actlist = actlist
28
31
self .terminals = terminals
29
- if not ( 0 <= gamma < 1 ):
30
- raise ValueError ( "An MDP must have 0 <= gamma < 1" )
32
+ self . transitions = transitions
33
+ self . states = states
31
34
self .gamma = gamma
32
- self .states = set ()
33
35
self .reward = {}
34
36
35
37
def R (self , state ):
36
- "Return a numeric reward for this state."
38
+ """ Return a numeric reward for this state."" "
37
39
return self .reward [state ]
38
40
39
41
def T (self , state , action ):
40
- """Transition model. From a state and an action, return a list
42
+ """Transition model. From a state and an action, return a list
41
43
of (probability, result-state) pairs."""
42
- raise NotImplementedError
44
+ if (self .transitions == {}):
45
+ raise ValueError ("Transition model is missing" )
46
+ else :
47
+ return self .transitions [state ][action ]
43
48
44
49
def actions (self , state ):
45
- """Set of actions that can be performed in this state. By default, a
50
+ """Set of actions that can be performed in this state. By default, a
46
51
fixed list of actions, except for terminal states. Override this
47
52
method if you need to specialize by state."""
48
53
if state in self .terminals :
@@ -53,9 +58,9 @@ def actions(self, state):
53
58
54
59
class GridMDP (MDP ):
55
60
56
- """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is
61
+ """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is
57
62
specify the grid as a list of lists of rewards; use None for an obstacle
58
- (unreachable state). Also, you should specify the terminal states.
63
+ (unreachable state). Also, you should specify the terminal states.
59
64
An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
60
65
61
66
def __init__ (self , grid , terminals , init = (0 , 0 ), gamma = .9 ):
0 commit comments