diff --git a/mdp.py b/mdp.py index 902582b19..24bbb2a8d 100644 --- a/mdp.py +++ b/mdp.py @@ -1,9 +1,9 @@ """Markov Decision Processes (Chapter 17) First we define an MDP, and the special case of a GridMDP, in which -states are laid out in a 2-dimensional grid. We also represent a policy +states are laid out in a 2-dimensional grid. We also represent a policy as a dictionary of {state:action} pairs, and a Utility function as a -dictionary of {state:number} pairs. We then define the value_iteration +dictionary of {state:number} pairs. We then define the value_iteration and policy_iteration algorithms.""" from utils import argmax, vector_add, print_table # noqa @@ -17,32 +17,37 @@ class MDP: """A Markov Decision Process, defined by an initial state, transition model, and reward function. We also keep track of a gamma value, for use by algorithms. The transition model is represented somewhat differently from - the text. Instead of P(s' | s, a) being a probability number for each + the text. Instead of P(s' | s, a) being a probability number for each state/state/action triplet, we instead have T(s, a) return a - list of (p, s') pairs. We also keep track of the possible states, + list of (p, s') pairs. We also keep track of the possible states, terminal states, and actions for each state. [page 646]""" - def __init__(self, init, actlist, terminals, gamma=.9): + def __init__(self, init, actlist, terminals, transitions={}, states=set(), gamma=.9): + if not (0 <= gamma < 1): + raise ValueError("An MDP must have 0 <= gamma < 1") + self.init = init self.actlist = actlist self.terminals = terminals - if not (0 <= gamma < 1): - raise ValueError("An MDP must have 0 <= gamma < 1") + self.transitions = transitions + self.states = states self.gamma = gamma - self.states = set() self.reward = {} def R(self, state): - "Return a numeric reward for this state." + """Return a numeric reward for this state.""" return self.reward[state] def T(self, state, action): - """Transition model. From a state and an action, return a list + """Transition model. From a state and an action, return a list of (probability, result-state) pairs.""" - raise NotImplementedError + if(self.transitions == {}): + raise ValueError("Transition model is missing") + else: + return self.transitions[state][action] def actions(self, state): - """Set of actions that can be performed in this state. By default, a + """Set of actions that can be performed in this state. By default, a fixed list of actions, except for terminal states. Override this method if you need to specialize by state.""" if state in self.terminals: @@ -53,9 +58,9 @@ def actions(self, state): class GridMDP(MDP): - """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is + """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is specify the grid as a list of lists of rewards; use None for an obstacle - (unreachable state). Also, you should specify the terminal states. + (unreachable state). Also, you should specify the terminal states. An action is an (x, y) unit vector; e.g. (1, 0) means move east.""" def __init__(self, grid, terminals, init=(0, 0), gamma=.9): diff --git a/tests/test_mdp.py b/tests/test_mdp.py index f5cb40510..e992d263c 100644 --- a/tests/test_mdp.py +++ b/tests/test_mdp.py @@ -25,3 +25,17 @@ def test_best_policy(): assert sequential_decision_environment.to_arrows(pi) == [['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']] + + +def test_transition_model(): + transition_model = { + "A": {"a1": (0.3, "B"), "a2": (0.7, "C")}, + "B": {"a1": (0.5, "B"), "a2": (0.5, "A")}, + "C": {"a1": (0.9, "A"), "a2": (0.1, "B")}, + } + + mdp = MDP(init="A", actlist={"a1","a2"}, terminals={"C"}, states={"A","B","C"}, transitions=transition_model) + + assert mdp.T("A","a1") == (0.3, "B") + assert mdp.T("B","a2") == (0.5, "A") + assert mdp.T("C","a1") == (0.9, "A") pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy