Implementation: Transition Model for MDP (aimacode#445)

antmarakis · norvig · commit a8b146fcc243 · 2017-05-23T22:22:39.000-07:00
* Update test_mdp.py

* Update mdp.py
diff --git a/mdp.py b/mdp.py
@@ -1,9 +1,9 @@
 """Markov Decision Processes (Chapter 17)
 
 First we define an MDP, and the special case of a GridMDP, in which
-states are laid out in a 2-dimensional grid.  We also represent a policy
+states are laid out in a 2-dimensional grid. We also represent a policy
 as a dictionary of {state:action} pairs, and a Utility function as a
-dictionary of {state:number} pairs.  We then define the value_iteration
+dictionary of {state:number} pairs. We then define the value_iteration
 and policy_iteration algorithms."""
 
 from utils import argmax, vector_add
@@ -17,32 +17,37 @@ class MDP:
     """A Markov Decision Process, defined by an initial state, transition model,
     and reward function. We also keep track of a gamma value, for use by
     algorithms. The transition model is represented somewhat differently from
-    the text.  Instead of P(s' | s, a) being a probability number for each
+    the text. Instead of P(s' | s, a) being a probability number for each
     state/state/action triplet, we instead have T(s, a) return a
-    list of (p, s') pairs.  We also keep track of the possible states,
+    list of (p, s') pairs. We also keep track of the possible states,
     terminal states, and actions for each state. [page 646]"""
 
-    def __init__(self, init, actlist, terminals, gamma=.9):
+    def __init__(self, init, actlist, terminals, transitions={}, states=set(), gamma=.9):
+        if not (0 <= gamma < 1):
+            raise ValueError("An MDP must have 0 <= gamma < 1")
+
         self.init = init
         self.actlist = actlist
         self.terminals = terminals
-        if not (0 <= gamma < 1):
-            raise ValueError("An MDP must have 0 <= gamma < 1")
+        self.transitions = transitions
+        self.states = states
         self.gamma = gamma
-        self.states = set()
         self.reward = {}
 
     def R(self, state):
-        "Return a numeric reward for this state."
+        """Return a numeric reward for this state."""
         return self.reward[state]
 
     def T(self, state, action):
-        """Transition model.  From a state and an action, return a list
+        """Transition model. From a state and an action, return a list
         of (probability, result-state) pairs."""
-        raise NotImplementedError
+        if(self.transitions == {}):
+            raise ValueError("Transition model is missing")
+        else:
+            return self.transitions[state][action]
 
     def actions(self, state):
-        """Set of actions that can be performed in this state.  By default, a
+        """Set of actions that can be performed in this state. By default, a
         fixed list of actions, except for terminal states. Override this
         method if you need to specialize by state."""
         if state in self.terminals:
@@ -53,9 +58,9 @@ def actions(self, state):
 
 class GridMDP(MDP):
 
-    """A two-dimensional grid MDP, as in [Figure 17.1].  All you have to do is
+    """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is
     specify the grid as a list of lists of rewards; use None for an obstacle
-    (unreachable state).  Also, you should specify the terminal states.
+    (unreachable state). Also, you should specify the terminal states.
     An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
 
     def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
diff --git a/tests/test_mdp.py b/tests/test_mdp.py
@@ -25,3 +25,17 @@ def test_best_policy():
     assert sequential_decision_environment.to_arrows(pi) == [['>', '>', '>', '.'],
                                                              ['^', None, '^', '.'],
                                                              ['^', '>', '^', '<']]
+
+
+def test_transition_model():
+    transition_model = {
+        "A": {"a1": (0.3, "B"), "a2": (0.7, "C")},
+        "B": {"a1": (0.5, "B"), "a2": (0.5, "A")},
+        "C": {"a1": (0.9, "A"), "a2": (0.1, "B")},
+    }
+
+    mdp = MDP(init="A", actlist={"a1","a2"}, terminals={"C"}, states={"A","B","C"}, transitions=transition_model)
+
+    assert mdp.T("A","a1") == (0.3, "B")
+    assert mdp.T("B","a2") == (0.5, "A")
+    assert mdp.T("C","a1") == (0.9, "A")