diff --git a/mdp.ipynb b/mdp.ipynb index 910b49040..4c44ff9d8 100644 --- a/mdp.ipynb +++ b/mdp.ipynb @@ -1,7 +1,7 @@ { "cells": [ { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "# Markov decision processes (MDPs)\n", @@ -10,19 +10,24 @@ ] }, { - "cell_type": "code", - "execution_count": 1, +<<<<<<< HEAD + "cell_type": "raw", "metadata": { "collapsed": true }, +======= + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], +>>>>>>> 3fed6614295b7270ca1226415beff7305e387eeb "source": [ "from mdp import *\n", "from notebook import psource, pseudocode" ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "## CONTENTS\n", @@ -36,7 +41,7 @@ ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "## OVERVIEW\n", @@ -56,7 +61,7 @@ ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "## MDP\n", @@ -65,162 +70,21 @@ ] }, { +<<<<<<< HEAD + "cell_type": "raw", + "metadata": {}, +======= "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "class MDP:\n",
- "\n",
- " """A Markov Decision Process, defined by an initial state, transition model,\n",
- " and reward function. We also keep track of a gamma value, for use by\n",
- " algorithms. The transition model is represented somewhat differently from\n",
- " the text. Instead of P(s' | s, a) being a probability number for each\n",
- " state/state/action triplet, we instead have T(s, a) return a\n",
- " list of (p, s') pairs. We also keep track of the possible states,\n",
- " terminal states, and actions for each state. [page 646]"""\n",
- "\n",
- " def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma=.9):\n",
- " if not (0 < gamma <= 1):\n",
- " raise ValueError("An MDP must have 0 < gamma <= 1")\n",
- "\n",
- " if states:\n",
- " self.states = states\n",
- " else:\n",
- " self.states = set()\n",
- " self.init = init\n",
- " self.actlist = actlist\n",
- " self.terminals = terminals\n",
- " self.transitions = transitions\n",
- " self.gamma = gamma\n",
- " self.reward = {}\n",
- "\n",
- " def R(self, state):\n",
- " """Return a numeric reward for this state."""\n",
- " return self.reward[state]\n",
- "\n",
- " def T(self, state, action):\n",
- " """Transition model. From a state and an action, return a list\n",
- " of (probability, result-state) pairs."""\n",
- " if(self.transitions == {}):\n",
- " raise ValueError("Transition model is missing")\n",
- " else:\n",
- " return self.transitions[state][action]\n",
- "\n",
- " def actions(self, state):\n",
- " """Set of actions that can be performed in this state. By default, a\n",
- " fixed list of actions, except for terminal states. Override this\n",
- " method if you need to specialize by state."""\n",
- " if state in self.terminals:\n",
- " return [None]\n",
- " else:\n",
- " return self.actlist\n",
- "
class GridMDP(MDP):\n",
- "\n",
- " """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is\n",
- " specify the grid as a list of lists of rewards; use None for an obstacle\n",
- " (unreachable state). Also, you should specify the terminal states.\n",
- " An action is an (x, y) unit vector; e.g. (1, 0) means move east."""\n",
- "\n",
- " def __init__(self, grid, terminals, init=(0, 0), gamma=.9):\n",
- " grid.reverse() # because we want row 0 on bottom, not on top\n",
- " MDP.__init__(self, init, actlist=orientations,\n",
- " terminals=terminals, gamma=gamma)\n",
- " self.grid = grid\n",
- " self.rows = len(grid)\n",
- " self.cols = len(grid[0])\n",
- " for x in range(self.cols):\n",
- " for y in range(self.rows):\n",
- " self.reward[x, y] = grid[y][x]\n",
- " if grid[y][x] is not None:\n",
- " self.states.add((x, y))\n",
- "\n",
- " def T(self, state, action):\n",
- " if action is None:\n",
- " return [(0.0, state)]\n",
- " else:\n",
- " return [(0.8, self.go(state, action)),\n",
- " (0.1, self.go(state, turn_right(action))),\n",
- " (0.1, self.go(state, turn_left(action)))]\n",
- "\n",
- " def go(self, state, direction):\n",
- " """Return the state that results from going in this direction."""\n",
- " state1 = vector_add(state, direction)\n",
- " return state1 if state1 in self.states else state\n",
- "\n",
- " def to_grid(self, mapping):\n",
- " """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""\n",
- " return list(reversed([[mapping.get((x, y), None)\n",
- " for x in range(self.cols)]\n",
- " for y in range(self.rows)]))\n",
- "\n",
- " def to_arrows(self, policy):\n",
- " chars = {\n",
- " (1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'}\n",
- " return self.to_grid({s: chars[a] for (s, a) in policy.items()})\n",
- "
def value_iteration(mdp, epsilon=0.001):\n",
- " """Solving an MDP by value iteration. [Figure 17.4]"""\n",
- " U1 = {s: 0 for s in mdp.states}\n",
- " R, T, gamma = mdp.R, mdp.T, mdp.gamma\n",
- " while True:\n",
- " U = U1.copy()\n",
- " delta = 0\n",
- " for s in mdp.states:\n",
- " U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])\n",
- " for a in mdp.actions(s)])\n",
- " delta = max(delta, abs(U1[s] - U[s]))\n",
- " if delta < epsilon * (1 - gamma) / gamma:\n",
- " return U\n",
- "
def expected_utility(a, s, U, mdp):\n",
- " """The expected utility of doing a in state s, according to the MDP and U."""\n",
- " return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])\n",
- "
def policy_iteration(mdp):\n",
- " """Solve an MDP by policy iteration [Figure 17.7]"""\n",
- " U = {s: 0 for s in mdp.states}\n",
- " pi = {s: random.choice(mdp.actions(s)) for s in mdp.states}\n",
- " while True:\n",
- " U = policy_evaluation(pi, U, mdp)\n",
- " unchanged = True\n",
- " for s in mdp.states:\n",
- " a = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp))\n",
- " if a != pi[s]:\n",
- " pi[s] = a\n",
- " unchanged = False\n",
- " if unchanged:\n",
- " return pi\n",
- "
def policy_evaluation(pi, U, mdp, k=20):\n",
- " """Return an updated utility mapping U from each state in the MDP to its\n",
- " utility, using an approximation (modified policy iteration)."""\n",
- " R, T, gamma = mdp.R, mdp.T, mdp.gamma\n",
- " for i in range(k):\n",
- " for s in mdp.states:\n",
- " U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])\n",
- " return U\n",
- "
def T(self, state, action):\n",
- " if action is None:\n",
- " return [(0.0, state)]\n",
- " else:\n",
- " return [(0.8, self.go(state, action)),\n",
- " (0.1, self.go(state, turn_right(action))),\n",
- " (0.1, self.go(state, turn_left(action)))]\n",
- "
def to_arrows(self, policy):\n",
- " chars = {\n",
- " (1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'}\n",
- " return self.to_grid({s: chars[a] for (s, a) in policy.items()})\n",
- "
def to_grid(self, mapping):\n",
- " """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""\n",
- " return list(reversed([[mapping.get((x, y), None)\n",
- " for x in range(self.cols)]\n",
- " for y in range(self.rows)]))\n",
- "
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: