|
3 | 3 |
|
4 | 4 | from collections import defaultdict
|
5 | 5 | from utils import argmax
|
| 6 | +from mdp import MDP, policy_evaluation |
6 | 7 |
|
7 |
| -import agents |
8 | 8 | import random
|
9 | 9 |
|
10 | 10 |
|
11 |
| -class PassiveADPAgent(agents.Agent): |
| 11 | +class PassiveADPAgent: |
12 | 12 |
|
13 | 13 | """Passive (non-learning) agent that uses adaptive dynamic programming
|
14 | 14 | on a given MDP and policy. [Figure 21.2]"""
|
15 |
| - NotImplemented |
| 15 | + |
| 16 | + class ModelMDP(MDP): |
| 17 | + """ Class for implementing modifed Version of input MDP with |
| 18 | + an editable transition model P and a custom function T. """ |
| 19 | + def __init__(self, init, actlist, terminals, gamma, states): |
| 20 | + super().__init__(init, actlist, terminals, gamma) |
| 21 | + nested_dict = lambda: defaultdict(nested_dict) |
| 22 | + # StackOverflow:whats-the-best-way-to-initialize-a-dict-of-dicts-in-python |
| 23 | + self.P = nested_dict() |
| 24 | + |
| 25 | + def T(self, s, a): |
| 26 | + """Returns a list of tuples with probabilities for states |
| 27 | + based on the learnt model P. """ |
| 28 | + return [(prob, res) for (res, prob) in self.P[(s, a)].items()] |
| 29 | + |
| 30 | + def __init__(self, pi, mdp): |
| 31 | + self.pi = pi |
| 32 | + self.mdp = PassiveADPAgent.ModelMDP(mdp.init, mdp.actlist, |
| 33 | + mdp.terminals, mdp.gamma, mdp.states) |
| 34 | + self.U = {} |
| 35 | + self.Nsa = defaultdict(int) |
| 36 | + self.Ns1_sa = defaultdict(int) |
| 37 | + self.s = None |
| 38 | + self.a = None |
| 39 | + |
| 40 | + def __call__(self, percept): |
| 41 | + s1, r1 = percept |
| 42 | + self.mdp.states.add(s1) # Model keeps track of visited states. |
| 43 | + R, P, mdp, pi = self.mdp.reward, self.mdp.P, self.mdp, self.pi |
| 44 | + s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U |
| 45 | + |
| 46 | + if s1 not in R: # Reward is only available for visted state. |
| 47 | + U[s1] = R[s1] = r1 |
| 48 | + if s is not None: |
| 49 | + Nsa[(s, a)] += 1 |
| 50 | + Ns1_sa[(s1, s, a)] += 1 |
| 51 | + # for each t such that Ns′|sa [t, s, a] is nonzero |
| 52 | + for t in [res for (res, state, act), freq in Ns1_sa.items() |
| 53 | + if (state, act) == (s, a) and freq != 0]: |
| 54 | + P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)] |
| 55 | + |
| 56 | + U = policy_evaluation(pi, U, mdp) |
| 57 | + if s1 in mdp.terminals: |
| 58 | + self.s = self.a = None |
| 59 | + else: |
| 60 | + self.s, self.a = s1, self.pi[s1] |
| 61 | + return self.a |
| 62 | + |
| 63 | + def update_state(self, percept): |
| 64 | + ''' To be overridden in most cases. The default case |
| 65 | + assumes th percept to be of type (state, reward)''' |
| 66 | + return percept |
16 | 67 |
|
17 | 68 |
|
18 | 69 | class PassiveTDAgent:
|
|
0 commit comments