Skip to content

Commit ba9dc72

Browse files
Implemented Passive ADP Agent
1 parent c541d31 commit ba9dc72

File tree

1 file changed

+54
-3
lines changed

1 file changed

+54
-3
lines changed

rl.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,67 @@
33

44
from collections import defaultdict
55
from utils import argmax
6+
from mdp import MDP, policy_evaluation
67

7-
import agents
88
import random
99

1010

11-
class PassiveADPAgent(agents.Agent):
11+
class PassiveADPAgent:
1212

1313
"""Passive (non-learning) agent that uses adaptive dynamic programming
1414
on a given MDP and policy. [Figure 21.2]"""
15-
NotImplemented
15+
16+
class ModelMDP(MDP):
17+
""" Class for implementing modifed Version of input MDP with
18+
an editable transition model P and a custom function T. """
19+
def __init__(self, init, actlist, terminals, gamma, states):
20+
super().__init__(init, actlist, terminals, gamma)
21+
nested_dict = lambda: defaultdict(nested_dict)
22+
# StackOverflow:whats-the-best-way-to-initialize-a-dict-of-dicts-in-python
23+
self.P = nested_dict()
24+
25+
def T(self, s, a):
26+
"""Returns a list of tuples with probabilities for states
27+
based on the learnt model P. """
28+
return [(prob, res) for (res, prob) in self.P[(s, a)].items()]
29+
30+
def __init__(self, pi, mdp):
31+
self.pi = pi
32+
self.mdp = PassiveADPAgent.ModelMDP(mdp.init, mdp.actlist,
33+
mdp.terminals, mdp.gamma, mdp.states)
34+
self.U = {}
35+
self.Nsa = defaultdict(int)
36+
self.Ns1_sa = defaultdict(int)
37+
self.s = None
38+
self.a = None
39+
40+
def __call__(self, percept):
41+
s1, r1 = percept
42+
self.mdp.states.add(s1) # Model keeps track of visited states.
43+
R, P, mdp, pi = self.mdp.reward, self.mdp.P, self.mdp, self.pi
44+
s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U
45+
46+
if s1 not in R: # Reward is only available for visted state.
47+
U[s1] = R[s1] = r1
48+
if s is not None:
49+
Nsa[(s, a)] += 1
50+
Ns1_sa[(s1, s, a)] += 1
51+
# for each t such that Ns′|sa [t, s, a] is nonzero
52+
for t in [res for (res, state, act), freq in Ns1_sa.items()
53+
if (state, act) == (s, a) and freq != 0]:
54+
P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)]
55+
56+
U = policy_evaluation(pi, U, mdp)
57+
if s1 in mdp.terminals:
58+
self.s = self.a = None
59+
else:
60+
self.s, self.a = s1, self.pi[s1]
61+
return self.a
62+
63+
def update_state(self, percept):
64+
''' To be overridden in most cases. The default case
65+
assumes th percept to be of type (state, reward)'''
66+
return percept
1667

1768

1869
class PassiveTDAgent:

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy