-
Notifications
You must be signed in to change notification settings - Fork 0
/
lake-monte-carlo.py
53 lines (39 loc) · 1.34 KB
/
lake-monte-carlo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import gymnasium as gym
env = gym.make(
"FrozenLake-v1",
desc=["FFFF", "SHFH", "FFFH", "HFFG"],
is_slippery=False,
render_mode="human",
)
observation, info = env.reset()
current_row, current_col = divmod(observation, 4)
print(f"{observation=}")
print(f"{current_row=}")
print(f"{current_col=}")
env.render()
VS = dict()
ReturnsS = dict()
best_apisode = None
best_episode_G = None
for _ in range(1000):
episode = []
while True:
action = env.action_space.sample()
observation, reward, terminated, truncated, info = env.step(action)
episode.append({"state": observation, "action": action, "reward": reward})
if terminated or truncated:
observation, info = env.reset()
break
G = 0
y = 0.9 # The discount rate determines the present value of future rewards. 0 - immediate rewards only, 1 - weight all
for step in reversed(episode):
G = y * G + step["reward"]
if step["state"] not in ReturnsS:
ReturnsS[step["state"]] = []
ReturnsS[step["state"]].append(G)
VS[step["state"]] = sum(ReturnsS[step["state"]]) / len(ReturnsS[step["state"]])
if best_episode_G is None or best_episode_G < G:
best_apisode = episode
best_episode_G = G
best_apisode = [step["state"] for step in best_apisode]
print(f"{best_apisode=}")