-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
95 lines (71 loc) · 3.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import random
import numpy as np
import gym
import yaml
from gym_qRacing.envs.functions import Helper
# temp
import warnings
warnings.filterwarnings("ignore")
# initializing global logging lists
log_episodes = []
log_loss = []
log_reward = []
log_result = []
def simulate():
#* loading global config from yaml file
with open('racesim_config.yaml') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
#* initializing variables for Q-Learning
episode_counter = 0
env = gym.make("qRacing-base-v0", config=config)
num_box = tuple(((env.observation_space.high[0], env.observation_space.high[1]) + np.ones(env.observation_space.shape)).astype(int))
q_table = np.zeros(num_box + (env.action_space.n,))
#! The actual simulation loop. Iterating over the defined amount of episodes.
for episode in range(config["QLEARNING"]["ENV_EPISODES"]):
# logging
Helper.global_logging(config["LOGGING"], "ENVIRONMENT", "\n[bold blue]Starting episode #{}[/bold blue]".format(episode+1))
# initialize environment
state_tuple = env.reset()
state, info = state_tuple
total_reward = 0
t_loss = 0
# AI tries up to env_config["env_maxTry"] times
for t in range(config["QLEARNING"]["ENV_MAXTRY"]):
# In the beginning, do random action to learn
if random.uniform(0, 1) < config["QLEARNING"]["ENV_EPSILON"]:
action = env.action_space.sample()
else:
#print("\nstate: {} \n".format(state))
action = np.argmax(q_table[state])
# Do action and get result
next_state, reward, done, _info = env.step(action)
#next_state = next_state_tuple[0]
total_reward += reward
# Get correspond q value from state, action pair
q_value = q_table[state][action]
best_q = np.max(q_table[next_state])
t_loss += (q_value - reward)
# Q(state, action) <- (1 - a)Q(state, action) + a(reward + rmaxQ(next state, all actions))
q_table[state][action] = (1 - config["QLEARNING"]["ENV_LEARNINGRATE"]) * q_value + config["QLEARNING"]["ENV_LEARNINGRATE"] * (reward + config["QLEARNING"]["ENV_GAMMA"] * best_q)
# Setup for the next episode iteration
state = next_state
# Check if episode is finished
if done or t >= config["QLEARNING"]["ENV_MAXTRY"] - 1:
# check if logging is enabled
if episode_counter % config["LOGGING"]['EPISODE_INTERVAL'] == 0:
# log results to output
if config['LOGGING']['AGENT']['RESULTS']:
#print(_info)
Helper.global_logging(config["LOGGING"], "ENVIRONMENT", "\n[bold blue]Agent results of episode #{}[/bold blue]".format(episode+1))
print("Position: %i \nReward: %f\n" % (_info["agent"]["position"], total_reward))
# save results to logging list
log_loss.append([episode_counter, t_loss])
log_reward.append([episode_counter, total_reward])
#log_result.append([episode_counter, info[0]])
episode_counter += 1
break
# exploration-rate decay
if config["QLEARNING"]["ENV_EPSILON"] >= 0.005:
config["QLEARNING"]["ENV_EPSILON"] *= config["QLEARNING"]["ENV_EPSILONDECAY"]
if __name__ == "__main__":
simulate()