-
Notifications
You must be signed in to change notification settings - Fork 6
/
AC_Agent.py
155 lines (126 loc) · 6.73 KB
/
AC_Agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 9 16:53:08 2022
@author: Abhilash
"""
import random
from collections import deque
import numpy as np
import tensorflow as tf
from tensorflow.keras.activations import softmax
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from utils import Portfolio
# Tensorflow GPU configuration
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)
tf.compat.v1.disable_eager_execution()
# https://arxiv.org/pdf/1509.02971.pdf
class ActorNetwork:
def __init__(self, sess, state_size, action_dim, buffer_size, tau, learning_rate, is_eval=False, model_name=""):
self.sess = sess
self.tau = tau
self.learning_rate = learning_rate
self.action_dim = action_dim
self.model, self.states = self.create_actor_network(state_size, action_dim)
self.model_target, self.target_state = self.create_actor_network(state_size, action_dim)
self.model_target.set_weights(self.model.get_weights())
self.action_gradient = tf.compat.v1.placeholder(tf.float32, [None, action_dim])
print("chain rule: ∂a/∂θ * ∂Q(s,a)/∂a (action_gradients); minus sign for gradient descent; 1/buffer_size for mean value")
self.sampled_policy_grad = tf.gradients(self.model.output/buffer_size, self.model.trainable_weights, -self.action_gradient)
self.update_actor_policy = Adam(learning_rate=learning_rate).apply_gradients(zip(self.sampled_policy_grad, self.model.trainable_weights))
def train(self, states_batch, action_grads_batch):
self.sess.run(self.update_actor_policy, feed_dict={self.states: states_batch, self.action_gradient: action_grads_batch})
def train_target(self):
actor_weights = self.model.get_weights()
actor_target_weights = self.model_target.get_weights()
for i in range(len(actor_weights)):
actor_target_weights[i] = self.tau * actor_weights[i] + (1 - self.tau) * actor_target_weights[i]
self.model_target.set_weights(actor_target_weights)
def create_actor_network(self, state_size, action_dim):
states = Input(shape=[state_size])
h0 = Dense(24, activation='relu')(states)
h1 = Dense(48, activation='relu')(h0)
h2 = Dense(24, activation='relu')(h1)
actions = Dense(self.action_dim, activation='softmax')(h2)
model = Model(inputs=states, outputs=actions)
return model, states
class CriticNetwork:
def __init__(self, sess, state_size, action_dim, tau, learning_rate, is_eval=False, model_name=""):
self.sess = sess
self.tau = tau
self.learning_rate = learning_rate
self.action_dim = action_dim
self.model, self.actions, self.states = self.create_critic_network(state_size, action_dim)
self.model_target, self.target_action, self.target_state = self.create_critic_network(state_size, action_dim)
self.action_grads = tf.gradients(self.model.output, self.actions)
def gradients(self, states_batch, actions_batch):
return self.sess.run(self.action_grads, feed_dict={self.states: states_batch, self.actions: actions_batch})[0]
def train_target(self):
critic_weights = self.model.get_weights()
critic_target_weights = self.model_target.get_weights()
for i in range(len(critic_weights)):
critic_target_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * critic_target_weights[i]
self.model_target.set_weights(critic_target_weights)
def create_critic_network(self, state_size, action_dim):
states = Input(shape=[state_size])
actions = Input(shape=[action_dim])
h0 = Concatenate()([states, actions])
h1 = Dense(24, activation='relu')(h0)
h2 = Dense(48, activation='relu')(h1)
h3 = Dense(24, activation='relu')(h2)
Q = Dense(action_dim, activation='relu')(h3)
model = Model(inputs=[states, actions], outputs=Q)
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate, decay=1e-6))
return model, actions, states
class AC_Agent(Portfolio):
def __init__(self, state_dim, balance,is_eval=False):
super().__init__(balance=balance)
self.model_type = 'AC'
self.state_dim = state_dim
self.action_dim = 3 # hold, buy, sell
self.memory = deque(maxlen=100)
self.buffer_size = 90
self.gamma = 0.95 # discount factor
self.is_eval = is_eval
tau = 0.001 # Target network hyperparameter
learning_rate_actor = 0.001 # learning rate for Actor network
learning_rate_critic = 0.001 # learning rate for Critic network
model_name="AC"
self.actor = ActorNetwork(sess, state_dim, self.action_dim, self.buffer_size, tau, learning_rate_actor, is_eval, model_name)
self.critic = CriticNetwork(sess, state_dim, self.action_dim, tau, learning_rate_critic)
self.tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs/AC_tensorboard', update_freq=90)
self.tensorboard.set_model(self.critic.model)
def reset(self):
self.reset_portfolio()
def remember(self, state, actions, reward, next_state, done):
self.memory.append((state, actions, reward, next_state, done))
def act(self, state, t):
actions = self.actor.model.predict(state)[0]
return actions
def experience_replay(self):
# sample random buffer_size long memory
mini_batch = random.sample(self.memory, self.buffer_size)
y_batch = []
for state, actions, reward, next_state, done in mini_batch:
if not done:
Q_target_value = self.critic.model_target.predict([next_state, self.actor.model_target.predict(next_state)])
y = reward + self.gamma * Q_target_value
else:
y = reward * np.ones((1, self.action_dim))
y_batch.append(y)
y_batch = np.vstack(y_batch)
states_batch = np.vstack([tup[0] for tup in mini_batch]) # batch_size * state_dim
actions_batch = np.vstack([tup[1] for tup in mini_batch]) # batch_size * action_dim
# update critic by minimizing the loss
loss = self.critic.model.train_on_batch([states_batch, actions_batch], y_batch)
# update actor using the sampled policy gradients
action_grads_batch = self.critic.gradients(states_batch, self.actor.model.predict(states_batch)) # batch_size * action_dim
self.actor.train(states_batch, action_grads_batch)
# update target networks
self.actor.train_target()
self.critic.train_target()
return loss