Skip to content

Commit

Permalink
Merge pull request #118 from kengz/policy-gradient
Browse files Browse the repository at this point in the history
ActorCritic and DDPG
  • Loading branch information
kengz committed Apr 19, 2017
2 parents 80eff0e + 0b50a69 commit 29bd213
Show file tree
Hide file tree
Showing 16 changed files with 899 additions and 309 deletions.
139 changes: 139 additions & 0 deletions rl/agent/actor_critic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import numpy as np
from rl.agent.dqn import DQN
from rl.util import logger


class ActorCritic(DQN):

'''
Actor Critic algorithm. The actor's policy
is adjusted in the direction that will lead to
better actions, guided by the critic
Implementation adapted from
http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html
Assumes one of the policies in actor_critic.py are being used
'''

def __init__(self, env_spec,
train_per_n_new_exp=1,
gamma=0.95, lr=0.1,
epi_change_lr=None,
batch_size=16, n_epoch=5, hidden_layers=None,
hidden_layers_activation='sigmoid',
output_layer_activation='linear',
auto_architecture=False,
num_hidden_layers=3,
first_hidden_layer_size=256,
num_initial_channels=16,
**kwargs): # absorb generic param without breaking
# import only when needed to contain side-effects
from keras.layers.core import Dense
from keras.models import Sequential, load_model
self.Dense = Dense
self.Sequential = Sequential
self.load_model = load_model

super(ActorCritic, self).__init__(env_spec,
train_per_n_new_exp,
gamma, lr,
epi_change_lr,
batch_size, n_epoch, hidden_layers,
hidden_layers_activation,
output_layer_activation,
auto_architecture,
num_hidden_layers,
first_hidden_layer_size,
num_initial_channels,
**kwargs)

def build_model(self):
self.build_actor()
self.build_critic()
logger.info("Actor and critic models built")

def build_actor(self):
actor = self.Sequential()
super(ActorCritic, self).build_hidden_layers(actor)
actor.add(self.Dense(self.env_spec['action_dim'],
init='lecun_uniform',
activation=self.output_layer_activation))
logger.info("Actor summary")
actor.summary()
self.actor = actor

def build_critic(self):
critic = self.Sequential()
super(ActorCritic, self).build_hidden_layers(critic)
critic.add(self.Dense(1,
init='lecun_uniform',
activation=self.output_layer_activation))
logger.info("Critic summary")
critic.summary()
self.critic = critic

def compile_model(self):
self.actor.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
self.critic.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
logger.info("Actor and critic compiled")

def recompile_model(self, sys_vars):
'''
Option to change model optimizer settings
Currently only used for changing the learning rate
Compiling does not affect the model weights
'''
if self.epi_change_lr is not None:
if (sys_vars['epi'] == self.epi_change_lr and
sys_vars['t'] == 0):
self.lr = self.lr / 10.0
self.optimizer.change_optim_param(**{'lr': self.lr})
self.actor.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
self.critic.compile(
loss='mse',
optimizer=self.optimizer.keras_optimizer)
logger.info(
'Actor and critic models recompiled with new settings: '
'Learning rate: {}'.format(self.lr))

def train_critic(self, minibatch):
Q_vals = np.clip(self.critic.predict(minibatch['states']),
-self.clip_val, self.clip_val)
Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']),
-self.clip_val, self.clip_val)
Q_targets = minibatch['rewards'] + self.gamma * \
(1 - minibatch['terminals']) * Q_next_vals.squeeze()
Q_targets = np.expand_dims(Q_targets, axis=1)

actor_delta = Q_next_vals - Q_vals
loss = self.critic.train_on_batch(minibatch['states'], Q_targets)

errors = abs(np.sum(Q_vals - Q_targets, axis=1))
self.memory.update(errors)
return loss, actor_delta

def train_actor(self, minibatch, actor_delta):
old_vals = self.actor.predict(minibatch['states'])
if self.env_spec['actions'] == 'continuous':
A_targets = np.zeros(
(actor_delta.shape[0], self.env_spec['action_dim']))
for j in range(A_targets.shape[1]):
A_targets[:, j] = actor_delta.squeeze()
else:
A_targets = minibatch['actions'] * actor_delta + \
(1 - minibatch['actions']) * old_vals

loss = self.actor.train_on_batch(minibatch['states'], A_targets)
return loss

def train_an_epoch(self):
minibatch = self.memory.rand_minibatch(self.batch_size)
critic_loss, actor_delta = self.train_critic(minibatch)
actor_loss = self.train_actor(minibatch, actor_delta)
return critic_loss + actor_loss
Loading

0 comments on commit 29bd213

Please sign in to comment.