Skip to content

Commit

Permalink
gail
Browse files Browse the repository at this point in the history
  • Loading branch information
shixiaowen03 committed Dec 30, 2018
1 parent 9937c07 commit e21e1c0
Show file tree
Hide file tree
Showing 12 changed files with 8,798 additions and 202 deletions.
374 changes: 172 additions & 202 deletions .idea/workspace.xml

Large diffs are not rendered by default.

115 changes: 115 additions & 0 deletions RL/Basic-GAIL-Demo/algo/ppo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import tensorflow as tf
import copy


class PPOTrain:
def __init__(self, Policy, Old_Policy, gamma=0.95, clip_value=0.2, c_1=1, c_2=0.01):
"""
:param Policy:
:param Old_Policy:
:param gamma:
:param clip_value:
:param c_1: parameter for value difference
:param c_2: parameter for entropy bonus
"""

self.Policy = Policy
self.Old_Policy = Old_Policy
self.gamma = gamma

pi_trainable = self.Policy.get_trainable_variables()
old_pi_trainable = self.Old_Policy.get_trainable_variables()

# assign_operations for policy parameter values to old policy parameters
with tf.variable_scope('assign_op'):
self.assign_ops = []
for v_old, v in zip(old_pi_trainable, pi_trainable):
self.assign_ops.append(tf.assign(v_old, v))

# inputs for train_op
with tf.variable_scope('train_inp'):
self.actions = tf.placeholder(dtype=tf.int32, shape=[None], name='actions')
self.rewards = tf.placeholder(dtype=tf.float32, shape=[None], name='rewards')
self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next')
self.gaes = tf.placeholder(dtype=tf.float32, shape=[None], name='gaes')

act_probs = self.Policy.act_probs
act_probs_old = self.Old_Policy.act_probs

# probabilities of actions which agent took with policy
act_probs = act_probs * tf.one_hot(indices=self.actions, depth=act_probs.shape[1])
act_probs = tf.reduce_sum(act_probs, axis=1)

# probabilities of actions which agent took with old policy
act_probs_old = act_probs_old * tf.one_hot(indices=self.actions, depth=act_probs_old.shape[1])
act_probs_old = tf.reduce_sum(act_probs_old, axis=1)

with tf.variable_scope('loss'):
# construct computation graph for loss_clip
# ratios = tf.divide(act_probs, act_probs_old)
ratios = tf.exp(tf.log(tf.clip_by_value(act_probs, 1e-10, 1.0))
- tf.log(tf.clip_by_value(act_probs_old, 1e-10, 1.0)))
clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - clip_value, clip_value_max=1 + clip_value)
loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
loss_clip = tf.reduce_mean(loss_clip)
tf.summary.scalar('loss_clip', loss_clip)

# construct computation graph for loss of entropy bonus
entropy = -tf.reduce_sum(self.Policy.act_probs *
tf.log(tf.clip_by_value(self.Policy.act_probs, 1e-10, 1.0)), axis=1)
entropy = tf.reduce_mean(entropy, axis=0) # mean of entropy of pi(obs)
tf.summary.scalar('entropy', entropy)

# construct computation graph for loss of value function
v_preds = self.Policy.v_preds
loss_vf = tf.squared_difference(self.rewards + self.gamma * self.v_preds_next, v_preds)
loss_vf = tf.reduce_mean(loss_vf)
tf.summary.scalar('value_difference', loss_vf)

# construct computation graph for loss
loss = loss_clip - c_1 * loss_vf + c_2 * entropy

# minimize -loss == maximize loss
loss = -loss
tf.summary.scalar('total', loss)

self.merged = tf.summary.merge_all()
optimizer = tf.train.AdamOptimizer(learning_rate=5e-5, epsilon=1e-5)
self.gradients = optimizer.compute_gradients(loss, var_list=pi_trainable)
self.train_op = optimizer.minimize(loss, var_list=pi_trainable)

def train(self, obs, actions, gaes, rewards, v_preds_next):
tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs,
self.Old_Policy.obs: obs,
self.actions: actions,
self.rewards: rewards,
self.v_preds_next: v_preds_next,
self.gaes: gaes})

def get_summary(self, obs, actions, gaes, rewards, v_preds_next):
return tf.get_default_session().run(self.merged, feed_dict={self.Policy.obs: obs,
self.Old_Policy.obs: obs,
self.actions: actions,
self.rewards: rewards,
self.v_preds_next: v_preds_next,
self.gaes: gaes})

def assign_policy_parameters(self):
# assign policy parameter values to old policy parameters
return tf.get_default_session().run(self.assign_ops)

def get_gaes(self, rewards, v_preds, v_preds_next):
deltas = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)]
# calculate generative advantage estimator(lambda = 1), see ppo paper eq(11)
gaes = copy.deepcopy(deltas)
for t in reversed(range(len(gaes) - 1)): # is T-1, where T is time step which run policy
gaes[t] = gaes[t] + self.gamma * gaes[t + 1]
return gaes

def get_grad(self, obs, actions, gaes, rewards, v_preds_next):
return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs,
self.Old_Policy.obs: obs,
self.actions: actions,
self.rewards: rewards,
self.v_preds_next: v_preds_next,
self.gaes: gaes})
Binary file not shown.
Binary file not shown.
74 changes: 74 additions & 0 deletions RL/Basic-GAIL-Demo/network_models/discriminator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import tensorflow as tf

class Discriminator:
def __init__(self, env):
"""
:param env:
Output of this Discriminator is reward for learning agent. Not the cost.
Because discriminator predicts P(expert|s,a) = 1 - P(agent|s,a).
"""

with tf.variable_scope('discriminator'):
self.scope = tf.get_variable_scope().name
self.expert_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape))
self.expert_a = tf.placeholder(dtype=tf.int32, shape=[None])
expert_a_one_hot = tf.one_hot(self.expert_a, depth=env.action_space.n)
# add noise for stabilise training
expert_a_one_hot += tf.random_normal(tf.shape(expert_a_one_hot), mean=0.2, stddev=0.1, dtype=tf.float32)/1.2
expert_s_a = tf.concat([self.expert_s, expert_a_one_hot], axis=1) # 将专家的state和action进行拼接

self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape))
self.agent_a = tf.placeholder(dtype=tf.int32, shape=[None])
agent_a_one_hot = tf.one_hot(self.agent_a, depth=env.action_space.n)
# add noise for stabilise training
agent_a_one_hot += tf.random_normal(tf.shape(agent_a_one_hot), mean=0.2, stddev=0.1, dtype=tf.float32)/1.2
agent_s_a = tf.concat([self.agent_s, agent_a_one_hot], axis=1) # 将agent的state和action进行拼接

with tf.variable_scope('network') as network_scope:
prob_1 = self.construct_network(input=expert_s_a)
network_scope.reuse_variables() # share parameter
prob_2 = self.construct_network(input=agent_s_a)

with tf.variable_scope('loss'): # 其实就是对数损失函数,想把专家行为与agent行为进行区分
loss_expert = tf.reduce_mean(tf.log(tf.clip_by_value(prob_1, 0.01, 1)))
loss_agent = tf.reduce_mean(tf.log(tf.clip_by_value(1 - prob_2, 0.01, 1)))
loss = loss_expert + loss_agent
loss = -loss
tf.summary.scalar('discriminator', loss)

optimizer = tf.train.AdamOptimizer()
self.train_op = optimizer.minimize(loss)

self.rewards = tf.log(tf.clip_by_value(prob_2, 1e-10, 1)) # log(P(expert|s,a)) larger is better for agent

def construct_network(self, input):
"""
得到采取该动作的概率,对于专家的行动来说,D希望越大越好,对于agent的行动来说,D希望越小越好
:param input:
:return:
"""
layer_1 = tf.layers.dense(inputs=input, units=20, activation=tf.nn.leaky_relu, name='layer1')
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.nn.leaky_relu, name='layer2')
layer_3 = tf.layers.dense(inputs=layer_2, units=20, activation=tf.nn.leaky_relu, name='layer3')
prob = tf.layers.dense(inputs=layer_3, units=1, activation=tf.sigmoid, name='prob')
return prob

def train(self, expert_s, expert_a, agent_s, agent_a):
return tf.get_default_session().run(self.train_op, feed_dict={self.expert_s: expert_s,
self.expert_a: expert_a,
self.agent_s: agent_s,
self.agent_a: agent_a})

def get_rewards(self, agent_s, agent_a):
"""
返回agent得到的奖励,对于agent来说,希望D输出的概率越大越好
:param agent_s:
:param agent_a:
:return:
"""
return tf.get_default_session().run(self.rewards, feed_dict={self.agent_s: agent_s,
self.agent_a: agent_a})

def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

49 changes: 49 additions & 0 deletions RL/Basic-GAIL-Demo/network_models/policy_net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import tensorflow as tf


class Policy_net:
def __init__(self, name: str, env):
"""
:param name: string
:param env: gym env
"""

ob_space = env.observation_space
act_space = env.action_space

with tf.variable_scope(name):
self.obs = tf.placeholder(dtype=tf.float32, shape=[None] + list(ob_space.shape), name='obs')

with tf.variable_scope('policy_net'):
layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh)
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh)
layer_3 = tf.layers.dense(inputs=layer_2, units=act_space.n, activation=tf.tanh)
self.act_probs = tf.layers.dense(inputs=layer_3, units=act_space.n, activation=tf.nn.softmax)

with tf.variable_scope('value_net'):
layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh)
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh)
self.v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None)

self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1)
self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1])

self.act_deterministic = tf.argmax(self.act_probs, axis=1)

self.scope = tf.get_variable_scope().name

def act(self, obs, stochastic=True):
if stochastic:
return tf.get_default_session().run([self.act_stochastic, self.v_preds], feed_dict={self.obs: obs})
else:
return tf.get_default_session().run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs})

def get_action_prob(self, obs):
return tf.get_default_session().run(self.act_probs, feed_dict={self.obs: obs})

def get_variables(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

128 changes: 128 additions & 0 deletions RL/Basic-GAIL-Demo/run_gail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import argparse
import gym
import numpy as np
import tensorflow as tf
from network_models.policy_net import Policy_net
from network_models.discriminator import Discriminator
from algo.ppo import PPOTrain

def argparser():
parser = argparse.ArgumentParser()
parser.add_argument('--logdir', help='log directory', default='log/train/gail')
parser.add_argument('--savedir', help='save directory', default='trained_models/gail')
parser.add_argument('--gamma', default=0.95)
parser.add_argument('--iteration', default=int(1e4))
return parser.parse_args()


def main(args):
env = gym.make('CartPole-v0')
env.seed(0)
ob_space = env.observation_space
Policy = Policy_net('policy', env)
Old_Policy = Policy_net('old_policy', env)
PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma)
D = Discriminator(env)

# 得到专家的观测和行动
expert_observations = np.genfromtxt('trajectory/observations.csv')
expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32)

saver = tf.train.Saver()

with tf.Session() as sess:
writer = tf.summary.FileWriter(args.logdir, sess.graph)
sess.run(tf.global_variables_initializer())

obs = env.reset()
success_num = 0

for iteration in range(args.iteration):
observations = []
actions = []
rewards = []
v_preds = []
run_policy_steps = 0

while True:
run_policy_steps += 1
obs = np.stack([obs]).astype(dtype=np.float32)
act, v_pred = Policy.act(obs = obs,stochastic = True)

act = np.asscalar(act)
v_pred = np.asscalar(v_pred)

next_obs,reward,done,info = env.step(act)

observations.append(obs)
actions.append(act)
rewards.append(reward)
v_preds.append(v_pred)

if done:
next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs
_, v_pred = Policy.act(obs=next_obs, stochastic=True)
v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
obs = env.reset()
break
else:
obs = next_obs

writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)])
, iteration)
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))])
, iteration)

if sum(rewards) >= 195:
success_num += 1
if success_num >= 100:
saver.save(sess, args.savedir + '/model.ckpt')
print('Clear!! Model saved.')
break
else:
success_num = 0

observations = np.reshape(observations,newshape=[-1] + list(ob_space.shape))
actions = np.array(actions).astype(dtype = np.int32)

for i in range(2):
D.train(expert_s = expert_observations,
expert_a = expert_actions,
agent_s = observations,
agent_a = actions)


d_rewards = D.get_rewards(agent_s=observations,agent_a = actions)
d_rewards = np.reshape(d_rewards,newshape=[-1]).astype(dtype=np.float32)

gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next)
gaes = np.array(gaes).astype(dtype=np.float32)
# gaes = (gaes - gaes.mean()) / gaes.std()
v_preds_next = np.array(v_preds_next).astype(dtype=np.float32)

# train policy
inp = [observations, actions, gaes, d_rewards, v_preds_next]
PPO.assign_policy_parameters()
for epoch in range(6):
sample_indices = np.random.randint(low=0, high=observations.shape[0],
size=32) # indices are in [low, high)
sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data
PPO.train(obs=sampled_inp[0],
actions=sampled_inp[1],
gaes=sampled_inp[2],
rewards=sampled_inp[3],
v_preds_next=sampled_inp[4])

summary = PPO.get_summary(obs=inp[0],
actions=inp[1],
gaes=inp[2],
rewards=inp[3],
v_preds_next=inp[4])

writer.add_summary(summary, iteration)
writer.close()


if __name__ == '__main__':
args = argparser()
main(args)
Loading

0 comments on commit e21e1c0

Please sign in to comment.