forked from princewen/tensorflow_practice
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
shixiaowen03
committed
Dec 30, 2018
1 parent
9937c07
commit e21e1c0
Showing
12 changed files
with
8,798 additions
and
202 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import tensorflow as tf | ||
import copy | ||
|
||
|
||
class PPOTrain: | ||
def __init__(self, Policy, Old_Policy, gamma=0.95, clip_value=0.2, c_1=1, c_2=0.01): | ||
""" | ||
:param Policy: | ||
:param Old_Policy: | ||
:param gamma: | ||
:param clip_value: | ||
:param c_1: parameter for value difference | ||
:param c_2: parameter for entropy bonus | ||
""" | ||
|
||
self.Policy = Policy | ||
self.Old_Policy = Old_Policy | ||
self.gamma = gamma | ||
|
||
pi_trainable = self.Policy.get_trainable_variables() | ||
old_pi_trainable = self.Old_Policy.get_trainable_variables() | ||
|
||
# assign_operations for policy parameter values to old policy parameters | ||
with tf.variable_scope('assign_op'): | ||
self.assign_ops = [] | ||
for v_old, v in zip(old_pi_trainable, pi_trainable): | ||
self.assign_ops.append(tf.assign(v_old, v)) | ||
|
||
# inputs for train_op | ||
with tf.variable_scope('train_inp'): | ||
self.actions = tf.placeholder(dtype=tf.int32, shape=[None], name='actions') | ||
self.rewards = tf.placeholder(dtype=tf.float32, shape=[None], name='rewards') | ||
self.v_preds_next = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next') | ||
self.gaes = tf.placeholder(dtype=tf.float32, shape=[None], name='gaes') | ||
|
||
act_probs = self.Policy.act_probs | ||
act_probs_old = self.Old_Policy.act_probs | ||
|
||
# probabilities of actions which agent took with policy | ||
act_probs = act_probs * tf.one_hot(indices=self.actions, depth=act_probs.shape[1]) | ||
act_probs = tf.reduce_sum(act_probs, axis=1) | ||
|
||
# probabilities of actions which agent took with old policy | ||
act_probs_old = act_probs_old * tf.one_hot(indices=self.actions, depth=act_probs_old.shape[1]) | ||
act_probs_old = tf.reduce_sum(act_probs_old, axis=1) | ||
|
||
with tf.variable_scope('loss'): | ||
# construct computation graph for loss_clip | ||
# ratios = tf.divide(act_probs, act_probs_old) | ||
ratios = tf.exp(tf.log(tf.clip_by_value(act_probs, 1e-10, 1.0)) | ||
- tf.log(tf.clip_by_value(act_probs_old, 1e-10, 1.0))) | ||
clipped_ratios = tf.clip_by_value(ratios, clip_value_min=1 - clip_value, clip_value_max=1 + clip_value) | ||
loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios)) | ||
loss_clip = tf.reduce_mean(loss_clip) | ||
tf.summary.scalar('loss_clip', loss_clip) | ||
|
||
# construct computation graph for loss of entropy bonus | ||
entropy = -tf.reduce_sum(self.Policy.act_probs * | ||
tf.log(tf.clip_by_value(self.Policy.act_probs, 1e-10, 1.0)), axis=1) | ||
entropy = tf.reduce_mean(entropy, axis=0) # mean of entropy of pi(obs) | ||
tf.summary.scalar('entropy', entropy) | ||
|
||
# construct computation graph for loss of value function | ||
v_preds = self.Policy.v_preds | ||
loss_vf = tf.squared_difference(self.rewards + self.gamma * self.v_preds_next, v_preds) | ||
loss_vf = tf.reduce_mean(loss_vf) | ||
tf.summary.scalar('value_difference', loss_vf) | ||
|
||
# construct computation graph for loss | ||
loss = loss_clip - c_1 * loss_vf + c_2 * entropy | ||
|
||
# minimize -loss == maximize loss | ||
loss = -loss | ||
tf.summary.scalar('total', loss) | ||
|
||
self.merged = tf.summary.merge_all() | ||
optimizer = tf.train.AdamOptimizer(learning_rate=5e-5, epsilon=1e-5) | ||
self.gradients = optimizer.compute_gradients(loss, var_list=pi_trainable) | ||
self.train_op = optimizer.minimize(loss, var_list=pi_trainable) | ||
|
||
def train(self, obs, actions, gaes, rewards, v_preds_next): | ||
tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs, | ||
self.Old_Policy.obs: obs, | ||
self.actions: actions, | ||
self.rewards: rewards, | ||
self.v_preds_next: v_preds_next, | ||
self.gaes: gaes}) | ||
|
||
def get_summary(self, obs, actions, gaes, rewards, v_preds_next): | ||
return tf.get_default_session().run(self.merged, feed_dict={self.Policy.obs: obs, | ||
self.Old_Policy.obs: obs, | ||
self.actions: actions, | ||
self.rewards: rewards, | ||
self.v_preds_next: v_preds_next, | ||
self.gaes: gaes}) | ||
|
||
def assign_policy_parameters(self): | ||
# assign policy parameter values to old policy parameters | ||
return tf.get_default_session().run(self.assign_ops) | ||
|
||
def get_gaes(self, rewards, v_preds, v_preds_next): | ||
deltas = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)] | ||
# calculate generative advantage estimator(lambda = 1), see ppo paper eq(11) | ||
gaes = copy.deepcopy(deltas) | ||
for t in reversed(range(len(gaes) - 1)): # is T-1, where T is time step which run policy | ||
gaes[t] = gaes[t] + self.gamma * gaes[t + 1] | ||
return gaes | ||
|
||
def get_grad(self, obs, actions, gaes, rewards, v_preds_next): | ||
return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs, | ||
self.Old_Policy.obs: obs, | ||
self.actions: actions, | ||
self.rewards: rewards, | ||
self.v_preds_next: v_preds_next, | ||
self.gaes: gaes}) |
Binary file added
BIN
+1.07 MB
...-GAIL-Demo/log/train/gail/events.out.tfevents.1546175604.meituan-sxwdeMacBook-Pro-4.local
Binary file not shown.
Binary file added
BIN
+2.44 MB
...c-GAIL-Demo/log/train/ppo/events.out.tfevents.1546172518.meituan-sxwdeMacBook-Pro-4.local
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import tensorflow as tf | ||
|
||
class Discriminator: | ||
def __init__(self, env): | ||
""" | ||
:param env: | ||
Output of this Discriminator is reward for learning agent. Not the cost. | ||
Because discriminator predicts P(expert|s,a) = 1 - P(agent|s,a). | ||
""" | ||
|
||
with tf.variable_scope('discriminator'): | ||
self.scope = tf.get_variable_scope().name | ||
self.expert_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape)) | ||
self.expert_a = tf.placeholder(dtype=tf.int32, shape=[None]) | ||
expert_a_one_hot = tf.one_hot(self.expert_a, depth=env.action_space.n) | ||
# add noise for stabilise training | ||
expert_a_one_hot += tf.random_normal(tf.shape(expert_a_one_hot), mean=0.2, stddev=0.1, dtype=tf.float32)/1.2 | ||
expert_s_a = tf.concat([self.expert_s, expert_a_one_hot], axis=1) # 将专家的state和action进行拼接 | ||
|
||
self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape)) | ||
self.agent_a = tf.placeholder(dtype=tf.int32, shape=[None]) | ||
agent_a_one_hot = tf.one_hot(self.agent_a, depth=env.action_space.n) | ||
# add noise for stabilise training | ||
agent_a_one_hot += tf.random_normal(tf.shape(agent_a_one_hot), mean=0.2, stddev=0.1, dtype=tf.float32)/1.2 | ||
agent_s_a = tf.concat([self.agent_s, agent_a_one_hot], axis=1) # 将agent的state和action进行拼接 | ||
|
||
with tf.variable_scope('network') as network_scope: | ||
prob_1 = self.construct_network(input=expert_s_a) | ||
network_scope.reuse_variables() # share parameter | ||
prob_2 = self.construct_network(input=agent_s_a) | ||
|
||
with tf.variable_scope('loss'): # 其实就是对数损失函数,想把专家行为与agent行为进行区分 | ||
loss_expert = tf.reduce_mean(tf.log(tf.clip_by_value(prob_1, 0.01, 1))) | ||
loss_agent = tf.reduce_mean(tf.log(tf.clip_by_value(1 - prob_2, 0.01, 1))) | ||
loss = loss_expert + loss_agent | ||
loss = -loss | ||
tf.summary.scalar('discriminator', loss) | ||
|
||
optimizer = tf.train.AdamOptimizer() | ||
self.train_op = optimizer.minimize(loss) | ||
|
||
self.rewards = tf.log(tf.clip_by_value(prob_2, 1e-10, 1)) # log(P(expert|s,a)) larger is better for agent | ||
|
||
def construct_network(self, input): | ||
""" | ||
得到采取该动作的概率,对于专家的行动来说,D希望越大越好,对于agent的行动来说,D希望越小越好 | ||
:param input: | ||
:return: | ||
""" | ||
layer_1 = tf.layers.dense(inputs=input, units=20, activation=tf.nn.leaky_relu, name='layer1') | ||
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.nn.leaky_relu, name='layer2') | ||
layer_3 = tf.layers.dense(inputs=layer_2, units=20, activation=tf.nn.leaky_relu, name='layer3') | ||
prob = tf.layers.dense(inputs=layer_3, units=1, activation=tf.sigmoid, name='prob') | ||
return prob | ||
|
||
def train(self, expert_s, expert_a, agent_s, agent_a): | ||
return tf.get_default_session().run(self.train_op, feed_dict={self.expert_s: expert_s, | ||
self.expert_a: expert_a, | ||
self.agent_s: agent_s, | ||
self.agent_a: agent_a}) | ||
|
||
def get_rewards(self, agent_s, agent_a): | ||
""" | ||
返回agent得到的奖励,对于agent来说,希望D输出的概率越大越好 | ||
:param agent_s: | ||
:param agent_a: | ||
:return: | ||
""" | ||
return tf.get_default_session().run(self.rewards, feed_dict={self.agent_s: agent_s, | ||
self.agent_a: agent_a}) | ||
|
||
def get_trainable_variables(self): | ||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import tensorflow as tf | ||
|
||
|
||
class Policy_net: | ||
def __init__(self, name: str, env): | ||
""" | ||
:param name: string | ||
:param env: gym env | ||
""" | ||
|
||
ob_space = env.observation_space | ||
act_space = env.action_space | ||
|
||
with tf.variable_scope(name): | ||
self.obs = tf.placeholder(dtype=tf.float32, shape=[None] + list(ob_space.shape), name='obs') | ||
|
||
with tf.variable_scope('policy_net'): | ||
layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh) | ||
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh) | ||
layer_3 = tf.layers.dense(inputs=layer_2, units=act_space.n, activation=tf.tanh) | ||
self.act_probs = tf.layers.dense(inputs=layer_3, units=act_space.n, activation=tf.nn.softmax) | ||
|
||
with tf.variable_scope('value_net'): | ||
layer_1 = tf.layers.dense(inputs=self.obs, units=20, activation=tf.tanh) | ||
layer_2 = tf.layers.dense(inputs=layer_1, units=20, activation=tf.tanh) | ||
self.v_preds = tf.layers.dense(inputs=layer_2, units=1, activation=None) | ||
|
||
self.act_stochastic = tf.multinomial(tf.log(self.act_probs), num_samples=1) | ||
self.act_stochastic = tf.reshape(self.act_stochastic, shape=[-1]) | ||
|
||
self.act_deterministic = tf.argmax(self.act_probs, axis=1) | ||
|
||
self.scope = tf.get_variable_scope().name | ||
|
||
def act(self, obs, stochastic=True): | ||
if stochastic: | ||
return tf.get_default_session().run([self.act_stochastic, self.v_preds], feed_dict={self.obs: obs}) | ||
else: | ||
return tf.get_default_session().run([self.act_deterministic, self.v_preds], feed_dict={self.obs: obs}) | ||
|
||
def get_action_prob(self, obs): | ||
return tf.get_default_session().run(self.act_probs, feed_dict={self.obs: obs}) | ||
|
||
def get_variables(self): | ||
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) | ||
|
||
def get_trainable_variables(self): | ||
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import argparse | ||
import gym | ||
import numpy as np | ||
import tensorflow as tf | ||
from network_models.policy_net import Policy_net | ||
from network_models.discriminator import Discriminator | ||
from algo.ppo import PPOTrain | ||
|
||
def argparser(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--logdir', help='log directory', default='log/train/gail') | ||
parser.add_argument('--savedir', help='save directory', default='trained_models/gail') | ||
parser.add_argument('--gamma', default=0.95) | ||
parser.add_argument('--iteration', default=int(1e4)) | ||
return parser.parse_args() | ||
|
||
|
||
def main(args): | ||
env = gym.make('CartPole-v0') | ||
env.seed(0) | ||
ob_space = env.observation_space | ||
Policy = Policy_net('policy', env) | ||
Old_Policy = Policy_net('old_policy', env) | ||
PPO = PPOTrain(Policy, Old_Policy, gamma=args.gamma) | ||
D = Discriminator(env) | ||
|
||
# 得到专家的观测和行动 | ||
expert_observations = np.genfromtxt('trajectory/observations.csv') | ||
expert_actions = np.genfromtxt('trajectory/actions.csv', dtype=np.int32) | ||
|
||
saver = tf.train.Saver() | ||
|
||
with tf.Session() as sess: | ||
writer = tf.summary.FileWriter(args.logdir, sess.graph) | ||
sess.run(tf.global_variables_initializer()) | ||
|
||
obs = env.reset() | ||
success_num = 0 | ||
|
||
for iteration in range(args.iteration): | ||
observations = [] | ||
actions = [] | ||
rewards = [] | ||
v_preds = [] | ||
run_policy_steps = 0 | ||
|
||
while True: | ||
run_policy_steps += 1 | ||
obs = np.stack([obs]).astype(dtype=np.float32) | ||
act, v_pred = Policy.act(obs = obs,stochastic = True) | ||
|
||
act = np.asscalar(act) | ||
v_pred = np.asscalar(v_pred) | ||
|
||
next_obs,reward,done,info = env.step(act) | ||
|
||
observations.append(obs) | ||
actions.append(act) | ||
rewards.append(reward) | ||
v_preds.append(v_pred) | ||
|
||
if done: | ||
next_obs = np.stack([next_obs]).astype(dtype=np.float32) # prepare to feed placeholder Policy.obs | ||
_, v_pred = Policy.act(obs=next_obs, stochastic=True) | ||
v_preds_next = v_preds[1:] + [np.asscalar(v_pred)] | ||
obs = env.reset() | ||
break | ||
else: | ||
obs = next_obs | ||
|
||
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_length', simple_value=run_policy_steps)]) | ||
, iteration) | ||
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='episode_reward', simple_value=sum(rewards))]) | ||
, iteration) | ||
|
||
if sum(rewards) >= 195: | ||
success_num += 1 | ||
if success_num >= 100: | ||
saver.save(sess, args.savedir + '/model.ckpt') | ||
print('Clear!! Model saved.') | ||
break | ||
else: | ||
success_num = 0 | ||
|
||
observations = np.reshape(observations,newshape=[-1] + list(ob_space.shape)) | ||
actions = np.array(actions).astype(dtype = np.int32) | ||
|
||
for i in range(2): | ||
D.train(expert_s = expert_observations, | ||
expert_a = expert_actions, | ||
agent_s = observations, | ||
agent_a = actions) | ||
|
||
|
||
d_rewards = D.get_rewards(agent_s=observations,agent_a = actions) | ||
d_rewards = np.reshape(d_rewards,newshape=[-1]).astype(dtype=np.float32) | ||
|
||
gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) | ||
gaes = np.array(gaes).astype(dtype=np.float32) | ||
# gaes = (gaes - gaes.mean()) / gaes.std() | ||
v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) | ||
|
||
# train policy | ||
inp = [observations, actions, gaes, d_rewards, v_preds_next] | ||
PPO.assign_policy_parameters() | ||
for epoch in range(6): | ||
sample_indices = np.random.randint(low=0, high=observations.shape[0], | ||
size=32) # indices are in [low, high) | ||
sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] # sample training data | ||
PPO.train(obs=sampled_inp[0], | ||
actions=sampled_inp[1], | ||
gaes=sampled_inp[2], | ||
rewards=sampled_inp[3], | ||
v_preds_next=sampled_inp[4]) | ||
|
||
summary = PPO.get_summary(obs=inp[0], | ||
actions=inp[1], | ||
gaes=inp[2], | ||
rewards=inp[3], | ||
v_preds_next=inp[4]) | ||
|
||
writer.add_summary(summary, iteration) | ||
writer.close() | ||
|
||
|
||
if __name__ == '__main__': | ||
args = argparser() | ||
main(args) |
Oops, something went wrong.