From 9937c077100df2735aa3967691315a45e958d4df Mon Sep 17 00:00:00 2001 From: shixiaowen03 Date: Sun, 23 Dec 2018 13:46:34 +0800 Subject: [PATCH] rainbow --- .idea/workspace.xml | 333 +++++++++++++++---------- RL/Basic-DisRL-Demo/Categorical_DQN.py | 6 +- RL/Basic-Rainbow-Net/agent.py | 294 ++++++++++++++++++++++ RL/Basic-Rainbow-Net/batchEnv.py | 157 ++++++++++++ RL/Basic-Rainbow-Net/huberLoss.py | 71 ++++++ RL/Basic-Rainbow-Net/main.py | 162 ++++++++++++ RL/Basic-Rainbow-Net/model.py | 199 +++++++++++++++ RL/Basic-Rainbow-Net/preprocessor.py | 126 ++++++++++ RL/Basic-Rainbow-Net/replayMemory.py | 85 +++++++ RL/Basic-Rainbow-Net/sumTree.py | 56 +++++ 10 files changed, 1351 insertions(+), 138 deletions(-) create mode 100644 RL/Basic-Rainbow-Net/agent.py create mode 100644 RL/Basic-Rainbow-Net/batchEnv.py create mode 100644 RL/Basic-Rainbow-Net/huberLoss.py create mode 100644 RL/Basic-Rainbow-Net/main.py create mode 100644 RL/Basic-Rainbow-Net/model.py create mode 100644 RL/Basic-Rainbow-Net/preprocessor.py create mode 100644 RL/Basic-Rainbow-Net/replayMemory.py create mode 100644 RL/Basic-Rainbow-Net/sumTree.py diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 9b672fbf..a5455b41 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,7 +1,9 @@ - + + + - - + + - - + + - + + + + + + + + + + + + + - - + + - - + + - - + + - - + + @@ -86,36 +100,48 @@ - - + + - - + + - + - - + + - - + + + + + + + + + + + + - + - - + + - - - + + + + + @@ -153,14 +179,6 @@ @@ -250,13 +276,7 @@ - - - - - - - + @@ -310,7 +330,7 @@ - + - + - + - - - - + + + + - - - - - + + + + + @@ -442,17 +462,17 @@ - - - + - - + + + + + - @@ -462,7 +482,6 @@ - @@ -498,52 +517,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -768,7 +741,7 @@ - + @@ -783,51 +756,143 @@ - + - - + + - + - + - - + + + + + + + + + + + + + + + + + + - + + + + + + + + + - - + + - + - + - - + + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/RL/Basic-DisRL-Demo/Categorical_DQN.py b/RL/Basic-DisRL-Demo/Categorical_DQN.py index 1f98aeb1..35646218 100644 --- a/RL/Basic-DisRL-Demo/Categorical_DQN.py +++ b/RL/Basic-DisRL-Demo/Categorical_DQN.py @@ -13,7 +13,7 @@ def __init__(self,env,config): self.config = config self.v_max = self.config.v_max self.v_min = self.config.v_min - self.atoms = self.config.atoms + self.atoms = self.config.atoms # 价值采样点的个数 self.epsilon = self.config.INITIAL_EPSILON self.state_shape = env.observation_space.shape @@ -23,10 +23,8 @@ def __init__(self,env,config): target_state_shape = [1] target_state_shape.extend(self.state_shape) - self.state_input = tf.placeholder(tf.float32,target_state_shape) self.action_input = tf.placeholder(tf.int32,[1,1]) - self.m_input = tf.placeholder(tf.float32,[self.atoms]) self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1) @@ -80,7 +78,6 @@ def build_cate_dqn_net(self): self.q_target = tf.reduce_sum(self.z_target * self.z) self.cross_entropy_loss = -tf.reduce_sum(self.m_input * tf.log(self.z_eval)) - self.optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE).minimize(self.cross_entropy_loss) eval_params = tf.get_collection("eval_net_params") @@ -97,6 +94,7 @@ def train(self,s,r,action,s_,gamma): a_ = tf.argmax(list_q_).eval() m = np.zeros(self.atoms) p = self.sess.run(self.z_target,feed_dict = {self.state_input:[s_],self.action_input:[[a_]]})[0] + m = np.zeros(self.atoms) for j in range(self.atoms): Tz = min(self.v_max,max(self.v_min,r+gamma * self.z[j])) bj = (Tz - self.v_min) / self.delta_z # 分在第几个块里 diff --git a/RL/Basic-Rainbow-Net/agent.py b/RL/Basic-Rainbow-Net/agent.py new file mode 100644 index 00000000..176e3d97 --- /dev/null +++ b/RL/Basic-Rainbow-Net/agent.py @@ -0,0 +1,294 @@ +"""Main DQN agent.""" + +import numpy as np +import tensorflow as tf +from PIL import Image +import random +from huberLoss import mean_huber_loss, weighted_huber_loss + +EPSILON_BEGIN = 1.0 +EPSILON_END = 0.1 +BETA_BEGIN = 0.5 +BETA_END = 1.0 + +class DQNAgent(): + def __init__(self, + eval_model, + target_model, + memory, + num_actions, + gamma, + update_freq, + target_update_freq, + update_target_params_ops, + batch_size, + is_double_dqn, + is_per, + is_distributional, + num_step, + is_noisy, + learning_rate, + rmsp_decay, + rmsp_momentum, + rmsp_epsilon): + + self._eval_model = eval_model + self._target_model = target_model + self._memory = memory + self._num_actions = num_actions + self._gamma = gamma + self._update_freq = update_freq + self._target_update_freq = target_update_freq + self._update_target_params_ops = update_target_params_ops + self._batch_size = batch_size + self._is_double_dqn = is_double_dqn + self._is_per = is_per + self._is_distributional = is_distributional + self._num_step = num_step + self._is_noisy = is_noisy + self._learning_rate = learning_rate + self._rmsp_decay = rmsp_decay + self._rmsp_momentum = rmsp_momentum + self._rmsp_epsilon = rmsp_epsilon + self._update_times = 0 + self._beta = EPSILON_BEGIN + self._beta_increment = (EPSILON_END-BETA_BEGIN)/2000000.0 + self._epsilon = EPSILON_BEGIN if is_noisy else 0. + self._epsilon_increment = (EPSILON_END - EPSILON_BEGIN)/2000000.0 if is_noisy==0 else 0. + self._action_ph = tf.placeholder(tf.int32,[None,2],'action_ph') + self._reward_ph = tf.placeholder(tf.float32,name='reward_ph') + self._is_terminal_ph = tf.placeholder(tf.float32,name='is_terminal_ph') + self._action_chosen_by_eval_ph = tf.placeholder(tf.int32,[None,2],'action_chosen_by_eval_ph') + self._loss_weight_ph = tf.placeholder(tf.float32,name='loss_weight_ph') + self._error_op,self._train_op = self._get_error_and_train_op(self._reward_ph,self._is_terminal_ph, + self._action_ph,self._action_chosen_by_eval_ph, + self._loss_weight_ph) + + + def _get_error_and_train_op(self,reward_ph, + is_terminal_ph, + action_ph, + action_chosen_by_eval_ph, + loss_weight_ph): + + if self._is_distributional == 0: + q_values_target = self._target_model['q_values'] + q_values_eval = self._eval_model['q_values'] + + if self._is_double_dqn: + max_q = tf.gather_nd(q_values_target,action_chosen_by_eval_ph) # 如果是double-dqn,动作由eval-net选出,q值由target-net得到 + else: + max_q = tf.reduce_max(q_values_target,axis=1) + + target = reward_ph + (1.0 - is_terminal_ph) * (self._gamma ** self._num_step) * max_q # 这里是多步的dqn + gathered_outputs = tf.gather_nd(q_values_eval,action_ph,name='gathered_outputs') + + if self._is_per == 1: + loss = weighted_huber_loss(target,gathered_outputs,loss_weight_ph) + else: + loss = mean_huber_loss(target,gathered_outputs) + train_op = tf.train.RMSPropOptimizer(self._learning_rate,decay=self._rmsp_decay, + momentum=self._rmsp_momentum,epsilon=self._rmsp_epsilon).minimize(loss) + + error_op = tf.abs(gathered_outputs - target,name='abs_error') + return train_op,error_op + + else: + N_atoms = 51 + V_Max = 20.0 + V_Min = 0.0 + Delta_z = (V_Max - V_Min) / (N_atoms - 1) + z_list = tf.constant([V_Min + i * Delta_z for i in range(N_atoms)], dtype=tf.float32) + + q_distributional_values_target = self._target_model['q_distributional_network'] # batch_size * num_actions * N_atoms + tmp_batch_size = tf.shape(q_distributional_values_target)[0] # batch_size + + if self._is_double_dqn: + q_distributional_chosen_by_action_target = tf.gather_nd(q_distributional_values_target,action_chosen_by_eval_ph) + else: + action_chosen_by_target_q = tf.cast(tf.argmax(self._target_model['q_values'], axis=1), tf.int32) + q_distributional_chosen_by_action_target = tf.gather_nd(q_distributional_values_target, + tf.concat([tf.reshape(tf.range(tmp_batch_size),[-1,1]), + tf.reshape(action_chosen_by_target_q,[-1,1])],axis=1)) + + + target = tf.tile(tf.reshape(reward_ph,[-1,1]),[1,N_atoms]) + \ + (self._gamma * self._num_step) * \ + tf.multiply(tf.reshape(z_list,[1,N_atoms]),(1.0 - tf.tile(tf.reshape(is_terminal_ph,[-1,1]),[1,N_atoms]))) + + target = tf.clip_by_value(target,V_Min,V_Max) + + b = (target - V_Min) / Delta_z + + u,l = tf.ceil(b),tf.floor(b) + + u_id,l_id = tf.cast(u,tf.int32),tf.cast(l,tf.int32) + + u_minus_b,b_minus_l = u - b,b - l + q_distributional_values_eval = self._eval_model['q_distributional_network'] + + q_distributional_chosen_by_action_eval = tf.gather_nd(q_distributional_values_eval,action_ph) + + index_help = tf.tile(tf.reshape(tf.range(tmp_batch_size),[-1,1]),[1,N_atoms]) + + index_help = tf.expand_dims(index_help,-1) # batch * N_atoms * 1 + u_id = tf.concat([index_help,tf.expand_dims(u_id,-1)],axis=2) + l_id = tf.concat([index_help,tf.expand_dims(l_id,-1)],axis=2) + + error = q_distributional_chosen_by_action_target * u_minus_b * \ + tf.log(tf.gather_nd(q_distributional_chosen_by_action_eval, l_id)) \ + + q_distributional_chosen_by_action_target * b_minus_l * \ + tf.log(tf.gather_nd(q_distributional_chosen_by_action_eval, u_id)) + error = tf.reduce_sum(error, axis=1) + + if self._is_per == 1: + loss = tf.negative(error * loss_weight_ph) + else: + loss = tf.negative(error) + + train_op = tf.train.RMSPropOptimizer(self._learning_rate, + decay=self._rmsp_decay, momentum=self._rmsp_momentum, + epsilon=self._rmsp_epsilon).minimize(loss) + error_op = tf.abs(error, name='abs_error') + return error_op, train_op + + def select_action(self,sess,state,epsilon,model): + batch_size = len(state) + if np.random.rand() < epsilon: + action = np.random.randint(0,self._num_actions,size=(batch_size,)) + else: + state = state.astype(np.float32) / 255.0 + feed_dict = {model['input_frames'] :state} + action = sess.run(model['action'],feed_dict=feed_dict) + return action + + def get_multi_step_sample(self,env,sess,num_step,epsilon): + old_state,action,reward,new_state,is_terminal = env.get_state() + total_reward = np.sign(reward) + total_is_terminal = is_terminal + + next_action = self.select_action(sess,new_state,epsilon,self._eval_model) + env.take_action(next_action) + + for i in range(1,num_step): + _,_,reward,new_state,is_terminal = env.get_state() + total_reward += self._gamma ** i * np.sign(reward) + total_is_terminal += is_terminal + next_action = self.select_action(sess,new_state,epsilon,self._eval_model) + env.take_action(next_action) + + return old_state,action,total_reward,new_state,np.sign(total_is_terminal) + + def fit(self,sess,env,num_iterations,do_train=True): + + num_environment = env.num_process + env.reset() + + for t in range(0,num_iterations,num_environment): + # 准备数据 + old_state,action,reward,new_state,is_terminal = self.get_multi_step_sample(env,sess,self._num_step,self._epsilon) + self._memory.append(old_state,action,reward,new_state,is_terminal) # 插入数据 + if self._epsilon > EPSILON_END: + self._epsilon += num_environment * self._epsilon_increment + if do_train: + num_update = sum([1 if i % self._update_freq == 0 else 0 for i in range(t, t + num_environment)]) + # 抽取数据 + for _ in range(num_update): + if self._is_per == 1: + (old_state_list, action_list, reward_list, new_state_list, is_terminal_list), \ + idx_list, p_list, sum_p, count = self._memory.sample(self._batch_size) + else: + old_state_list, action_list, reward_list, new_state_list, is_terminal_list \ + = self._memory.sample(self._batch_size) + + feed_dict = {self._target_model['input_frames']: new_state_list.astype(np.float32) / 255.0, + self._eval_model['input_frames']: old_state_list.astype(np.float32) / 255.0, + self._action_ph: list(enumerate(action_list)), + self._reward_ph: np.array(reward_list).astype(np.float32), + self._is_terminal_ph: np.array(is_terminal_list).astype(np.float32), + } + + if self._is_double_dqn: + action_chosen_by_online = sess.run(self._eval_model['action'], feed_dict={ + self._eval_model['input_frames']: new_state_list.astype(np.float32)/255.0}) + feed_dict[self._action_chosen_by_eval_ph] = list(enumerate(action_chosen_by_online)) + + if self._is_per == 1: + # Annealing weight beta + feed_dict[self._loss_weight_ph] = (np.array(p_list)*count/sum_p)**(-self._beta) + error, _ = sess.run([self._error_op, self._train_op], feed_dict=feed_dict) + self._memory.update(idx_list, error) + else: + sess.run(self._train_op, feed_dict=feed_dict) + + self._update_times += 1 + if self._beta < BETA_END: + self._beta += self._beta_increment + + if self._update_times%self._target_update_freq == 0: + sess.run(self._update_target_params_ops) + + + def _get_error(self, sess, old_state, action, reward, new_state, is_terminal): + ''' + Get TD error for Prioritized Experience Replay + ''' + feed_dict = {self._target_model['input_frames']: new_state.astype(np.float32)/255.0, + self._eval_model['input_frames']: old_state.astype(np.float32)/255.0, + self._action_ph: list(enumerate(action)), + self._reward_ph: np.array(reward).astype(np.float32), + self._is_terminal_ph: np.array(is_terminal).astype(np.float32), + } + + if self._is_double_dqn: + action_chosen_by_online = sess.run(self._eval_model['action'], feed_dict={ + self._eval_model['input_frames']: new_state.astype(np.float32)/255.0}) + feed_dict[self._action_chosen_by_eval_ph] = list(enumerate(action_chosen_by_online)) + + error = sess.run(self._error_op, feed_dict=feed_dict) + return error + + def get_mean_max_Q(self, sess, samples): + mean_max = [] + INCREMENT = 1000 + for i in range(0, len(samples), INCREMENT): + feed_dict = {self._eval_model['input_frames']: + samples[i: i + INCREMENT].astype(np.float32)/255.0} + mean_max.append(sess.run(self._eval_model['mean_max_Q'], + feed_dict = feed_dict)) + return np.mean(mean_max) + + + def evaluate(self, sess, env, num_episode): + """Evaluate num_episode games by online model. + Parameters + ---------- + sess: tf.Session + env: batchEnv.BatchEnvironment + This is your paralleled Atari environment. + num_episode: int + This is the number of episode of games to evaluate + Returns + ------- + reward list for each episode + """ + num_environment = env.num_process + env.reset() + reward_of_each_environment = np.zeros(num_environment) + rewards_list = [] + + num_finished_episode = 0 + + while num_finished_episode < num_episode: + old_state, action, reward, new_state, is_terminal = env.get_state() + action = self.select_action(sess, new_state, 0, self._eval_model) + env.take_action(action) + for i, r, is_t in zip(range(num_environment), reward, is_terminal): + if not is_t: + reward_of_each_environment[i] += r + else: + rewards_list.append(reward_of_each_environment[i]) + reward_of_each_environment[i] = 0 + num_finished_episode += 1 + return np.mean(rewards_list), np.std(rewards_list) + diff --git a/RL/Basic-Rainbow-Net/batchEnv.py b/RL/Basic-Rainbow-Net/batchEnv.py new file mode 100644 index 00000000..3caccc9f --- /dev/null +++ b/RL/Basic-Rainbow-Net/batchEnv.py @@ -0,0 +1,157 @@ +import multiprocessing as mp +from multiprocessing import Pool, sharedctypes +import numpy as np +import gym +import sys +from preprocessor import Preprocessor +import time + +SKIP_START_FRAME_NUM = 42 + +class Atari_ParallelWorker(mp.Process): + def __init__(self, + i, + env_name, + pipe, + window_size, + input_shape, + num_frame_per_action, + max_episode_length, + state, + lock): + super(Atari_ParallelWorker,self).__init__() + self.id = i + self.pipe = pipe + self.window_size = window_size + self.num_frame_per_action = num_frame_per_action + self.max_episode_length = max_episode_length + self.state = state + self.lock = lock + self.env = gym.make(env_name) + self.env.seed(np.random.randint(222)) + self.preprocessor = Preprocessor(window_size,input_shape) + self.time = 0 + self._reset() + + def _take_action(self,action): + reward = 0 + old_state = self.preprocessor.get_state() # 得到4帧的画面 + for _ in range(self.num_frame_per_action): + self.time += 1 + state,intermediate_reward,is_terminal,_ = self.env.step(action) + reward += intermediate_reward + self.preprocessor.process_state_for_memory(state) # 将每一帧的画面进行压缩,同时将HistoryPreprocessor中存放的历史记录往前覆盖 + if is_terminal: + self._reset() + break + new_state = self.preprocessor.get_state() # 拿到新的state,这里的新的state是old_state后的四帧动画,reward是这四帧动画的奖励和 + if self.time > self.max_episode_length: + is_terminal = True + self._reset() + + # write 'sara' into mp.Array + np.ctypeslib.as_array(self.state['old_state'])[self.id] = old_state + np.ctypeslib.as_array(self.state['action'])[self.id] = action + np.ctypeslib.as_array(self.state['reward'])[self.id] = reward + np.ctypeslib.as_array(self.state['new_state'])[self.id] = new_state + np.ctypeslib.as_array(self.state['is_terminal'])[self.id] = is_terminal + + def run(self): + print('Environment worker %d: run'%(self.id,)) + # This lock to ensure all the process prepared before take actions + self.lock.release() + while True: + command, context = self.pipe.recv() + if command == 'CLOSE': + self.env.close() + self.pipe.close() + break + elif command == 'ACTION': + self._take_action(action=context) + self.lock.release() + elif command == 'RESET': + self._reset() + self.lock.release() + else: + raise NotImplementedError() + + + def _reset(self): + self.env.reset() + self.preprocessor.reset() + self.time = 0 + for _ in range(SKIP_START_FRAME_NUM - self.window_size): + self.env.step(0) + for _ in range(self.window_size): + state,_,_,_ = self.env.step(0) + self.preprocessor.process_state_for_memory(state) + + + + +class BatchEnvironment(): + def __init__(self,env_name, + num_process, + window_size, + input_shape, + num_frame_per_action, + max_episode_length): + self.num_process = num_process # 并行的游戏数量 + self.env_name = env_name + self.workers = [] + self.pipes = [] + self.locks = [] + + def get_multiprocess_numpy(dtype,shape): + tmp = np.ctypeslib.as_ctypes(np.zeros(shape,dtype=dtype)) + return sharedctypes.Array(tmp._type_,tmp,lock=False) + + self.state = { + 'old_state':get_multiprocess_numpy(np.uint8,shape=(num_process,input_shape[0],input_shape[1],window_size)), + 'action':get_multiprocess_numpy(np.uint8,shape=(num_process,)), + 'reward':get_multiprocess_numpy(np.int16,shape=(num_process,)), + 'new_state':get_multiprocess_numpy(np.uint8,shape=(num_process,input_shape[0],input_shape[1],window_size)), + 'is_terminal':get_multiprocess_numpy(np.uint8,shape=(num_process,)) + } + + for i in range(num_process): + parent_pipe,child_pipe = mp.Pipe() + lock = mp.Lock() + self.pipes.append(parent_pipe) + self.locks.append(lock) + + lock.acquire() + worker = Atari_ParallelWorker(i, env_name, child_pipe, window_size, + input_shape, num_frame_per_action, max_episode_length, self.state, lock) + + worker.start() + self.workers.append(worker) + + def take_action(self, action_list): + assert len(action_list) == self.num_process + for pipe, action, lock in zip(self.pipes, action_list, self.locks): + lock.acquire() + pipe.send(('ACTION', action)) + + def reset(self): + for pipe, lock in zip(self.pipes, self.locks): + lock.acquire() + pipe.send(('RESET', None)) + + def get_state(self): + for lock in self.locks: + lock.acquire() + + old_state = np.ctypeslib.as_array(self.state['old_state']) + action = np.ctypeslib.as_array(self.state['action']) + reward = np.ctypeslib.as_array(self.state['reward']) + new_state = np.ctypeslib.as_array(self.state['new_state']) + is_terminal = np.ctypeslib.as_array(self.state['is_terminal']) + + for lock in self.locks: + lock.release() + return np.copy(old_state), action, reward, np.copy(new_state), is_terminal + + def close(self): + for worker in self.workers: + worker.terminate() \ No newline at end of file diff --git a/RL/Basic-Rainbow-Net/huberLoss.py b/RL/Basic-Rainbow-Net/huberLoss.py new file mode 100644 index 00000000..12a1b1a7 --- /dev/null +++ b/RL/Basic-Rainbow-Net/huberLoss.py @@ -0,0 +1,71 @@ +"""Loss functions.""" + +import tensorflow as tf + + +def huber_loss(y_true, y_pred, max_grad=1.): + """Calculate the huber loss. + See https://en.wikipedia.org/wiki/Huber_loss + Parameters + ---------- + y_true: np.array, tf.Tensor + Target value. + y_pred: np.array, tf.Tensor + Predicted value. + max_grad: float, optional + Positive floating point value. Represents the maximum possible + gradient magnitude. + Returns + ------- + tf.Tensor + The huber loss. + """ + a = tf.abs(y_true - y_pred) + less_than_max = 0.5 * tf.square(a) + greater_than_max = max_grad * (a - 0.5 * max_grad) + return tf.where(a <= max_grad, x=less_than_max, y=greater_than_max) + + + +def mean_huber_loss(y_true, y_pred, max_grad=1.): + """Return mean huber loss. + Same as huber_loss, but takes the mean over all values in the + output tensor. + Parameters + ---------- + y_true: np.array, tf.Tensor + Target value. + y_pred: np.array, tf.Tensor + Predicted value. + max_grad: float, optional + Positive floating point value. Represents the maximum possible + gradient magnitude. + Returns + ------- + tf.Tensor + The mean huber loss. + """ + return tf.reduce_mean(huber_loss(y_true, y_pred, max_grad=max_grad)) + + +def weighted_huber_loss(y_true, y_pred, weights, max_grad=1.): + """Return mean huber loss. + Same as huber_loss, but takes the mean over all values in the + output tensor. + Parameters + ---------- + y_true: np.array, tf.Tensor + Target value. + y_pred: np.array, tf.Tensor + Predicted value. + weights: np.array, tf.Tensor + weights value. + max_grad: float, optional + Positive floating point value. Represents the maximum possible + gradient magnitude. + Returns + ------- + tf.Tensor + The mean huber loss. + """ + return tf.reduce_mean(weights*huber_loss(y_true, y_pred, max_grad=max_grad)) \ No newline at end of file diff --git a/RL/Basic-Rainbow-Net/main.py b/RL/Basic-Rainbow-Net/main.py new file mode 100644 index 00000000..6162475c --- /dev/null +++ b/RL/Basic-Rainbow-Net/main.py @@ -0,0 +1,162 @@ +import argparse +import gym + +import numpy as np +import random +import tensorflow as tf +import PIL + +from batchEnv import BatchEnvironment +from replayMemory import ReplayMemory, PriorityExperienceReplay +from model import create_deep_q_network, create_duel_q_network, create_model, create_distributional_model +from agent import DQNAgent + + +NUM_FRAME_PER_ACTION = 4 +UPDATE_FREQUENCY = 4 # do one batch update when UPDATE_FREQUENCY number of new samples come +TARGET_UPDATE_FREQENCY = 10000 # target-net更新频率 +REPLAYMEMORY_SIZE = 500000 # 经验池的大小 +MAX_EPISODE_LENGTH = 100000 # 最大的序列长度 +RMSP_EPSILON = 0.01 +RMSP_DECAY = 0.95 +RMSP_MOMENTUM =0.95 +NUM_FIXED_SAMPLES = 10000 +NUM_BURN_IN = 50000 +LINEAR_DECAY_LENGTH = 4000000 +NUM_EVALUATE_EPSIODE = 20 + +def get_fixed_samples(env,num_actions,num_samples): + fixed_samples = [] + num_environment = env.num_process + env.reset() + + for _ in range(0,num_samples,num_environment): + old_state,action,reward,new_state,is_terminal = env.get_state() + action = np.random.randint(0,num_actions,size = (num_environment,)) + env.take_action(action) + for state in new_state: + fixed_samples.append(state) + return np.array(fixed_samples) + + +def main(): + parser = argparse.ArgumentParser(description='Run DQN on Atari Space Invaders') + parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') + parser.add_argument('--seed', default=10703, type=int, help='Random seed') + parser.add_argument('--input_shape', default=(84,84), help='Input shape') + parser.add_argument('--gamma', default=0.99, help='Discount factor') + parser.add_argument('--epsilon', default=0.1, help='Exploration probability in epsilon-greedy') + parser.add_argument('--learning_rate', default=0.00025, help='Training learning rate.') + parser.add_argument('--window_size', default=4, type = int, help= + 'Number of frames to feed to the Q-network') + parser.add_argument('--batch_size', default=32, type = int, help= + 'Batch size of the training part') + parser.add_argument('--num_process', default=3, type = int, help= + 'Number of parallel environment') + parser.add_argument('--num_iteration', default=20000000, type = int, help= + 'number of iterations to train') + parser.add_argument('--eval_every', default=0.001, type = float, help= + 'What fraction of num_iteration to run between evaluations.') + parser.add_argument('--is_duel', default=1, type = int, help= + 'Whether use duel DQN, 0 means no, 1 means yes.') + parser.add_argument('--is_double', default=1, type = int, help= + 'Whether use double DQN, 0 means no, 1 means yes.') + parser.add_argument('--is_per', default=1, type = int, help= + 'Whether use PriorityExperienceReplay, 0 means no, 1 means yes.') + parser.add_argument('--is_distributional', default=1, type = int, help= + 'Whether use distributional DQN, 0 means no, 1 means yes.') + parser.add_argument('--num_step', default=1, type = int, help= + 'Num Step for multi-step DQN, 3 is recommended') + parser.add_argument('--is_noisy', default=1, type = int, help= + 'Whether use NoisyNet, 0 means no, 1 means yes.') + + + args = parser.parse_args() + args.input_shape = tuple(args.input_shape) + print('Environment : %s.' % (args.env,)) + + env = gym.make(args.env) + num_actions = env.action_space.n + print('number actions %d' % (num_actions,)) + + env.close() + + random.seed(args.seed) + np.random.seed(args.seed) + tf.set_random_seed(args.seed) + + batch_environment = BatchEnvironment(args.env, args.num_process, + args.window_size, args.input_shape, NUM_FRAME_PER_ACTION, MAX_EPISODE_LENGTH) + + # 是否使用优先经验回放 + if args.is_per == 1: + replay_memory = PriorityExperienceReplay(REPLAYMEMORY_SIZE,args.window_size,args.input_shape) + else: + replay_memory = ReplayMemory(REPLAYMEMORY_SIZE,args.window_size,args.input_shape) + + create_network_fn = create_deep_q_network if args.is_duel == 0 else create_duel_q_network + + create_model_fn = create_model if args.is_distributional == 0 else create_distributional_model + + noisy = True if args.is_noisy == 1 else False + + eval_model,eval_params = create_model_fn(args.window_size,args.input_shape,num_actions, + 'eval_model',create_network_fn,trainable=True,noisy=noisy) + target_model,target_params = create_model_fn(args.window_size,args.input_shape,num_actions, + 'target_model',create_network_fn,trainable=False,noisy=noisy) + + update_target_params_ops = [t.assign(s) for s,t in zip(eval_params,target_params)] + + agent = DQNAgent(eval_model, + target_model, + replay_memory, + num_actions, + args.gamma, + UPDATE_FREQUENCY, + TARGET_UPDATE_FREQENCY, + update_target_params_ops, + args.batch_size, + args.is_double, + args.is_per, + args.is_distributional, + args.num_step, + args.is_noisy, + args.learning_rate, + RMSP_DECAY, + RMSP_MOMENTUM, + RMSP_EPSILON) + + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + + sess.run(update_target_params_ops) + + print('prepare fixed samples for mean max q') + + fixed_samples = get_fixed_samples(batch_environment, num_actions, NUM_FIXED_SAMPLES) + + agent.fit(sess,batch_environment,NUM_BURN_IN,do_train=False) + + # Begin to train: + fit_iteration = int(args.num_iteration * args.eval_every) + + for i in range(0, args.num_iteration, fit_iteration): + # Evaluate: + reward_mean, reward_var = agent.evaluate(sess, batch_environment, NUM_EVALUATE_EPSIODE) + mean_max_Q = agent.get_mean_max_Q(sess, fixed_samples) + print("%d, %f, %f, %f" % (i, mean_max_Q, reward_mean, reward_var)) + # Train: + agent.fit(sess, batch_environment, fit_iteration, do_train=True) + + batch_environment.close() + + + + + + + +if __name__ == '__main__': + main() + diff --git a/RL/Basic-Rainbow-Net/model.py b/RL/Basic-Rainbow-Net/model.py new file mode 100644 index 00000000..0d12176c --- /dev/null +++ b/RL/Basic-Rainbow-Net/model.py @@ -0,0 +1,199 @@ +import tensorflow as tf +import numpy as np + +def create_conv_network(input_frames,trainable): + conv1_W = tf.get_variable(shape=[8,8,4,16],name='conv1_W', + trainable=trainable,initializer=tf.contrib.layers.xavier_initializer()) + conv1_b = tf.Variable(tf.zeros([16], dtype=tf.float32), + name='conv1_b', trainable=trainable) + conv1 = tf.nn.conv2d(input_frames, conv1_W, strides=[1, 4, 4, 1], + padding='VALID', name='conv1') + # (batch size, 20, 20, 16) + output1 = tf.nn.relu(conv1 + conv1_b, name='output1') + + conv2_W = tf.get_variable(shape=[4, 4, 16, 32], name='conv2_W', + trainable=trainable, initializer=tf.contrib.layers.xavier_initializer()) + conv2_b = tf.Variable(tf.zeros([32], dtype=tf.float32), name='conv2_b', + dtype=tf.float32, trainable=trainable) + conv2 = tf.nn.conv2d(output1, conv2_W, strides=[1, 2, 2, 1], + padding='VALID', name='conv2') + # (batch size, 9, 9, 32) + output2 = tf.nn.relu(conv2 + conv2_b, name='output2') + + flat_output2_size = 9 * 9 * 32 + flat_output2 = tf.reshape(output2, [-1, flat_output2_size], name='flat_output2') + + return flat_output2, flat_output2_size, [conv1_W, conv1_b, conv2_W, conv2_b] + + + +def create_deep_q_network(input_frames,num_actions,trainable,noisy): + flat_output,flat_output_size,parameter_list = create_conv_network(input_frames,trainable) + + if noisy == False: + fc1_W = tf.get_variable(shape=[flat_output_size,256],name='fc1_W', + trainable=trainable,initializer=tf.contrib.layers.xavier_initializer()) + + fc1_b = tf.Variable(tf.zeros([256],dtype=tf.float32),name='fc1_b', + trainable=trainable) + + output3 = tf.nn.relu(tf.matmul(flat_output,fc1_W)+fc1_b,name='output3') + + fc2_W = tf.get_variable(shape=[256,num_actions],name='fc2_W',trainable=trainable, + initializer=tf.contrib.layers.xavier_initializer()) + + fc2_b = tf.Variable(tf.zeros([num_actions],dtype=tf.float32),name='fc2_b',trainable=trainable) + + q_network = tf.nn.relu(tf.matmul(output3,fc2_W) + fc2_b,name='q_network') + + parameter_list += [fc1_W,fc1_b,fc2_W,fc2_b] + + else: + + output3, parameter_list_output3 = noisy_dense(flat_output, name='noisy_fc1', + input_size=flat_output_size, output_size=256, + activation_fn=tf.nn.relu, trainable=trainable) + q_network, parameter_list_q_network = noisy_dense(output3, name='noisy_fc2', + input_size=256, output_size=num_actions, trainable=trainable) + parameter_list += parameter_list_output3 + parameter_list_q_network + return q_network, parameter_list + + +def create_duel_q_network(input_frames,num_actions,trainable,noisy): + flat_output, flat_output_size, parameter_list = create_conv_network(input_frames, trainable) + + if noisy == False: + fcV_W = tf.get_variable(shape=[flat_output_size, 512], name='fcV_W', + trainable=trainable, initializer=tf.contrib.layers.xavier_initializer()) + fcV_b = tf.Variable(tf.zeros([512], dtype=tf.float32), name='fcV_b', + dtype=tf.float32, trainable=trainable) + outputV = tf.nn.relu(tf.matmul(flat_output, fcV_W) + fcV_b, name='outputV') + + fcV2_W = tf.get_variable(shape=[512, 1], name='fcV2_W', + trainable=trainable, initializer=tf.contrib.layers.xavier_initializer()) + fcV2_b = tf.Variable(tf.zeros([1], dtype=tf.float32), name='fcV2_b', + trainable=trainable) + outputV2 = tf.matmul(outputV, fcV2_W) + fcV2_b # V + + + fcA_W = tf.get_variable(shape=[flat_output_size, 512], name='fcA_W', + trainable=trainable, initializer=tf.contrib.layers.xavier_initializer()) + fcA_b = tf.Variable(tf.zeros([512], dtype=tf.float32), name='fcA_b', + trainable=trainable) + outputA = tf.nn.relu(tf.matmul(flat_output, fcA_W) + fcA_b, name='outputA') + + fcA2_W = tf.get_variable(shape=[512, num_actions], name='fcA2_W', + trainable=trainable, initializer=tf.contrib.layers.xavier_initializer()) + fcA2_b = tf.Variable(tf.zeros([num_actions], dtype=tf.float32), name='fcA2_b', + trainable=trainable) + outputA2 = tf.matmul(outputA, fcA2_W) + fcA2_b # 优势函数 + + parameter_list += [fcV_W, fcV_b, fcV2_W, fcV2_b, fcA_W, fcA_b, fcA2_W, fcA2_b] + else: + outputV, parameter_list_outputV = noisy_dense(flat_output, name='fcV', + input_size=flat_output_size, output_size=512, trainable=trainable, + activation_fn=tf.nn.relu) + outputV2, parameter_list_outputV2 = noisy_dense(outputV, name='fcV2', + input_size=512, output_size=1, trainable=trainable) + ouputA, parameter_list_outputA = noisy_dense(flat_output, name='fcA', + input_size=flat_output_size, output_size=512, trainable=trainable, + activation_fn=tf.nn.relu) + outputA2, parameter_list_outputA2 = noisy_dense(ouputA, name='fcA2', + input_size=512, output_size=num_actions, trainable=trainable) + parameter_list += parameter_list_outputA + parameter_list_outputA2 + \ + parameter_list_outputV + parameter_list_outputV2 + + q_network = tf.nn.relu(outputV2 + outputA2 - tf.reduce_mean(outputA2), name='q_network') + + return q_network, parameter_list + + +def create_model(window,input_shape,num_actions,model_name,create_network_fn,trainable,noisy): + """创建Q网络""" + with tf.variable_scope(model_name): + input_frames = tf.placeholder(tf.float32,[None,input_shape[0],input_shape[1],window],name='input_frames') + q_network,parameter_list = create_network_fn(input_frames,num_actions,trainable,noisy) + + mean_max_q = tf.reduce_mean(tf.reduce_max(q_network,axis=[1]),name='mean_max_q') + action = tf.argmax(q_network,axis=1) + + model = { + 'q_values':q_network, + 'input_frames':input_frames, + 'mean_max_q':mean_max_q, + 'action':action, + } + + return model,parameter_list + + +def create_distributional_model(window,input_shape,num_actions,model_name,create_network_fn,trainable,noisy): + N_atoms = 51 + V_Max = 20.0 + V_Min = 0.0 + Delta_z = (V_Max - V_Min) / (N_atoms - 1) + z_list = tf.constant([V_Min + i * Delta_z for i in range(N_atoms)],dtype=tf.float32) + z_list_broadcasted = tf.tile(tf.reshape(z_list,[1,N_atoms]),[num_actions,1]) # batch * num_actions * N_atoms + + with tf.variable_scope(model_name): + input_frames = tf.placeholder(tf.float32,[None,input_shape[0],input_shape[1],window],name='input_frames') + q_distributional_network,parameter_list = create_network_fn(input_frames,num_actions * N_atoms,trainable,noisy) + + q_distributional_network = tf.reshape(q_distributional_network,[-1,num_actions,N_atoms]) + + q_distributional_network = tf.nn.softmax(q_distributional_network,dim=2) + # 防止NAN + q_distributional_network = tf.clip_by_value(q_distributional_network, 1e-8, 1.0 - 1e-8) + q_network = tf.multiply(q_distributional_network ,z_list_broadcasted) + q_network = tf.reduce_sum(q_network,axis=2,name='q_values') + + mean_max_q = tf.reduce_mean(tf.reduce_max(q_network,axis=[1]),name='mean_max_q') + action = tf.argmax(q_network,axis=1) + + model = { + 'q_distributional_network' : q_distributional_network, + 'q_values':q_network, + 'input_frames':input_frames, + 'mean_max_q':mean_max_q, + 'action':action + } + + return model,parameter_list + + +def noisy_dense(x,input_size,output_size,name,trainable,activation_fn=tf.identity): + + def f(x): + return tf.multiply(tf.sign(x),tf.pow(tf.abs(x),0.5)) + + mu_init = tf.random_uniform_initializer(minval=-1*1/np.power(input_size, 0.5), + maxval=1*1/np.power(input_size, 0.5)) + + sigma_init = tf.constant_initializer(0.4 / np.power(input_size,0.5)) + + p = tf.random_normal([input_size,1]) + q = tf.random_normal([1,output_size]) + + f_p = f(p) + f_q = f(q) + + w_epsilon = f_p * f_q + b_epsilon = tf.squeeze(f_q) + + w_mu = tf.get_variable(name + "/w_mu", [input_size, output_size], + initializer=mu_init, trainable=trainable) + w_sigma = tf.get_variable(name + "/w_sigma", [input_size, output_size], + initializer=sigma_init, trainable=trainable) + + w = w_mu + tf.multiply(w_sigma, w_epsilon) + ret = tf.matmul(x,w) + + b_mu = tf.get_variable(name + "/b_mu", [output_size], + initializer=mu_init, trainable=trainable) + b_sigma = tf.get_variable(name + "/b_sigma", [output_size], + initializer=sigma_init, trainable=trainable) + b = b_mu + tf.multiply(b_sigma, b_epsilon) + return activation_fn(ret + b), [w_mu, w_sigma, b_mu, b_sigma] + + + diff --git a/RL/Basic-Rainbow-Net/preprocessor.py b/RL/Basic-Rainbow-Net/preprocessor.py new file mode 100644 index 00000000..ef280c7d --- /dev/null +++ b/RL/Basic-Rainbow-Net/preprocessor.py @@ -0,0 +1,126 @@ +"""Preprocessors for Atari pixel output.""" + +import numpy as np +from PIL import Image + +class HistoryPreprocessor: + """Keeps the last k states. + Useful for domains where you need velocities, but the state + contains only positions. + When the environment starts, this will just fill the initial + sequence values with zeros k times. + Parameters + ---------- + history_length: int + Number of previous states to prepend to state being processed. + """ + + def __init__(self, input_shape, history_length=1): + self._WIDTH = input_shape[0] + self._HEIGHT = input_shape[1] + self.history = np.zeros(shape=(1, self._WIDTH, self._HEIGHT, history_length+1), dtype = np.uint8) + self.history_length = history_length + + + def process_state_for_memory(self, state): + """You only want history when you're deciding the current action to take. + Returns last history_length processed states, where each is the max of + the raw state and the previous raw state. + """ + self.history[0,:,:,1:]=self.history[0,:,:,:self.history_length] + self.history[0,:,:,0]=state + + def reset(self): + """Reset the history sequence. + Useful when you start a new episode. + """ + self.history = np.zeros(shape=(1, self._WIDTH, self._HEIGHT, self.history_length+1), dtype = np.uint8) + + def get_state(self): + result = np.zeros(shape=(1, self._WIDTH, self._HEIGHT, self.history_length), dtype = np.uint8) + + for i in range(self.history_length): + result[0,:,:,i]=np.maximum(self.history[0,:,:,i], self.history[0,:,:,i+1]) + return result + + + +class AtariPreprocessor: + """Converts images to greyscale and downscales. + Based on the preprocessing step described in: + @article{mnih15_human_level_contr_throug_deep_reinf_learn, + author = {Volodymyr Mnih and Koray Kavukcuoglu and David + Silver and Andrei A. Rusu and Joel Veness and Marc + G. Bellemare and Alex Graves and Martin Riedmiller + and Andreas K. Fidjeland and Georg Ostrovski and + Stig Petersen and Charles Beattie and Amir Sadik and + Ioannis Antonoglou and Helen King and Dharshan + Kumaran and Daan Wierstra and Shane Legg and Demis + Hassabis}, + title = {Human-Level Control Through Deep Reinforcement + Learning}, + journal = {Nature}, + volume = 518, + number = 7540, + pages = {529-533}, + year = 2015, + doi = {10.1038/nature14236}, + url = {http://dx.doi.org/10.1038/nature14236}, + } + You may also want to max over frames to remove flickering. Some + games require this (based on animations and the limited sprite + drawing capabilities of the original Atari). + Parameters + ---------- + new_size: 2 element tuple + The size that each image in the state should be scaled to. e.g + (84, 84) will make each image in the output have shape (84, 84). + """ + + def __init__(self, input_shape): + self._WIDTH = input_shape[0] + self._HEIGHT = input_shape[1] + + + def process_state_for_memory(self, state): + """Scale, convert to greyscale and store as uint8. + We don't want to save floating point numbers in the replay + memory. We get the same resolution as uint8, but use a quarter + to an eigth of the bytes (depending on float32 or float64) + """ + image = Image.fromarray(state) + image = image.convert('L') + image = image.resize((self._WIDTH, self._HEIGHT), Image.LANCZOS) + #image.show() + image = np.array(image,dtype=np.uint8) + return image.reshape((1, self._WIDTH, self._HEIGHT)) + + +class Preprocessor: + """Combination of both an Atari preprocessor and history preprocessor.""" + def __init__(self, window_size, input_shape): + self._atari_preprocessor = AtariPreprocessor(input_shape) + self._history_preprocessor = HistoryPreprocessor(input_shape, history_length=window_size) + + def process_state_for_memory(self, state): + if state.shape == (210, 160, 3): + state = self._atari_preprocessor.process_state_for_memory(state) + state = self._history_preprocessor.process_state_for_memory(state) + else: + raise Exception("Shape Error in preprocessor"+str(state.shape)) + return state + + def reset(self): + self._history_preprocessor.reset() + + def process_reward(self, reward): + """Get sign of reward: -1, 0 or 1.""" + return np.sign(reward) + + def get_state(self): + return self._history_preprocessor.get_state() + + def state2float(self, state): + if state.dtype != np.uint8: + raise Exception("Error, State should be in np.unit8") + return state.astype(np.float16) / 255.0 \ No newline at end of file diff --git a/RL/Basic-Rainbow-Net/replayMemory.py b/RL/Basic-Rainbow-Net/replayMemory.py new file mode 100644 index 00000000..3a3e5432 --- /dev/null +++ b/RL/Basic-Rainbow-Net/replayMemory.py @@ -0,0 +1,85 @@ +import numpy as np +import random +from sumTree import SumTree + +class ReplayMemory(): + + def __init__(self,max_size,window_size,input_shape): + + self._max_size = max_size + self._window_size = window_size + self._WIDTH = input_shape[0] + self._HEIGHT = input_shape[1] + self._memory = [] + + + def append(self,old_state,action,reward,new_state,is_terminal): + + num_sample = len(old_state) + if len(self._memory) >= self._max_size: + del(self._memory[0:num_sample]) + + for o_s,a,r,n_s,i_t in zip(old_state,action,reward,new_state,is_terminal): + self._memory.append((o_s,a,r,n_s,i_t)) + + def sample(self,batch_size,indexes=None): + + samples = random.sample(self._memory,min(batch_size,len(self._memory))) + zipped = list(zip(*samples)) + zipped[0] = np.reshape(zipped[0],(-1,self._WIDTH,self._HEIGHT,self._window_size)) + zipped[3] = np.reshape(zipped[3],(-1,self._WIDTH,self._HEIGHT,self._window_size)) + return zipped + + +class PriorityExperienceReplay(): + + def __init__(self,max_size, + window_size, + input_shape): + self.tree = SumTree(max_size) + self._max_size = max_size + self._window_size = window_size + self._WIDTH = input_shape[0] + self._HEIGHT = input_shape[1] + self.e = 0.01 + self.a = 0.6 + + + def _getPriority(self,error): + return (error + self.e) ** self.a + + def append(self,old_state,action,reward,new_state,is_terminal): + for o_s,a,r,n_s,i_t in zip(old_state,action,reward,new_state,is_terminal): + p = self._getPriority(0.5) + self.tree.add(p,data=(o_s,a,r,n_s,i_t)) + + def sample(self,batch_size,indexes=None): + data_batch = [] + idx_batch = [] + p_batch = [] + segment = self.tree.total_and_count()[0] / batch_size + + for i in range(batch_size): + a = segment * i + b = segment * (i + 1) + + s = random.uniform(a,b) + (idx,p,data) = self.tree.get(s) + data_batch.append(data) + idx_batch.append(idx) + p_batch.append(p) + + zipped = list(zip(*data_batch)) + zipped[0] = np.reshape(zipped[0], (-1, self._WIDTH, self._HEIGHT, self._window_size)) + zipped[3] = np.reshape(zipped[3], (-1, self._WIDTH, self._HEIGHT, self._window_size)) + + sum_p,count = self.tree.total_and_count() + + return zipped,idx_batch,p_batch,sum_p,count + + def update(self,idx_list,error_list): + for idx,error in zip(idx_list,error_list): + p = self._getPriority(error) + self.tree.update(idx,p) + + diff --git a/RL/Basic-Rainbow-Net/sumTree.py b/RL/Basic-Rainbow-Net/sumTree.py new file mode 100644 index 00000000..7919b9a4 --- /dev/null +++ b/RL/Basic-Rainbow-Net/sumTree.py @@ -0,0 +1,56 @@ +import numpy + +class SumTree(): + write = 0 + count = 0 + + def __init__(self,capacity): + self.capacity = capacity + self.tree = numpy.zeros( 2 * capacity - 1) + self.data = numpy.zeros( capacity ,dtype = object) + + + def _propagate(self,idx,change): + parent = (idx - 1) // 2 + self.tree[parent] += change + + if parent!=0: + self._propagate(parent,change) + + def _retrieve(self, idx, s): + left = 2 * idx + 1 + right = left + 1 + + if left >= len(self.tree): + return idx + + if s <= self.tree[left]: + return self._retrieve(left, s) + else: + return self._retrieve(right, s-self.tree[left]) + + def total_and_count(self): + return self.tree[0], self.count + + + def update(self,idx,p): + change = p - self.tree[idx] # 得到变化的数值, + self.tree[idx] = p + self._propagate(idx,change) # 根据叶子结点的变化,不断修改父节点的值 + + def add(self,p,data): + idx = self.write + self.capacity - 1 # 得到叶子结点在树中的位置 + self.data[self.write] = data + self.update(idx,p) + + self.write += 1 + if self.write >= self.capacity: + self.write = 0 + if self.count < self.capacity: + self.count += 1 + + def get(self, s): + idx = self._retrieve(0, s) + dataIdx = idx - self.capacity + 1 + + return (idx, self.tree[idx], self.data[dataIdx]) \ No newline at end of file