Merge pull request #49 from RLBots/copy_trainer

Added regression code and added some simple visualizations.
SaltieRL · Jan 20, 2018 · 7b0c01b · 7b0c01b
2 parents 11a8bb0 + 891c0d0
commit 7b0c01b
Show file tree

Hide file tree

Showing 24 changed files with 585 additions and 121 deletions.
diff --git a/conversions/input/input_formatter.py b/conversions/input/input_formatter.py
@@ -77,7 +77,13 @@ def split_teams(self, game_tick_packet):
         return player_car, team_members, enemies, own_team_score, enemy_team_score
 
     def create_result_array(self, array):
-        return np.array(array, dtype=np.float32)
+        np_version = np.array(array, dtype=np.float32)
+        output = np.argwhere(np.isnan(np_version))
+        if len(output) > 0:
+            print('nan indexes', output)
+            for index in output:
+                np_version[index[0]] = 0
+        return np_version
 
     def get_player_goals(self, game_tick_packet, index):
         return game_tick_packet.gamecars[index].Score.Goals

diff --git a/modelHelpers/actions/action_factory.py b/modelHelpers/actions/action_factory.py
@@ -1,11 +1,34 @@
 from modelHelpers.actions.action_handler import ActionHandler
-from modelHelpers.actions.dynamic_action_handler import DynamicActionHandler
+from modelHelpers.actions.dynamic_action_handler import DynamicActionHandler, LOSS_SQUARE_MEAN, LOSS_SPARSE_CROSS, \
+    LOSS_ABSOLUTE_DIFFERENCE
 from modelHelpers.actions.split_action_handler import SplitActionHandler
 
 default_scheme = [[('steer', (-1, 1.5, .5)), ('pitch', (-1, 1.5, .5)), ('roll', (-1, 1.5, .5))],
                   [('throttle', (-1, 2, 1)), ('jump', (0, 2, 1)), ('boost', (0, 2, 1)), ('handbrake', (0, 2, 1))],
                   [('yaw', 'steer')]]
 
+super_split_scheme = [[('throttle', (-1, 1.5, .5)), ('steer', (-1, 1.5, .5)),
+                       ('yaw', (-1, 1.5, .5)), ('pitch', (-1, 1.5, .5)), ('roll', (-1, 1.5, .5))],
+                      [('jump', (0, 2, 1)), ('boost', (0, 2, 1)), ('handbrake', (0, 2, 1))],
+                      []]
+
+only_steer_split_scheme = [[('steer', (-1, 1.5, .5))],
+                           [('throttle', (-1, 2, 1)), ('jump', (0, 2, 1)), ('boost', (0, 2, 1)),
+                            ('handbrake', (0, 2, 1)), ('yaw', (-1, 2, 1)),
+                            ('pitch', (-1, 2, 1)), ('roll', (-1, 2, 1))],
+                           []]
+
+regression_controls = [[('throttle', (-1, 1.5, .5), LOSS_SQUARE_MEAN), ('steer', (-1, 1.5, .5), LOSS_SQUARE_MEAN),
+                        ('yaw', (-1, 1.5, .5), LOSS_SQUARE_MEAN), ('pitch', (-1, 1.5, .5), LOSS_SQUARE_MEAN),
+                        ('roll', (-1, 1.5, .5), LOSS_SQUARE_MEAN)],
+                       [('jump', (0, 2, 1)), ('boost', (0, 2, 1)), ('handbrake', (0, 2, 1))],
+                       []]
+
+mixed_controls = [[('throttle', (-1, 1.5, .5), LOSS_SPARSE_CROSS), ('steer', (-1, 1.5, .5), LOSS_ABSOLUTE_DIFFERENCE),
+                        ('yaw', (-1, 1.5, .5), LOSS_ABSOLUTE_DIFFERENCE), ('pitch', (-1, 1.5, .5), LOSS_ABSOLUTE_DIFFERENCE),
+                        ('roll', (-1, 1.5, .5), LOSS_ABSOLUTE_DIFFERENCE)],
+                       [('jump', (0, 2, 1)), ('boost', (0, 2, 1)), ('handbrake', (0, 2, 1))],
+                       []]
 
 def get_handler(split_mode=True, control_scheme=default_scheme):
     """

diff --git a/modelHelpers/actions/action_handler.py b/modelHelpers/actions/action_handler.py
@@ -92,13 +92,12 @@ def create_controller_from_selection(self, action_selection):
     def create_tensorflow_controller_from_selection(self, action_selection, batch_size=1, should_stack=True):
         combo_actions = self.actions
         indexer = tf.constant(1, dtype=tf.int32)
-        action_selection = tf.cast(action_selection, tf.int32)
         if batch_size > 1:
             multiplier = tf.constant([int(batch_size), 1, 1])
             combo_actions = tf.tile(tf.expand_dims(combo_actions, 0), multiplier)
             indexer = tf.constant(np.arange(0, batch_size, 1), dtype=tf.int32)
 
-        button_combo = tf.gather_nd(combo_actions, tf.stack([indexer, action_selection[3]], axis=1))
+        button_combo = tf.gather_nd(combo_actions, tf.stack([indexer, tf.cast(action_selection[3], tf.int32)], axis=1))
         new_shape = [self.get_logit_size(), batch_size]
         button_combo = tf.reshape(button_combo, new_shape)
         controller_option = button_combo
@@ -195,17 +194,6 @@ def optionally_split_numpy_arrays(self, numpy_array, split_func, is_already_spli
         """
         return split_func(numpy_array)
 
-    def get_cross_entropy_with_logits(self, labels, logits, name):
-        """
-        :param tf:
-        :param labels:
-        :param logits:
-        :param name:
-        :return:
-        """
-        return tf.nn.softmax_cross_entropy_with_logits(
-            labels=labels, logits=logits, name=name + 'ns')
-
     def _find_closet_real_number_graph(self, number):
         pure_number = tf.round(number * 2.0) / 2.0
         comparison = tf.Variable(np.array([-1.0, -0.5, 0.0, 0.5, 1.0]), dtype=tf.float32)
@@ -243,3 +231,28 @@ def create_action_indexes_graph(self, real_action, batch_size=None):
             combo_list.append(bucketed_control)
 
         return self._create_combo_index_graph(combo_list)
+
+    def get_action_loss_from_logits(self, logits, labels, index):
+        """
+        :param logits: A tensorflow logit
+        :param labels: A label of what accured
+        :param index: The index of the control in the actions list this maps to
+        :return: The loss for this particular action
+        """
+        return tf.nn.softmax_cross_entropy_with_logits(
+            labels=labels, logits=logits, name=str(index) + 'ns')
+
+    def get_last_layer_activation_function(self, func, index):
+        return func
+
+    def scale_layer(self, layer, index):
+        """
+        Scales the layer if required
+        :param layer: the output layer of the model
+        :param index: The index regarding this specific action
+        :return: A scaled layer
+        """
+        return layer
+
+    def get_loss_type(self, index):
+        return 'softmax'
diff --git a/modelHelpers/actions/dynamic_action_handler.py b/modelHelpers/actions/dynamic_action_handler.py
@@ -2,21 +2,16 @@
 
 import numpy as np
 import tensorflow as tf
+from tensorflow.python.ops.losses.losses_impl import Reduction
 
 from modelHelpers.actions.action_handler import ActionHandler, ActionMap
 from modelHelpers.actions.split_action_handler import SplitActionHandler
 
 
 COMBO = 'combo'
-
-super_split_scheme = [[('throttle', (-1, 1.5, .5)), ('steer', (-1, 1.5, .5)),
-                       ('yaw', (-1, 1.5, .5)), ('pitch', (-1, 1.5, .5)), ('roll', (-1, 1.5, .5))],
-                      [('jump', (0, 2, 1)), ('boost', (0, 2, 1)), ('handbrake', (0, 2, 1))],
-                      []]
-
-
-
-
+LOSS_SPARSE_CROSS = 'sparse_loss'
+LOSS_SQUARE_MEAN = 'square_mean'
+LOSS_ABSOLUTE_DIFFERENCE = 'abs_diff'
 
 
 class DynamicActionHandler(SplitActionHandler):
@@ -37,6 +32,7 @@ class DynamicActionHandler(SplitActionHandler):
     combo_name_list = []
     dodge_suppressor_list = [['jump'], ['steer', 'pitch', 'roll', 'yaw']]
     should_suppress_dodge = False
+    action_loss_type_map = {}
 
     def __init__(self, control_scheme):
         self.control_scheme = control_scheme
@@ -55,8 +51,14 @@ def reset(self):
         self.combo_list = []
         self.button_combo = []
         self.combo_name_list = []
+        self.action_loss_type_map = {}
+
+    def is_classification(self, index):
+        return self.action_loss_type_map[index] == LOSS_SPARSE_CROSS
 
     def create_range_action(self, item):
+        if len(item) > 2 and (item[2] == LOSS_SQUARE_MEAN or item[2] == LOSS_ABSOLUTE_DIFFERENCE):
+            return np.array([0])
         action_data = np.arange(*item[1])
         return action_data
 
@@ -74,6 +76,10 @@ def create_actions(self):
             action = self.create_range_action(item)
             self.action_sizes.append(len(action))
             self.action_name_index_map[item[0]] = len(self.action_list_names)
+            if len(item) > 2:
+                self.action_loss_type_map[len(self.action_list_names)] = item[2]
+            else:
+                self.action_loss_type_map[len(self.action_list_names)] = LOSS_SPARSE_CROSS
             self.action_list_names.append(item[0])
             self.actions.append(action)
 
@@ -91,6 +97,7 @@ def create_actions(self):
         self.button_combo = list(itertools.product(*self.combo_list))
         self.action_sizes.append(len(self.button_combo))
         self.action_name_index_map[COMBO] = len(self.action_list_names)
+        self.action_loss_type_map[len(self.action_list_names)] = LOSS_SPARSE_CROSS
         self.action_list_names.append(COMBO)
         self.actions.append(self.button_combo)
 
@@ -111,9 +118,12 @@ def create_controller_from_selection(self, action_selection):
             index = self.action_name_index_map[control]
             if index == COMBO:
                 true_index = self.combo_name_index_map[control]
-                controller_output.append(self.actions[combo_index][action_selection[combo_index]][true_index])
+                controller_output.append(self.actions[combo_index][int(action_selection[combo_index])][true_index])
                 continue
-            controller_output.append(self.actions[index][action_selection[index]])
+            if self.is_classification(index):
+                controller_output.append(self.actions[index][int(action_selection[index])])
+            else:
+                controller_output.append(action_selection[index])
 
         # print(controller_output)
         return controller_output
@@ -123,7 +133,6 @@ def create_tensorflow_controller_from_selection(self, action_selection, batch_si
 
         ranged_actions = []
         combo_actions = tf.constant(np.transpose(np.array(self.button_combo)))
-        action_selection = tf.cast(action_selection, tf.int32)
 
         # handle ranged actions
         multiplier = tf.constant([int(batch_size), 1])
@@ -153,12 +162,16 @@ def create_tensorflow_controller_from_selection(self, action_selection, batch_si
                 true_index = self.combo_name_index_map[control]
                 single_element = combo_actions[true_index]
                 controller_output.append(
-                    tf.gather_nd(single_element, tf.stack([indexer, action_selection[combo_index]], axis=1)))
+                    tf.gather_nd(single_element,
+                                 tf.stack([indexer, tf.cast(action_selection[combo_index], tf.int32)], axis=1)))
                 continue
-            ranged_action = ranged_actions[index]
             selection = action_selection[index]
-            output = tf.gather_nd(ranged_action, tf.stack([indexer, selection], axis=1))
-            controller_output.append(output)
+            if self.is_classification(index):
+                ranged_action = ranged_actions[index]
+                output = tf.gather_nd(ranged_action, tf.stack([indexer, tf.cast(selection, tf.int32)], axis=1))
+                controller_output.append(output)
+            else:
+                controller_output.append(selection)
 
         # make sure everything is the same type
         controller_output = [tf.cast(option, tf.float32) for option in controller_output]
@@ -193,8 +206,10 @@ def create_action_index(self, real_action):
                 bucketed_control = self.round_action(real_control, action_size)
                 combo_list[real_index] = bucketed_control
             else:
-                if indexes[action_index] is None:
+                if indexes[action_index] is None and self.is_classification(action_index):
                     indexes[action_index] = (self._find_closet_real_number(real_control))
+                elif indexes[action_index] is None:
+                    indexes[action_index] = real_control
 
         indexes[self.action_name_index_map[COMBO]] = self._create_combo_index(real_action, combo_list)
 
@@ -239,13 +254,48 @@ def create_action_indexes_graph(self, real_action, batch_size=None):
                     bucketed_control = self.round_action_graph(real_control, action_size)
                 combo_list[real_index] = bucketed_control
             else:
-                if indexes[action_index] is None:
+                if indexes[action_index] is None and self.is_classification(action_index):
                     indexes[action_index] = self._find_closet_real_number_graph(real_control)
+                elif indexes[action_index] is None:
+                    indexes[action_index] = tf.squeeze(real_control, axis=1)
+
         combo_action = self._create_combo_index_graph(combo_list, real_action)
-        if batch_size is not None and batch_size == 1:
-            indexes[self.action_name_index_map[COMBO]] = tf.reshape(combo_action, [1])
-        else:
-            indexes[self.action_name_index_map[COMBO]] = tf.squeeze(combo_action)
+        indexes[self.action_name_index_map[COMBO]] = tf.squeeze(combo_action, axis=1)
 
         result = tf.stack(indexes, axis=1)
         return result
+
+    def get_action_loss_from_logits(self, logits, labels, index):
+        """
+        :param logits: A tensorflow logit
+        :param labels: A label of what occurred
+        :param index: The index of the control in the actions list this maps to
+        :return: The loss for this particular action
+        """
+        if self.action_loss_type_map[index] == LOSS_SPARSE_CROSS:
+            return tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=tf.cast(labels, tf.int32), logits=logits, name=LOSS_SPARSE_CROSS)
+        if self.action_loss_type_map[index] == LOSS_SQUARE_MEAN:
+            return tf.losses.mean_squared_error(labels, tf.squeeze(logits), reduction=Reduction.NONE)
+        if self.action_loss_type_map[index] == LOSS_ABSOLUTE_DIFFERENCE:
+            return tf.losses.absolute_difference(labels, tf.squeeze(logits), reduction=Reduction.NONE)
+
+    def get_last_layer_activation_function(self, func, index):
+        if self.is_classification(index):
+            return func
+        return None
+
+    def scale_layer(self, layer, index):
+        """
+        Scales the layer if required
+        :param layer: the output layer of the model
+        :param index: The index regarding this specific action
+        :return: A scaled layer
+        """
+        if self.is_classification(index):
+            return layer
+        else:
+            return layer  # * 2.0 - 1.0
+
+    def get_loss_type(self, index):
+        return self.action_loss_type_map[index]
diff --git a/modelHelpers/actions/split_action_handler.py b/modelHelpers/actions/split_action_handler.py
@@ -248,19 +248,6 @@ def optionally_split_numpy_arrays(self, numpy_array, split_func, is_already_spli
 
         return result
 
-    def get_cross_entropy_with_logits(self, labels, logits, name):
-        """
-        In split mode there can be more than one class at a time.
-        This is so that
-        :param tf:
-        :param labels:
-        :param logits:
-        :param name:
-        :return:
-        """
-        return tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.cast(labels, tf.float32), logits=logits, name=name+'s')
-
     def create_action_indexes_graph(self, real_action, batch_size=None):
         #slice each index
         throttle = tf.slice(real_action, [0, 0], [-1, 1])

diff --git a/models/actor_critic/base_actor_critic.py b/models/actor_critic/base_actor_critic.py
@@ -58,24 +58,33 @@ def load_config_file(self):
         super().load_config_file()
         try:
             self.num_layers = self.config_file.getint(base_model.MODEL_CONFIGURATION_HEADER,
-                                             'num_layers')
+                                                      'num_layers')
         except:
             print('unable to load num_layers')
 
         try:
             self.network_size = self.config_file.getint(base_model.MODEL_CONFIGURATION_HEADER,
-                                                      'num_width')
+                                                        'num_width')
         except:
             print('unable to load the width of each layer')
 
 
         try:
             self.forced_frame_action = self.config_file.getint(base_model.MODEL_CONFIGURATION_HEADER,
-                                                         'exploration_factor')
+                                                               'exploration_factor')
         except:
             print('unable to load exploration_factor')
 
-    def smart_argmax(self, input_tensor):
+        try:
+            self.keep_prob = self.config_file.getfloat(base_model.MODEL_CONFIGURATION_HEADER,
+                                                     'keep_probability')
+        except:
+            print('unable to load keep_probability')
+
+    def smart_argmax(self, input_tensor, index):
+        if not self.action_handler.is_classification(index):
+            # input_tensor = tf.Print(input_tensor, [input_tensor], str(index))
+            return tf.squeeze(input_tensor, axis=1)
         argmax_index = tf.cast(tf.argmax(input_tensor, axis=1), tf.int32)
         indexer = tf.range(0, self.mini_batch_size)
         slicer_data = tf.stack([indexer, argmax_index], axis=1)
@@ -124,7 +133,8 @@ def _create_model(self, model_input):
                                                                      lambda input_tensor: tf.argmax(
                                                                          tf.nn.softmax(input_tensor), axis=1),
                                                                      return_as_list=True)
-        self.smart_max = self.action_handler.run_func_on_split_tensors(self.policy_outputs,
+        indexes = np.arange(0, self.action_handler.get_number_actions(), 1).tolist()
+        self.smart_max = self.action_handler.run_func_on_split_tensors([self.policy_outputs, indexes],
                                                                        self.smart_argmax,
                                                                        return_as_list=True)
         return self.predicted_actions, self.action_scores
@@ -142,7 +152,7 @@ def create_copy_training_model(self, model_input=None, taken_actions=None):
             batched_input, batched_taken_actions = self.iterator.get_next()
         else:
             batched_input = converted_input
-            batched_taken_actions = self.taken_actions
+            batched_taken_actions = actions_input
         with tf.name_scope("training_network"):
             self.discounted_rewards = tf.constant(0.0)
             with tf.variable_scope("actor_network", reuse=True):
@@ -214,7 +224,10 @@ def create_layer(self, activation_function, input, layer_number, input_size, out
                              initializer=tf.random_normal_initializer())
         b = tf.get_variable(bias_name, [output_size],
                              initializer=tf.random_normal_initializer())
-        layer_output = activation_function(tf.matmul(input, W) + b)
+        if activation_function is not None:
+            layer_output = activation_function(tf.matmul(input, W) + b)
+        else:
+            layer_output = tf.matmul(input, W) + b
         if variable_list is not None:
             variable_list.append(W)
             variable_list.append(b)
@@ -301,10 +314,15 @@ def create_last_layer(self, activation_function, inner_layer, network_size, num_
 
             self.actor_last_row_layer = []
             for i, item in enumerate(self.action_handler.get_action_sizes()):
-                with tf.variable_scope(str(self.action_handler.action_list_names[i])):
-                    self.actor_last_row_layer.append(self.create_layer(activation_function, inner_layer[i], last_layer_name,
+                variable_name = str(self.action_handler.action_list_names[i])
+                with tf.variable_scope(variable_name):
+                    fixed_activation = self.action_handler.get_last_layer_activation_function(activation_function, i)
+                    layer = self.create_layer(fixed_activation, inner_layer[i], last_layer_name,
                                                                        network_size, item, network_prefix,
-                                                                       variable_list=last_layer_list[i], dropout=False)[0])
+                                                                       variable_list=last_layer_list[i], dropout=False)[0]
+                    scaled_layer = self.action_handler.scale_layer(layer, i)
+                    self.actor_last_row_layer.append(scaled_layer)
+                    # tf.summary.histogram(variable_name + '_output', scaled_layer)
 
             return tf.concat(self.actor_last_row_layer, 1)