bdaiinstitute · lf-zhao · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 25, 2024
diff --git a/predicators/args.py b/predicators/args.py
@@ -43,6 +43,7 @@ def create_arg_parser(env_required: bool = True,
     parser.add_argument("--experiment_id", default="", type=str)
     parser.add_argument("--load_experiment_id", default="", type=str)
     parser.add_argument("--log_file", default="", type=str)
+    parser.add_argument("--log_rich", default="true", type=str)
     parser.add_argument("--use_gui", action="store_true")
     parser.add_argument('--debug',
                         action="store_const",

diff --git a/predicators/envs/spot_env.py b/predicators/envs/spot_env.py
@@ -9,6 +9,7 @@
 from typing import Callable, ClassVar, Collection, Dict, Iterator, List, \
     Optional, Sequence, Set, Tuple
 
+import PIL.Image
 import matplotlib
 import numpy as np
 import pbrspot
@@ -21,6 +22,7 @@
 
 from predicators import utils
 from predicators.envs import BaseEnv
+from predicators.pretrained_model_interface import OpenAIVLM
 from predicators.settings import CFG
 from predicators.spot_utils.perception.object_detection import \
     AprilTagObjectDetectionID, KnownStaticObjectDetectionID, \
@@ -94,6 +96,8 @@ class _PartialPerceptionState(State):
     in the classifier definitions for the dummy predicates
     """
 
+    # obs_images: Optional[Dict[str, RGBDImageWithContext]] = None
+
     @property
     def _simulator_state_predicates(self) -> Set[Predicate]:
         assert isinstance(self.simulator_state, Dict)
@@ -121,7 +125,8 @@ def copy(self) -> State:
             "atoms": self._simulator_state_atoms.copy()
         }
         return _PartialPerceptionState(state_copy,
-                                       simulator_state=sim_state_copy)
+                                       simulator_state=sim_state_copy,
+                                       camera_images=self.camera_images)
 
 
 def _create_dummy_predicate_classifier(
@@ -298,7 +303,7 @@ def percept_predicates(self) -> Set[Predicate]:
     def action_space(self) -> Box:
         # The action space is effectively empty because only the extra info
         # part of actions are used.
-        return Box(0, 1, (0, ))
+        return Box(0, 1, (0,))
 
     @abc.abstractmethod
     def _get_dry_task(self, train_or_test: str,
@@ -336,7 +341,7 @@ def _get_next_dry_observation(
                                                nonpercept_atoms)
 
         if action_name in [
-                "MoveToReachObject", "MoveToReadySweep", "MoveToBodyViewObject"
+            "MoveToReachObject", "MoveToReadySweep", "MoveToBodyViewObject"
         ]:
             robot_rel_se2_pose = action_args[1]
             return _dry_simulate_move_to_reach_obj(obs, robot_rel_se2_pose,
@@ -703,7 +708,7 @@ def _build_realworld_observation(
             for swept_object in swept_objects:
                 if swept_object not in all_objects_in_view:
                     if container is not None and container in \
-                        all_objects_in_view:
+                            all_objects_in_view:
                         while True:
                             msg = (
                                 f"\nATTENTION! The {swept_object.name} was not "
@@ -988,7 +993,7 @@ def _actively_construct_initial_object_views(
         return obj_to_se3_pose
 
     def _run_init_search_for_objects(
-        self, detection_ids: Set[ObjectDetectionID]
+            self, detection_ids: Set[ObjectDetectionID]
     ) -> Dict[ObjectDetectionID, math_helpers.SE3Pose]:
         """Have the hand look down from high up at first."""
         assert self._robot is not None
@@ -1027,6 +1032,69 @@ def _generate_goal_description(self) -> GoalDescription:
         """For now, we assume that there's only one goal per environment."""
 
 
+###############################################################################
+#                      VLM Predicate Evaluation Related                       #
+###############################################################################
+
+# Initialize VLM
+vlm = OpenAIVLM(model_name="gpt-4-turbo", detail="auto")
+
+# Engineer the prompt for VLM
+vlm_predicate_eval_prompt_prefix = """
+Your goal is to answer questions related to object relationships in the 
+given image(s).
+We will use following predicate-style descriptions to ask questions:
+    Inside(object1, container)
+    Blocking(object1, object2)
+    On(object, surface)
+
+Examples:
+Does this predicate hold in the following image?
+Inside(apple, bowl)
+Answer (in a single word): Yes/No
+
+Actual question:
+Does this predicate hold in the following image?
+{question}
+Answer (in a single word):
+"""
+
+# Provide some visual examples when needed
+vlm_predicate_eval_prompt_example = ""
+# TODO: Next, try include visual hints via segmentation ("Set of Masks")
+
+
+def vlm_predicate_classify(question: str, state: State) -> bool:
+    """Use VLM to evaluate (classify) a predicate in a given state."""
+    full_prompt = vlm_predicate_eval_prompt_prefix.format(question=question)
+    images_dict: Dict[str, RGBDImageWithContext] = state.camera_images
+    images = [
+        PIL.Image.fromarray(v.rotated_rgb) for _, v in images_dict.items()
+    ]
+
+    logging.info(f"VLM predicate evaluation for: {question}")
+    logging.info(f"Prompt: {full_prompt}")
+
+    vlm_responses = vlm.sample_completions(
+        prompt=full_prompt,
+        imgs=images,
+        temperature=0.2,
+        seed=int(time.time()),
+        num_completions=1,
+    )
+    logging.info(f"VLM response 0: {vlm_responses[0]}")
+
+    vlm_response = vlm_responses[0].strip().lower()
+    if vlm_response == "yes":
+        return True
+    elif vlm_response == "no":
+        return False
+    else:
+        logging.error(
+            f"VLM response not understood: {vlm_response}. Treat as False.")
+        return False
+
+
 ###############################################################################
 #                   Shared Types, Predicates, Operators                       #
 ###############################################################################
@@ -1090,8 +1158,8 @@ def _object_in_xy_classifier(state: State,
 
     spot, = state.get_objects(_robot_type)
     if obj1.is_instance(_movable_object_type) and \
-        _is_placeable_classifier(state, [obj1]) and \
-        _holding_classifier(state, [spot, obj1]):
+            _is_placeable_classifier(state, [obj1]) and \
+            _holding_classifier(state, [spot, obj1]):
         return False
 
     # Check that the center of the object is contained within the surface in
@@ -1108,17 +1176,34 @@ def _object_in_xy_classifier(state: State,
 def _on_classifier(state: State, objects: Sequence[Object]) -> bool:
     obj_on, obj_surface = objects
 
-    # Check that the bottom of the object is close to the top of the surface.
-    expect = state.get(obj_surface, "z") + state.get(obj_surface, "height") / 2
-    actual = state.get(obj_on, "z") - state.get(obj_on, "height") / 2
-    classification_val = abs(actual - expect) < _ONTOP_Z_THRESHOLD
+    currently_visible = all([o in state.visible_objects for o in objects])
+    # If object not all visible and choose to use VLM,
+    # then use predicate values of previous time step
+    if CFG.spot_vlm_eval_predicate and not currently_visible:
+        # TODO: add all previous atoms to the state
+        raise NotImplementedError
+
+    # Call VLM to evaluate predicate value
+    elif CFG.spot_vlm_eval_predicate and currently_visible:
+        predicate_str = f"""
+        On({obj_on}, {obj_surface})
+        (Whether {obj_on} is on {obj_surface} in the image?)
+        """
+        return vlm_predicate_classify(predicate_str, state)
 
-    # If so, check that the object is within the bounds of the surface.
-    if not _object_in_xy_classifier(
-            state, obj_on, obj_surface, buffer=_ONTOP_SURFACE_BUFFER):
-        return False
+    else:
+        # Check that the bottom of the object is close to the top of the surface.
+        expect = state.get(obj_surface,
+                           "z") + state.get(obj_surface, "height") / 2
+        actual = state.get(obj_on, "z") - state.get(obj_on, "height") / 2
+        classification_val = abs(actual - expect) < _ONTOP_Z_THRESHOLD
+
+        # If so, check that the object is within the bounds of the surface.
+        if not _object_in_xy_classifier(
+                state, obj_on, obj_surface, buffer=_ONTOP_SURFACE_BUFFER):
+            return False
 
-    return classification_val
+        return classification_val
 
 
 def _top_above_classifier(state: State, objects: Sequence[Object]) -> bool:
@@ -1133,26 +1218,42 @@ def _top_above_classifier(state: State, objects: Sequence[Object]) -> bool:
 def _inside_classifier(state: State, objects: Sequence[Object]) -> bool:
     obj_in, obj_container = objects
 
-    if not _object_in_xy_classifier(
-            state, obj_in, obj_container, buffer=_INSIDE_SURFACE_BUFFER):
-        return False
+    currently_visible = all([o in state.visible_objects for o in objects])
+    # If object not all visible and choose to use VLM,
+    # then use predicate values of previous time step
+    if CFG.spot_vlm_eval_predicate and not currently_visible:
+        # TODO: add all previous atoms to the state
+        raise NotImplementedError
+
+    # Call VLM to evaluate predicate value
+    elif CFG.spot_vlm_eval_predicate and currently_visible:
+        predicate_str = f"""
+        Inside({obj_in}, {obj_container})
+        (Whether {obj_in} is inside {obj_container} in the image?)
+        """
+        return vlm_predicate_classify(predicate_str, state)
 
-    obj_z = state.get(obj_in, "z")
-    obj_half_height = state.get(obj_in, "height") / 2
-    obj_bottom = obj_z - obj_half_height
-    obj_top = obj_z + obj_half_height
+    else:
+        if not _object_in_xy_classifier(
+                state, obj_in, obj_container, buffer=_INSIDE_SURFACE_BUFFER):
+            return False
 
-    container_z = state.get(obj_container, "z")
-    container_half_height = state.get(obj_container, "height") / 2
-    container_bottom = container_z - container_half_height
-    container_top = container_z + container_half_height
+        obj_z = state.get(obj_in, "z")
+        obj_half_height = state.get(obj_in, "height") / 2
+        obj_bottom = obj_z - obj_half_height
+        obj_top = obj_z + obj_half_height
 
-    # Check that the bottom is "above" the bottom of the container.
-    if obj_bottom < container_bottom - _INSIDE_Z_THRESHOLD:
-        return False
+        container_z = state.get(obj_container, "z")
+        container_half_height = state.get(obj_container, "height") / 2
+        container_bottom = container_z - container_half_height
+        container_top = container_z + container_half_height
+
+        # Check that the bottom is "above" the bottom of the container.
+        if obj_bottom < container_bottom - _INSIDE_Z_THRESHOLD:
+            return False
 
-    # Check that the top is "below" the top of the container.
-    return obj_top < container_top + _INSIDE_Z_THRESHOLD
+        # Check that the top is "below" the top of the container.
+        return obj_top < container_top + _INSIDE_Z_THRESHOLD
 
 
 def _not_inside_any_container_classifier(state: State,
@@ -1201,8 +1302,8 @@ def in_general_view_classifier(state: State,
 def _obj_reachable_from_spot_pose(spot_pose: math_helpers.SE3Pose,
                                   obj_position: math_helpers.Vec3) -> bool:
     is_xy_near = np.sqrt(
-        (spot_pose.x - obj_position.x)**2 +
-        (spot_pose.y - obj_position.y)**2) <= _REACHABLE_THRESHOLD
+        (spot_pose.x - obj_position.x) ** 2 +
+        (spot_pose.y - obj_position.y) ** 2) <= _REACHABLE_THRESHOLD
 
     # Compute angle between spot's forward direction and the line from
     # spot to the object.
@@ -1244,6 +1345,21 @@ def _blocking_classifier(state: State, objects: Sequence[Object]) -> bool:
     if blocker_obj == blocked_obj:
         return False
 
+    currently_visible = all([o in state.visible_objects for o in objects])
+    # If object not all visible and choose to use VLM,
+    # then use predicate values of previous time step
+    if CFG.spot_vlm_eval_predicate and not currently_visible:
+        # TODO: add all previous atoms to the state
+        raise NotImplementedError
+
+    # Call VLM to evaluate predicate value
+    elif CFG.spot_vlm_eval_predicate and currently_visible:
+        predicate_str = f"""
+        (Whether {blocker_obj} is blocking {blocked_obj} for further manipulation in the image?)
+        Blocking({blocker_obj}, {blocked_obj})
+        """
+        return vlm_predicate_classify(predicate_str, state)
+
     # Only consider draggable (non-placeable, movable) objects to be blockers.
     if not blocker_obj.is_instance(_movable_object_type):
         return False
@@ -1258,7 +1374,7 @@ def _blocking_classifier(state: State, objects: Sequence[Object]) -> bool:
 
     spot, = state.get_objects(_robot_type)
     if blocked_obj.is_instance(_movable_object_type) and \
-        _holding_classifier(state, [spot, blocked_obj]):
+            _holding_classifier(state, [spot, blocked_obj]):
         return False
 
     # Draw a line between blocked and the robot’s current pose.
@@ -1328,8 +1444,8 @@ def _container_adjacent_to_surface_for_sweeping(container: Object,
     container_x = state.get(container, "x")
     container_y = state.get(container, "y")
 
-    dist = np.sqrt((expected_x - container_x)**2 +
-                   (expected_y - container_y)**2)
+    dist = np.sqrt((expected_x - container_x) ** 2 +
+                   (expected_y - container_y) ** 2)
 
     return dist <= _CONTAINER_SWEEP_READY_BUFFER
 
@@ -1451,6 +1567,14 @@ def _get_sweeping_surface_for_container(container: Object,
     _IsSemanticallyGreaterThan
 }
 _NONPERCEPT_PREDICATES: Set[Predicate] = set()
+# NOTE: We maintain a list of predicates that we check via
+# NOTE: In the future, we may include an attribute to denote whether a predicate
+# is VLM perceptible or not.
+# NOTE: candidates: on, inside, door opened, blocking, not blocked, ...
+_VLM_EVAL_PREDICATES: {
+    _On,
+    _Inside,
+}
 
 
 ## Operators (needed in the environment for non-percept atom hack)
@@ -2271,7 +2395,7 @@ def _dry_simulate_sweep_into_container(
             x = container_pose.x + dx
             y = container_pose.y + dy
             z = container_pose.z
-            dist_to_container = (dx**2 + dy**2)**0.5
+            dist_to_container = (dx ** 2 + dy ** 2) ** 0.5
             assert dist_to_container > (container_radius +
                                         _INSIDE_SURFACE_BUFFER)
 

diff --git a/predicators/main.py b/predicators/main.py
@@ -71,8 +71,14 @@ def main() -> None:
     args = utils.parse_args()
     utils.update_config(args)
     str_args = " ".join(sys.argv)
-    # Log to stderr.
-    handlers: List[logging.Handler] = [logging.StreamHandler()]
+    # Log to stderr or use `rich` package for more structured output.
+    handlers: List[logging.Handler] = []
+    if CFG.log_rich:
+        from rich.logging import RichHandler
+        handlers.append(RichHandler())
+    else:
+        handlers.append(logging.StreamHandler())
+
     if CFG.log_file:
         handlers.append(logging.FileHandler(CFG.log_file, mode='w'))
     logging.basicConfig(level=CFG.loglevel,

diff --git a/predicators/perception/spot_perceiver.py b/predicators/perception/spot_perceiver.py
@@ -200,6 +200,12 @@ def _update_state_from_observation(self, observation: Observation) -> None:
         for obj in observation.objects_in_view:
             self._lost_objects.discard(obj)
 
+        # NOTE: This is only used when using VLM for predicate evaluation
+        # NOTE: Performance aspect should be considered later
+        if CFG.spot_vlm_eval_predicate:
+            # Add current Spot images to the state if needed
+            self._camera_images = observation.images
+
     def _create_state(self) -> State:
         if self._waiting_for_observation:
             return DefaultState
@@ -281,9 +287,15 @@ def _create_state(self) -> State:
         # logging.info("Simulator state:")
         # logging.info(simulator_state)
 
+        # Prepare the current images from observation
+        camera_images = self._camera_images if CFG.spot_vlm_eval_predicate else None
+
         # Now finish the state.
         state = _PartialPerceptionState(percept_state.data,
-                                        simulator_state=simulator_state)
+                                        simulator_state=simulator_state,
+                                        camera_images=camera_images,
+                                        visible_objects=self._objects_in_view)
+        # DEBUG - look into dataclass field init - why warning
 
         return state
 

diff --git a/predicators/planning.py b/predicators/planning.py
@@ -1215,6 +1215,9 @@ def run_task_plan_once(
             raise PlanningFailure(
                 "Skeleton produced by A-star exceeds horizon!")
     elif "fd" in CFG.sesame_task_planner:  # pragma: no cover
+        # Run Fast Downward. See the instructions in the docstring of `_sesame_plan_with_fast_downward`
+        assert "FD_EXEC_PATH" in os.environ, \
+            "Please follow the instructions in the docstring of this method!"
         fd_exec_path = os.environ["FD_EXEC_PATH"]
         exec_str = os.path.join(fd_exec_path, "fast-downward.py")
         timeout_cmd = "gtimeout" if sys.platform == "darwin" \