From 79ab8528a870a48088677c05de1d1617080bd50b Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 26 Oct 2023 01:27:03 -0400 Subject: [PATCH 01/46] Revise emotion detector node to be single topic subscriber --- .../base_emotion_detector.py | 25 ++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py index 255000739..5d5d2a07a 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py @@ -8,8 +8,7 @@ from angel_msgs.msg import InterpretedAudioUserEmotion, InterpretedAudioUserIntent from angel_utils import declare_and_get_parameters -IN_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" -IN_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" +IN_USER_INTENT_TOPIC = "user_intent_topic" OUT_INTERP_USER_EMOTION_TOPIC = "user_emotion_topic" # Currently supported emotions. This is tied with the emotions @@ -25,8 +24,8 @@ class BaseEmotionDetector(Node): """ - As of Q22023, emotion detection is derived via VaderSentiment - (https://github.com/cjhutto/vaderSentiment). + This is the base emotion detection node that other emotion detection nodes + should inherit from. """ def __init__(self): @@ -37,26 +36,18 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (IN_EXPECT_USER_INTENT_TOPIC,), - (IN_INTERP_USER_INTENT_TOPIC,), + (IN_USER_INTENT_TOPIC,), (OUT_INTERP_USER_EMOTION_TOPIC,), ], ) - self._in_expect_uintent_topic = param_values[IN_EXPECT_USER_INTENT_TOPIC] - self._in_interp_uintent_topic = param_values[IN_INTERP_USER_INTENT_TOPIC] + self._in_uintent_topic = param_values[IN_USER_INTENT_TOPIC] self._out_interp_uemotion_topic = param_values[OUT_INTERP_USER_EMOTION_TOPIC] # Handle subscription/publication topics. - self.expect_uintent_subscription = self.create_subscription( + self.uintent_subscription = self.create_subscription( InterpretedAudioUserIntent, - self._in_expect_uintent_topic, - self.intent_detection_callback, - 1, - ) - self.interp_uintent_subscription = self.create_subscription( - InterpretedAudioUserIntent, - self._in_interp_uintent_topic, + self._in_uintent_topic, self.intent_detection_callback, 1, ) @@ -116,7 +107,7 @@ def process_message_queue(self): Constant loop to process received messages. """ while True: - msg = self.message_queue.get() + msg = self.message_queue.get(block=True, timeout=None) self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') classification, confidence_score = self.get_inference(msg) self.publish_detected_emotion( From b50f8446956733e1f8bc413fbf4510f04a612227 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 26 Oct 2023 02:10:26 -0400 Subject: [PATCH 02/46] Add Visual Question Answerer Node --- .../visual_question_answerer.py | 348 ++++++++++++++++++ .../configs/llm_prompts/vis_qa_prompt | 10 + ros/angel_system_nodes/setup.py | 1 + ...al_visual_vocalized_question_answering.yml | 184 +++++++++ 4 files changed, 543 insertions(+) create mode 100644 ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py create mode 100644 ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt create mode 100644 tmux/eval_visual_vocalized_question_answering.yml diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py new file mode 100644 index 000000000..79d208b06 --- /dev/null +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -0,0 +1,348 @@ +import langchain +from langchain.chains import LLMChain +import json +from langchain.chat_models import ChatOpenAI +import openai +from operator import itemgetter +import os +import queue +import rclpy +from rclpy.node import Node +import requests +from termcolor import colored +import threading + +from angel_msgs.msg import ( + ActivityDetection, + InterpretedAudioUserEmotion, + ObjectDetection2dSet, + SystemTextResponse, +) +from angel_utils import declare_and_get_parameters + +openai.organization = os.getenv("OPENAI_ORG_ID") +openai.api_key = os.getenv("OPENAI_API_KEY") + +# Below is/are the subscribed topic(s). +IN_EMOTION_TOPIC = "user_emotion_topic" +IN_OBJECT_DETECTION_TOPIC = "object_detections_topic" +IN_ACT_CLFN_TOPIC = "action_classifications_topic" + +# Below is/are the published topic(s). +OUT_QA_TOPIC = "system_text_response_topic" + +# Below are the corresponding model thresholds. +OBJECT_DETECTION_THRESHOLD = "object_detections_threshold" +ACT_CLFN_THRESHOLD = "action_classification_threshold" + +# Below is the recipe paths for the intended task. +RECIPE_PATH = "recipe_path" +# Below is the recipe paths for the prompt template. +PROMPT_TEMPLATE_PATH = "prompt_template_path" + +DEBUG_MODE = "debug_mode" + +# Below is the complete set of prompt instructions. +PROMPT_INSTRUCTIONS = """ +You are given a User Scenario. All the objects in front of and observable to the user are included. +Your task is to use the Action Steps to answer the user's Question. + +Action Steps: {recipe} + +User Scenario: +The User feels {emotion} while doing {action}. The User can see {observables}. + +User Question: {question} +Answer: """ + + +class VisualQuestionAnswerer(Node): + class TimestampedEntity: + def __init__(self, time, entity: str): + self.time = time + self.entity = entity + + def __init__(self): + super().__init__(self.__class__.__name__) + self.log = self.get_logger() + + param_values = declare_and_get_parameters( + self, + [ + (RECIPE_PATH,), + (PROMPT_TEMPLATE_PATH,), + (IN_EMOTION_TOPIC,), + (IN_OBJECT_DETECTION_TOPIC, ""), + (IN_ACT_CLFN_TOPIC, ""), + (OBJECT_DETECTION_THRESHOLD, 0.8), + (ACT_CLFN_THRESHOLD, 0.8), + (OUT_QA_TOPIC,), + (DEBUG_MODE, False), + ], + ) + self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] + self._in_objects_topic = param_values[IN_OBJECT_DETECTION_TOPIC] + self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] + self._out_qa_topic = param_values[OUT_QA_TOPIC] + if param_values[DEBUG_MODE]: + langchain.debug = True + + self._recipe_path = param_values[RECIPE_PATH] + self.recipe = self._configure_recipe(self._recipe_path) + self.log.info(f"Configured recipe to be: ~~~~~~~~~~\n{self.recipe}\n~~~~~~~~~~") + self._prompt_template_path = param_values[PROMPT_TEMPLATE_PATH] + with open(self._prompt_template_path, "r") as file: + self.prompt_template = file.read() + self.log.info( + f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" + ) + + self.object_dtctn_threshold = param_values[OBJECT_DETECTION_THRESHOLD] + self.action_clfn_threshold = param_values[ACT_CLFN_THRESHOLD] + + self.question_queue = queue.Queue() + self.action_classification_queue = queue.Queue() + self.detected_objects_queue = queue.Queue() + self.handler_thread = threading.Thread(target=self.process_question_queue) + self.handler_thread.start() + + # Configure the (necessary) emotional detection enriched utterance subscription. + self.emotion_subscription = self.create_subscription( + InterpretedAudioUserEmotion, + self._in_emotion_topic, + self.question_answer_callback, + 1, + ) + # Configure the optional object detection subscription. + self.objects_subscription = None + if self._in_emotion_topic: + self.objects_subscription = self.create_subscription( + ObjectDetection2dSet, + self._in_objects_topic, + self._add_detected_objects, + 1, + ) + # Configure the optional action classification subscription. + self.action_subscription = None + if self.action_subscription: + self.action_subscription = self.create_subscription( + ActivityDetection, + self._in_actions_topic, + self._add_action_classification, + 1, + ) + # Configure the sole QA output of this node. + self._qa_publisher = self.create_publisher( + SystemTextResponse, self._out_qa_topic, 1 + ) + + # Configure OpenAI API. + self.openai_api_key = self._configure_openai_api_key() + self.openai_org_id = self._configure_openai_org_id() + + # Configure LangChain. + self.chain = self._configure_langchain() + + def _configure_openai_org_id(self): + if not os.getenv("OPENAI_ORG_ID"): + raise ValueError( + "OPENAI_ORG_ID environment variable is unset. " + + f"You should at least set it to garbage output." + ) + return os.getenv("OPENAI_ORG_ID") + + def _configure_openai_api_key(self): + if not os.getenv("OPENAI_API_KEY"): + raise ValueError( + "OPENAI_API_KEY environment variable is unset. " + + f"You should at least set it to garbage output." + ) + return os.getenv("OPENAI_API_KEY") + + def _configure_recipe(self, recipe_path: str): + """ + Reads a recipe from a JSON file. The top-level keys in this file should correspond + to each of the steps for a determined task. The next level should contain an "index" + field to indicate the step number. + """ + f = open(recipe_path) + data = json.load(f) + steps = [None] * len(data.keys()) + for step in data.keys(): + idx = data[step]["index"] + steps[idx] = f"{idx + 1}. {step}" + return "\n".join(steps) + + def _configure_langchain(self): + """ + Handles OpenAI API prompting via LangChain. + """ + openai_llm = ChatOpenAI( + model_name="gpt-3.5-turbo", + openai_api_key=self.openai_api_key, + temperature=0.0, + max_tokens=64, + ) + zero_shot_prompt = langchain.PromptTemplate( + input_variables=["recipe", "emotion", "action", "observables", "question"], + template=self.prompt_template, + ) + return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) + + def _get_sec(self, msg) -> int: + return msg.header.stamp.sec + + def _add_action_classification(self, msg: ActivityDetection) -> str: + """ + Stores the action label with the highest confidence in + self.action_classification_queue. + """ + action_classification = max( + zip(msg.label_vec, msg.conf_vec), key=itemgetter(1) + )[0] + te = VisualQuestionAnswerer.TimestampedEntity( + self._get_sec(msg), action_classification + ) + self.action_classification_queue.put(te) + + def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: + """ + Stores all detected objects with a confidence score above IN_OBJECT_DETECTION_THRESHOLD. + """ + detected_objects = set() + for obj, score in zip(msg.label_vec, msg.label_confidences): + if score < self.object_dtctn_threshold: + # Optional threshold filtering + continue + detected_objects.add(obj) + if detected_objects: + te = VisualQuestionAnswerer.TimestampedEntity( + self._get_sec(msg), detected_objects + ) + self.detected_objects_queue.put(te) + + def _get_action_before(self, curr_time: int) -> str: + """ + Returns the latest action classification in self.action_classification_queue + that does not occur before a provided time. + """ + latest_action = "nothing" + while not self.action_classification_queue.empty(): + next = self.action_classification_queue.queue[0] + if next.time < curr_time: + latest_action = next.entity + self.action_classification_queue.get() + else: + break + return latest_action + + def _get_observables_before(self, curr_time: int) -> str: + """ + Returns a comma-delimited list of observed objects per all + entities in self.detected_objects_queue that occurred before a provided time. + """ + observables = set() + while not self.detected_objects_queue.empty(): + next = self.detected_objects_queue.queue[0] + if next.time < curr_time: + observables.update(next.entity) + self.detected_objects_queue.get() + else: + break + if not observables: + return "nothing" + return ", ".join(list(observables)) + + def get_response( + self, msg: InterpretedAudioUserEmotion, action: str, observables: str + ): + """ + Generate a response to the utterance, enriched with the addition of + the user's detected emotion. Inference calls can be added and revised + here. + """ + return_msg = None + try: + self.log.info(f"User emotion: {msg.user_emotion}") + response = self.chain.run( + recipe=self.recipe, + action=action, + observables=observables, + emotion=msg.user_emotion, + question=msg.utterance_text, + ) + return_msg = colored(f"{response}\n", "light_green") + except RuntimeError as err: + self.log.info(err) + colored_apology = colored( + "I'm sorry. I don't know how to answer your statement.", "light_red" + ) + colored_emotion = colored(msg.user_emotion, "light_red") + return_msg = ( + f"{colored_apology} I understand that you feel {colored_emotion}." + ) + return return_msg + + def question_answer_callback(self, msg): + """ + This is the main ROS node listener callback loop that will process + all messages received via subscribed topics. + """ + self.log.debug(f"Received message:\n\n{msg.utterance_text}") + if not self._apply_filter(msg): + return + self.question_queue.put(msg) + + def process_question_queue(self): + """ + Constant loop to process received questions. + """ + while True: + question_msg = self.question_queue.get() + start_time = self._get_sec(question_msg) + + # Get most recently detected action. + action = self._get_action_before(start_time) + self.log.info(f"Latest action: {action}") + + # Get detected objects. + observables = self._get_observables_before(start_time) + self.log.info(f"Observed objects: {observables}") + + # Generate response. + response = self.get_response(question_msg, action, observables) + self.publish_generated_response(question_msg.utterance_text, response) + + def publish_generated_response(self, utterance: str, response: str): + msg = SystemTextResponse() + msg.header.frame_id = "GPT Question Answering" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = utterance + msg.response = response + colored_utterance = colored(utterance, "light_blue") + colored_response = colored(response, "light_green") + self.log.info( + f'Responding to utterance:\n>>> "{colored_utterance}"\n>>> with:\n' + + f'>>> "{colored_response}"' + ) + self._qa_publisher.publish(msg) + + def _apply_filter(self, msg): + """ + Abstracts away any filtering to apply on received messages. Return + none if the message should be filtered out. Else, return the incoming + msg if it can be included. + """ + return msg + + +def main(): + rclpy.init() + question_answerer = VisualQuestionAnswerer() + rclpy.spin(question_answerer) + question_answerer.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt new file mode 100644 index 000000000..281cd25a8 --- /dev/null +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt @@ -0,0 +1,10 @@ +You are given a User Scenario. All the objects in front of and observable to the user are included. +Your task is to use the Action Steps to answer the user's Question. + +Action Steps: {recipe} + +User Scenario: +The User feels {emotion}. The User is doing {action}. The User can see {observables}. + +User Question: {question} +Answer: \ No newline at end of file diff --git a/ros/angel_system_nodes/setup.py b/ros/angel_system_nodes/setup.py index 6ecbec3d9..c75bc6664 100644 --- a/ros/angel_system_nodes/setup.py +++ b/ros/angel_system_nodes/setup.py @@ -25,6 +25,7 @@ "base_emotion_detector = angel_system_nodes.base_emotion_detector:main", "gpt_emotion_detector = angel_system_nodes.gpt_emotion_detector:main", "question_answerer = angel_system_nodes.question_answerer:main", + "visual_question_answerer = angel_system_nodes.visual_question_answerer:main", "intent_detector = angel_system_nodes.intent_detector:main", "spatial_mapper = angel_system_nodes.spatial_mapper:main", "feedback_generator = angel_system_nodes.feedback_generator:main", diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml new file mode 100644 index 000000000..613a60010 --- /dev/null +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -0,0 +1,184 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.1.101} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: +# - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ROS_IP:=0.0.0.0 +# - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ip_addr:=${HL2_IP} +# -p image_topic:=PVFramesBGR +# -p image_ts_topic:=disable +# -p hand_pose_topic:=disable +# -p audio_topic:=HeadsetAudioData +# -p head_pose_topic:=HeadsetPoseData +# -p sm_topic:=disable +# -p rm_depth_AHAT:=disable +# -p pv_width:=760 +# -p pv_height:=428 +# -p pv_framerate:=30 +# -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + - ros_bag_play: sleep 5; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesRGB + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.1 + # -p cuda_device_id:=0 + + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=3 + -p vad_margin:=0.20 + -p max_accumulation_length:=10 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=1 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - intent_detection: + layout: even-vertical + panes: + - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterances_topic:=utterances_topic + -p expect_user_intent_topic:=expect_user_intent_topic + -p interp_user_intent_topic:=interp_user_intent_topic + - emotion_detection: + layout: even-vertical + panes: + - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p user_intent_topic:=interp_user_intent_topic + -p user_emotion_topic:=gpt_emotion_topic + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p user_emotion_topic:=gpt_emotion_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_prompt From 595713acecfb697ea63158c44cd248df991a9044 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 26 Oct 2023 15:31:33 -0400 Subject: [PATCH 03/46] Add new visual question answering prompt (teacher mode) --- .../visual_question_answerer.py | 27 +++++++++++++++---- .../configs/llm_prompts/vis_qa_prompt | 10 ------- .../configs/llm_prompts/vis_qa_teacher_prompt | 11 ++++++++ ...al_visual_vocalized_question_answering.yml | 21 ++++++++------- 4 files changed, 44 insertions(+), 25 deletions(-) delete mode 100644 ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt create mode 100644 ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 79d208b06..0d1da5c8d 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -17,6 +17,7 @@ InterpretedAudioUserEmotion, ObjectDetection2dSet, SystemTextResponse, + TaskUpdate ) from angel_utils import declare_and_get_parameters @@ -27,6 +28,7 @@ IN_EMOTION_TOPIC = "user_emotion_topic" IN_OBJECT_DETECTION_TOPIC = "object_detections_topic" IN_ACT_CLFN_TOPIC = "action_classifications_topic" +IN_TASK_UPDATE_TOPIC = "task_update_topic" # Below is/are the published topic(s). OUT_QA_TOPIC = "system_text_response_topic" @@ -72,6 +74,7 @@ def __init__(self): (RECIPE_PATH,), (PROMPT_TEMPLATE_PATH,), (IN_EMOTION_TOPIC,), + (IN_TASK_UPDATE_TOPIC, ""), (IN_OBJECT_DETECTION_TOPIC, ""), (IN_ACT_CLFN_TOPIC, ""), (OBJECT_DETECTION_THRESHOLD, 0.8), @@ -81,6 +84,7 @@ def __init__(self): ], ) self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] + self._in_task_updates_topic = param_values[IN_TASK_UPDATE_TOPIC] self._in_objects_topic = param_values[IN_OBJECT_DETECTION_TOPIC] self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] @@ -101,6 +105,7 @@ def __init__(self): self.action_clfn_threshold = param_values[ACT_CLFN_THRESHOLD] self.question_queue = queue.Queue() + self.step = "Unstarted" self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() self.handler_thread = threading.Thread(target=self.process_question_queue) @@ -113,6 +118,15 @@ def __init__(self): self.question_answer_callback, 1, ) + # Configure the optional task updates subscription. + self.objects_subscription = None + if self._in_emotion_topic: + self.objects_subscription = self.create_subscription( + TaskUpdate, + self._in_task_updates_topic, + self._set_current_step, + 1, + ) # Configure the optional object detection subscription. self.objects_subscription = None if self._in_emotion_topic: @@ -184,7 +198,7 @@ def _configure_langchain(self): max_tokens=64, ) zero_shot_prompt = langchain.PromptTemplate( - input_variables=["recipe", "emotion", "action", "observables", "question"], + input_variables=["recipe", "current_step", "emotion", "action", "observables", "question"], template=self.prompt_template, ) return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) @@ -192,6 +206,9 @@ def _configure_langchain(self): def _get_sec(self, msg) -> int: return msg.header.stamp.sec + def _set_current_step(self, msg: TaskUpdate): + self.step = msg.current_step + def _add_action_classification(self, msg: ActivityDetection) -> str: """ Stores the action label with the highest confidence in @@ -253,9 +270,8 @@ def _get_observables_before(self, curr_time: int) -> str: return "nothing" return ", ".join(list(observables)) - def get_response( - self, msg: InterpretedAudioUserEmotion, action: str, observables: str - ): + def get_response(self, msg: InterpretedAudioUserEmotion, + current_step: str, action: str, observables: str): """ Generate a response to the utterance, enriched with the addition of the user's detected emotion. Inference calls can be added and revised @@ -266,6 +282,7 @@ def get_response( self.log.info(f"User emotion: {msg.user_emotion}") response = self.chain.run( recipe=self.recipe, + current_step=current_step, action=action, observables=observables, emotion=msg.user_emotion, @@ -310,7 +327,7 @@ def process_question_queue(self): self.log.info(f"Observed objects: {observables}") # Generate response. - response = self.get_response(question_msg, action, observables) + response = self.get_response(question_msg, self.current_step, action, observables) self.publish_generated_response(question_msg.utterance_text, response) def publish_generated_response(self, utterance: str, response: str): diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt deleted file mode 100644 index 281cd25a8..000000000 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_prompt +++ /dev/null @@ -1,10 +0,0 @@ -You are given a User Scenario. All the objects in front of and observable to the user are included. -Your task is to use the Action Steps to answer the user's Question. - -Action Steps: {recipe} - -User Scenario: -The User feels {emotion}. The User is doing {action}. The User can see {observables}. - -User Question: {question} -Answer: \ No newline at end of file diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt new file mode 100644 index 000000000..ccb4d78e6 --- /dev/null +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -0,0 +1,11 @@ +You are a teacher helping me learn how to complete a Task. I will tell you how I am feeling (positive, negative, neutral), all the objects that I can see, and what I am currently doing. I will ask you a question and you will respond with an answer. + +Task Steps: +{recipe} + +My Current Step: {current_step} +My Emotion: {emotion}. +My Current Action: {action}. +In front of me are the following objects: {observables}. +My Question: {question} +Your Answer: \ No newline at end of file diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 613a60010..857b10e4b 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -104,17 +104,17 @@ windows: - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p image_topic:=PVFramesBGR - -p det_topic:=ObjectDetections2d + -p det_topic:=BerkeleyObjectDetections2d -p det_conf_threshold:=0.1 -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args - # -r __ns:=${ROS_NAMESPACE} - # -p image_topic:=PVFramesRGB - # -p det_topic:=ObjectDetections2d - # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt - # -p inference_img_size:=1280 - # -p det_conf_threshold:=0.1 - # -p cuda_device_id:=0 + - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p det_topic:=ObjectDetections2d + -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + -p inference_img_size:=1280 + -p det_conf_threshold:=0.1 + -p cuda_device_id:=1 - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args -r __ns:=${ROS_NAMESPACE} @@ -181,4 +181,5 @@ windows: -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json - -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_prompt + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p debug_mode:=True \ No newline at end of file From ed3095880f0aa62af32eb8cea93af57619af40c4 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 26 Oct 2023 17:25:03 -0400 Subject: [PATCH 04/46] Add dialogue history to visual question answering --- .../visual_question_answerer.py | 79 +++++++++++++------ .../configs/llm_prompts/vis_qa_teacher_prompt | 10 ++- ...al_visual_vocalized_question_answering.yml | 11 ++- 3 files changed, 74 insertions(+), 26 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 0d1da5c8d..8b6049637 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -1,3 +1,4 @@ +import itertools import langchain from langchain.chains import LLMChain import json @@ -8,9 +9,9 @@ import queue import rclpy from rclpy.node import Node -import requests from termcolor import colored import threading +from typing import * from angel_msgs.msg import ( ActivityDetection, @@ -28,7 +29,7 @@ IN_EMOTION_TOPIC = "user_emotion_topic" IN_OBJECT_DETECTION_TOPIC = "object_detections_topic" IN_ACT_CLFN_TOPIC = "action_classifications_topic" -IN_TASK_UPDATE_TOPIC = "task_update_topic" +IN_TASK_STATE_TOPIC = "task_state_topic" # Below is/are the published topic(s). OUT_QA_TOPIC = "system_text_response_topic" @@ -42,6 +43,7 @@ # Below is the recipe paths for the prompt template. PROMPT_TEMPLATE_PATH = "prompt_template_path" +CONTEXT_HISTORY_LENGTH = "context_history_length" DEBUG_MODE = "debug_mode" # Below is the complete set of prompt instructions. @@ -74,22 +76,26 @@ def __init__(self): (RECIPE_PATH,), (PROMPT_TEMPLATE_PATH,), (IN_EMOTION_TOPIC,), - (IN_TASK_UPDATE_TOPIC, ""), + (IN_TASK_STATE_TOPIC, ""), (IN_OBJECT_DETECTION_TOPIC, ""), (IN_ACT_CLFN_TOPIC, ""), (OBJECT_DETECTION_THRESHOLD, 0.8), (ACT_CLFN_THRESHOLD, 0.8), (OUT_QA_TOPIC,), + (CONTEXT_HISTORY_LENGTH, 3), (DEBUG_MODE, False), ], ) self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] - self._in_task_updates_topic = param_values[IN_TASK_UPDATE_TOPIC] + self._in_task_state_topic = param_values[IN_TASK_STATE_TOPIC] self._in_objects_topic = param_values[IN_OBJECT_DETECTION_TOPIC] self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] + self.dialogue_history_length = param_values[CONTEXT_HISTORY_LENGTH] + self.debug_mode = False if param_values[DEBUG_MODE]: - langchain.debug = True + # langchain.debug = True + self.debug_mode = True self._recipe_path = param_values[RECIPE_PATH] self.recipe = self._configure_recipe(self._recipe_path) @@ -105,7 +111,7 @@ def __init__(self): self.action_clfn_threshold = param_values[ACT_CLFN_THRESHOLD] self.question_queue = queue.Queue() - self.step = "Unstarted" + self.current_step = "Unstarted" self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() self.handler_thread = threading.Thread(target=self.process_question_queue) @@ -119,11 +125,11 @@ def __init__(self): 1, ) # Configure the optional task updates subscription. - self.objects_subscription = None - if self._in_emotion_topic: - self.objects_subscription = self.create_subscription( + self.task_state_subscription = None + if self._in_task_state_topic: + self.task_state_subscription = self.create_subscription( TaskUpdate, - self._in_task_updates_topic, + self._in_task_state_topic, self._set_current_step, 1, ) @@ -157,6 +163,8 @@ def __init__(self): # Configure LangChain. self.chain = self._configure_langchain() + self.dialogue_history = [] + def _configure_openai_org_id(self): if not os.getenv("OPENAI_ORG_ID"): raise ValueError( @@ -197,8 +205,11 @@ def _configure_langchain(self): temperature=0.0, max_tokens=64, ) + # TODO (derekahmed) Figure out how to include optional dialogue history zero_shot_prompt = langchain.PromptTemplate( - input_variables=["recipe", "current_step", "emotion", "action", "observables", "question"], + input_variables=["recipe", "chat_history", + "current_step", "emotion", "action", "observables", + "question"], template=self.prompt_template, ) return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) @@ -207,7 +218,15 @@ def _get_sec(self, msg) -> int: return msg.header.stamp.sec def _set_current_step(self, msg: TaskUpdate): - self.step = msg.current_step + self.current_step = msg.current_step + + def _get_current_step(self): + return self.current_step + + def _get_dialogue_history(self): + last_n = min(len(self.dialogue_history), self.dialogue_history_length) + last_n_turns = self.dialogue_history[-1 * last_n:] + return "\n".join(itertools.chain.from_iterable(last_n_turns)) def _add_action_classification(self, msg: ActivityDetection) -> str: """ @@ -238,6 +257,9 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: ) self.detected_objects_queue.put(te) + def _add_dialogue_history(self, question: str, response: str): + self.dialogue_history.append((f"Me: {question}", f"You: {response}")) + def _get_action_before(self, curr_time: int) -> str: """ Returns the latest action classification in self.action_classification_queue @@ -270,8 +292,8 @@ def _get_observables_before(self, curr_time: int) -> str: return "nothing" return ", ".join(list(observables)) - def get_response(self, msg: InterpretedAudioUserEmotion, - current_step: str, action: str, observables: str): + def get_response(self, msg: InterpretedAudioUserEmotion, + chat_history: str, current_step: str, action: str, observables: str): """ Generate a response to the utterance, enriched with the addition of the user's detected emotion. Inference calls can be added and revised @@ -280,24 +302,30 @@ def get_response(self, msg: InterpretedAudioUserEmotion, return_msg = None try: self.log.info(f"User emotion: {msg.user_emotion}") - response = self.chain.run( + return_msg = self.chain.run( recipe=self.recipe, + chat_history=chat_history, current_step=current_step, action=action, observables=observables, emotion=msg.user_emotion, question=msg.utterance_text, ) - return_msg = colored(f"{response}\n", "light_green") + if self.debug_mode: + sent_prompt = \ + self.chain.prompt.format_prompt(recipe=self.recipe, + chat_history=chat_history, + current_step=current_step, + action=action, + observables=observables, + emotion=msg.user_emotion, + question=msg.utterance_text,).to_string() + sent_prompt = colored(sent_prompt, "light_red") + self.log.info(f"Prompt sent over:~~~~~~~~~~\n{sent_prompt}\n:~~~~~~~~~~") except RuntimeError as err: self.log.info(err) - colored_apology = colored( - "I'm sorry. I don't know how to answer your statement.", "light_red" - ) - colored_emotion = colored(msg.user_emotion, "light_red") - return_msg = ( - f"{colored_apology} I understand that you feel {colored_emotion}." - ) + return_msg = "I'm sorry. I don't know how to answer your statement. " +\ + f"I understand that you feel {msg.user_emotion}." return return_msg def question_answer_callback(self, msg): @@ -327,8 +355,11 @@ def process_question_queue(self): self.log.info(f"Observed objects: {observables}") # Generate response. - response = self.get_response(question_msg, self.current_step, action, observables) + response = self.get_response(question_msg, + self._get_dialogue_history(), + self._get_current_step(), action, observables) self.publish_generated_response(question_msg.utterance_text, response) + self._add_dialogue_history(question_msg.utterance_text, response) def publish_generated_response(self, utterance: str, response: str): msg = SystemTextResponse() diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index ccb4d78e6..2eac012d6 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -3,9 +3,17 @@ You are a teacher helping me learn how to complete a Task. I will tell you how I Task Steps: {recipe} +Chat History: +{chat_history} + My Current Step: {current_step} + My Emotion: {emotion}. + My Current Action: {action}. -In front of me are the following objects: {observables}. + +Objects In Front of Me: {observables}. + My Question: {question} + Your Answer: \ No newline at end of file diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 857b10e4b..e00535a1a 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -106,6 +106,7 @@ windows: -p image_topic:=PVFramesBGR -p det_topic:=BerkeleyObjectDetections2d -p det_conf_threshold:=0.1 + -p cuda_device_id:=1 -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args -r __ns:=${ROS_NAMESPACE} @@ -130,12 +131,19 @@ windows: -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json - -p model_device:=cuda + -p model_device:=cuda:1 -p model_dets_conv_version:=5 -p window_size:=30 -p buffer_max_size_seconds:=5 -p image_pix_width:=1280 -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands - vocal: layout: even-vertical panes: @@ -177,6 +185,7 @@ windows: - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} -p user_emotion_topic:=gpt_emotion_topic + -p task_state_topic:=task_state_topic -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic From 00dad55c86fecda5e5d73dec0258c50e3c4bd62f Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Fri, 27 Oct 2023 01:29:42 -0400 Subject: [PATCH 05/46] Add Centroid Strategy for Detected Objects --- .../visual_question_answerer.py | 211 ++++++++++++++---- ...al_visual_vocalized_question_answering.yml | 21 +- 2 files changed, 185 insertions(+), 47 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 8b6049637..589a1541e 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -1,3 +1,4 @@ +from enum import Enum import itertools import langchain from langchain.chains import LLMChain @@ -9,6 +10,7 @@ import queue import rclpy from rclpy.node import Node +from scipy.spatial import distance from termcolor import colored import threading from typing import * @@ -18,7 +20,7 @@ InterpretedAudioUserEmotion, ObjectDetection2dSet, SystemTextResponse, - TaskUpdate + TaskUpdate, ) from angel_utils import declare_and_get_parameters @@ -34,6 +36,10 @@ # Below is/are the published topic(s). OUT_QA_TOPIC = "system_text_response_topic" +# Below configures the filtering strategy for detected objects. It should correspond to +# VisualQuestionAnswerer.FilterType. +OBJECT_DETECTION_FILTER = "obj_det_filter" + # Below are the corresponding model thresholds. OBJECT_DETECTION_THRESHOLD = "object_detections_threshold" ACT_CLFN_THRESHOLD = "action_classification_threshold" @@ -42,8 +48,12 @@ RECIPE_PATH = "recipe_path" # Below is the recipe paths for the prompt template. PROMPT_TEMPLATE_PATH = "prompt_template_path" - +# Below is how many dialogue turns to keep maintained in the prompt context. CONTEXT_HISTORY_LENGTH = "context_history_length" + +# Below configures the width and height of an image. A typical example would be 1280 * 720. +IMAGE_WIDTH = "pv_width" +IMAGE_HEIGHT = "pv_height" DEBUG_MODE = "debug_mode" # Below is the complete set of prompt instructions. @@ -59,17 +69,50 @@ User Question: {question} Answer: """ +# Below are all the variables. These should correspond to the variables defined in the +# PROMPT_TEMPLATE_PATH and will be indicated by surrounding '{' and '}'. +PROMPT_VARIABLES = [ + "recipe", + "chat_history", + "current_step", + "emotion", + "action", + "observables", + "question", +] + class VisualQuestionAnswerer(Node): + class FilterType(Enum): + """ + The following determines which objects to surface in the prompt. + "threshold" selects objects with a confidence score above OBJECT_DETECTION_THRESHOLD. + "center" selects the object closest to the center of the user's field of view. Make sure to + also configure pv_width and pv_height if this is selected. + """ + + THRESHOLD = 1 + CENTER = 2 + + def is_threshold(self): + return self.value == VisualQuestionAnswerer.FilterType.THRESHOLD.value + + def is_center(self): + return self.value == VisualQuestionAnswerer.FilterType.CENTER.value + class TimestampedEntity: - def __init__(self, time, entity: str): + """ + This class is used internally as a container for recorded detections and classifications at + specific instances in time. + """ + + def __init__(self, time, entity): self.time = time self.entity = entity def __init__(self): super().__init__(self.__class__.__name__) self.log = self.get_logger() - param_values = declare_and_get_parameters( self, [ @@ -79,6 +122,12 @@ def __init__(self): (IN_TASK_STATE_TOPIC, ""), (IN_OBJECT_DETECTION_TOPIC, ""), (IN_ACT_CLFN_TOPIC, ""), + (IMAGE_WIDTH, -1), + (IMAGE_HEIGHT, -1), + ( + OBJECT_DETECTION_FILTER, + VisualQuestionAnswerer.FilterType.THRESHOLD.name, + ), (OBJECT_DETECTION_THRESHOLD, 0.8), (ACT_CLFN_THRESHOLD, 0.8), (OUT_QA_TOPIC,), @@ -97,9 +146,21 @@ def __init__(self): # langchain.debug = True self.debug_mode = True + # Used to obtain the center perspective point and how far detected objects + # are from it. + self.pv_width = param_values[IMAGE_WIDTH] + self.pv_height = param_values[IMAGE_HEIGHT] + pv_configured = self.pv_width > 0 and self.pv_height > 0 + self.pv_center_coordinate = ( + [self.pv_width / 2, self.pv_height / 2] if pv_configured else [None, None] + ) + + # Read the configured recipe file. self._recipe_path = param_values[RECIPE_PATH] self.recipe = self._configure_recipe(self._recipe_path) self.log.info(f"Configured recipe to be: ~~~~~~~~~~\n{self.recipe}\n~~~~~~~~~~") + + # Read the configured prompt template. self._prompt_template_path = param_values[PROMPT_TEMPLATE_PATH] with open(self._prompt_template_path, "r") as file: self.prompt_template = file.read() @@ -107,13 +168,27 @@ def __init__(self): f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" ) + # Configure supplemental input detection & classification criteria. + self.object_dtctn_filter = VisualQuestionAnswerer.FilterType[ + param_values[OBJECT_DETECTION_FILTER].upper() + ] + if ( + self.object_dtctn_filter.is_center() + and self.pv_center_coordinate[0] is None + ): + raise ValueError( + f"All {OBJECT_DETECTION_FILTER} and {IMAGE_WIDTH} and {IMAGE_HEIGHT} " + + "must be configured together." + ) self.object_dtctn_threshold = param_values[OBJECT_DETECTION_THRESHOLD] self.action_clfn_threshold = param_values[ACT_CLFN_THRESHOLD] + # Configure supplemental input resources. self.question_queue = queue.Queue() self.current_step = "Unstarted" self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() + self.dialogue_history = [] self.handler_thread = threading.Thread(target=self.process_question_queue) self.handler_thread.start() @@ -163,8 +238,6 @@ def __init__(self): # Configure LangChain. self.chain = self._configure_langchain() - self.dialogue_history = [] - def _configure_openai_org_id(self): if not os.getenv("OPENAI_ORG_ID"): raise ValueError( @@ -205,11 +278,9 @@ def _configure_langchain(self): temperature=0.0, max_tokens=64, ) - # TODO (derekahmed) Figure out how to include optional dialogue history + # TODO (derekahmed) Figure out how to include optional dialogue history zero_shot_prompt = langchain.PromptTemplate( - input_variables=["recipe", "chat_history", - "current_step", "emotion", "action", "observables", - "question"], + input_variables=PROMPT_VARIABLES, template=self.prompt_template, ) return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) @@ -224,8 +295,11 @@ def _get_current_step(self): return self.current_step def _get_dialogue_history(self): + """ + Gets a string concatenation of the last self.dialogue_history_length turns of conversation. + """ last_n = min(len(self.dialogue_history), self.dialogue_history_length) - last_n_turns = self.dialogue_history[-1 * last_n:] + last_n_turns = self.dialogue_history[-1 * last_n :] return "\n".join(itertools.chain.from_iterable(last_n_turns)) def _add_action_classification(self, msg: ActivityDetection) -> str: @@ -245,22 +319,64 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: """ Stores all detected objects with a confidence score above IN_OBJECT_DETECTION_THRESHOLD. """ - detected_objects = set() + if self.object_dtctn_filter.is_threshold(): + self._add_detected_objects_above_threshold(msg) + elif self.object_dtctn_filter.is_center(): + # TODO(derekahmed): Maybe these shouldn't be mutually exclusive? + self._add_detected_object_closest_to_center(msg) + else: + raise ValueError( + "VisualQuestionAnswerer Node is misconfigured as " + + self.object_dtctn_filter.value + ) + + def _add_detected_object_closest_to_center(self, msg): + """ + Adds the object that is closest to the configured center coordinate of the user's view. + This center coordinate is indicated by self.pv_center_coordinate. + """ + most_center_obj = None + most_center_dist = max(self.pv_width, self.pv_height) + zipped = zip(msg.label_vec, msg.left, msg.right, msg.top, msg.bottom) + for obj, left, right, top, bottom in zipped: + width_center = left + int((right - left) / 2) + height_center = top + int((bottom - top) / 2) + curr_dist = distance.euclidean( + [width_center, height_center], self.pv_center_coordinate + ) + if curr_dist < most_center_dist: + most_center_obj = obj + most_center_dist = curr_dist + if most_center_obj: + if self.debug_mode: + self.log.info( + f"Added {most_center_obj} to detected objects queue." + + f"Object is {most_center_dist} away from the center." + ) + te = VisualQuestionAnswerer.TimestampedEntity( + self._get_sec(msg), set([most_center_obj]) + ) + self.detected_objects_queue.put(te) + + def _add_detected_objects_above_threshold(self, msg): + """ + Queuse all objects above a configure threshold. + """ + detected_objs = set() for obj, score in zip(msg.label_vec, msg.label_confidences): if score < self.object_dtctn_threshold: - # Optional threshold filtering continue - detected_objects.add(obj) - if detected_objects: + detected_objs.add(obj) + if detected_objs: te = VisualQuestionAnswerer.TimestampedEntity( - self._get_sec(msg), detected_objects + self._get_sec(msg), detected_objs ) self.detected_objects_queue.put(te) def _add_dialogue_history(self, question: str, response: str): - self.dialogue_history.append((f"Me: {question}", f"You: {response}")) + self.dialogue_history.append((f"Me: {question}", f"You: {response}")) - def _get_action_before(self, curr_time: int) -> str: + def _get_latest_action(self, curr_time: int) -> str: """ Returns the latest action classification in self.action_classification_queue that does not occur before a provided time. @@ -275,7 +391,7 @@ def _get_action_before(self, curr_time: int) -> str: break return latest_action - def _get_observables_before(self, curr_time: int) -> str: + def _get_latest_observables(self, curr_time: int) -> str: """ Returns a comma-delimited list of observed objects per all entities in self.detected_objects_queue that occurred before a provided time. @@ -292,12 +408,18 @@ def _get_observables_before(self, curr_time: int) -> str: return "nothing" return ", ".join(list(observables)) - def get_response(self, msg: InterpretedAudioUserEmotion, - chat_history: str, current_step: str, action: str, observables: str): + def get_response( + self, + msg: InterpretedAudioUserEmotion, + chat_history: str, + current_step: str, + action: str, + observables: str, + ): """ - Generate a response to the utterance, enriched with the addition of - the user's detected emotion. Inference calls can be added and revised - here. + Generate a response to the utterance, enriched with the addition of + the user's detected emotion, chat history, current step information, action, and + detected objects. Inference calls can be added and revised here. """ return_msg = None try: @@ -312,20 +434,25 @@ def get_response(self, msg: InterpretedAudioUserEmotion, question=msg.utterance_text, ) if self.debug_mode: - sent_prompt = \ - self.chain.prompt.format_prompt(recipe=self.recipe, - chat_history=chat_history, - current_step=current_step, - action=action, - observables=observables, - emotion=msg.user_emotion, - question=msg.utterance_text,).to_string() + sent_prompt = self.chain.prompt.format_prompt( + recipe=self.recipe, + chat_history=chat_history, + current_step=current_step, + action=action, + observables=observables, + emotion=msg.user_emotion, + question=msg.utterance_text, + ).to_string() sent_prompt = colored(sent_prompt, "light_red") - self.log.info(f"Prompt sent over:~~~~~~~~~~\n{sent_prompt}\n:~~~~~~~~~~") + self.log.info( + f"Prompt sent over:~~~~~~~~~~\n{sent_prompt}\n:~~~~~~~~~~" + ) except RuntimeError as err: self.log.info(err) - return_msg = "I'm sorry. I don't know how to answer your statement. " +\ - f"I understand that you feel {msg.user_emotion}." + return_msg = ( + "I'm sorry. I don't know how to answer your statement. " + + f"I understand that you feel {msg.user_emotion}." + ) return return_msg def question_answer_callback(self, msg): @@ -347,17 +474,21 @@ def process_question_queue(self): start_time = self._get_sec(question_msg) # Get most recently detected action. - action = self._get_action_before(start_time) + action = self._get_latest_action(start_time) self.log.info(f"Latest action: {action}") # Get detected objects. - observables = self._get_observables_before(start_time) + observables = self._get_latest_observables(start_time) self.log.info(f"Observed objects: {observables}") # Generate response. - response = self.get_response(question_msg, - self._get_dialogue_history(), - self._get_current_step(), action, observables) + response = self.get_response( + question_msg, + self._get_dialogue_history(), + self._get_current_step(), + action, + observables, + ) self.publish_generated_response(question_msg.utterance_text, response) self._add_dialogue_history(question_msg.utterance_text, response) diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index e00535a1a..37414b958 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -126,7 +126,7 @@ windows: - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args -r __ns:=${ROS_NAMESPACE} -p image_ts_topic:=PVFramesBGR_TS - -p det_topic:=ObjectDetections2d + -p det_topic:=BerkeleyObjectDetections2d -p act_topic:=ActivityDetections -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt @@ -152,16 +152,16 @@ windows: -p input_audio_topic:=HeadsetAudioData -p output_voice_activity_topic:=DetectedVoiceData -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad - -p vad_cadence:=3 - -p vad_margin:=0.20 - -p max_accumulation_length:=10 + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 -p debug_mode:=True - asr: ros2 run angel_system_nodes asr --ros-args -r __ns:=${ROS_NAMESPACE} -p audio_topic:=DetectedVoiceData -p utterances_topic:=utterances_topic -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr - -p asr_req_segment_duration:=1 + -p asr_req_segment_duration:=2 -p is_sentence_tokenize:=False -p debug_mode:=True - intent_detection: @@ -175,7 +175,11 @@ windows: - emotion_detection: layout: even-vertical panes: - - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p user_intent_topic:=expect_user_intent_topic + -p user_emotion_topic:=null_topic + - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic -p user_emotion_topic:=gpt_emotion_topic @@ -186,9 +190,12 @@ windows: -r __ns:=${ROS_NAMESPACE} -p user_emotion_topic:=gpt_emotion_topic -p task_state_topic:=task_state_topic - -p object_detections_topic:=ObjectDetections2d + -p object_detections_topic:=BerkeleyObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_filter:=CENTER + -p pv_width:=1280 + -p pv_height:=720 -p debug_mode:=True \ No newline at end of file From f399390b0d04482d847ee7586d3d8826581ac915 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Fri, 27 Oct 2023 14:27:30 -0400 Subject: [PATCH 06/46] Add Last n Objects to Visual Question Answering Node --- .../visual_question_answerer.py | 93 +++++++++++-------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 589a1541e..41751db3f 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -38,23 +38,27 @@ # Below configures the filtering strategy for detected objects. It should correspond to # VisualQuestionAnswerer.FilterType. -OBJECT_DETECTION_FILTER = "obj_det_filter" +PARAM_OBJECT_DETECTION_FILTER_STRATEGY = "obj_det_filter" + +# Below indicates how many of the last n detected objects should be surfaced +# in the LLM prompt. These objects do NOT have to be unique. +PARAM_OBJECT_LAST_N_OBJECTS = "obj_det_last_n" # Below are the corresponding model thresholds. -OBJECT_DETECTION_THRESHOLD = "object_detections_threshold" -ACT_CLFN_THRESHOLD = "action_classification_threshold" +PARAM_OBJECT_DETECTION_THRESHOLD = "object_det_threshold" +PARAM_ACT_CLFN_THRESHOLD = "action_classification_threshold" # Below is the recipe paths for the intended task. -RECIPE_PATH = "recipe_path" +PARAM_RECIPE_PATH = "recipe_path" # Below is the recipe paths for the prompt template. -PROMPT_TEMPLATE_PATH = "prompt_template_path" +PARAM_PROMPT_TEMPLATE_PATH = "prompt_template_path" # Below is how many dialogue turns to keep maintained in the prompt context. -CONTEXT_HISTORY_LENGTH = "context_history_length" +PARAM_CONTEXT_HISTORY_LENGTH = "context_history_length" # Below configures the width and height of an image. A typical example would be 1280 * 720. -IMAGE_WIDTH = "pv_width" -IMAGE_HEIGHT = "pv_height" -DEBUG_MODE = "debug_mode" +PARAM_IMAGE_WIDTH = "pv_width" +PARAM_IMAGE_HEIGHT = "pv_height" +PARAM_DEBUG_MODE = "debug_mode" # Below is the complete set of prompt instructions. PROMPT_INSTRUCTIONS = """ @@ -116,23 +120,24 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (RECIPE_PATH,), - (PROMPT_TEMPLATE_PATH,), (IN_EMOTION_TOPIC,), (IN_TASK_STATE_TOPIC, ""), (IN_OBJECT_DETECTION_TOPIC, ""), (IN_ACT_CLFN_TOPIC, ""), - (IMAGE_WIDTH, -1), - (IMAGE_HEIGHT, -1), + (PARAM_RECIPE_PATH,), + (PARAM_PROMPT_TEMPLATE_PATH,), + (PARAM_IMAGE_WIDTH, -1), + (PARAM_IMAGE_HEIGHT, -1), ( - OBJECT_DETECTION_FILTER, + PARAM_OBJECT_DETECTION_FILTER_STRATEGY, VisualQuestionAnswerer.FilterType.THRESHOLD.name, ), - (OBJECT_DETECTION_THRESHOLD, 0.8), - (ACT_CLFN_THRESHOLD, 0.8), + (PARAM_OBJECT_LAST_N_OBJECTS, 10), + (PARAM_OBJECT_DETECTION_THRESHOLD, 0.8), + (PARAM_ACT_CLFN_THRESHOLD, 0.8), (OUT_QA_TOPIC,), - (CONTEXT_HISTORY_LENGTH, 3), - (DEBUG_MODE, False), + (PARAM_CONTEXT_HISTORY_LENGTH, 3), + (PARAM_DEBUG_MODE, False), ], ) self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] @@ -140,48 +145,51 @@ def __init__(self): self._in_objects_topic = param_values[IN_OBJECT_DETECTION_TOPIC] self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] - self.dialogue_history_length = param_values[CONTEXT_HISTORY_LENGTH] + self.dialogue_history_length = param_values[PARAM_CONTEXT_HISTORY_LENGTH] self.debug_mode = False - if param_values[DEBUG_MODE]: + if param_values[PARAM_DEBUG_MODE]: # langchain.debug = True self.debug_mode = True # Used to obtain the center perspective point and how far detected objects # are from it. - self.pv_width = param_values[IMAGE_WIDTH] - self.pv_height = param_values[IMAGE_HEIGHT] + self.pv_width = param_values[PARAM_IMAGE_WIDTH] + self.pv_height = param_values[PARAM_IMAGE_HEIGHT] pv_configured = self.pv_width > 0 and self.pv_height > 0 self.pv_center_coordinate = ( [self.pv_width / 2, self.pv_height / 2] if pv_configured else [None, None] ) # Read the configured recipe file. - self._recipe_path = param_values[RECIPE_PATH] + self._recipe_path = param_values[PARAM_RECIPE_PATH] self.recipe = self._configure_recipe(self._recipe_path) self.log.info(f"Configured recipe to be: ~~~~~~~~~~\n{self.recipe}\n~~~~~~~~~~") # Read the configured prompt template. - self._prompt_template_path = param_values[PROMPT_TEMPLATE_PATH] + self._prompt_template_path = param_values[PARAM_PROMPT_TEMPLATE_PATH] with open(self._prompt_template_path, "r") as file: self.prompt_template = file.read() self.log.info( f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" ) - # Configure supplemental input detection & classification criteria. + # Configure supplemental input object detection criteria. self.object_dtctn_filter = VisualQuestionAnswerer.FilterType[ - param_values[OBJECT_DETECTION_FILTER].upper() + param_values[PARAM_OBJECT_DETECTION_FILTER_STRATEGY].upper() ] if ( self.object_dtctn_filter.is_center() and self.pv_center_coordinate[0] is None ): raise ValueError( - f"All {OBJECT_DETECTION_FILTER} and {IMAGE_WIDTH} and {IMAGE_HEIGHT} " + f"All {PARAM_OBJECT_DETECTION_FILTER_STRATEGY} and {PARAM_IMAGE_WIDTH} and {PARAM_IMAGE_HEIGHT} " + "must be configured together." ) - self.object_dtctn_threshold = param_values[OBJECT_DETECTION_THRESHOLD] - self.action_clfn_threshold = param_values[ACT_CLFN_THRESHOLD] + self.object_dtctn_threshold = param_values[PARAM_OBJECT_DETECTION_THRESHOLD] + self.object_dtctn_last_n_objects = param_values[PARAM_OBJECT_LAST_N_OBJECTS] + + # Configure supplemental input action classification criteria. + self.action_clfn_threshold = param_values[PARAM_ACT_CLFN_THRESHOLD] # Configure supplemental input resources. self.question_queue = queue.Queue() @@ -348,11 +356,11 @@ def _add_detected_object_closest_to_center(self, msg): most_center_obj = obj most_center_dist = curr_dist if most_center_obj: - if self.debug_mode: - self.log.info( - f"Added {most_center_obj} to detected objects queue." - + f"Object is {most_center_dist} away from the center." - ) + # if self.debug_mode: + # self.log.info( + # f"Added {most_center_obj} to detected objects queue." + # + f"Object is {most_center_dist} away from the center." + # ) te = VisualQuestionAnswerer.TimestampedEntity( self._get_sec(msg), set([most_center_obj]) ) @@ -391,22 +399,27 @@ def _get_latest_action(self, curr_time: int) -> str: break return latest_action - def _get_latest_observables(self, curr_time: int) -> str: + def _get_last_n_observables(self, curr_time: int, n: int) -> str: """ Returns a comma-delimited list of observed objects per all entities in self.detected_objects_queue that occurred before a provided time. + + + :param curr_time: The time for which objects must have been detected before. + :param n: The last n objects. + :return: returns a string-ified list of the latest observables """ - observables = set() + observables = [] while not self.detected_objects_queue.empty(): next = self.detected_objects_queue.queue[0] if next.time < curr_time: - observables.update(next.entity) + observables.extend(next.entity) self.detected_objects_queue.get() else: break if not observables: return "nothing" - return ", ".join(list(observables)) + return ", ".join(set(observables[-n:])) def get_response( self, @@ -478,7 +491,9 @@ def process_question_queue(self): self.log.info(f"Latest action: {action}") # Get detected objects. - observables = self._get_latest_observables(start_time) + observables = self._get_last_n_observables( + start_time, self.object_dtctn_last_n_objects + ) self.log.info(f"Observed objects: {observables}") # Generate response. From a1722d5c261e95a0aa61340b15efcbce2a6f23d0 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Fri, 27 Oct 2023 15:02:32 -0400 Subject: [PATCH 07/46] Add Live Visual QA Tmux Config --- ...sual_vocalized_question_answering_live.yml | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 tmux/eval_visual_vocalized_question_answering_live.yml diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml new file mode 100644 index 000000000..8771f2a50 --- /dev/null +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -0,0 +1,201 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.0.23} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: + - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ROS_IP:=0.0.0.0 + - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ip_addr:=${HL2_IP} + -p image_topic:=PVFramesBGR + -p image_ts_topic:=disable + -p hand_pose_topic:=disable + -p audio_topic:=HeadsetAudioData + -p head_pose_topic:=HeadsetPoseData + -p sm_topic:=disable + -p rm_depth_AHAT:=disable + -p pv_width:=760 + -p pv_height:=428 + -p pv_framerate:=30 + -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + # - ros_bag_play: sleep 5; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=BerkeleyObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=1 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p det_topic:=ObjectDetections2d + -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + -p inference_img_size:=1280 + -p det_conf_threshold:=0.1 + -p cuda_device_id:=1 + + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=BerkeleyObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:1 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - intent_detection: + layout: even-vertical + panes: + - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterances_topic:=utterances_topic + -p expect_user_intent_topic:=expect_user_intent_topic + -p interp_user_intent_topic:=interp_user_intent_topic + - emotion_detection: + layout: even-vertical + panes: + - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p user_intent_topic:=expect_user_intent_topic + -p user_emotion_topic:=null_topic + - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p user_intent_topic:=interp_user_intent_topic + -p user_emotion_topic:=gpt_emotion_topic + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p user_emotion_topic:=gpt_emotion_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=BerkeleyObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_filter:=CENTER + -p pv_width:=1280 + -p pv_height:=720 + -p debug_mode:=True \ No newline at end of file From a50c777191e616f50cce86020094e72ce8e029f3 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Fri, 27 Oct 2023 16:22:47 -0400 Subject: [PATCH 08/46] Change intent detection to always publish to interpreted topic --- .../angel_system_nodes/base_intent_detector.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py index 4d0afdacf..c89d3e554 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py @@ -134,9 +134,8 @@ def publish_msg(self, utterance, intent, score): intent_msg.confidence = 1.0 self._expected_publisher.publish(intent_msg) published_topic = PARAM_EXPECT_USER_INTENT_TOPIC - else: - self._interp_publisher.publish(intent_msg) - published_topic = PARAM_INTERP_USER_INTENT_TOPIC + self._interp_publisher.publish(intent_msg) + published_topic = PARAM_INTERP_USER_INTENT_TOPIC colored_utterance = colored(utterance, "light_blue") colored_intent = colored(intent_msg.user_intent, "light_green") From f0e61517c4095e0f20a90f6190f4d24b1c6b7319 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Fri, 27 Oct 2023 16:23:43 -0400 Subject: [PATCH 09/46] Remove expected emotion detection publishing --- tmux/eval_visual_vocalized_question_answering_live.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index 8771f2a50..5e6aef527 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -175,10 +175,6 @@ windows: - emotion_detection: layout: even-vertical panes: - - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p user_intent_topic:=expect_user_intent_topic - -p user_emotion_topic:=null_topic - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic From db610a6f2a67aeb39417a637ed75cca7c9a76854 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 28 Oct 2023 13:11:56 -0400 Subject: [PATCH 10/46] Remove legacy intent detector node --- .../angel_system_nodes/intent_detector.py | 133 ------------------ ros/angel_system_nodes/setup.py | 1 - 2 files changed, 134 deletions(-) delete mode 100644 ros/angel_system_nodes/angel_system_nodes/intent_detector.py diff --git a/ros/angel_system_nodes/angel_system_nodes/intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/intent_detector.py deleted file mode 100644 index 618cf02ca..000000000 --- a/ros/angel_system_nodes/angel_system_nodes/intent_detector.py +++ /dev/null @@ -1,133 +0,0 @@ -import rclpy -from rclpy.node import Node - -from angel_msgs.msg import InterpretedAudioUserIntent, Utterance - -# Please refer to labels defined in -# https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . -NEXT_STEP_KEYPHRASES = ["skip", "next", "next step"] -PREV_STEP_KEYPHRASES = ["previous", "previous step", "last step", "go back"] -OVERRIDE_KEYPHRASES = ["angel", "angel system"] - -# TODO(derekahmed): Please figure out how to keep this sync-ed with -# config/angel_system_cmds/user_intent_to_sys_cmd_v1.yaml. -LABELS = ["Go to next step", "Go to previous step"] - - -UTTERANCES_TOPIC = "utterances_topic" -PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" -PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" - - -class IntentDetector(Node): - """ - As of Q12023, intent detection is derived heuristically. This will be shifted - to a model-based approach in the near-future. - """ - - def __init__(self): - super().__init__(self.__class__.__name__) - self.log = self.get_logger() - - parameter_names = [ - UTTERANCES_TOPIC, - PARAM_EXPECT_USER_INTENT_TOPIC, - PARAM_INTERP_USER_INTENT_TOPIC, - ] - set_parameters = self.declare_parameters( - namespace="", - parameters=[(p,) for p in parameter_names], - ) - # Check for not-set parameters - some_not_set = False - for p in set_parameters: - if p.type_ is rclpy.parameter.Parameter.Type.NOT_SET: - some_not_set = True - self.log.error(f"Parameter not set: {p.name}") - if some_not_set: - raise ValueError("Some parameters are not set.") - - self._utterances_topic = self.get_parameter(UTTERANCES_TOPIC).value - self._expect_uintent_topic = self.get_parameter( - PARAM_EXPECT_USER_INTENT_TOPIC - ).value - self._interp_uintent_topic = self.get_parameter( - PARAM_INTERP_USER_INTENT_TOPIC - ).value - self.log.info( - f"Utterances topic: " - f"({type(self._utterances_topic).__name__}) " - f"{self._utterances_topic}" - ) - self.log.info( - f"Expected User Intent topic: " - f"({type(self._expect_uintent_topic).__name__}) " - f"{self._expect_uintent_topic}" - ) - self.log.info( - f"Interpreted User Intent topic: " - f"({type(self._interp_uintent_topic).__name__}) " - f"{self._interp_uintent_topic}" - ) - - # TODO(derekahmed): Add internal queueing to reduce subscriber queue - # size to 1. - self.subscription = self.create_subscription( - Utterance, self._utterances_topic, self.listener_callback, 10 - ) - - self._expected_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._expect_uintent_topic, 1 - ) - - self._interp_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._interp_uintent_topic, 1 - ) - - def listener_callback(self, msg): - log = self.get_logger() - intent_msg = InterpretedAudioUserIntent() - intent_msg.utterance_text = msg.value - - lower_utterance = msg.value.lower() - if self.contains_phrase(lower_utterance, NEXT_STEP_KEYPHRASES): - intent_msg.user_intent = LABELS[0] - intent_msg.confidence = 0.5 - elif self.contains_phrase(lower_utterance, PREV_STEP_KEYPHRASES): - intent_msg.user_intent = LABELS[1] - intent_msg.confidence = 0.5 - else: - log.info(f'Detected no intents for "{msg.value}":') - return - - if self.contains_phrase(lower_utterance, OVERRIDE_KEYPHRASES): - intent_msg.confidence = 1.0 - self._expected_publisher.publish(intent_msg) - else: - self._interp_publisher.publish(intent_msg) - - log.info( - f'Detected intents for "{msg.value}":\n' - + f'"{intent_msg.user_intent}": {intent_msg.confidence}' - ) - - def contains_phrase(self, utterance, phrases): - for phrase in phrases: - if phrase in utterance: - return True - return False - - -def main(): - rclpy.init() - - intentDetector = IntentDetector() - - rclpy.spin(intentDetector) - - intentDetector.destroy_node() - rclpy.shutdown() - - -if __name__ == "__main__": - main() diff --git a/ros/angel_system_nodes/setup.py b/ros/angel_system_nodes/setup.py index c75bc6664..13cec4fae 100644 --- a/ros/angel_system_nodes/setup.py +++ b/ros/angel_system_nodes/setup.py @@ -26,7 +26,6 @@ "gpt_emotion_detector = angel_system_nodes.gpt_emotion_detector:main", "question_answerer = angel_system_nodes.question_answerer:main", "visual_question_answerer = angel_system_nodes.visual_question_answerer:main", - "intent_detector = angel_system_nodes.intent_detector:main", "spatial_mapper = angel_system_nodes.spatial_mapper:main", "feedback_generator = angel_system_nodes.feedback_generator:main", "annotation_event_monitor = angel_system_nodes.annotation_event_monitor:main", From b00175b5ab3f74d957621ff460e37f03d43cf921 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 28 Oct 2023 15:33:06 -0400 Subject: [PATCH 11/46] Add centroid 2d strategy queue --- .../utils/object_detection_queues/__init__.py | 0 .../centroid_2d_strategy_queue.py | 122 ++++++++ .../centroid_2d_strategy_queue_test.py | 284 ++++++++++++++++++ 3 files changed, 406 insertions(+) create mode 100644 angel_system/utils/object_detection_queues/__init__.py create mode 100644 angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py create mode 100644 angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py diff --git a/angel_system/utils/object_detection_queues/__init__.py b/angel_system/utils/object_detection_queues/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py new file mode 100644 index 000000000..976c02439 --- /dev/null +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py @@ -0,0 +1,122 @@ +import logging +import heapq +from scipy.spatial import distance +import threading +from typing import * + +LOG = logging.getLogger(__name__) + + +class Centroid2DStrategyQueue: + """ + Little class to handle priority queueing of detected object bounding boxes + based on their centroid (center coordinate of the bounding box). + Items are stored in a priority queue based on a timestamp integer. + When items are popped from the queue, the `last_n` items *before* a provided + timestamp are returned. + + + Typical Example Usage: + q = Centroid2DStrategyQueue(n=1, k=2) + q.add(timestamp=1, BoundingBoxes(..., [('obj1', 'obj2', 'obj3')])) + q.add(timestamp=2, BoundingBoxes(..., [('obj1', 'obj2', 'obj3')])) + q.get_n_before(2) + """ + + class BoundingBoxes: + + def __init__(self, left: List[int], right: List[int], top: List[int], bottom: List[int], + item: List[Any]): + """ + Wrapper of bounding boxes and a contained entity corresponding to each bounding box. + The item is intentionally kept ambiguous to provide flexibility (e.g. can pass in + an object label that corresponds to each bounding box or a tuple of an object label and + its confidence score). + """ + self.left = left + self.right = right + self.top = top + self.bottom = bottom + self.item = item + + def __init__(self, n : int, center_x: int, center_y: int, + k: int = 1, log_func: Optional[Callable[..., None]] = None): + """ + Additional arguments are passed to the logging method + :param n: Whenever objects are retrieved, return the last n entries. + :param k: Acquires the top k objects that are the most centered given their centroid. + :param log_func: Optional callable to be invoked to receive the + message. If this is `None`, the local Logger instance to this + module is used. + """ + self._log_func = log_func + + self.n = n + self.k = k + + # This is the main priority queue. Each item should be a Tuple[int, Any] in which + # the elements correspond to (Integer Timestamp, Any Object). An example of the queued + # object's second element could be a Tuple of the top K detected objects. + self.pq = [] + self.center_x = center_x + self.center_y = center_y + self.lock = threading.Lock() + + def get_queue(self): + return self.pq + + def add(self, timestamp: int, item: BoundingBoxes): + self.lock.acquire() + k_most_centered_objects = self._get_k_most_center_objects(item) + heapq.heappush(self.pq, (timestamp, k_most_centered_objects)) + self.lock.release() + + def get_n_before(self, timestamp: int) -> List[Any]: + """ + Gets the self.n items before the provided timestamp. + """ + items = [] + self.lock.acquire() + while self.pq: + next_timestamp, _ = self.pq[0] + if next_timestamp < timestamp: + items.append(heapq.heappop(self.pq)) + else: + break + self.lock.release() + if self._log_func: + self._log_func(f"Read up to {self.n} items from queue" +\ + "; ".join([f"{item} @ Time={time}" for time, item in items])) + return items[-self.n:] if items else items + + def _get_k_most_center_objects(self, bb: BoundingBoxes) -> List[Any]: + """ + Acquires the top k objects with respect to centroid distance from the center pixel. + Returns a list of Tuples of (centroid distance, top k most centered objects) + """ + k_most_centered_objects = [] + + # Sort the bounding boxes in order of distance from centroid to center pixel. + zipped = zip(bb.item, bb.left, bb.right, bb.top, bb.bottom) + for item, left, right, top, bottom in zipped: + centroid_x, centroid_y = self._get_centroid(left, right, top, bottom) + dist = distance.euclidean( + [centroid_x, centroid_y], + [self.center_x, self.center_y] + ) + heapq.heappush(k_most_centered_objects, (dist, item)) + + # Return the top k centered objects based on centroid distance. + result = [] + for _ in range(self.k): + result.append(heapq.heappop(k_most_centered_objects)) + return result + + + def _get_centroid(self, left: int, right: int, top: int, bottom: int) -> Tuple[int, int]: + """ + Calculates the center 2D pixel of a 2D bounding box. + """ + width_center = left + int((right - left) / 2) + height_center = top + int((bottom - top) / 2) + return [width_center, height_center] \ No newline at end of file diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py new file mode 100644 index 000000000..9247f203b --- /dev/null +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py @@ -0,0 +1,284 @@ +import unittest + +from centroid_2d_strategy_queue import Centroid2DStrategyQueue + + + +RESOLUTION_W = 1920 +RESOLUTION_H = 1080 + +class Centroid2DStrategyQueueTest(unittest.TestCase): + + def test_queue_n3_k1_insertion(self): + """ + Tests proper queue insertion when objects are inserted as strings. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + ['mug', 'computer', 'dog'] + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + ['ball', 'butterfly', 'cat'] + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4 , 2], + ['shoes', 'pencil', 'child'] + ) + q.add(timestamp=1, item=first_objects_detected) + q.add(timestamp=2, item=second_objects_detected) + q.add(timestamp=3, item=third_objects_detected) + + queue_state = q.get_queue() + first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ + queue_state[0], queue_state[1], queue_state[2] + first_top_k, second_top_k, third_top_k = \ + first_timestamped_item[-1], second_timetsamped_item[-1], third_timestamped_item[-1] + # Recall that each object is a List of Tuples of (centroid distance, detected object) + self.assertEqual(first_top_k[0][-1], 'dog') + self.assertEqual(second_top_k[0][-1], 'cat') + self.assertEqual(third_top_k[0][-1], 'shoes') + + def test_queue_n3_k1_insertion_with_confidence_scores(self): + """ + Tests proper queue insertion when objects are inserted as Tuples with confidence scores. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + [('mug', 0.1), ('computer', 0.8), ('dog', 0.5)] + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + [('ball', 0.9), ('butterfly', 0.3), ('cat', 0.5)] + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4 , 2], + [('shoes', 0.9), ('pencil', 0.3), ('child', 0.5)] + ) + q.add(timestamp=1, item=first_objects_detected) + q.add(timestamp=2, item=second_objects_detected) + q.add(timestamp=3, item=third_objects_detected) + + queue_state = q.get_queue() + first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ + queue_state[0], queue_state[1], queue_state[2] + first_top_k, second_top_k, third_top_k = \ + first_timestamped_item[-1], second_timetsamped_item[-1], third_timestamped_item[-1] + # Recall that each object is a List of Tuples: + # (centroid distance, (detected object, confidence score)) + _, obj_with_conf_score = first_top_k[0] + self.assertEqual(obj_with_conf_score[0], 'dog') + _, obj_with_conf_score = second_top_k[0] + self.assertEqual(obj_with_conf_score[0], 'cat') + _, obj_with_conf_score = third_top_k[0] + self.assertEqual(obj_with_conf_score[0], 'shoes') + + def test_queue_n3_k2_insertion(self): + """ + Tests proper queue insertion when the top 2 objects are inserted as strings. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + ['mug', 'computer', 'dog'] + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + ['ball', 'butterfly', 'cat'] + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4 , 2], + ['shoes', 'pencil', 'child'] + ) + q.add(timestamp=1, item=first_objects_detected) + q.add(timestamp=2, item=second_objects_detected) + q.add(timestamp=3, item=third_objects_detected) + + queue_state = q.get_queue() + first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ + queue_state[0], queue_state[1], queue_state[2] + first_top_k, second_top_k, third_top_k = \ + first_timestamped_item[-1], second_timetsamped_item[-1], third_timestamped_item[-1] + # Recall that each object is a List of Tuples of (centroid distance, detected object) + + first_object_labels = [label for centroid, label in first_top_k] + self.assertEqual(["dog", "computer"], first_object_labels) + + second_object_labels = [label for centroid, label in second_top_k] + self.assertEqual(["cat", "butterfly"], second_object_labels) + + third_object_labels = [label for centroid, label in third_top_k] + self.assertEqual(["shoes", "pencil"], third_object_labels) + + def test_queue_n3_k2_removal(self): + """ + Tests proper queueing of the last 3 top 2 objects are inserted as strings. + """ + q = Centroid2DStrategyQueue( + n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + ['mug', 'computer', 'dog'] + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + ['ball', 'butterfly', 'cat'] + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4 , 2], + ['shoes', 'pencil', 'child'] + ) + q.add(timestamp=1, item=first_objects_detected) + q.add(timestamp=2, item=second_objects_detected) + q.add(timestamp=3, item=third_objects_detected) + + no_items = q.get_n_before(timestamp=1) + self.assertEqual([], no_items) + last_n_top_k = q.get_n_before(timestamp=4) + discarded_timestamp, first_top_k_with_centroid_dist = last_n_top_k[0] + first_top_k = [item for discarded_dist, item in first_top_k_with_centroid_dist] + self.assertEqual(["dog", "computer"], first_top_k) + + discarded_timestamp, second_top_k_with_centroid_dist = last_n_top_k[1] + second_top_k = [item for dist, item in second_top_k_with_centroid_dist] + self.assertEqual(["cat", "butterfly"], second_top_k) + + discarded_timestamp, third_top_k_with_centroid_dist = last_n_top_k[2] + third_top_k = [item for dist, item in third_top_k_with_centroid_dist] + self.assertEqual(["shoes", "pencil"], third_top_k) + + def test_queue_n2_k2_removal_with_confidence_scores(self): + """ + Tests proper queue removal of the last 2 top 2 detected objects before a given timestamp + when the top 2 objects are inserted as strings with confidence scores. + """ + q = Centroid2DStrategyQueue( + n=2, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) + + # Dog is in the middle of the screen. Mug is in top left of the screen. + # Computer is near bottom right of screen. + first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], + [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], + [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], + [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], + [('mug', 0.1), ('computer', 0.8), ('dog', 0.5)] + ) + + # Ball is in top left of the screen. The butterfly is bottom right + # of this ball. The cat is in the middle of the screen. + second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [1, 2, RESOLUTION_W // 2], + [2, 4, RESOLUTION_W // 2 + 20], + [1, 2, RESOLUTION_H // 2], + [2, 4, RESOLUTION_H // 2 + 10], + [('ball', 0.9), ('butterfly', 0.3), ('cat', 0.5)] + ) + + # Shoes is in bottom right of the screen. The pencil is in the top left + # of the screen. The child is in the top left of the screen. + third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + [RESOLUTION_W - 10, 2, 1], + [RESOLUTION_W, 4, 2], + [RESOLUTION_H - 10, 2, 1], + [RESOLUTION_H, 4 , 2], + [('shoes', 0.9), ('pencil', 0.3), ('child', 0.5)] + ) + q.add(timestamp=1, item=first_objects_detected) + q.add(timestamp=2, item=second_objects_detected) + q.add(timestamp=3, item=third_objects_detected) + + no_items = q.get_n_before(timestamp=1) + self.assertEqual([], no_items) + last_n_top_k = q.get_n_before(timestamp=4) + discarded_timestamp, first_top_k_with_centroid_dist = last_n_top_k[0] + first_scored_top_k = [scored_item for discarded_dist, scored_item in + first_top_k_with_centroid_dist] + first_top_k = [item for item, score in first_scored_top_k] + self.assertEqual(["cat", "butterfly"], first_top_k) + + discarded_timestamp, second_top_k_with_centroid_dist = last_n_top_k[1] + second_scored_top_k = [scored_item for discarded_dist, scored_item in + second_top_k_with_centroid_dist] + second_top_k = [item for item, score in second_scored_top_k] + self.assertEqual(["shoes", "pencil"], second_top_k) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From d9daf47170c780bb703282679ff8ededf202846c Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 14:53:51 -0400 Subject: [PATCH 12/46] Add Centroid 2D Strategy Queue Test for no insertions --- .../centroid_2d_strategy_queue_test.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py index 9247f203b..d35ec0a9e 100644 --- a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py @@ -174,7 +174,7 @@ def test_queue_n3_k2_removal(self): Tests proper queueing of the last 3 top 2 objects are inserted as strings. """ q = Centroid2DStrategyQueue( - n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) + n=1, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. @@ -211,16 +211,10 @@ def test_queue_n3_k2_removal(self): no_items = q.get_n_before(timestamp=1) self.assertEqual([], no_items) + # Expects the last n=1 detections before timestamp 4. This should be timestamp 3's + # top k=2 objects. last_n_top_k = q.get_n_before(timestamp=4) - discarded_timestamp, first_top_k_with_centroid_dist = last_n_top_k[0] - first_top_k = [item for discarded_dist, item in first_top_k_with_centroid_dist] - self.assertEqual(["dog", "computer"], first_top_k) - - discarded_timestamp, second_top_k_with_centroid_dist = last_n_top_k[1] - second_top_k = [item for dist, item in second_top_k_with_centroid_dist] - self.assertEqual(["cat", "butterfly"], second_top_k) - - discarded_timestamp, third_top_k_with_centroid_dist = last_n_top_k[2] + discarded_timestamp, third_top_k_with_centroid_dist = last_n_top_k[0] third_top_k = [item for dist, item in third_top_k_with_centroid_dist] self.assertEqual(["shoes", "pencil"], third_top_k) @@ -267,18 +261,28 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): no_items = q.get_n_before(timestamp=1) self.assertEqual([], no_items) + # Expects the last n=2 detections before timestamp 4. This should be timestamp 2 and + # timestamp 3's top k=2 objects. last_n_top_k = q.get_n_before(timestamp=4) discarded_timestamp, first_top_k_with_centroid_dist = last_n_top_k[0] first_scored_top_k = [scored_item for discarded_dist, scored_item in first_top_k_with_centroid_dist] first_top_k = [item for item, score in first_scored_top_k] self.assertEqual(["cat", "butterfly"], first_top_k) - discarded_timestamp, second_top_k_with_centroid_dist = last_n_top_k[1] second_scored_top_k = [scored_item for discarded_dist, scored_item in second_top_k_with_centroid_dist] second_top_k = [item for item, score in second_scored_top_k] self.assertEqual(["shoes", "pencil"], second_top_k) + def test_empty_queue(self): + """ + Tests proper get-behavior of an empty queue. + """ + q = Centroid2DStrategyQueue( + n=2, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) + self.assertEqual([], q.get_n_before(timestamp=4)) + + if __name__ == "__main__": unittest.main() \ No newline at end of file From f5cf171c5f888663abb0ca0a4e0b6a7720453a9d Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 15:26:45 -0400 Subject: [PATCH 13/46] Integrate 2d centroid distance queueing into visual question answering --- .../visual_question_answerer.py | 45 +++++++++++++++---- ...al_visual_vocalized_question_answering.yml | 10 ++--- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 41751db3f..d2eac6419 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -23,6 +23,7 @@ TaskUpdate, ) from angel_utils import declare_and_get_parameters +from angel_system.utils.object_detection_queues import centroid_2d_strategy_queue openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") @@ -200,6 +201,11 @@ def __init__(self): self.handler_thread = threading.Thread(target=self.process_question_queue) self.handler_thread.start() + self.centroid_object_queue = \ + centroid_2d_strategy_queue.Centroid2DStrategyQueue( + 5, self.pv_center_coordinate[0], self.pv_center_coordinate[1], + k=1) + # Configure the (necessary) emotional detection enriched utterance subscription. self.emotion_subscription = self.create_subscription( InterpretedAudioUserEmotion, @@ -327,6 +333,14 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: """ Stores all detected objects with a confidence score above IN_OBJECT_DETECTION_THRESHOLD. """ + # We will queue timestamped lists of pairs of (detections, confidence scores). + self.centroid_object_queue.add( + self._get_sec(msg), + centroid_2d_strategy_queue.Centroid2DStrategyQueue.BoundingBoxes( + msg.left, msg.right, msg.top, msg.bottom, + item=list(zip(msg.label_vec, msg.label_confidences)) + )) + if self.object_dtctn_filter.is_threshold(): self._add_detected_objects_above_threshold(msg) elif self.object_dtctn_filter.is_center(): @@ -409,17 +423,30 @@ def _get_last_n_observables(self, curr_time: int, n: int) -> str: :param n: The last n objects. :return: returns a string-ified list of the latest observables """ - observables = [] - while not self.detected_objects_queue.empty(): - next = self.detected_objects_queue.queue[0] - if next.time < curr_time: - observables.extend(next.entity) - self.detected_objects_queue.get() - else: - break + observables = set() + timestamped_detections = self.centroid_object_queue.get_n_before(timestamp=curr_time) + if timestamped_detections: + if self.debug_mode: + print(f"Timestamped detections based on centroid distance are:{timestamped_detections}") + # Recall that we passed in timestamped lists of pairs of (detections, confidence scores). + top_k_obj_lists = [j for _, j in timestamped_detections] + for top_k_obj_list in top_k_obj_lists: + for centroid_dist_obj in top_k_obj_list: + centroid, obj_score = centroid_dist_obj + obj, score = obj_score + observables.add(obj) + return ", ".join(observables) + + # while not self.detected_objects_queue.empty(): + # next = self.detected_objects_queue.queue[0] + # if next.time < curr_time: + # observables.extend(next.entity) + # self.detected_objects_queue.get() + # else: + # break if not observables: return "nothing" - return ", ".join(set(observables[-n:])) + # return ", ".join(set(observables[-n:])) def get_response( self, diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 37414b958..5578f4591 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -81,7 +81,7 @@ windows: - sensor_input: layout: even-vertical panes: - - ros_bag_play: sleep 5; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + - ros_bag_play: sleep 2; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 # Old videos were recorded in NV12 #- image_converter: ros2 run angel_datahub ImageConverter --ros-args @@ -175,10 +175,10 @@ windows: - emotion_detection: layout: even-vertical panes: - - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p user_intent_topic:=expect_user_intent_topic - -p user_emotion_topic:=null_topic + # - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p user_intent_topic:=expect_user_intent_topic + # -p user_emotion_topic:=null_topic - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic From 7af13f7b5389c200ff41af1fa3a470587e48936e Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 16:05:41 -0400 Subject: [PATCH 14/46] Rename CENTER filter type to CENTROID --- .../angel_system_nodes/visual_question_answerer.py | 13 ++++++------- tmux/eval_visual_vocalized_question_answering.yml | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index d2eac6419..1b686cf98 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -97,13 +97,13 @@ class FilterType(Enum): """ THRESHOLD = 1 - CENTER = 2 + CENTROID = 2 def is_threshold(self): return self.value == VisualQuestionAnswerer.FilterType.THRESHOLD.value - def is_center(self): - return self.value == VisualQuestionAnswerer.FilterType.CENTER.value + def is_centroid(self): + return self.value == VisualQuestionAnswerer.FilterType.CENTROID.value class TimestampedEntity: """ @@ -149,7 +149,6 @@ def __init__(self): self.dialogue_history_length = param_values[PARAM_CONTEXT_HISTORY_LENGTH] self.debug_mode = False if param_values[PARAM_DEBUG_MODE]: - # langchain.debug = True self.debug_mode = True # Used to obtain the center perspective point and how far detected objects @@ -179,7 +178,7 @@ def __init__(self): param_values[PARAM_OBJECT_DETECTION_FILTER_STRATEGY].upper() ] if ( - self.object_dtctn_filter.is_center() + self.object_dtctn_filter.is_centroid() and self.pv_center_coordinate[0] is None ): raise ValueError( @@ -204,7 +203,7 @@ def __init__(self): self.centroid_object_queue = \ centroid_2d_strategy_queue.Centroid2DStrategyQueue( 5, self.pv_center_coordinate[0], self.pv_center_coordinate[1], - k=1) + k=3) # Configure the (necessary) emotional detection enriched utterance subscription. self.emotion_subscription = self.create_subscription( @@ -343,7 +342,7 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: if self.object_dtctn_filter.is_threshold(): self._add_detected_objects_above_threshold(msg) - elif self.object_dtctn_filter.is_center(): + elif self.object_dtctn_filter.is_centroid(): # TODO(derekahmed): Maybe these shouldn't be mutually exclusive? self._add_detected_object_closest_to_center(msg) else: diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 5578f4591..b887be445 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -195,7 +195,7 @@ windows: -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_filter:=CENTER + -p obj_det_filter:=CENTROID -p pv_width:=1280 -p pv_height:=720 -p debug_mode:=True \ No newline at end of file From 36e2f57de7984ee28e7db13b794631f4c9f3af5d Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 16:08:06 -0400 Subject: [PATCH 15/46] Add live tmux config for Centroid Visual QA --- ...val_visual_vocalized_question_answering_live.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index 5e6aef527..d33673983 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -81,7 +81,6 @@ windows: - sensor_input: layout: even-vertical panes: - # - ros_bag_play: sleep 5; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 # Old videos were recorded in NV12 #- image_converter: ros2 run angel_datahub ImageConverter --ros-args @@ -106,7 +105,7 @@ windows: -p image_topic:=PVFramesBGR -p det_topic:=BerkeleyObjectDetections2d -p det_conf_threshold:=0.1 - -p cuda_device_id:=1 + -p cuda_device_id:=0 -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args -r __ns:=${ROS_NAMESPACE} @@ -115,7 +114,7 @@ windows: -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt -p inference_img_size:=1280 -p det_conf_threshold:=0.1 - -p cuda_device_id:=1 + -p cuda_device_id:=0 - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args -r __ns:=${ROS_NAMESPACE} @@ -131,7 +130,7 @@ windows: -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json - -p model_device:=cuda:1 + -p model_device:=cuda:0 -p model_dets_conv_version:=5 -p window_size:=30 -p buffer_max_size_seconds:=5 @@ -175,6 +174,10 @@ windows: - emotion_detection: layout: even-vertical panes: + # - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p user_intent_topic:=expect_user_intent_topic + # -p user_emotion_topic:=null_topic - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic @@ -191,7 +194,7 @@ windows: -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_filter:=CENTER + -p obj_det_filter:=CENTROID -p pv_width:=1280 -p pv_height:=720 -p debug_mode:=True \ No newline at end of file From 8e87f2f3d7a9d70a2332282f4e4e18a2e41f1d54 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 16:17:30 -0400 Subject: [PATCH 16/46] Remove deprecated queueing code in visual question answering --- .../visual_question_answerer.py | 98 ++----------------- ...al_visual_vocalized_question_answering.yml | 5 +- ...sual_vocalized_question_answering_live.yml | 5 +- 3 files changed, 13 insertions(+), 95 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 1b686cf98..6c86ef5a3 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -88,22 +88,6 @@ class VisualQuestionAnswerer(Node): - class FilterType(Enum): - """ - The following determines which objects to surface in the prompt. - "threshold" selects objects with a confidence score above OBJECT_DETECTION_THRESHOLD. - "center" selects the object closest to the center of the user's field of view. Make sure to - also configure pv_width and pv_height if this is selected. - """ - - THRESHOLD = 1 - CENTROID = 2 - - def is_threshold(self): - return self.value == VisualQuestionAnswerer.FilterType.THRESHOLD.value - - def is_centroid(self): - return self.value == VisualQuestionAnswerer.FilterType.CENTROID.value class TimestampedEntity: """ @@ -127,12 +111,8 @@ def __init__(self): (IN_ACT_CLFN_TOPIC, ""), (PARAM_RECIPE_PATH,), (PARAM_PROMPT_TEMPLATE_PATH,), - (PARAM_IMAGE_WIDTH, -1), - (PARAM_IMAGE_HEIGHT, -1), - ( - PARAM_OBJECT_DETECTION_FILTER_STRATEGY, - VisualQuestionAnswerer.FilterType.THRESHOLD.name, - ), + (PARAM_IMAGE_WIDTH,), + (PARAM_IMAGE_HEIGHT,), (PARAM_OBJECT_LAST_N_OBJECTS, 10), (PARAM_OBJECT_DETECTION_THRESHOLD, 0.8), (PARAM_ACT_CLFN_THRESHOLD, 0.8), @@ -173,18 +153,6 @@ def __init__(self): f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" ) - # Configure supplemental input object detection criteria. - self.object_dtctn_filter = VisualQuestionAnswerer.FilterType[ - param_values[PARAM_OBJECT_DETECTION_FILTER_STRATEGY].upper() - ] - if ( - self.object_dtctn_filter.is_centroid() - and self.pv_center_coordinate[0] is None - ): - raise ValueError( - f"All {PARAM_OBJECT_DETECTION_FILTER_STRATEGY} and {PARAM_IMAGE_WIDTH} and {PARAM_IMAGE_HEIGHT} " - + "must be configured together." - ) self.object_dtctn_threshold = param_values[PARAM_OBJECT_DETECTION_THRESHOLD] self.object_dtctn_last_n_objects = param_values[PARAM_OBJECT_LAST_N_OBJECTS] @@ -196,15 +164,15 @@ def __init__(self): self.current_step = "Unstarted" self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() + self.centroid_object_queue = \ + centroid_2d_strategy_queue.Centroid2DStrategyQueue( + 8, self.pv_center_coordinate[0], self.pv_center_coordinate[1], + k=1) + self.dialogue_history = [] self.handler_thread = threading.Thread(target=self.process_question_queue) self.handler_thread.start() - self.centroid_object_queue = \ - centroid_2d_strategy_queue.Centroid2DStrategyQueue( - 5, self.pv_center_coordinate[0], self.pv_center_coordinate[1], - k=3) - # Configure the (necessary) emotional detection enriched utterance subscription. self.emotion_subscription = self.create_subscription( InterpretedAudioUserEmotion, @@ -340,45 +308,6 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: item=list(zip(msg.label_vec, msg.label_confidences)) )) - if self.object_dtctn_filter.is_threshold(): - self._add_detected_objects_above_threshold(msg) - elif self.object_dtctn_filter.is_centroid(): - # TODO(derekahmed): Maybe these shouldn't be mutually exclusive? - self._add_detected_object_closest_to_center(msg) - else: - raise ValueError( - "VisualQuestionAnswerer Node is misconfigured as " - + self.object_dtctn_filter.value - ) - - def _add_detected_object_closest_to_center(self, msg): - """ - Adds the object that is closest to the configured center coordinate of the user's view. - This center coordinate is indicated by self.pv_center_coordinate. - """ - most_center_obj = None - most_center_dist = max(self.pv_width, self.pv_height) - zipped = zip(msg.label_vec, msg.left, msg.right, msg.top, msg.bottom) - for obj, left, right, top, bottom in zipped: - width_center = left + int((right - left) / 2) - height_center = top + int((bottom - top) / 2) - curr_dist = distance.euclidean( - [width_center, height_center], self.pv_center_coordinate - ) - if curr_dist < most_center_dist: - most_center_obj = obj - most_center_dist = curr_dist - if most_center_obj: - # if self.debug_mode: - # self.log.info( - # f"Added {most_center_obj} to detected objects queue." - # + f"Object is {most_center_dist} away from the center." - # ) - te = VisualQuestionAnswerer.TimestampedEntity( - self._get_sec(msg), set([most_center_obj]) - ) - self.detected_objects_queue.put(te) - def _add_detected_objects_above_threshold(self, msg): """ Queuse all objects above a configure threshold. @@ -423,6 +352,7 @@ def _get_last_n_observables(self, curr_time: int, n: int) -> str: :return: returns a string-ified list of the latest observables """ observables = set() + # handle 2D centroid distance queueing. timestamped_detections = self.centroid_object_queue.get_n_before(timestamp=curr_time) if timestamped_detections: if self.debug_mode: @@ -435,17 +365,7 @@ def _get_last_n_observables(self, curr_time: int, n: int) -> str: obj, score = obj_score observables.add(obj) return ", ".join(observables) - - # while not self.detected_objects_queue.empty(): - # next = self.detected_objects_queue.queue[0] - # if next.time < curr_time: - # observables.extend(next.entity) - # self.detected_objects_queue.get() - # else: - # break - if not observables: - return "nothing" - # return ", ".join(set(observables[-n:])) + def get_response( self, diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index b887be445..121a3db04 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -196,6 +196,5 @@ windows: -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt -p obj_det_filter:=CENTROID - -p pv_width:=1280 - -p pv_height:=720 - -p debug_mode:=True \ No newline at end of file + -p pv_width:=1920 + -p pv_height:=1080 \ No newline at end of file diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index d33673983..a7dcf953e 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -195,6 +195,5 @@ windows: -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt -p obj_det_filter:=CENTROID - -p pv_width:=1280 - -p pv_height:=720 - -p debug_mode:=True \ No newline at end of file + -p pv_width:=1920 + -p pv_height:=1080 \ No newline at end of file From 6af4b234eed8db5c03254178877aca18aa55775c Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 16:20:29 -0400 Subject: [PATCH 17/46] Change tmuxinator configs for visual question answering to be debug_mode by default --- tmux/eval_visual_vocalized_question_answering.yml | 3 ++- tmux/eval_visual_vocalized_question_answering_live.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 121a3db04..db2b26f2d 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -197,4 +197,5 @@ windows: -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt -p obj_det_filter:=CENTROID -p pv_width:=1920 - -p pv_height:=1080 \ No newline at end of file + -p pv_height:=1080 + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index a7dcf953e..b97d7eaf6 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -196,4 +196,5 @@ windows: -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt -p obj_det_filter:=CENTROID -p pv_width:=1920 - -p pv_height:=1080 \ No newline at end of file + -p pv_height:=1080 + -p debug_mode:=True \ No newline at end of file From ac83c226cf042f81f6330430716ff4b6993389fd Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 16:57:36 -0400 Subject: [PATCH 18/46] Fix live object detectio overlay tmux configuration --- .../eval_visual_vocalized_question_answering_live.yml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index b97d7eaf6..cbaf36a39 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -107,19 +107,10 @@ windows: -p det_conf_threshold:=0.1 -p cuda_device_id:=0 -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args - -r __ns:=${ROS_NAMESPACE} - -p image_topic:=PVFramesRGB - -p det_topic:=ObjectDetections2d - -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt - -p inference_img_size:=1280 - -p det_conf_threshold:=0.1 - -p cuda_device_id:=0 - - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args -r __ns:=${ROS_NAMESPACE} -p topic_input_images:=PVFramesBGR - -p topic_input_det_2d:=ObjectDetections2d + -p topic_input_det_2d:=BerkeleyObjectDetections2d -p topic_output_images:=pv_image_detections_2d -p filter_top_k:=-1 - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args From c9de08cc0b58772bc84bb6d11b2ca54e4001e9f7 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Mon, 30 Oct 2023 17:06:14 -0400 Subject: [PATCH 19/46] Add tmux config fixes --- .../centroid_2d_strategy_queue.py | 2 + .../base_emotion_detector.py | 7 +- ...sual_vocalized_question_answering_live.yml | 74 ++++++++++--------- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py index 976c02439..dfc75108d 100644 --- a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py @@ -109,6 +109,8 @@ def _get_k_most_center_objects(self, bb: BoundingBoxes) -> List[Any]: # Return the top k centered objects based on centroid distance. result = [] for _ in range(self.k): + if not k_most_centered_objects: + break result.append(heapq.heappop(k_most_centered_objects)) return result diff --git a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py index 5d5d2a07a..7733b3395 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py @@ -107,7 +107,7 @@ def process_message_queue(self): Constant loop to process received messages. """ while True: - msg = self.message_queue.get(block=True, timeout=None) + msg = self.message_queue.get() self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') classification, confidence_score = self.get_inference(msg) self.publish_detected_emotion( @@ -144,7 +144,10 @@ def _apply_filter(self, msg): # return msg # else: # return None - return msg + + if "hey angel" in msg.utterance_text.lower(): + return msg + return None def main(): diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index cbaf36a39..db3f45c40 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -61,32 +61,26 @@ tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf # attach: false windows: - - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args - -r __ns:=${ROS_NAMESPACE} - -p ROS_IP:=0.0.0.0 - - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args - -r __ns:=${ROS_NAMESPACE} - -p ip_addr:=${HL2_IP} - -p image_topic:=PVFramesBGR - -p image_ts_topic:=disable - -p hand_pose_topic:=disable - -p audio_topic:=HeadsetAudioData - -p head_pose_topic:=HeadsetPoseData - -p sm_topic:=disable - -p rm_depth_AHAT:=disable - -p pv_width:=760 - -p pv_height:=428 - -p pv_framerate:=30 - -p sm_freq:=5 - sensor_input: layout: even-vertical panes: - - # Old videos were recorded in NV12 - #- image_converter: ros2 run angel_datahub ImageConverter --ros-args - # -r __ns:=${ROS_NAMESPACE} - # -p topic_input_images:=PVFramesNV12 - # -p topic_output_images:=PVFramesRGB + - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ROS_IP:=0.0.0.0 + - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ip_addr:=${HL2_IP} + -p image_topic:=PVFramesBGR + -p image_ts_topic:=PVFramesBGR_TS + -p hand_pose_topic:=disable + -p audio_topic:=HeadsetAudioData + -p sm_topic:=disable + -p head_pose_topic:=disable + -p pv_width:=1280 + -p pv_height:=720 + -p pv_framerate:=30 + -p sm_freq:=5 + -p rm_depth_AHAT:=disable - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args -r __ns:=${ROS_NAMESPACE} @@ -97,26 +91,34 @@ windows: - rqt_rgb_images: rqt -s rqt_image_view/ImageView --args ${ROS_NAMESPACE}/PVFramesBGR --ros-args -p _image_transport:=raw + - object_detector: layout: even-vertical panes: - - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + # - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p det_conf_threshold:=0.1 + # -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args -r __ns:=${ROS_NAMESPACE} -p image_topic:=PVFramesBGR - -p det_topic:=BerkeleyObjectDetections2d + -p det_topic:=ObjectDetections2d + -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + -p inference_img_size:=1280 -p det_conf_threshold:=0.1 -p cuda_device_id:=0 - -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args -r __ns:=${ROS_NAMESPACE} -p topic_input_images:=PVFramesBGR - -p topic_input_det_2d:=BerkeleyObjectDetections2d + -p topic_input_det_2d:=ObjectDetections2d -p topic_output_images:=pv_image_detections_2d -p filter_top_k:=-1 - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args -r __ns:=${ROS_NAMESPACE} -p image_ts_topic:=PVFramesBGR_TS - -p det_topic:=BerkeleyObjectDetections2d + -p det_topic:=ObjectDetections2d -p act_topic:=ActivityDetections -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt @@ -134,6 +136,14 @@ windows: -p task_error_topic:=TaskErrors -p query_task_graph_topic:=query_task_graph -p sys_cmd_topic:=SystemCommands + - feedback_generator: ros2 run angel_system_nodes feedback_generator --ros-args + -r __ns:=${ROS_NAMESPACE} + -p activity_detector_topic:=ActivityDetections + -p object_detection_topic:=ObjectDetections3d + -p task_monitor_topic:=TaskUpdates + -p arui_update_topic:=AruiUpdates + -p interp_user_intent_topic:=InterpUserIntents + - vocal: layout: even-vertical panes: @@ -165,10 +175,6 @@ windows: - emotion_detection: layout: even-vertical panes: - # - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args - # -r __ns:=${ROS_NAMESPACE} - # -p user_intent_topic:=expect_user_intent_topic - # -p user_emotion_topic:=null_topic - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic @@ -180,7 +186,7 @@ windows: -r __ns:=${ROS_NAMESPACE} -p user_emotion_topic:=gpt_emotion_topic -p task_state_topic:=task_state_topic - -p object_detections_topic:=BerkeleyObjectDetections2d + -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json @@ -188,4 +194,4 @@ windows: -p obj_det_filter:=CENTROID -p pv_width:=1920 -p pv_height:=1080 - -p debug_mode:=True \ No newline at end of file + -p debug_mode:=True From b4934f267b4abaf553d00ad9f7dec71ccc960072 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Tue, 31 Oct 2023 00:30:46 -0400 Subject: [PATCH 20/46] Fix emotion detector code and remove filtering --- .../base_emotion_detector.py | 22 ++----------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py index 7733b3395..86401f618 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py @@ -48,7 +48,7 @@ def __init__(self): self.uintent_subscription = self.create_subscription( InterpretedAudioUserIntent, self._in_uintent_topic, - self.intent_detection_callback, + self.emotion_detection_callback, 1, ) self._interp_emo_publisher = self.create_publisher( @@ -92,14 +92,12 @@ def get_inference(self, msg): """ return self._get_vader_sentiment_analysis(msg.utterance_text) - def intent_detection_callback(self, msg): + def emotion_detection_callback(self, msg): """ This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ self.log.debug(f'Received message:\n\n"{msg.utterance_text}"') - if not self._apply_filter(msg): - return self.message_queue.put(msg) def process_message_queue(self): @@ -134,22 +132,6 @@ def publish_detected_emotion( + f'to {self._out_interp_uemotion_topic} for:\n>>> "{colored_utterance}"' ) - def _apply_filter(self, msg): - """ - Abstracts away any filtering to apply on received messages. Return - none if the message should be filtered out. Else, return the incoming - msg if it can be included. - """ - # if msg.user_intent.lower() == "user inquiry": - # return msg - # else: - # return None - - if "hey angel" in msg.utterance_text.lower(): - return msg - return None - - def main(): rclpy.init() emotion_detector = BaseEmotionDetector() From 3cd0ffe47ae3c3ebb61f6e3f8501c79dd70f6feb Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Tue, 31 Oct 2023 00:31:12 -0400 Subject: [PATCH 21/46] Add optional target phrase filtering --- .../visual_question_answerer.py | 16 +++++++++++--- ...al_visual_vocalized_question_answering.yml | 21 ++++++++++--------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 6c86ef5a3..369b6226a 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -37,6 +37,11 @@ # Below is/are the published topic(s). OUT_QA_TOPIC = "system_text_response_topic" +# Below is used to filter out incoming questions. Toggle this parameter to True if questions +# are only responded to if they contain the TARGET_PHRASE. +PARAM_MUST_CONTAIN_TARGET_PHRASE = "must_contain_target_phrase" +TARGET_PHRASE = "hey angel" + # Below configures the filtering strategy for detected objects. It should correspond to # VisualQuestionAnswerer.FilterType. PARAM_OBJECT_DETECTION_FILTER_STRATEGY = "obj_det_filter" @@ -119,6 +124,7 @@ def __init__(self): (OUT_QA_TOPIC,), (PARAM_CONTEXT_HISTORY_LENGTH, 3), (PARAM_DEBUG_MODE, False), + (PARAM_MUST_CONTAIN_TARGET_PHRASE, False), ], ) self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] @@ -131,6 +137,8 @@ def __init__(self): if param_values[PARAM_DEBUG_MODE]: self.debug_mode = True + self.param_must_contain_target_phrase = param_values[PARAM_MUST_CONTAIN_TARGET_PHRASE] + # Used to obtain the center perspective point and how far detected objects # are from it. self.pv_width = param_values[PARAM_IMAGE_WIDTH] @@ -470,10 +478,12 @@ def publish_generated_response(self, utterance: str, response: str): def _apply_filter(self, msg): """ Abstracts away any filtering to apply on received messages. Return - none if the message should be filtered out. Else, return the incoming - msg if it can be included. + a boolean value indicating if the message passes a filter and should be processed. """ - return msg + if self.param_must_contain_target_phrase: + return TARGET_PHRASE in msg.utterance_text.lower() + else: + return True def main(): diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index db2b26f2d..397f2c51f 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -101,16 +101,16 @@ windows: - object_detector: layout: even-vertical panes: - - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p image_topic:=PVFramesBGR - -p det_topic:=BerkeleyObjectDetections2d - -p det_conf_threshold:=0.1 - -p cuda_device_id:=1 - -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p det_conf_threshold:=0.1 + # -p cuda_device_id:=1 + # -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args -r __ns:=${ROS_NAMESPACE} - -p image_topic:=PVFramesRGB + -p image_topic:=PVFramesBGR -p det_topic:=ObjectDetections2d -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt -p inference_img_size:=1280 @@ -126,7 +126,7 @@ windows: - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args -r __ns:=${ROS_NAMESPACE} -p image_ts_topic:=PVFramesBGR_TS - -p det_topic:=BerkeleyObjectDetections2d + -p det_topic:=ObjectDetections2d -p act_topic:=ActivityDetections -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt @@ -190,7 +190,7 @@ windows: -r __ns:=${ROS_NAMESPACE} -p user_emotion_topic:=gpt_emotion_topic -p task_state_topic:=task_state_topic - -p object_detections_topic:=BerkeleyObjectDetections2d + -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json @@ -198,4 +198,5 @@ windows: -p obj_det_filter:=CENTROID -p pv_width:=1920 -p pv_height:=1080 + -p must_contain_target_phrase:=False -p debug_mode:=True \ No newline at end of file From d2fd5952e6dd16487498893bfcbb5ef7319baef8 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Tue, 31 Oct 2023 00:32:57 -0400 Subject: [PATCH 22/46] Remove dialogue history TODO --- .../angel_system_nodes/visual_question_answerer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 369b6226a..1f224488a 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -267,7 +267,6 @@ def _configure_langchain(self): temperature=0.0, max_tokens=64, ) - # TODO (derekahmed) Figure out how to include optional dialogue history zero_shot_prompt = langchain.PromptTemplate( input_variables=PROMPT_VARIABLES, template=self.prompt_template, From 33cdfe3816d4c03a733c568c9f7df60245c473cd Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Tue, 31 Oct 2023 00:44:51 -0400 Subject: [PATCH 23/46] Change emotion detector to base to reduce GPT queries --- tmux/eval_visual_vocalized_question_answering.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 397f2c51f..27d9594de 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -175,11 +175,7 @@ windows: - emotion_detection: layout: even-vertical panes: - # - gpt_exp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args - # -r __ns:=${ROS_NAMESPACE} - # -p user_intent_topic:=expect_user_intent_topic - # -p user_emotion_topic:=null_topic - - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + - gpt_interp_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic -p user_emotion_topic:=gpt_emotion_topic From 42af572d4ad75f15059febd0e52ad5e5db7ba6dd Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Tue, 31 Oct 2023 01:53:37 -0400 Subject: [PATCH 24/46] Refactor bounding boxes and centroid 2d strategy queueing --- angel_system/data/common/bounding_boxes.py | 17 +++++ .../centroid_2d_strategy_queue.py | 22 ++----- .../centroid_2d_strategy_queue_test.py | 62 +++++++++---------- 3 files changed, 52 insertions(+), 49 deletions(-) create mode 100644 angel_system/data/common/bounding_boxes.py diff --git a/angel_system/data/common/bounding_boxes.py b/angel_system/data/common/bounding_boxes.py new file mode 100644 index 000000000..207f5962c --- /dev/null +++ b/angel_system/data/common/bounding_boxes.py @@ -0,0 +1,17 @@ +from typing import * + +class BoundingBoxes: + + def __init__(self, left: List[int], right: List[int], top: List[int], bottom: List[int], + item: List[Any]): + """ + Wrapper of bounding boxes and a contained entity corresponding to each bounding box. + The item is intentionally kept ambiguous to provide flexibility (e.g. can pass in + an object label that corresponds to each bounding box or a tuple of an object label and + its confidence score). + """ + self.left = left + self.right = right + self.top = top + self.bottom = bottom + self.item = item \ No newline at end of file diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py index dfc75108d..b9a2a94ef 100644 --- a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py @@ -4,6 +4,8 @@ import threading from typing import * +from angel_system.data.common.bounding_boxes import BoundingBoxes + LOG = logging.getLogger(__name__) @@ -23,22 +25,6 @@ class Centroid2DStrategyQueue: q.get_n_before(2) """ - class BoundingBoxes: - - def __init__(self, left: List[int], right: List[int], top: List[int], bottom: List[int], - item: List[Any]): - """ - Wrapper of bounding boxes and a contained entity corresponding to each bounding box. - The item is intentionally kept ambiguous to provide flexibility (e.g. can pass in - an object label that corresponds to each bounding box or a tuple of an object label and - its confidence score). - """ - self.left = left - self.right = right - self.top = top - self.bottom = bottom - self.item = item - def __init__(self, n : int, center_x: int, center_y: int, k: int = 1, log_func: Optional[Callable[..., None]] = None): """ @@ -65,9 +51,9 @@ def __init__(self, n : int, center_x: int, center_y: int, def get_queue(self): return self.pq - def add(self, timestamp: int, item: BoundingBoxes): + def add(self, timestamp: int, bounding_boxed_item: BoundingBoxes): self.lock.acquire() - k_most_centered_objects = self._get_k_most_center_objects(item) + k_most_centered_objects = self._get_k_most_center_objects(bounding_boxed_item) heapq.heappush(self.pq, (timestamp, k_most_centered_objects)) self.lock.release() diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py index d35ec0a9e..b33ad3699 100644 --- a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py @@ -1,7 +1,7 @@ import unittest from centroid_2d_strategy_queue import Centroid2DStrategyQueue - +from angel_system.data.common.bounding_boxes import BoundingBoxes RESOLUTION_W = 1920 @@ -18,7 +18,7 @@ def test_queue_n3_k1_insertion(self): # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. - first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + first_objects_detected = BoundingBoxes( [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], @@ -28,7 +28,7 @@ def test_queue_n3_k1_insertion(self): # Ball is in top left of the screen. The butterfly is bottom right # of this ball. The cat is in the middle of the screen. - second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + second_objects_detected = BoundingBoxes( [1, 2, RESOLUTION_W // 2], [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], @@ -38,16 +38,16 @@ def test_queue_n3_k1_insertion(self): # Shoes is in bottom right of the screen. The pencil is in the top left # of the screen. The child is in the top left of the screen. - third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + third_objects_detected = BoundingBoxes( [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], [RESOLUTION_H, 4 , 2], ['shoes', 'pencil', 'child'] ) - q.add(timestamp=1, item=first_objects_detected) - q.add(timestamp=2, item=second_objects_detected) - q.add(timestamp=3, item=third_objects_detected) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) queue_state = q.get_queue() first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ @@ -68,7 +68,7 @@ def test_queue_n3_k1_insertion_with_confidence_scores(self): # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. - first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + first_objects_detected = BoundingBoxes( [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], @@ -78,7 +78,7 @@ def test_queue_n3_k1_insertion_with_confidence_scores(self): # Ball is in top left of the screen. The butterfly is bottom right # of this ball. The cat is in the middle of the screen. - second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + second_objects_detected = BoundingBoxes( [1, 2, RESOLUTION_W // 2], [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], @@ -88,16 +88,16 @@ def test_queue_n3_k1_insertion_with_confidence_scores(self): # Shoes is in bottom right of the screen. The pencil is in the top left # of the screen. The child is in the top left of the screen. - third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + third_objects_detected = BoundingBoxes( [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], [RESOLUTION_H, 4 , 2], [('shoes', 0.9), ('pencil', 0.3), ('child', 0.5)] ) - q.add(timestamp=1, item=first_objects_detected) - q.add(timestamp=2, item=second_objects_detected) - q.add(timestamp=3, item=third_objects_detected) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) queue_state = q.get_queue() first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ @@ -122,7 +122,7 @@ def test_queue_n3_k2_insertion(self): # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. - first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + first_objects_detected = BoundingBoxes( [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], @@ -132,7 +132,7 @@ def test_queue_n3_k2_insertion(self): # Ball is in top left of the screen. The butterfly is bottom right # of this ball. The cat is in the middle of the screen. - second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + second_objects_detected = BoundingBoxes( [1, 2, RESOLUTION_W // 2], [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], @@ -142,16 +142,16 @@ def test_queue_n3_k2_insertion(self): # Shoes is in bottom right of the screen. The pencil is in the top left # of the screen. The child is in the top left of the screen. - third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + third_objects_detected = BoundingBoxes( [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], [RESOLUTION_H, 4 , 2], ['shoes', 'pencil', 'child'] ) - q.add(timestamp=1, item=first_objects_detected) - q.add(timestamp=2, item=second_objects_detected) - q.add(timestamp=3, item=third_objects_detected) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) queue_state = q.get_queue() first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ @@ -178,7 +178,7 @@ def test_queue_n3_k2_removal(self): # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. - first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + first_objects_detected = BoundingBoxes( [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], @@ -188,7 +188,7 @@ def test_queue_n3_k2_removal(self): # Ball is in top left of the screen. The butterfly is bottom right # of this ball. The cat is in the middle of the screen. - second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + second_objects_detected = BoundingBoxes( [1, 2, RESOLUTION_W // 2], [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], @@ -198,16 +198,16 @@ def test_queue_n3_k2_removal(self): # Shoes is in bottom right of the screen. The pencil is in the top left # of the screen. The child is in the top left of the screen. - third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + third_objects_detected = BoundingBoxes( [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], [RESOLUTION_H, 4 , 2], ['shoes', 'pencil', 'child'] ) - q.add(timestamp=1, item=first_objects_detected) - q.add(timestamp=2, item=second_objects_detected) - q.add(timestamp=3, item=third_objects_detected) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) no_items = q.get_n_before(timestamp=1) self.assertEqual([], no_items) @@ -228,7 +228,7 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. - first_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + first_objects_detected = BoundingBoxes( [1, RESOLUTION_W * 3 // 4, RESOLUTION_W // 2], [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], @@ -238,7 +238,7 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): # Ball is in top left of the screen. The butterfly is bottom right # of this ball. The cat is in the middle of the screen. - second_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + second_objects_detected = BoundingBoxes( [1, 2, RESOLUTION_W // 2], [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], @@ -248,16 +248,16 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): # Shoes is in bottom right of the screen. The pencil is in the top left # of the screen. The child is in the top left of the screen. - third_objects_detected = Centroid2DStrategyQueue.BoundingBoxes( + third_objects_detected = BoundingBoxes( [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], [RESOLUTION_H, 4 , 2], [('shoes', 0.9), ('pencil', 0.3), ('child', 0.5)] ) - q.add(timestamp=1, item=first_objects_detected) - q.add(timestamp=2, item=second_objects_detected) - q.add(timestamp=3, item=third_objects_detected) + q.add(timestamp=1, bounding_boxed_item=first_objects_detected) + q.add(timestamp=2, bounding_boxed_item=second_objects_detected) + q.add(timestamp=3, bounding_boxed_item=third_objects_detected) no_items = q.get_n_before(timestamp=1) self.assertEqual([], no_items) From 4b35a7d23c8fc9986b161d360f515fb37a9e83b1 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Tue, 31 Oct 2023 01:54:25 -0400 Subject: [PATCH 25/46] Integrate centered and surrounding observables together in visual question answering --- .../visual_question_answerer.py | 95 +++++++++++++------ .../configs/llm_prompts/vis_qa_teacher_prompt | 4 +- ...al_visual_vocalized_question_answering.yml | 33 ++++--- 3 files changed, 85 insertions(+), 47 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 1f224488a..00749b5df 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -10,7 +10,6 @@ import queue import rclpy from rclpy.node import Node -from scipy.spatial import distance from termcolor import colored import threading from typing import * @@ -23,6 +22,7 @@ TaskUpdate, ) from angel_utils import declare_and_get_parameters +from angel_system.data.common import bounding_boxes from angel_system.utils.object_detection_queues import centroid_2d_strategy_queue openai.organization = os.getenv("OPENAI_ORG_ID") @@ -48,7 +48,7 @@ # Below indicates how many of the last n detected objects should be surfaced # in the LLM prompt. These objects do NOT have to be unique. -PARAM_OBJECT_LAST_N_OBJECTS = "obj_det_last_n" +PARAM_OBJECT_LAST_N_OBJ_DETECTIONS = "obj_det_last_n" # Below are the corresponding model thresholds. PARAM_OBJECT_DETECTION_THRESHOLD = "object_det_threshold" @@ -87,7 +87,8 @@ "current_step", "emotion", "action", - "observables", + "centered_observables", + "all_observables", "question", ] @@ -118,7 +119,7 @@ def __init__(self): (PARAM_PROMPT_TEMPLATE_PATH,), (PARAM_IMAGE_WIDTH,), (PARAM_IMAGE_HEIGHT,), - (PARAM_OBJECT_LAST_N_OBJECTS, 10), + (PARAM_OBJECT_LAST_N_OBJ_DETECTIONS, 5), (PARAM_OBJECT_DETECTION_THRESHOLD, 0.8), (PARAM_ACT_CLFN_THRESHOLD, 0.8), (OUT_QA_TOPIC,), @@ -162,7 +163,7 @@ def __init__(self): ) self.object_dtctn_threshold = param_values[PARAM_OBJECT_DETECTION_THRESHOLD] - self.object_dtctn_last_n_objects = param_values[PARAM_OBJECT_LAST_N_OBJECTS] + self.object_dtctn_last_n_obj_detections = param_values[PARAM_OBJECT_LAST_N_OBJ_DETECTIONS] # Configure supplemental input action classification criteria. self.action_clfn_threshold = param_values[PARAM_ACT_CLFN_THRESHOLD] @@ -174,8 +175,10 @@ def __init__(self): self.detected_objects_queue = queue.Queue() self.centroid_object_queue = \ centroid_2d_strategy_queue.Centroid2DStrategyQueue( - 8, self.pv_center_coordinate[0], self.pv_center_coordinate[1], - k=1) + self.object_dtctn_last_n_obj_detections, + self.pv_center_coordinate[0], self.pv_center_coordinate[1], + k=1, # the number of top-k objects to obtain from each detection. + ) self.dialogue_history = [] self.handler_thread = threading.Thread(target=self.process_question_queue) @@ -307,17 +310,22 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: """ Stores all detected objects with a confidence score above IN_OBJECT_DETECTION_THRESHOLD. """ - # We will queue timestamped lists of pairs of (detections, confidence scores). + # We queue timestamped lists of pairs of (detections, confidence scores) for centered + # objects based on centroid distance from the middle. self.centroid_object_queue.add( self._get_sec(msg), - centroid_2d_strategy_queue.Centroid2DStrategyQueue.BoundingBoxes( + bounding_boxes.BoundingBoxes( msg.left, msg.right, msg.top, msg.bottom, item=list(zip(msg.label_vec, msg.label_confidences)) )) + + # We queue ALL objects above threshold, regardless if they are centered in the user's + # perspective. + self._add_detected_objects_above_threshold(msg) def _add_detected_objects_above_threshold(self, msg): """ - Queuse all objects above a configure threshold. + Queuse all objects above a configured threshold. """ detected_objs = set() for obj, score in zip(msg.label_vec, msg.label_confidences): @@ -348,12 +356,10 @@ def _get_latest_action(self, curr_time: int) -> str: break return latest_action - def _get_last_n_observables(self, curr_time: int, n: int) -> str: + def _get_latest_centered_observables(self, curr_time: int) -> str: """ - Returns a comma-delimited list of observed objects per all + Returns a comma-delimited list of "centered" objects per all entities in self.detected_objects_queue that occurred before a provided time. - - :param curr_time: The time for which objects must have been detected before. :param n: The last n objects. :return: returns a string-ified list of the latest observables @@ -363,16 +369,39 @@ def _get_last_n_observables(self, curr_time: int, n: int) -> str: timestamped_detections = self.centroid_object_queue.get_n_before(timestamp=curr_time) if timestamped_detections: if self.debug_mode: - print(f"Timestamped detections based on centroid distance are:{timestamped_detections}") - # Recall that we passed in timestamped lists of pairs of (detections, confidence scores). - top_k_obj_lists = [j for _, j in timestamped_detections] - for top_k_obj_list in top_k_obj_lists: - for centroid_dist_obj in top_k_obj_list: - centroid, obj_score = centroid_dist_obj + print(f"Timestamped detections based on centroid distance are: " +\ + f"{timestamped_detections}") + # Recall that we passed in timestamped lists of pairs of + # (detection, confidence score). + centered_obj_detections_lists = [j for _, j in timestamped_detections] + for centered_obj_detections in centered_obj_detections_lists: + for centered_obj_detection in centered_obj_detections: + centroid, obj_score = centered_obj_detection obj, score = obj_score observables.add(obj) return ", ".join(observables) + def _get_latest_observables(self, curr_time: int, n: int) -> str: + """ + Returns a comma-delimited list of all observed objects per all + entities in self.detected_objects_queue that occurred before a provided time. + Only refers to the latest n detections. + :param curr_time: The time for which objects must have been detected before. + :param n: The last n objects. + :return: returns a string-ified list of the latest observables + """ + detections = [] + while not self.detected_objects_queue.empty(): + next_detections = self.detected_objects_queue.queue[0] + if next_detections.time < curr_time: + detections.append(self.detected_objects_queue.get()) + else: + break + observables = set() + for detection in detections[-n:]: + for obj in detection.entity: + observables.add(obj) + return ", ".join(observables) def get_response( self, @@ -380,7 +409,8 @@ def get_response( chat_history: str, current_step: str, action: str, - observables: str, + centered_observables: str, + all_observables: str, ): """ Generate a response to the utterance, enriched with the addition of @@ -395,7 +425,8 @@ def get_response( chat_history=chat_history, current_step=current_step, action=action, - observables=observables, + centered_observables=centered_observables, + all_observables=all_observables, emotion=msg.user_emotion, question=msg.utterance_text, ) @@ -405,7 +436,8 @@ def get_response( chat_history=chat_history, current_step=current_step, action=action, - observables=observables, + centered_observables=centered_observables, + all_observables=all_observables, emotion=msg.user_emotion, question=msg.utterance_text, ).to_string() @@ -443,11 +475,15 @@ def process_question_queue(self): action = self._get_latest_action(start_time) self.log.info(f"Latest action: {action}") - # Get detected objects. - observables = self._get_last_n_observables( - start_time, self.object_dtctn_last_n_objects - ) - self.log.info(f"Observed objects: {observables}") + # Get centered detected objects. + centered_observables = \ + self._get_latest_centered_observables(start_time) + self.log.info(f"Observed objects: {centered_observables}") + + # Get all detected objects. + all_observables = self._get_latest_observables(start_time, + self.object_dtctn_last_n_obj_detections) + self.log.info(f"Observed objects: {all_observables}") # Generate response. response = self.get_response( @@ -455,7 +491,8 @@ def process_question_queue(self): self._get_dialogue_history(), self._get_current_step(), action, - observables, + centered_observables, + all_observables, ) self.publish_generated_response(question_msg.utterance_text, response) self._add_dialogue_history(question_msg.utterance_text, response) diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index 2eac012d6..6e8d6d5d8 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -12,7 +12,9 @@ My Emotion: {emotion}. My Current Action: {action}. -Objects In Front of Me: {observables}. +Objects In Front of Me: {centered_observables}. + +Objects Nearby: {all_observables} My Question: {question} diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_vocalized_question_answering.yml index 27d9594de..ebdbf40be 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_vocalized_question_answering.yml @@ -101,22 +101,21 @@ windows: - object_detector: layout: even-vertical panes: - # - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args - # -r __ns:=${ROS_NAMESPACE} - # -p image_topic:=PVFramesBGR - # -p det_topic:=ObjectDetections2d - # -p det_conf_threshold:=0.1 - # -p cuda_device_id:=1 - # -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p image_topic:=PVFramesBGR -p det_topic:=ObjectDetections2d - -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt - -p inference_img_size:=1280 -p det_conf_threshold:=0.1 - -p cuda_device_id:=1 - + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args -r __ns:=${ROS_NAMESPACE} -p topic_input_images:=PVFramesBGR @@ -131,7 +130,7 @@ windows: -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json - -p model_device:=cuda:1 + -p model_device:=cuda:0 -p model_dets_conv_version:=5 -p window_size:=30 -p buffer_max_size_seconds:=5 @@ -167,7 +166,7 @@ windows: - intent_detection: layout: even-vertical panes: - - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p utterances_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic @@ -175,7 +174,7 @@ windows: - emotion_detection: layout: even-vertical panes: - - gpt_interp_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p user_intent_topic:=interp_user_intent_topic -p user_emotion_topic:=gpt_emotion_topic @@ -191,8 +190,8 @@ windows: -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_filter:=CENTROID + -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 - -p must_contain_target_phrase:=False + -p must_contain_target_phrase:=True -p debug_mode:=True \ No newline at end of file From 96bba9143766089a092923c306ec1631141fb2e4 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Wed, 1 Nov 2023 21:26:54 -0400 Subject: [PATCH 26/46] Add general purpose dialogue utterance message --- ros/angel_msgs/CMakeLists.txt | 1 + ros/angel_msgs/msg/DialogueUtterance.msg | 24 +++++ .../Angel/msg/DialogueUtteranceMsg.cs | 99 +++++++++++++++++++ .../Angel/msg/DialogueUtteranceMsg.cs.meta | 11 +++ 4 files changed, 135 insertions(+) create mode 100644 ros/angel_msgs/msg/DialogueUtterance.msg create mode 100644 unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs create mode 100644 unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta diff --git a/ros/angel_msgs/CMakeLists.txt b/ros/angel_msgs/CMakeLists.txt index e5abe06e3..bd9890ce4 100644 --- a/ros/angel_msgs/CMakeLists.txt +++ b/ros/angel_msgs/CMakeLists.txt @@ -28,6 +28,7 @@ set( message_files msg/AruiObject3d.msg msg/AruiUpdate.msg msg/AruiUserNotification.msg + msg/DialogueUtterance.msg msg/EyeGazeData.msg msg/HandJointPose.msg msg/HandJointPosesUpdate.msg diff --git a/ros/angel_msgs/msg/DialogueUtterance.msg b/ros/angel_msgs/msg/DialogueUtterance.msg new file mode 100644 index 000000000..49d3122ee --- /dev/null +++ b/ros/angel_msgs/msg/DialogueUtterance.msg @@ -0,0 +1,24 @@ +# +# Dialogue Utterance with additional information about the environmental state +# and user model. +# + +# The header primarily encapsulates when this message was emitted. +# The time component of this may be utilized as an identifier for this user +# intent and utterance. +std_msgs/Header header + +# Speech-to-text of the user utterance we have interpreted +string utterance_text + +# Below are optional fields + +# Canonical user intent that has been interpreted. "Canonical" in this context +# is to mean that this string may be used as an identifier of this type of +# user intent. Should be in the range [0,1] where 1.0 means absolute confidence. +string intent +float64 intent_confidence_score + +# Emotion classification. Should be in the range [0,1] where 1.0 means absolute confidence. +string emotion +float64 emotion_confidence_score diff --git a/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs new file mode 100644 index 000000000..b1e86db02 --- /dev/null +++ b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs @@ -0,0 +1,99 @@ +//Do not edit! This file was generated by Unity-ROS MessageGeneration. +using System; +using System.Linq; +using System.Collections.Generic; +using System.Text; +using Unity.Robotics.ROSTCPConnector.MessageGeneration; + +namespace RosMessageTypes.Angel +{ + [Serializable] + public class DialogueUtteranceMsg : Message + { + public const string k_RosMessageName = "angel_msgs/DialogueUtterance"; + public override string RosMessageName => k_RosMessageName; + + // + // Dialogue Utterance with additional information about the environmental state + // and user model. + // + // The header primarily encapsulates when this message was emitted. + // The time component of this may be utilized as an identifier for this user + // intent and utterance. + public Std.HeaderMsg header; + // Speech-to-text of the user utterance we have interpreted + public string utterance_text; + // Below are optional fields + // Canonical user intent that has been interpreted. "Canonical" in this context + // is to mean that this string may be used as an identifier of this type of + // user intent. Should be in the range [0,1] where 1.0 means absolute confidence. + public string intent; + public double intent_confidence_score; + // Emotion classification. Should be in the range [0,1] where 1.0 means absolute confidence. + public string emotion; + public double emotion_confidence_score; + + public DialogueUtteranceMsg() + { + this.header = new Std.HeaderMsg(); + this.utterance_text = ""; + this.intent = ""; + this.intent_confidence_score = 0.0; + this.emotion = ""; + this.emotion_confidence_score = 0.0; + } + + public DialogueUtteranceMsg(Std.HeaderMsg header, string utterance_text, string intent, double intent_confidence_score, string emotion, double emotion_confidence_score) + { + this.header = header; + this.utterance_text = utterance_text; + this.intent = intent; + this.intent_confidence_score = intent_confidence_score; + this.emotion = emotion; + this.emotion_confidence_score = emotion_confidence_score; + } + + public static DialogueUtteranceMsg Deserialize(MessageDeserializer deserializer) => new DialogueUtteranceMsg(deserializer); + + private DialogueUtteranceMsg(MessageDeserializer deserializer) + { + this.header = Std.HeaderMsg.Deserialize(deserializer); + deserializer.Read(out this.utterance_text); + deserializer.Read(out this.intent); + deserializer.Read(out this.intent_confidence_score); + deserializer.Read(out this.emotion); + deserializer.Read(out this.emotion_confidence_score); + } + + public override void SerializeTo(MessageSerializer serializer) + { + serializer.Write(this.header); + serializer.Write(this.utterance_text); + serializer.Write(this.intent); + serializer.Write(this.intent_confidence_score); + serializer.Write(this.emotion); + serializer.Write(this.emotion_confidence_score); + } + + public override string ToString() + { + return "DialogueUtteranceMsg: " + + "\nheader: " + header.ToString() + + "\nutterance_text: " + utterance_text.ToString() + + "\nintent: " + intent.ToString() + + "\nintent_confidence_score: " + intent_confidence_score.ToString() + + "\nemotion: " + emotion.ToString() + + "\nemotion_confidence_score: " + emotion_confidence_score.ToString(); + } + +#if UNITY_EDITOR + [UnityEditor.InitializeOnLoadMethod] +#else + [UnityEngine.RuntimeInitializeOnLoadMethod] +#endif + public static void Register() + { + MessageRegistry.Register(k_RosMessageName, Deserialize); + } + } +} diff --git a/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta new file mode 100644 index 000000000..cfee2a66a --- /dev/null +++ b/unity/ARUI/Assets/RosMessages/Angel/msg/DialogueUtteranceMsg.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 244f6af8d6d7e4c18a6e2d52b444d387 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: From b6709a0881a93a703c2af53f5ccd61594b27b4ea Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Wed, 1 Nov 2023 23:47:55 -0400 Subject: [PATCH 27/46] Refactor Dialogue System Nodes to inherit from BaseDialogueSystemNode and pub/sub DialogueUtterance --- .../angel_system_nodes/audio/asr.py | 20 +- .../base_dialogue_system_node.py | 62 ++++++ .../base_emotion_detector.py | 48 ++--- .../base_intent_detector.py | 59 +++--- .../gpt_emotion_detector.py | 3 +- .../angel_system_nodes/gpt_intent_detector.py | 5 +- .../angel_system_nodes/question_answerer.py | 46 ++-- .../visual_question_answerer.py | 50 ++--- ...al_visual_barebones_question_answering.yml | 196 ++++++++++++++++++ ...> eval_visual_full_question_answering.yml} | 11 +- ...sual_vocalized_question_answering_live.yml | 2 +- tmux/eval_vocalized_emotion_detection.yml | 94 +++++++++ tmux/eval_vocalized_intent_detection.yml | 8 +- ...al_vocalized_intent_emotion_detection.yml} | 17 +- tmux/eval_vocalized_question_answering.yml | 27 +-- 15 files changed, 489 insertions(+), 159 deletions(-) create mode 100644 ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py create mode 100644 tmux/eval_visual_barebones_question_answering.yml rename tmux/{eval_visual_vocalized_question_answering.yml => eval_visual_full_question_answering.yml} (95%) create mode 100644 tmux/eval_vocalized_emotion_detection.yml rename tmux/{eval_vocalized_emotional_detection.yml => eval_vocalized_intent_emotion_detection.yml} (81%) diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py index 5eb352e7b..e4c3598a9 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py @@ -11,7 +11,7 @@ from rclpy.node import Node import simpleaudio as sa -from angel_msgs.msg import HeadsetAudioData, Utterance +from angel_msgs.msg import HeadsetAudioData, DialogueUtterance AUDIO_TOPIC = "audio_topic" @@ -105,7 +105,7 @@ def __init__(self): self.subscription = self.create_subscription( HeadsetAudioData, self._audio_topic, self.listener_callback, 1 ) - self._publisher = self.create_publisher(Utterance, self._utterances_topic, 1) + self._publisher = self.create_publisher(DialogueUtterance, self._utterances_topic, 1) self.audio_stream = [] self.t = threading.Thread() @@ -204,15 +204,19 @@ def asr_server_request_thread(self, audio_data, num_channels, sample_rate): self.log.info("Complete ASR text is:\n" + f'"{response_text}"') if self._is_sentence_tokenize_mode: for sentence in sent_tokenize(response_text): - utterance_msg = Utterance() - utterance_msg.value = sentence + msg = DialogueUtterance() + msg.header.frame_id = "ASR" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = sentence self.log.info("Publishing message: " + f'"{sentence}"') - self._publisher.publish(utterance_msg) + self._publisher.publish(msg) else: - utterance_msg = Utterance() - utterance_msg.value = response_text + msg = DialogueUtterance() + msg.header.frame_id = "ASR" + msg.header.stamp = self.get_clock().now().to_msg() + msg.utterance_text = response_text self.log.info("Publishing message: " + f'"{response_text}"') - self._publisher.publish(utterance_msg) + self._publisher.publish(msg) def main(): diff --git a/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py b/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py new file mode 100644 index 000000000..b1da32783 --- /dev/null +++ b/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py @@ -0,0 +1,62 @@ +from abc import ABC +import rclpy +from rclpy.node import Node + +from angel_msgs.msg import DialogueUtterance + +class BaseDialogueSystemNode(Node): + """ + This class is used for all dialogue system nodes to inherit similar + functionality. + """ + def __init__(self): + super().__init__(self.__class__.__name__) + self.log = self.get_logger() + + def get_intent_or(self, src_msg: DialogueUtterance, or_value: str = "not available") -> str: + """ + Returns the src_msg intent classification information. If the value is absent, + the or_value is passed in. + """ + return src_msg.intent if src_msg.intent else or_value + + def get_emotion_or(self, src_msg: DialogueUtterance, or_value: str = "not available") -> str: + """ + Returns the src_msg emotion classification information. If the value is absent, + the or_value is passed in. + """ + return src_msg.emotion if src_msg.emotion else or_value + + def copy_dialogue_utterance(self, + src_msg: DialogueUtterance, + node_name: str = "Dialogue System Node" + ) -> DialogueUtterance: + msg = DialogueUtterance() + msg.header.frame_id = node_name + msg.utterance_text = src_msg.utterance_text + + # Assign new time for publication. + msg.header.stamp = self.get_clock().now().to_msg() + + # Copy over intent classification information if present. + if src_msg.intent: + msg.intent = src_msg.intent + msg.intent_confidence_score = src_msg.intent_confidence_score + + # Copy over intent classification information if present. + if src_msg.emotion: + msg.emotion = src_msg.emotion + msg.emotion_confidence_score = src_msg.emotion_confidence_score + + return msg + +def main(): + rclpy.init() + base_dialogue_node = BaseDialogueSystemNode() + rclpy.spin(base_dialogue_node) + base_dialogue_node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py index 86401f618..f123bafa8 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py @@ -5,10 +5,11 @@ import threading from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -from angel_msgs.msg import InterpretedAudioUserEmotion, InterpretedAudioUserIntent +from angel_msgs.msg import DialogueUtterance +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_utils import declare_and_get_parameters -IN_USER_INTENT_TOPIC = "user_intent_topic" +IN_TOPIC = "input_topic" OUT_INTERP_USER_EMOTION_TOPIC = "user_emotion_topic" # Currently supported emotions. This is tied with the emotions @@ -22,37 +23,36 @@ VADER_POSITIVE_COMPOUND_THRESHOLD = 0.05 -class BaseEmotionDetector(Node): +class BaseEmotionDetector(BaseDialogueSystemNode): """ This is the base emotion detection node that other emotion detection nodes should inherit from. """ def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() # Handle parameterization. param_values = declare_and_get_parameters( self, [ - (IN_USER_INTENT_TOPIC,), + (IN_TOPIC,), (OUT_INTERP_USER_EMOTION_TOPIC,), ], ) - self._in_uintent_topic = param_values[IN_USER_INTENT_TOPIC] + self._input_topic = param_values[IN_TOPIC] self._out_interp_uemotion_topic = param_values[OUT_INTERP_USER_EMOTION_TOPIC] - # Handle subscription/publication topics. - self.uintent_subscription = self.create_subscription( - InterpretedAudioUserIntent, - self._in_uintent_topic, + self.subscription = self.create_subscription( + DialogueUtterance, + self._input_topic, self.emotion_detection_callback, 1, ) - self._interp_emo_publisher = self.create_publisher( - InterpretedAudioUserEmotion, self._out_interp_uemotion_topic, 1 + self.emotion_publication = self.create_publisher( + DialogueUtterance, self._out_interp_uemotion_topic, 1 ) self.message_queue = queue.Queue() @@ -85,7 +85,7 @@ def _get_vader_sentiment_analysis(self, utterance: str): ) return (classification, confidence) - def get_inference(self, msg): + def get_inference(self, msg: DialogueUtterance): """ Abstract away the different model inference calls depending on the node's configure model mode. @@ -109,24 +109,24 @@ def process_message_queue(self): self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') classification, confidence_score = self.get_inference(msg) self.publish_detected_emotion( - msg.utterance_text, classification, confidence_score + msg, classification, confidence_score ) def publish_detected_emotion( - self, utterance: str, classification: str, confidence_score: float + self, sub_msg: DialogueUtterance, classification: str, confidence_score: float ): """ Handles message publishing for an utterance with a detected emotion classification. """ - emotion_msg = InterpretedAudioUserEmotion() - emotion_msg.header.frame_id = "Emotion Detection" - emotion_msg.header.stamp = self.get_clock().now().to_msg() - emotion_msg.utterance_text = utterance - emotion_msg.user_emotion = classification - emotion_msg.confidence = confidence_score - self._interp_emo_publisher.publish(emotion_msg) - colored_utterance = colored(utterance, "light_blue") - colored_emotion = colored(classification, "light_green") + pub_msg = self.copy_dialogue_utterance(sub_msg, node_name="Emotion Detection") + # Overwrite the user emotion with the latest classification information. + pub_msg.emotion = classification + pub_msg.emotion_confidence_score = confidence_score + self.emotion_publication.publish(pub_msg) + + # Log emotion detection information. + colored_utterance = colored(pub_msg.utterance_text, "light_blue") + colored_emotion = colored(pub_msg.emotion, "light_green") self.log.info( f'Publishing {{"{colored_emotion}": {confidence_score}}} ' + f'to {self._out_interp_uemotion_topic} for:\n>>> "{colored_utterance}"' diff --git a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py index c89d3e554..1790c6b56 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py @@ -4,7 +4,8 @@ from termcolor import colored import threading -from angel_msgs.msg import InterpretedAudioUserIntent, Utterance +from angel_msgs.msg import DialogueUtterance +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_utils import declare_and_get_parameters NEXT_STEP_KEYPHRASES = ["skip", "next", "next step"] @@ -18,38 +19,38 @@ # https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . INTENT_LABELS = ["next_step", "prev_step", "inquiry", "other"] -UTTERANCES_TOPIC = "utterances_topic" +IN_TOPIC = "input_topic" PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" -class BaseIntentDetector(Node): +class BaseIntentDetector(BaseDialogueSystemNode): def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() # Handle parameterization. param_values = declare_and_get_parameters( self, [ - (UTTERANCES_TOPIC,), + (IN_TOPIC,), (PARAM_EXPECT_USER_INTENT_TOPIC,), (PARAM_INTERP_USER_INTENT_TOPIC,), ], ) - self._utterances_topic = param_values[UTTERANCES_TOPIC] + self._input_topic = param_values[IN_TOPIC] self._expect_uintent_topic = param_values[PARAM_EXPECT_USER_INTENT_TOPIC] self._interp_uintent_topic = param_values[PARAM_INTERP_USER_INTENT_TOPIC] # Handle subscription/publication topics. self.subscription = self.create_subscription( - Utterance, self._utterances_topic, self.utterance_callback, 1 + DialogueUtterance, self._input_topic, self.utterance_callback, 1 ) self._expected_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._expect_uintent_topic, 1 + DialogueUtterance, self._expect_uintent_topic, 1 ) self._interp_publisher = self.create_publisher( - InterpretedAudioUserIntent, self._interp_uintent_topic, 1 + DialogueUtterance, self._interp_uintent_topic, 1 ) self.utterance_message_queue = queue.Queue() @@ -63,7 +64,7 @@ def utterance_callback(self, msg): This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ - self.log.debug(f'Received message:\n\n"{msg.value}"') + self.log.debug(f'Received message:\n\n"{msg.utterance_text}"') self.utterance_message_queue.put(msg) def process_utterance_message_queue(self): @@ -72,13 +73,13 @@ def process_utterance_message_queue(self): """ while True: msg = self.utterance_message_queue.get() - self.log.debug(f'Processing message:\n\n"{msg.value}"') + self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') intent, score = self.detect_intents(msg) if not intent: continue - self.publish_msg(msg.value, intent, score) + self.publish_msg(msg, intent, score) - def detect_intents(self, msg): + def detect_intents(self, msg: DialogueUtterance): """ Keyphrase search for intent detection. This implementation does simple string matching to assign a detected label. When multiple intents are @@ -98,7 +99,7 @@ def _tiebreak_intents(intents, confidences): ) return classification, score - lower_utterance = msg.value.lower() + lower_utterance = msg.utterance_text.lower() intents = [] confidences = [] if self._contains_phrase(lower_utterance, NEXT_STEP_KEYPHRASES): @@ -111,7 +112,7 @@ def _tiebreak_intents(intents, confidences): intents.append(INTENT_LABELS[2]) confidences.append(0.5) if not intents: - colored_utterance = colored(msg.value, "light_blue") + colored_utterance = colored(msg.utterance_text, "light_blue") self.log.info(f'No intents detected for:\n>>> "{colored_utterance}":') return None, -1.0 @@ -119,26 +120,28 @@ def _tiebreak_intents(intents, confidences): classification = colored(classification, "light_green") return classification, confidence - def publish_msg(self, utterance, intent, score): + def publish_msg(self, sub_msg: DialogueUtterance, intent: str, score: float): """ Handles message publishing for an utterance with a detected intent. """ - intent_msg = InterpretedAudioUserIntent() - intent_msg.header.frame_id = "Intent Detection" - intent_msg.header.stamp = self.get_clock().now().to_msg() - intent_msg.utterance_text = utterance - intent_msg.user_intent = intent - intent_msg.confidence = score + pub_msg = self.copy_dialogue_utterance(sub_msg, node_name="Intent Detection") + # Overwrite the user intent with the latest classification information. + pub_msg.intent = intent + pub_msg.intent_confidence_score = score + + # Decide which intent topic to publish the message to. published_topic = None - if self._contains_phrase(utterance.lower(), OVERRIDE_KEYPHRASES): - intent_msg.confidence = 1.0 - self._expected_publisher.publish(intent_msg) + if self._contains_phrase(pub_msg.utterance_text.lower(), + OVERRIDE_KEYPHRASES): + pub_msg.intent_confidence_score = 1.0 + self._expected_publisher.publish(pub_msg) published_topic = PARAM_EXPECT_USER_INTENT_TOPIC - self._interp_publisher.publish(intent_msg) + self._interp_publisher.publish(pub_msg) published_topic = PARAM_INTERP_USER_INTENT_TOPIC - colored_utterance = colored(utterance, "light_blue") - colored_intent = colored(intent_msg.user_intent, "light_green") + # Log intent detection information. + colored_utterance = colored(pub_msg.utterance_text, "light_blue") + colored_intent = colored(pub_msg.intent, "light_green") self.log.info( f'Publishing {{"{colored_intent}": {score}}} to {published_topic} ' + f'for:\n>>> "{colored_utterance}"' diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py index 19a15fdd5..259b27311 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py @@ -5,6 +5,7 @@ import os import rclpy +from angel_msgs.msg import DialogueUtterance from angel_system_nodes.base_emotion_detector import BaseEmotionDetector, LABEL_MAPPINGS openai.organization = os.getenv("OPENAI_ORG_ID") @@ -80,7 +81,7 @@ def _labels_list_str(labels): ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) - def get_inference(self, msg): + def get_inference(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py index e09f6bffa..11e4780ce 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py @@ -6,6 +6,7 @@ import os import rclpy +from angel_msgs.msg import DialogueUtterance from angel_system_nodes.base_intent_detector import BaseIntentDetector, INTENT_LABELS openai.organization = os.getenv("OPENAI_ORG_ID") @@ -81,11 +82,11 @@ def _labels_list_str(labels): ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) - def detect_intents(self, msg): + def detect_intents(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ - return self.chain.run(utterance=msg), 0.5 + return self.chain.run(utterance=msg.utterance_text), 0.5 def main(): diff --git a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py index 7572f3641..fe4f42cd8 100644 --- a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py @@ -8,31 +8,32 @@ from termcolor import colored import threading -from angel_msgs.msg import InterpretedAudioUserEmotion, SystemTextResponse +from angel_msgs.msg import DialogueUtterance, SystemTextResponse +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_utils import declare_and_get_parameters openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") -IN_EMOTION_TOPIC = "user_emotion_topic" +INPUT_TOPIC = "input_topic" OUT_QA_TOPIC = "system_text_response_topic" FEW_SHOT_PROMPT = "few_shot_prompt_file" -class QuestionAnswerer(Node): +class QuestionAnswerer(BaseDialogueSystemNode): def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() param_values = declare_and_get_parameters( self, [ - (IN_EMOTION_TOPIC,), + (INPUT_TOPIC,), (OUT_QA_TOPIC,), (FEW_SHOT_PROMPT,), ], ) - self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] + self._input_topic = param_values[INPUT_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] self.prompt_file = param_values[FEW_SHOT_PROMPT] @@ -58,8 +59,8 @@ def __init__(self): # Handle subscription/publication topics. self.subscription = self.create_subscription( - InterpretedAudioUserEmotion, - self._in_emotion_topic, + DialogueUtterance, + self._input_topic, self.question_answer_callback, 1, ) @@ -67,7 +68,7 @@ def __init__(self): SystemTextResponse, self._out_qa_topic, 1 ) - def get_response(self, user_utterance: str, user_emotion: str): + def get_response(self, sub_msg: DialogueUtterance): """ Generate a response to the utterance, enriched with the addition of the user's detected emotion. Inference calls can be added and revised @@ -77,14 +78,14 @@ def get_response(self, user_utterance: str, user_emotion: str): try: if self.is_openai_ready: return_msg = colored( - self.prompt_gpt(user_utterance) + "\n", "light_green" + self.prompt_gpt(sub_msg.utterance_text) + "\n", "light_green" ) except RuntimeError as err: self.log.info(err) colored_apology = colored( "I'm sorry. I don't know how to answer your statement.", "light_red" ) - colored_emotion = colored(user_emotion, "light_red") + colored_emotion = colored(sub_msg.emotion, "light_red") return_msg = ( f"{colored_apology} I understand that you feel {colored_emotion}." ) @@ -106,23 +107,22 @@ def process_question_queue(self): """ while True: msg = self.question_queue.get() - emotion = msg.user_emotion - response = self.get_response(msg.utterance_text, emotion) - self.publish_generated_response(msg.utterance_text, response) - - def publish_generated_response(self, utterance: str, response: str): - msg = SystemTextResponse() - msg.header.frame_id = "GPT Question Answering" - msg.header.stamp = self.get_clock().now().to_msg() - msg.utterance_text = utterance - msg.response = response - colored_utterance = colored(utterance, "light_blue") + response = self.get_response(msg) + self.publish_generated_response(msg, response) + + def publish_generated_response(self, sub_msg: DialogueUtterance, response: str): + pub_msg = SystemTextResponse() + pub_msg.header.frame_id = "GPT Question Answering" + pub_msg.header.stamp = self.get_clock().now().to_msg() + pub_msg.utterance_text = sub_msg.utterance_text + pub_msg.response = response + colored_utterance = colored(sub_msg.utterance_text, "light_blue") colored_response = colored(response, "light_green") self.log.info( f'Responding to utterance:\n>>> "{colored_utterance}"\n>>> with:\n' + f'>>> "{colored_response}"' ) - self._qa_publisher.publish(msg) + self._qa_publisher.publish(pub_msg) def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): prompt = self.prompt.format(question) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 00749b5df..fe6b0e63d 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -9,27 +9,27 @@ import os import queue import rclpy -from rclpy.node import Node from termcolor import colored import threading from typing import * from angel_msgs.msg import ( ActivityDetection, - InterpretedAudioUserEmotion, + DialogueUtterance, ObjectDetection2dSet, SystemTextResponse, TaskUpdate, ) from angel_utils import declare_and_get_parameters from angel_system.data.common import bounding_boxes +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode from angel_system.utils.object_detection_queues import centroid_2d_strategy_queue openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") # Below is/are the subscribed topic(s). -IN_EMOTION_TOPIC = "user_emotion_topic" +IN_UTTERANCE_TOPIC = "utterance_topic" IN_OBJECT_DETECTION_TOPIC = "object_detections_topic" IN_ACT_CLFN_TOPIC = "action_classifications_topic" IN_TASK_STATE_TOPIC = "task_state_topic" @@ -93,7 +93,7 @@ ] -class VisualQuestionAnswerer(Node): +class VisualQuestionAnswerer(BaseDialogueSystemNode): class TimestampedEntity: """ @@ -106,12 +106,12 @@ def __init__(self, time, entity): self.entity = entity def __init__(self): - super().__init__(self.__class__.__name__) + super().__init__() self.log = self.get_logger() param_values = declare_and_get_parameters( self, [ - (IN_EMOTION_TOPIC,), + (IN_UTTERANCE_TOPIC,), (IN_TASK_STATE_TOPIC, ""), (IN_OBJECT_DETECTION_TOPIC, ""), (IN_ACT_CLFN_TOPIC, ""), @@ -128,7 +128,7 @@ def __init__(self): (PARAM_MUST_CONTAIN_TARGET_PHRASE, False), ], ) - self._in_emotion_topic = param_values[IN_EMOTION_TOPIC] + self._in_utterance_topic = param_values[IN_UTTERANCE_TOPIC] self._in_task_state_topic = param_values[IN_TASK_STATE_TOPIC] self._in_objects_topic = param_values[IN_OBJECT_DETECTION_TOPIC] self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] @@ -185,9 +185,9 @@ def __init__(self): self.handler_thread.start() # Configure the (necessary) emotional detection enriched utterance subscription. - self.emotion_subscription = self.create_subscription( - InterpretedAudioUserEmotion, - self._in_emotion_topic, + self.subscription = self.create_subscription( + DialogueUtterance, + self._in_utterance_topic, self.question_answer_callback, 1, ) @@ -202,7 +202,7 @@ def __init__(self): ) # Configure the optional object detection subscription. self.objects_subscription = None - if self._in_emotion_topic: + if self._in_objects_topic: self.objects_subscription = self.create_subscription( ObjectDetection2dSet, self._in_objects_topic, @@ -276,7 +276,7 @@ def _configure_langchain(self): ) return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) - def _get_sec(self, msg) -> int: + def _get_sec(self, msg: DialogueUtterance) -> int: return msg.header.stamp.sec def _set_current_step(self, msg: TaskUpdate): @@ -346,7 +346,7 @@ def _get_latest_action(self, curr_time: int) -> str: Returns the latest action classification in self.action_classification_queue that does not occur before a provided time. """ - latest_action = "nothing" + latest_action = "not available" while not self.action_classification_queue.empty(): next = self.action_classification_queue.queue[0] if next.time < curr_time: @@ -405,7 +405,7 @@ def _get_latest_observables(self, curr_time: int, n: int) -> str: def get_response( self, - msg: InterpretedAudioUserEmotion, + msg: DialogueUtterance, chat_history: str, current_step: str, action: str, @@ -417,17 +417,17 @@ def get_response( the user's detected emotion, chat history, current step information, action, and detected objects. Inference calls can be added and revised here. """ - return_msg = None + return_string = None try: - self.log.info(f"User emotion: {msg.user_emotion}") - return_msg = self.chain.run( + self.log.info(f"User emotion: {msg.emotion}") + return_string = self.chain.run( recipe=self.recipe, chat_history=chat_history, current_step=current_step, action=action, centered_observables=centered_observables, all_observables=all_observables, - emotion=msg.user_emotion, + emotion=self.get_emotion_or(msg), question=msg.utterance_text, ) if self.debug_mode: @@ -438,7 +438,7 @@ def get_response( action=action, centered_observables=centered_observables, all_observables=all_observables, - emotion=msg.user_emotion, + emotion=self.get_emotion_or(msg), question=msg.utterance_text, ).to_string() sent_prompt = colored(sent_prompt, "light_red") @@ -447,18 +447,18 @@ def get_response( ) except RuntimeError as err: self.log.info(err) - return_msg = ( + return_string = ( "I'm sorry. I don't know how to answer your statement. " - + f"I understand that you feel {msg.user_emotion}." + + f"I understand that you feel {self.get_emotion_or(msg)}." ) - return return_msg + return return_string - def question_answer_callback(self, msg): + def question_answer_callback(self, msg: DialogueUtterance): """ This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ - self.log.debug(f"Received message:\n\n{msg.utterance_text}") + self.log.info(f"Received message:\n\n{msg.utterance_text}") if not self._apply_filter(msg): return self.question_queue.put(msg) @@ -467,9 +467,11 @@ def process_question_queue(self): """ Constant loop to process received questions. """ + self.log.info("Spawning question-processing thread...") while True: question_msg = self.question_queue.get() start_time = self._get_sec(question_msg) + self.log.info(f"Processing utterance {question_msg.utterance_text}") # Get most recently detected action. action = self._get_latest_action(start_time) diff --git a/tmux/eval_visual_barebones_question_answering.yml b/tmux/eval_visual_barebones_question_answering.yml new file mode 100644 index 000000000..d9065b93a --- /dev/null +++ b/tmux/eval_visual_barebones_question_answering.yml @@ -0,0 +1,196 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.1.101} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: +# - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ROS_IP:=0.0.0.0 +# - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ip_addr:=${HL2_IP} +# -p image_topic:=PVFramesBGR +# -p image_ts_topic:=disable +# -p hand_pose_topic:=disable +# -p audio_topic:=HeadsetAudioData +# -p head_pose_topic:=HeadsetPoseData +# -p sm_topic:=disable +# -p rm_depth_AHAT:=disable +# -p pv_width:=760 +# -p pv_height:=428 +# -p pv_framerate:=30 +# -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + - ros_bag_play: sleep 2; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:0 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True +# - intent_detection: +# layout: even-vertical +# panes: +# - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=utterances_topic +# -p expect_user_intent_topic:=expect_user_intent_topic +# -p interp_user_intent_topic:=interp_user_intent_topic +# - emotion_detection: +# layout: even-vertical +# panes: +# - emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=interp_user_intent_topic +# -p user_emotion_topic:=emotion_topic + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=utterances_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_last_n:=5 + -p pv_width:=1920 + -p pv_height:=1080 + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/eval_visual_vocalized_question_answering.yml b/tmux/eval_visual_full_question_answering.yml similarity index 95% rename from tmux/eval_visual_vocalized_question_answering.yml rename to tmux/eval_visual_full_question_answering.yml index ebdbf40be..5e2d6493b 100644 --- a/tmux/eval_visual_vocalized_question_answering.yml +++ b/tmux/eval_visual_full_question_answering.yml @@ -168,22 +168,22 @@ windows: panes: - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic - emotion_detection: layout: even-vertical panes: - - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + - emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_intent_topic:=interp_user_intent_topic - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=interp_user_intent_topic + -p user_emotion_topic:=emotion_topic - question_answering: layout: even-vertical panes: - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_emotion_topic:=gpt_emotion_topic + -p utterance_topic:=emotion_topic -p task_state_topic:=task_state_topic -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections @@ -193,5 +193,4 @@ windows: -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 - -p must_contain_target_phrase:=True -p debug_mode:=True \ No newline at end of file diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index db3f45c40..069c4a74e 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -184,7 +184,7 @@ windows: panes: - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=gpt_emotion_topic -p task_state_topic:=task_state_topic -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections diff --git a/tmux/eval_vocalized_emotion_detection.yml b/tmux/eval_vocalized_emotion_detection.yml new file mode 100644 index 000000000..b88684fb8 --- /dev/null +++ b/tmux/eval_vocalized_emotion_detection.yml @@ -0,0 +1,94 @@ +# +# Used to evaluate Emotion Detection with vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: VAD + ASR + Emotion Detection +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export NODE_RESOURCES_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/resource +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: + # - ros_bag_play: ros2 bag play <> + - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=3 + -p vad_margin:=0.20 + -p max_accumulation_length:=10 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=1 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - emotion_detection: + layout: even-vertical + panes: + - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=base_emotion_topic + - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=gpt_emotion_topic \ No newline at end of file diff --git a/tmux/eval_vocalized_intent_detection.yml b/tmux/eval_vocalized_intent_detection.yml index 559557e8a..bf160dc8f 100644 --- a/tmux/eval_vocalized_intent_detection.yml +++ b/tmux/eval_vocalized_intent_detection.yml @@ -10,7 +10,7 @@ # ``` # -name: Intent Detection with VAD +name: VAD + ASR + Intent Detection root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # Optional tmux socket @@ -60,7 +60,7 @@ tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf windows: # - ros_bag_play: ros2 bag play <> - - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00_0.db3 + - ros_bag_play: sleep 5; ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 - vocal: layout: even-vertical panes: @@ -86,11 +86,11 @@ windows: panes: - base_intent_detection: ros2 run angel_system_nodes base_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic \ No newline at end of file diff --git a/tmux/eval_vocalized_emotional_detection.yml b/tmux/eval_vocalized_intent_emotion_detection.yml similarity index 81% rename from tmux/eval_vocalized_emotional_detection.yml rename to tmux/eval_vocalized_intent_emotion_detection.yml index f16d448a3..ae24dde32 100644 --- a/tmux/eval_vocalized_emotional_detection.yml +++ b/tmux/eval_vocalized_intent_emotion_detection.yml @@ -10,7 +10,7 @@ # ``` # -name: ASR Evaluation with VAD +name: VAD + ASR + Intent Detection + Emotion Detection root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # Optional tmux socket @@ -84,26 +84,15 @@ windows: - intent_detection: layout: even-vertical panes: - - base_intent_detection: ros2 run angel_system_nodes base_intent_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic - emotion_detection: layout: even-vertical panes: - - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - -p user_emotion_topic:=base_emotion_topic - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic + -p input_topic:=interp_user_intent_topic -p user_emotion_topic:=gpt_emotion_topic \ No newline at end of file diff --git a/tmux/eval_vocalized_question_answering.yml b/tmux/eval_vocalized_question_answering.yml index 3fd3c8dbb..b61ef88f1 100644 --- a/tmux/eval_vocalized_question_answering.yml +++ b/tmux/eval_vocalized_question_answering.yml @@ -1,16 +1,4 @@ -# -# Used to evaluate Question Answering with vocal processing for a specified ROS bag of data -# This configuration should be run by itself (e.g. not in combination with -# another tmuxinator launch). -# -# NOTE: In order to query GPT, you will need to execute -# ``` -# export OPENAI_API_KEY="YOUR API KEY" -# export OPENAI_ORG_ID="YOUR ORG ID" -# ``` -# - -name: ASR Evaluation with VAD +name: Vocal Question Answering root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # Optional tmux socket @@ -81,27 +69,18 @@ windows: -p asr_req_segment_duration:=1 -p is_sentence_tokenize:=False -p debug_mode:=True - - intent_detection: - layout: even-vertical - panes: - - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - emotion_detection: layout: even-vertical panes: - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic + -p input_topic:=utterances_topic -p user_emotion_topic:=gpt_emotion_topic - question_answering: layout: even-vertical panes: - gpt_question_answering: ros2 run angel_system_nodes question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=gpt_emotion_topic -p system_text_response_topic:=system_text_response_topic -p few_shot_prompt_file:=${CONFIG_DIR}/llm_prompts/tourniquet_steps_prompt From 020ad07e958d329ce119dfc4d65ff401060b398f Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Wed, 1 Nov 2023 23:48:13 -0400 Subject: [PATCH 28/46] Add surrounding quotations to user question utterance --- .../configs/llm_prompts/vis_qa_teacher_prompt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index 6e8d6d5d8..082b32571 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -16,6 +16,6 @@ Objects In Front of Me: {centered_observables}. Objects Nearby: {all_observables} -My Question: {question} +My Question: "{question}" Your Answer: \ No newline at end of file From 919a26c09baa1f67bcca38340756961cfc0cf6a1 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Wed, 1 Nov 2023 23:52:42 -0400 Subject: [PATCH 29/46] Rearrange visual and vocal dialogue system tmux configurations --- tmux/eval_asr.yml | 69 ------- ...al_visual_barebones_question_answering.yml | 0 ...al_visual_emotional_question_answering.yml | 188 ++++++++++++++++++ .../eval_visual_full_question_answering.yml | 0 .../vocal}/eval_vocal.yml | 0 .../eval_vocalized_emotion_detection.yml | 0 .../eval_vocalized_intent_detection.yml | 0 ...val_vocalized_intent_emotion_detection.yml | 0 .../eval_vocalized_question_answering.yml | 0 9 files changed, 188 insertions(+), 69 deletions(-) delete mode 100644 tmux/eval_asr.yml rename tmux/{ => vocalized_dialogue_systems/visual}/eval_visual_barebones_question_answering.yml (100%) create mode 100644 tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml rename tmux/{ => vocalized_dialogue_systems/visual}/eval_visual_full_question_answering.yml (100%) rename tmux/{ => vocalized_dialogue_systems/vocal}/eval_vocal.yml (100%) rename tmux/{ => vocalized_dialogue_systems/vocal}/eval_vocalized_emotion_detection.yml (100%) rename tmux/{ => vocalized_dialogue_systems/vocal}/eval_vocalized_intent_detection.yml (100%) rename tmux/{ => vocalized_dialogue_systems/vocal}/eval_vocalized_intent_emotion_detection.yml (100%) rename tmux/{ => vocalized_dialogue_systems/vocal}/eval_vocalized_question_answering.yml (100%) diff --git a/tmux/eval_asr.yml b/tmux/eval_asr.yml deleted file mode 100644 index 5c439bfc3..000000000 --- a/tmux/eval_asr.yml +++ /dev/null @@ -1,69 +0,0 @@ -# -# Used to evaluate ASR and intent detection for a specified ROS bag of data. -# This configuration should be run by itself (e.g. not in combination with -# another tmuxinator launch). -# - -name: ASR Evaluation -root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> - -# Optional tmux socket -# socket_name: foo - -# Note that the pre and post options have been deprecated and will be replaced by -# project hooks. - -# Project hooks - -# Runs on project start, always -# on_project_start: command -on_project_start: | - export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} - export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs - -# Run on project start, the first time -# on_project_first_start: command - -# Run on project start, after the first time -# on_project_restart: command - -# Run on project exit ( detaching from tmux session ) -# on_project_exit: command - -# Run on project stop -# on_project_stop: command - -# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. -# pre_window: rbenv shell 2.0.0-p247 - -# Pass command line options to tmux. Useful for specifying a different tmux.conf. -# tmux_options: -f ~/.tmux.mac.conf -tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf - -# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. -# tmux_command: byobu - -# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. -# startup_window: editor - -# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. -# startup_pane: 1 - -# Controls whether the tmux session should be attached to automatically. Defaults to true. -# attach: false - -windows: - # - ros_bag_play: ros2 bag play <> - - ros_bag_play: ros2 bag play /angel_workspace/ros_bags/rosbag2_2023_03_01-17_28_00/rosbag2_2023_03_01-17_28_00_0.db3 - - asr: ros2 run angel_system_nodes asr --ros-args - -r __ns:=${ROS_NAMESPACE} - -p audio_topic:=HeadsetAudioData - -p utterances_topic:=utterances_topic - -p asr_server_url:=http://communication.cs.columbia.edu:8058/asr - -p asr_req_segment_duration:=30 - - intent_detection: ros2 run angel_system_nodes intent_detector --ros-args - -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic - -p expect_user_intent_topic:=expect_user_intent_topic - -p interp_user_intent_topic:=interp_user_intent_topic - diff --git a/tmux/eval_visual_barebones_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_barebones_question_answering.yml similarity index 100% rename from tmux/eval_visual_barebones_question_answering.yml rename to tmux/vocalized_dialogue_systems/visual/eval_visual_barebones_question_answering.yml diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml new file mode 100644 index 000000000..c795b3795 --- /dev/null +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml @@ -0,0 +1,188 @@ +# +# Used to evaluate Question Answering with visual + vocal processing for a specified ROS bag of data +# This configuration should be run by itself (e.g. not in combination with +# another tmuxinator launch). +# +# NOTE: In order to query GPT, you will need to execute +# ``` +# export OPENAI_API_KEY="YOUR API KEY" +# export OPENAI_ORG_ID="YOUR ORG ID" +# ``` +# + +name: Visual Question Answering +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/debug} + export HL2_IP=${HL2_IP:-192.168.1.101} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +# Change the command to call tmux. This can be used by derivatives/wrappers like byobu. +# tmux_command: byobu + +# Specifies (by name or index) which window will be selected on project startup. If not set, the first window is used. +# startup_window: editor + +# Specifies (by index) which pane of the specified window will be selected on project startup. If not set, the first pane is used. +# startup_pane: 1 + +# Controls whether the tmux session should be attached to automatically. Defaults to true. +# attach: false + +windows: +# - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ROS_IP:=0.0.0.0 +# - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p ip_addr:=${HL2_IP} +# -p image_topic:=PVFramesBGR +# -p image_ts_topic:=disable +# -p hand_pose_topic:=disable +# -p audio_topic:=HeadsetAudioData +# -p head_pose_topic:=HeadsetPoseData +# -p sm_topic:=disable +# -p rm_depth_AHAT:=disable +# -p pv_width:=760 +# -p pv_height:=428 +# -p pv_framerate:=30 +# -p sm_freq:=5 + - sensor_input: + layout: even-vertical + panes: + - ros_bag_play: sleep 2; ros2 bag play ros_bags/josh_rosbag/josh_rosbag.db3 + + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesRGB + -p output_topic:=PVFramesRGB_TS + + # Visualize RGB Images being output from the headset + - rqt_rgb_images: rqt -s rqt_image_view/ImageView + --args ${ROS_NAMESPACE}/PVFramesBGR + --ros-args -p _image_transport:=raw + - object_detector: + layout: even-vertical + panes: + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesBGR + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt + -p model_mapping:=${MODEL_DIR}/activity_tcn-coffee-mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-coffee-det_label_mapping.json + -p model_device:=cuda:0 + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + - multi_task_monitor: ros2 run angel_system_nodes dummy_multi_task_monitor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=task_state_topic + -p task_error_topic:=TaskErrors + -p query_task_graph_topic:=query_task_graph + -p sys_cmd_topic:=SystemCommands + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - emotion_detection: + layout: even-vertical + panes: + - emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=emotion_topic + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=emotion_topic + -p task_state_topic:=task_state_topic + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_last_n:=5 + -p pv_width:=1920 + -p pv_height:=1080 + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/eval_visual_full_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml similarity index 100% rename from tmux/eval_visual_full_question_answering.yml rename to tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml diff --git a/tmux/eval_vocal.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocal.yml similarity index 100% rename from tmux/eval_vocal.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocal.yml diff --git a/tmux/eval_vocalized_emotion_detection.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_emotion_detection.yml similarity index 100% rename from tmux/eval_vocalized_emotion_detection.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_emotion_detection.yml diff --git a/tmux/eval_vocalized_intent_detection.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_detection.yml similarity index 100% rename from tmux/eval_vocalized_intent_detection.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_detection.yml diff --git a/tmux/eval_vocalized_intent_emotion_detection.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_emotion_detection.yml similarity index 100% rename from tmux/eval_vocalized_intent_emotion_detection.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_intent_emotion_detection.yml diff --git a/tmux/eval_vocalized_question_answering.yml b/tmux/vocalized_dialogue_systems/vocal/eval_vocalized_question_answering.yml similarity index 100% rename from tmux/eval_vocalized_question_answering.yml rename to tmux/vocalized_dialogue_systems/vocal/eval_vocalized_question_answering.yml From cfdf435c59fe1c2cc2eddfabbc504b0ce0389807 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 2 Nov 2023 00:09:00 -0400 Subject: [PATCH 30/46] Add timeout parameter to dialogue system nodes Users can now add a ros node parameterization of "-p timeout:= 2" in order to determine how much time is allowed for outstanding GPT requests before timeout. --- .../angel_system_nodes/gpt_emotion_detector.py | 10 ++++++++++ .../angel_system_nodes/gpt_intent_detector.py | 10 ++++++++++ .../angel_system_nodes/question_answerer.py | 5 ++++- .../angel_system_nodes/visual_question_answerer.py | 6 +++++- .../eval_visual_emotional_question_answering.yml | 3 ++- .../visual/eval_visual_full_question_answering.yml | 4 +++- 6 files changed, 34 insertions(+), 4 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py index 259b27311..bea8cda01 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py @@ -7,6 +7,7 @@ from angel_msgs.msg import DialogueUtterance from angel_system_nodes.base_emotion_detector import BaseEmotionDetector, LABEL_MAPPINGS +from angel_utils import declare_and_get_parameters openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") @@ -21,12 +22,20 @@ {"utterance": "We're doing great and I'm learning a lot!", "label": "positive"}, ] +PARAM_TIMEOUT = "timeout" class GptEmotionDetector(BaseEmotionDetector): def __init__(self): super().__init__() self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (PARAM_TIMEOUT,600), + ]) + self.timeout = param_values[PARAM_TIMEOUT] + # This node additionally includes fields for interacting with OpenAI # via LangChain. if not os.getenv("OPENAI_API_KEY"): @@ -78,6 +87,7 @@ def _labels_list_str(labels): openai_api_key=self.openai_api_key, temperature=0.0, max_tokens=1, + request_timeout=self.timeout ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py index 11e4780ce..3e2190fb1 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py @@ -8,6 +8,7 @@ from angel_msgs.msg import DialogueUtterance from angel_system_nodes.base_intent_detector import BaseIntentDetector, INTENT_LABELS +from angel_utils import declare_and_get_parameters openai.organization = os.getenv("OPENAI_ORG_ID") openai.api_key = os.getenv("OPENAI_API_KEY") @@ -20,12 +21,20 @@ {"utterance": "The sky is blue", "label": "other"}, ] +PARAM_TIMEOUT = "timeout" class GptIntentDetector(BaseIntentDetector): def __init__(self): super().__init__() self.log = self.get_logger() + param_values = declare_and_get_parameters( + self, + [ + (PARAM_TIMEOUT,600), + ]) + self.timeout = param_values[PARAM_TIMEOUT] + # This node additionally includes fields for interacting with OpenAI # via LangChain. if not os.getenv("OPENAI_API_KEY"): @@ -79,6 +88,7 @@ def _labels_list_str(labels): # Only 2 tokens needed for classification (tokens are delimited by use of '_', i.e. # 'next_step' counts as 2 tokens). max_tokens=2, + request_timeout=self.timeout ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) diff --git a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py index fe4f42cd8..f9ea13864 100644 --- a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py @@ -18,7 +18,7 @@ INPUT_TOPIC = "input_topic" OUT_QA_TOPIC = "system_text_response_topic" FEW_SHOT_PROMPT = "few_shot_prompt_file" - +PARAM_TIMEOUT = "timeout" class QuestionAnswerer(BaseDialogueSystemNode): def __init__(self): @@ -31,11 +31,13 @@ def __init__(self): (INPUT_TOPIC,), (OUT_QA_TOPIC,), (FEW_SHOT_PROMPT,), + (PARAM_TIMEOUT, 600), ], ) self._input_topic = param_values[INPUT_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] self.prompt_file = param_values[FEW_SHOT_PROMPT] + self.timeout = param_values[PARAM_TIMEOUT] self.question_queue = queue.Queue() self.handler_thread = threading.Thread(target=self.process_question_queue) @@ -137,6 +139,7 @@ def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): "https://api.openai.com/v1/chat/completions", json=payload, headers={"Authorization": "Bearer {}".format(self.openai_api_key)}, + timeout=self.timeout ) return ( json.loads(req.text)["choices"][0]["message"]["content"] diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index fe6b0e63d..fc1c6ee90 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -92,6 +92,7 @@ "question", ] +PARAM_TIMEOUT = "timeout" class VisualQuestionAnswerer(BaseDialogueSystemNode): @@ -124,8 +125,9 @@ def __init__(self): (PARAM_ACT_CLFN_THRESHOLD, 0.8), (OUT_QA_TOPIC,), (PARAM_CONTEXT_HISTORY_LENGTH, 3), - (PARAM_DEBUG_MODE, False), (PARAM_MUST_CONTAIN_TARGET_PHRASE, False), + (PARAM_TIMEOUT, 600), + (PARAM_DEBUG_MODE, False), ], ) self._in_utterance_topic = param_values[IN_UTTERANCE_TOPIC] @@ -134,6 +136,7 @@ def __init__(self): self._in_actions_topic = param_values[IN_ACT_CLFN_TOPIC] self._out_qa_topic = param_values[OUT_QA_TOPIC] self.dialogue_history_length = param_values[PARAM_CONTEXT_HISTORY_LENGTH] + self.timeout = param_values[PARAM_TIMEOUT] self.debug_mode = False if param_values[PARAM_DEBUG_MODE]: self.debug_mode = True @@ -269,6 +272,7 @@ def _configure_langchain(self): openai_api_key=self.openai_api_key, temperature=0.0, max_tokens=64, + request_timeout=self.timeout ) zero_shot_prompt = langchain.PromptTemplate( input_variables=PROMPT_VARIABLES, diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml index c795b3795..d03b43d21 100644 --- a/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml @@ -166,10 +166,11 @@ windows: - emotion_detection: layout: even-vertical panes: - - emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p input_topic:=utterances_topic -p user_emotion_topic:=emotion_topic + -p timeout:=2 - question_answering: layout: even-vertical panes: diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml index 5e2d6493b..3b4842e61 100644 --- a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml @@ -171,13 +171,15 @@ windows: -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic + -p timeout:=2 - emotion_detection: layout: even-vertical panes: - - emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p input_topic:=interp_user_intent_topic -p user_emotion_topic:=emotion_topic + -p timeout:=2 - question_answering: layout: even-vertical panes: From b8d6efdcd34a65a9a400ab7b9af9a96f7fe7ae54 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 2 Nov 2023 00:11:05 -0400 Subject: [PATCH 31/46] Remove outdated filter type code in Visual Question Answering Node --- .../angel_system_nodes/visual_question_answerer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index fc1c6ee90..ce8d31841 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -42,10 +42,6 @@ PARAM_MUST_CONTAIN_TARGET_PHRASE = "must_contain_target_phrase" TARGET_PHRASE = "hey angel" -# Below configures the filtering strategy for detected objects. It should correspond to -# VisualQuestionAnswerer.FilterType. -PARAM_OBJECT_DETECTION_FILTER_STRATEGY = "obj_det_filter" - # Below indicates how many of the last n detected objects should be surfaced # in the LLM prompt. These objects do NOT have to be unique. PARAM_OBJECT_LAST_N_OBJ_DETECTIONS = "obj_det_last_n" From ae8fd47888d475f381fa2c8f806027b03ec76141 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 2 Nov 2023 00:28:10 -0400 Subject: [PATCH 32/46] Add ignorables parameter to Visual Question Answering Node Users can now specify which objects label detections to ignore by providing a comma-delimited list of object labels, e.g. `-p object_det_ignored_objects:="hand (left),hand (right),background"` --- .../visual_question_answerer.py | 12 +++++++++++ ...sual_vocalized_question_answering_live.yml | 20 ++++++++++--------- ...al_visual_emotional_question_answering.yml | 1 + .../eval_visual_full_question_answering.yml | 1 + 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index ce8d31841..36665b4c8 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -46,6 +46,9 @@ # in the LLM prompt. These objects do NOT have to be unique. PARAM_OBJECT_LAST_N_OBJ_DETECTIONS = "obj_det_last_n" +# Comma-delimited list of objects to ignore. +PARAM_OBJECT_DETECTION_IGNORABLES = "object_det_ignored_objects" + # Below are the corresponding model thresholds. PARAM_OBJECT_DETECTION_THRESHOLD = "object_det_threshold" PARAM_ACT_CLFN_THRESHOLD = "action_classification_threshold" @@ -116,6 +119,7 @@ def __init__(self): (PARAM_PROMPT_TEMPLATE_PATH,), (PARAM_IMAGE_WIDTH,), (PARAM_IMAGE_HEIGHT,), + (PARAM_OBJECT_DETECTION_IGNORABLES,""), (PARAM_OBJECT_LAST_N_OBJ_DETECTIONS, 5), (PARAM_OBJECT_DETECTION_THRESHOLD, 0.8), (PARAM_ACT_CLFN_THRESHOLD, 0.8), @@ -161,6 +165,12 @@ def __init__(self): f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" ) + self.object_dtctn_ignorables = set([s.strip() for s in + param_values[PARAM_OBJECT_DETECTION_IGNORABLES].split(",")]) + self.log.info( + colored(f"Will be ignoring the following objects: {self.object_dtctn_ignorables}", + "light_red") + ) self.object_dtctn_threshold = param_values[PARAM_OBJECT_DETECTION_THRESHOLD] self.object_dtctn_last_n_obj_detections = param_values[PARAM_OBJECT_LAST_N_OBJ_DETECTIONS] @@ -379,6 +389,7 @@ def _get_latest_centered_observables(self, curr_time: int) -> str: centroid, obj_score = centered_obj_detection obj, score = obj_score observables.add(obj) + observables = observables - self.object_dtctn_ignorables return ", ".join(observables) def _get_latest_observables(self, curr_time: int, n: int) -> str: @@ -401,6 +412,7 @@ def _get_latest_observables(self, curr_time: int, n: int) -> str: for detection in detections[-n:]: for obj in detection.entity: observables.add(obj) + observables = observables - self.object_dtctn_ignorables return ", ".join(observables) def get_response( diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index 069c4a74e..8f5c9d6ba 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -143,7 +143,6 @@ windows: -p task_monitor_topic:=TaskUpdates -p arui_update_topic:=AruiUpdates -p interp_user_intent_topic:=InterpUserIntents - - vocal: layout: even-vertical panes: @@ -167,31 +166,34 @@ windows: - intent_detection: layout: even-vertical panes: - - gpt_intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args + - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterances_topic:=utterances_topic + -p input_topic:=utterances_topic -p expect_user_intent_topic:=expect_user_intent_topic -p interp_user_intent_topic:=interp_user_intent_topic + -p timeout:=2 - emotion_detection: layout: even-vertical panes: - - gpt_interp_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p user_intent_topic:=interp_user_intent_topic - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=interp_user_intent_topic + -p user_emotion_topic:=emotion_topic + -p timeout:=2 - question_answering: layout: even-vertical panes: - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args -r __ns:=${ROS_NAMESPACE} - -p input_topic:=gpt_emotion_topic + -p utterance_topic:=emotion_topic -p task_state_topic:=task_state_topic -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_filter:=CENTROID + -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 - -p debug_mode:=True + -p object_det_ignored_objects:="hand (left),hand (right),background" + -p debug_mode:=True \ No newline at end of file diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml index d03b43d21..225ba1387 100644 --- a/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_emotional_question_answering.yml @@ -186,4 +186,5 @@ windows: -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 + -p object_det_ignored_objects:="hand (left),hand (right),background" -p debug_mode:=True \ No newline at end of file diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml index 3b4842e61..7980189c4 100644 --- a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml @@ -195,4 +195,5 @@ windows: -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 + -p object_det_ignored_objects:="hand (left),hand (right),background" -p debug_mode:=True \ No newline at end of file From 1f16adaca8e043c267bd6414e1d7b1ee42274490 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 2 Nov 2023 00:30:01 -0400 Subject: [PATCH 33/46] Delete unused prompt instructions in Visual Question Answering Node --- .../angel_system_nodes/visual_question_answerer.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 36665b4c8..77e396042 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -65,19 +65,6 @@ PARAM_IMAGE_HEIGHT = "pv_height" PARAM_DEBUG_MODE = "debug_mode" -# Below is the complete set of prompt instructions. -PROMPT_INSTRUCTIONS = """ -You are given a User Scenario. All the objects in front of and observable to the user are included. -Your task is to use the Action Steps to answer the user's Question. - -Action Steps: {recipe} - -User Scenario: -The User feels {emotion} while doing {action}. The User can see {observables}. - -User Question: {question} -Answer: """ - # Below are all the variables. These should correspond to the variables defined in the # PROMPT_TEMPLATE_PATH and will be indicated by surrounding '{' and '}'. PROMPT_VARIABLES = [ From af0759881de766f213686029e5d4252f4895eaa7 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Thu, 2 Nov 2023 00:31:21 -0400 Subject: [PATCH 34/46] Apply code formatting --- angel_system/data/common/bounding_boxes.py | 14 +- .../centroid_2d_strategy_queue.py | 32 ++-- .../centroid_2d_strategy_queue_test.py | 150 +++++++++++------- .../angel_system_nodes/audio/asr.py | 4 +- .../base_dialogue_system_node.py | 20 ++- .../base_emotion_detector.py | 5 +- .../base_intent_detector.py | 5 +- .../gpt_emotion_detector.py | 8 +- .../angel_system_nodes/gpt_intent_detector.py | 8 +- .../angel_system_nodes/question_answerer.py | 3 +- .../visual_question_answerer.py | 78 +++++---- 11 files changed, 199 insertions(+), 128 deletions(-) diff --git a/angel_system/data/common/bounding_boxes.py b/angel_system/data/common/bounding_boxes.py index 207f5962c..6811580e0 100644 --- a/angel_system/data/common/bounding_boxes.py +++ b/angel_system/data/common/bounding_boxes.py @@ -1,9 +1,15 @@ from typing import * + class BoundingBoxes: - - def __init__(self, left: List[int], right: List[int], top: List[int], bottom: List[int], - item: List[Any]): + def __init__( + self, + left: List[int], + right: List[int], + top: List[int], + bottom: List[int], + item: List[Any], + ): """ Wrapper of bounding boxes and a contained entity corresponding to each bounding box. The item is intentionally kept ambiguous to provide flexibility (e.g. can pass in @@ -14,4 +20,4 @@ def __init__(self, left: List[int], right: List[int], top: List[int], bottom: Li self.right = right self.top = top self.bottom = bottom - self.item = item \ No newline at end of file + self.item = item diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py index b9a2a94ef..ba4944435 100644 --- a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue.py @@ -25,8 +25,14 @@ class Centroid2DStrategyQueue: q.get_n_before(2) """ - def __init__(self, n : int, center_x: int, center_y: int, - k: int = 1, log_func: Optional[Callable[..., None]] = None): + def __init__( + self, + n: int, + center_x: int, + center_y: int, + k: int = 1, + log_func: Optional[Callable[..., None]] = None, + ): """ Additional arguments are passed to the logging method :param n: Whenever objects are retrieved, return the last n entries. @@ -36,7 +42,7 @@ def __init__(self, n : int, center_x: int, center_y: int, module is used. """ self._log_func = log_func - + self.n = n self.k = k @@ -71,9 +77,11 @@ def get_n_before(self, timestamp: int) -> List[Any]: break self.lock.release() if self._log_func: - self._log_func(f"Read up to {self.n} items from queue" +\ - "; ".join([f"{item} @ Time={time}" for time, item in items])) - return items[-self.n:] if items else items + self._log_func( + f"Read up to {self.n} items from queue" + + "; ".join([f"{item} @ Time={time}" for time, item in items]) + ) + return items[-self.n :] if items else items def _get_k_most_center_objects(self, bb: BoundingBoxes) -> List[Any]: """ @@ -87,11 +95,10 @@ def _get_k_most_center_objects(self, bb: BoundingBoxes) -> List[Any]: for item, left, right, top, bottom in zipped: centroid_x, centroid_y = self._get_centroid(left, right, top, bottom) dist = distance.euclidean( - [centroid_x, centroid_y], - [self.center_x, self.center_y] + [centroid_x, centroid_y], [self.center_x, self.center_y] ) heapq.heappush(k_most_centered_objects, (dist, item)) - + # Return the top k centered objects based on centroid distance. result = [] for _ in range(self.k): @@ -99,12 +106,13 @@ def _get_k_most_center_objects(self, bb: BoundingBoxes) -> List[Any]: break result.append(heapq.heappop(k_most_centered_objects)) return result - - def _get_centroid(self, left: int, right: int, top: int, bottom: int) -> Tuple[int, int]: + def _get_centroid( + self, left: int, right: int, top: int, bottom: int + ) -> Tuple[int, int]: """ Calculates the center 2D pixel of a 2D bounding box. """ width_center = left + int((right - left) / 2) height_center = top + int((bottom - top) / 2) - return [width_center, height_center] \ No newline at end of file + return [width_center, height_center] diff --git a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py index b33ad3699..36dbf4b4e 100644 --- a/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py +++ b/angel_system/utils/object_detection_queues/centroid_2d_strategy_queue_test.py @@ -7,15 +7,16 @@ RESOLUTION_W = 1920 RESOLUTION_H = 1080 -class Centroid2DStrategyQueueTest(unittest.TestCase): +class Centroid2DStrategyQueueTest(unittest.TestCase): def test_queue_n3_k1_insertion(self): """ Tests proper queue insertion when objects are inserted as strings. """ q = Centroid2DStrategyQueue( - n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2) - + n=5, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2 + ) + # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. first_objects_detected = BoundingBoxes( @@ -23,7 +24,7 @@ def test_queue_n3_k1_insertion(self): [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], - ['mug', 'computer', 'dog'] + ["mug", "computer", "dog"], ) # Ball is in top left of the screen. The butterfly is bottom right @@ -33,7 +34,7 @@ def test_queue_n3_k1_insertion(self): [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], [2, 4, RESOLUTION_H // 2 + 10], - ['ball', 'butterfly', 'cat'] + ["ball", "butterfly", "cat"], ) # Shoes is in bottom right of the screen. The pencil is in the top left @@ -42,30 +43,37 @@ def test_queue_n3_k1_insertion(self): [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], - [RESOLUTION_H, 4 , 2], - ['shoes', 'pencil', 'child'] + [RESOLUTION_H, 4, 2], + ["shoes", "pencil", "child"], ) q.add(timestamp=1, bounding_boxed_item=first_objects_detected) q.add(timestamp=2, bounding_boxed_item=second_objects_detected) q.add(timestamp=3, bounding_boxed_item=third_objects_detected) - + queue_state = q.get_queue() - first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ - queue_state[0], queue_state[1], queue_state[2] - first_top_k, second_top_k, third_top_k = \ - first_timestamped_item[-1], second_timetsamped_item[-1], third_timestamped_item[-1] + first_timestamped_item, second_timetsamped_item, third_timestamped_item = ( + queue_state[0], + queue_state[1], + queue_state[2], + ) + first_top_k, second_top_k, third_top_k = ( + first_timestamped_item[-1], + second_timetsamped_item[-1], + third_timestamped_item[-1], + ) # Recall that each object is a List of Tuples of (centroid distance, detected object) - self.assertEqual(first_top_k[0][-1], 'dog') - self.assertEqual(second_top_k[0][-1], 'cat') - self.assertEqual(third_top_k[0][-1], 'shoes') + self.assertEqual(first_top_k[0][-1], "dog") + self.assertEqual(second_top_k[0][-1], "cat") + self.assertEqual(third_top_k[0][-1], "shoes") def test_queue_n3_k1_insertion_with_confidence_scores(self): """ Tests proper queue insertion when objects are inserted as Tuples with confidence scores. """ q = Centroid2DStrategyQueue( - n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2) - + n=5, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2 + ) + # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. first_objects_detected = BoundingBoxes( @@ -73,7 +81,7 @@ def test_queue_n3_k1_insertion_with_confidence_scores(self): [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], - [('mug', 0.1), ('computer', 0.8), ('dog', 0.5)] + [("mug", 0.1), ("computer", 0.8), ("dog", 0.5)], ) # Ball is in top left of the screen. The butterfly is bottom right @@ -83,7 +91,7 @@ def test_queue_n3_k1_insertion_with_confidence_scores(self): [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], [2, 4, RESOLUTION_H // 2 + 10], - [('ball', 0.9), ('butterfly', 0.3), ('cat', 0.5)] + [("ball", 0.9), ("butterfly", 0.3), ("cat", 0.5)], ) # Shoes is in bottom right of the screen. The pencil is in the top left @@ -92,34 +100,41 @@ def test_queue_n3_k1_insertion_with_confidence_scores(self): [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], - [RESOLUTION_H, 4 , 2], - [('shoes', 0.9), ('pencil', 0.3), ('child', 0.5)] + [RESOLUTION_H, 4, 2], + [("shoes", 0.9), ("pencil", 0.3), ("child", 0.5)], ) q.add(timestamp=1, bounding_boxed_item=first_objects_detected) q.add(timestamp=2, bounding_boxed_item=second_objects_detected) q.add(timestamp=3, bounding_boxed_item=third_objects_detected) - + queue_state = q.get_queue() - first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ - queue_state[0], queue_state[1], queue_state[2] - first_top_k, second_top_k, third_top_k = \ - first_timestamped_item[-1], second_timetsamped_item[-1], third_timestamped_item[-1] + first_timestamped_item, second_timetsamped_item, third_timestamped_item = ( + queue_state[0], + queue_state[1], + queue_state[2], + ) + first_top_k, second_top_k, third_top_k = ( + first_timestamped_item[-1], + second_timetsamped_item[-1], + third_timestamped_item[-1], + ) # Recall that each object is a List of Tuples: # (centroid distance, (detected object, confidence score)) _, obj_with_conf_score = first_top_k[0] - self.assertEqual(obj_with_conf_score[0], 'dog') + self.assertEqual(obj_with_conf_score[0], "dog") _, obj_with_conf_score = second_top_k[0] - self.assertEqual(obj_with_conf_score[0], 'cat') + self.assertEqual(obj_with_conf_score[0], "cat") _, obj_with_conf_score = third_top_k[0] - self.assertEqual(obj_with_conf_score[0], 'shoes') + self.assertEqual(obj_with_conf_score[0], "shoes") def test_queue_n3_k2_insertion(self): """ Tests proper queue insertion when the top 2 objects are inserted as strings. """ q = Centroid2DStrategyQueue( - n=5, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) - + n=5, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. first_objects_detected = BoundingBoxes( @@ -127,7 +142,7 @@ def test_queue_n3_k2_insertion(self): [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], - ['mug', 'computer', 'dog'] + ["mug", "computer", "dog"], ) # Ball is in top left of the screen. The butterfly is bottom right @@ -137,7 +152,7 @@ def test_queue_n3_k2_insertion(self): [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], [2, 4, RESOLUTION_H // 2 + 10], - ['ball', 'butterfly', 'cat'] + ["ball", "butterfly", "cat"], ) # Shoes is in bottom right of the screen. The pencil is in the top left @@ -146,20 +161,26 @@ def test_queue_n3_k2_insertion(self): [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], - [RESOLUTION_H, 4 , 2], - ['shoes', 'pencil', 'child'] + [RESOLUTION_H, 4, 2], + ["shoes", "pencil", "child"], ) q.add(timestamp=1, bounding_boxed_item=first_objects_detected) q.add(timestamp=2, bounding_boxed_item=second_objects_detected) q.add(timestamp=3, bounding_boxed_item=third_objects_detected) - + queue_state = q.get_queue() - first_timestamped_item, second_timetsamped_item, third_timestamped_item = \ - queue_state[0], queue_state[1], queue_state[2] - first_top_k, second_top_k, third_top_k = \ - first_timestamped_item[-1], second_timetsamped_item[-1], third_timestamped_item[-1] + first_timestamped_item, second_timetsamped_item, third_timestamped_item = ( + queue_state[0], + queue_state[1], + queue_state[2], + ) + first_top_k, second_top_k, third_top_k = ( + first_timestamped_item[-1], + second_timetsamped_item[-1], + third_timestamped_item[-1], + ) # Recall that each object is a List of Tuples of (centroid distance, detected object) - + first_object_labels = [label for centroid, label in first_top_k] self.assertEqual(["dog", "computer"], first_object_labels) @@ -174,8 +195,9 @@ def test_queue_n3_k2_removal(self): Tests proper queueing of the last 3 top 2 objects are inserted as strings. """ q = Centroid2DStrategyQueue( - n=1, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) - + n=1, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. first_objects_detected = BoundingBoxes( @@ -183,7 +205,7 @@ def test_queue_n3_k2_removal(self): [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], - ['mug', 'computer', 'dog'] + ["mug", "computer", "dog"], ) # Ball is in top left of the screen. The butterfly is bottom right @@ -193,7 +215,7 @@ def test_queue_n3_k2_removal(self): [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], [2, 4, RESOLUTION_H // 2 + 10], - ['ball', 'butterfly', 'cat'] + ["ball", "butterfly", "cat"], ) # Shoes is in bottom right of the screen. The pencil is in the top left @@ -202,13 +224,13 @@ def test_queue_n3_k2_removal(self): [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], - [RESOLUTION_H, 4 , 2], - ['shoes', 'pencil', 'child'] + [RESOLUTION_H, 4, 2], + ["shoes", "pencil", "child"], ) q.add(timestamp=1, bounding_boxed_item=first_objects_detected) q.add(timestamp=2, bounding_boxed_item=second_objects_detected) q.add(timestamp=3, bounding_boxed_item=third_objects_detected) - + no_items = q.get_n_before(timestamp=1) self.assertEqual([], no_items) # Expects the last n=1 detections before timestamp 4. This should be timestamp 3's @@ -224,8 +246,9 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): when the top 2 objects are inserted as strings with confidence scores. """ q = Centroid2DStrategyQueue( - n=2, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) - + n=2, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) + # Dog is in the middle of the screen. Mug is in top left of the screen. # Computer is near bottom right of screen. first_objects_detected = BoundingBoxes( @@ -233,7 +256,7 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): [2, RESOLUTION_W * 3 // 4 + 10, RESOLUTION_W // 2 + 20], [1, RESOLUTION_H * 3 // 4, RESOLUTION_H // 2], [2, RESOLUTION_H * 3 // 4 + 10, RESOLUTION_H // 2 + 10], - [('mug', 0.1), ('computer', 0.8), ('dog', 0.5)] + [("mug", 0.1), ("computer", 0.8), ("dog", 0.5)], ) # Ball is in top left of the screen. The butterfly is bottom right @@ -243,7 +266,7 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): [2, 4, RESOLUTION_W // 2 + 20], [1, 2, RESOLUTION_H // 2], [2, 4, RESOLUTION_H // 2 + 10], - [('ball', 0.9), ('butterfly', 0.3), ('cat', 0.5)] + [("ball", 0.9), ("butterfly", 0.3), ("cat", 0.5)], ) # Shoes is in bottom right of the screen. The pencil is in the top left @@ -252,26 +275,30 @@ def test_queue_n2_k2_removal_with_confidence_scores(self): [RESOLUTION_W - 10, 2, 1], [RESOLUTION_W, 4, 2], [RESOLUTION_H - 10, 2, 1], - [RESOLUTION_H, 4 , 2], - [('shoes', 0.9), ('pencil', 0.3), ('child', 0.5)] + [RESOLUTION_H, 4, 2], + [("shoes", 0.9), ("pencil", 0.3), ("child", 0.5)], ) q.add(timestamp=1, bounding_boxed_item=first_objects_detected) q.add(timestamp=2, bounding_boxed_item=second_objects_detected) q.add(timestamp=3, bounding_boxed_item=third_objects_detected) - + no_items = q.get_n_before(timestamp=1) self.assertEqual([], no_items) # Expects the last n=2 detections before timestamp 4. This should be timestamp 2 and # timestamp 3's top k=2 objects. last_n_top_k = q.get_n_before(timestamp=4) discarded_timestamp, first_top_k_with_centroid_dist = last_n_top_k[0] - first_scored_top_k = [scored_item for discarded_dist, scored_item in - first_top_k_with_centroid_dist] + first_scored_top_k = [ + scored_item + for discarded_dist, scored_item in first_top_k_with_centroid_dist + ] first_top_k = [item for item, score in first_scored_top_k] self.assertEqual(["cat", "butterfly"], first_top_k) discarded_timestamp, second_top_k_with_centroid_dist = last_n_top_k[1] - second_scored_top_k = [scored_item for discarded_dist, scored_item in - second_top_k_with_centroid_dist] + second_scored_top_k = [ + scored_item + for discarded_dist, scored_item in second_top_k_with_centroid_dist + ] second_top_k = [item for item, score in second_scored_top_k] self.assertEqual(["shoes", "pencil"], second_top_k) @@ -280,9 +307,10 @@ def test_empty_queue(self): Tests proper get-behavior of an empty queue. """ q = Centroid2DStrategyQueue( - n=2, center_x=RESOLUTION_W/2, center_y=RESOLUTION_H/2, k=2) + n=2, center_x=RESOLUTION_W / 2, center_y=RESOLUTION_H / 2, k=2 + ) self.assertEqual([], q.get_n_before(timestamp=4)) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py index e4c3598a9..a8bbc0e9f 100644 --- a/ros/angel_system_nodes/angel_system_nodes/audio/asr.py +++ b/ros/angel_system_nodes/angel_system_nodes/audio/asr.py @@ -105,7 +105,9 @@ def __init__(self): self.subscription = self.create_subscription( HeadsetAudioData, self._audio_topic, self.listener_callback, 1 ) - self._publisher = self.create_publisher(DialogueUtterance, self._utterances_topic, 1) + self._publisher = self.create_publisher( + DialogueUtterance, self._utterances_topic, 1 + ) self.audio_stream = [] self.t = threading.Thread() diff --git a/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py b/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py index b1da32783..8e72bfd4f 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_dialogue_system_node.py @@ -4,33 +4,38 @@ from angel_msgs.msg import DialogueUtterance + class BaseDialogueSystemNode(Node): """ This class is used for all dialogue system nodes to inherit similar functionality. """ + def __init__(self): super().__init__(self.__class__.__name__) self.log = self.get_logger() - - def get_intent_or(self, src_msg: DialogueUtterance, or_value: str = "not available") -> str: + + def get_intent_or( + self, src_msg: DialogueUtterance, or_value: str = "not available" + ) -> str: """ Returns the src_msg intent classification information. If the value is absent, the or_value is passed in. """ return src_msg.intent if src_msg.intent else or_value - def get_emotion_or(self, src_msg: DialogueUtterance, or_value: str = "not available") -> str: + def get_emotion_or( + self, src_msg: DialogueUtterance, or_value: str = "not available" + ) -> str: """ Returns the src_msg emotion classification information. If the value is absent, the or_value is passed in. """ return src_msg.emotion if src_msg.emotion else or_value - def copy_dialogue_utterance(self, - src_msg: DialogueUtterance, - node_name: str = "Dialogue System Node" - ) -> DialogueUtterance: + def copy_dialogue_utterance( + self, src_msg: DialogueUtterance, node_name: str = "Dialogue System Node" + ) -> DialogueUtterance: msg = DialogueUtterance() msg.header.frame_id = node_name msg.utterance_text = src_msg.utterance_text @@ -50,6 +55,7 @@ def copy_dialogue_utterance(self, return msg + def main(): rclpy.init() base_dialogue_node = BaseDialogueSystemNode() diff --git a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py index f123bafa8..996688610 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_emotion_detector.py @@ -108,9 +108,7 @@ def process_message_queue(self): msg = self.message_queue.get() self.log.debug(f'Processing message:\n\n"{msg.utterance_text}"') classification, confidence_score = self.get_inference(msg) - self.publish_detected_emotion( - msg, classification, confidence_score - ) + self.publish_detected_emotion(msg, classification, confidence_score) def publish_detected_emotion( self, sub_msg: DialogueUtterance, classification: str, confidence_score: float @@ -132,6 +130,7 @@ def publish_detected_emotion( + f'to {self._out_interp_uemotion_topic} for:\n>>> "{colored_utterance}"' ) + def main(): rclpy.init() emotion_detector = BaseEmotionDetector() diff --git a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py index 1790c6b56..8cffbc75e 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py @@ -128,11 +128,10 @@ def publish_msg(self, sub_msg: DialogueUtterance, intent: str, score: float): # Overwrite the user intent with the latest classification information. pub_msg.intent = intent pub_msg.intent_confidence_score = score - + # Decide which intent topic to publish the message to. published_topic = None - if self._contains_phrase(pub_msg.utterance_text.lower(), - OVERRIDE_KEYPHRASES): + if self._contains_phrase(pub_msg.utterance_text.lower(), OVERRIDE_KEYPHRASES): pub_msg.intent_confidence_score = 1.0 self._expected_publisher.publish(pub_msg) published_topic = PARAM_EXPECT_USER_INTENT_TOPIC diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py index bea8cda01..c8eaaa2ce 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py @@ -24,6 +24,7 @@ PARAM_TIMEOUT = "timeout" + class GptEmotionDetector(BaseEmotionDetector): def __init__(self): super().__init__() @@ -32,8 +33,9 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (PARAM_TIMEOUT,600), - ]) + (PARAM_TIMEOUT, 600), + ], + ) self.timeout = param_values[PARAM_TIMEOUT] # This node additionally includes fields for interacting with OpenAI @@ -87,7 +89,7 @@ def _labels_list_str(labels): openai_api_key=self.openai_api_key, temperature=0.0, max_tokens=1, - request_timeout=self.timeout + request_timeout=self.timeout, ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py index 3e2190fb1..e991d2152 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py @@ -23,6 +23,7 @@ PARAM_TIMEOUT = "timeout" + class GptIntentDetector(BaseIntentDetector): def __init__(self): super().__init__() @@ -31,8 +32,9 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (PARAM_TIMEOUT,600), - ]) + (PARAM_TIMEOUT, 600), + ], + ) self.timeout = param_values[PARAM_TIMEOUT] # This node additionally includes fields for interacting with OpenAI @@ -88,7 +90,7 @@ def _labels_list_str(labels): # Only 2 tokens needed for classification (tokens are delimited by use of '_', i.e. # 'next_step' counts as 2 tokens). max_tokens=2, - request_timeout=self.timeout + request_timeout=self.timeout, ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) diff --git a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py index f9ea13864..d7c496b76 100644 --- a/ros/angel_system_nodes/angel_system_nodes/question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/question_answerer.py @@ -20,6 +20,7 @@ FEW_SHOT_PROMPT = "few_shot_prompt_file" PARAM_TIMEOUT = "timeout" + class QuestionAnswerer(BaseDialogueSystemNode): def __init__(self): super().__init__() @@ -139,7 +140,7 @@ def prompt_gpt(self, question, model: str = "gpt-3.5-turbo"): "https://api.openai.com/v1/chat/completions", json=payload, headers={"Authorization": "Bearer {}".format(self.openai_api_key)}, - timeout=self.timeout + timeout=self.timeout, ) return ( json.loads(req.text)["choices"][0]["message"]["content"] diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 77e396042..e84501e8b 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -80,8 +80,8 @@ PARAM_TIMEOUT = "timeout" -class VisualQuestionAnswerer(BaseDialogueSystemNode): +class VisualQuestionAnswerer(BaseDialogueSystemNode): class TimestampedEntity: """ This class is used internally as a container for recorded detections and classifications at @@ -106,7 +106,7 @@ def __init__(self): (PARAM_PROMPT_TEMPLATE_PATH,), (PARAM_IMAGE_WIDTH,), (PARAM_IMAGE_HEIGHT,), - (PARAM_OBJECT_DETECTION_IGNORABLES,""), + (PARAM_OBJECT_DETECTION_IGNORABLES, ""), (PARAM_OBJECT_LAST_N_OBJ_DETECTIONS, 5), (PARAM_OBJECT_DETECTION_THRESHOLD, 0.8), (PARAM_ACT_CLFN_THRESHOLD, 0.8), @@ -128,7 +128,9 @@ def __init__(self): if param_values[PARAM_DEBUG_MODE]: self.debug_mode = True - self.param_must_contain_target_phrase = param_values[PARAM_MUST_CONTAIN_TARGET_PHRASE] + self.param_must_contain_target_phrase = param_values[ + PARAM_MUST_CONTAIN_TARGET_PHRASE + ] # Used to obtain the center perspective point and how far detected objects # are from it. @@ -152,14 +154,22 @@ def __init__(self): f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" ) - self.object_dtctn_ignorables = set([s.strip() for s in - param_values[PARAM_OBJECT_DETECTION_IGNORABLES].split(",")]) + self.object_dtctn_ignorables = set( + [ + s.strip() + for s in param_values[PARAM_OBJECT_DETECTION_IGNORABLES].split(",") + ] + ) self.log.info( - colored(f"Will be ignoring the following objects: {self.object_dtctn_ignorables}", - "light_red") + colored( + f"Will be ignoring the following objects: {self.object_dtctn_ignorables}", + "light_red", + ) ) self.object_dtctn_threshold = param_values[PARAM_OBJECT_DETECTION_THRESHOLD] - self.object_dtctn_last_n_obj_detections = param_values[PARAM_OBJECT_LAST_N_OBJ_DETECTIONS] + self.object_dtctn_last_n_obj_detections = param_values[ + PARAM_OBJECT_LAST_N_OBJ_DETECTIONS + ] # Configure supplemental input action classification criteria. self.action_clfn_threshold = param_values[PARAM_ACT_CLFN_THRESHOLD] @@ -169,13 +179,13 @@ def __init__(self): self.current_step = "Unstarted" self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() - self.centroid_object_queue = \ - centroid_2d_strategy_queue.Centroid2DStrategyQueue( - self.object_dtctn_last_n_obj_detections, - self.pv_center_coordinate[0], self.pv_center_coordinate[1], - k=1, # the number of top-k objects to obtain from each detection. - ) - + self.centroid_object_queue = centroid_2d_strategy_queue.Centroid2DStrategyQueue( + self.object_dtctn_last_n_obj_detections, + self.pv_center_coordinate[0], + self.pv_center_coordinate[1], + k=1, # the number of top-k objects to obtain from each detection. + ) + self.dialogue_history = [] self.handler_thread = threading.Thread(target=self.process_question_queue) self.handler_thread.start() @@ -265,7 +275,7 @@ def _configure_langchain(self): openai_api_key=self.openai_api_key, temperature=0.0, max_tokens=64, - request_timeout=self.timeout + request_timeout=self.timeout, ) zero_shot_prompt = langchain.PromptTemplate( input_variables=PROMPT_VARIABLES, @@ -312,10 +322,14 @@ def _add_detected_objects(self, msg: ObjectDetection2dSet) -> str: self.centroid_object_queue.add( self._get_sec(msg), bounding_boxes.BoundingBoxes( - msg.left, msg.right, msg.top, msg.bottom, - item=list(zip(msg.label_vec, msg.label_confidences)) - )) - + msg.left, + msg.right, + msg.top, + msg.bottom, + item=list(zip(msg.label_vec, msg.label_confidences)), + ), + ) + # We queue ALL objects above threshold, regardless if they are centered in the user's # perspective. self._add_detected_objects_above_threshold(msg) @@ -363,17 +377,21 @@ def _get_latest_centered_observables(self, curr_time: int) -> str: """ observables = set() # handle 2D centroid distance queueing. - timestamped_detections = self.centroid_object_queue.get_n_before(timestamp=curr_time) + timestamped_detections = self.centroid_object_queue.get_n_before( + timestamp=curr_time + ) if timestamped_detections: if self.debug_mode: - print(f"Timestamped detections based on centroid distance are: " +\ - f"{timestamped_detections}") + print( + f"Timestamped detections based on centroid distance are: " + + f"{timestamped_detections}" + ) # Recall that we passed in timestamped lists of pairs of # (detection, confidence score). centered_obj_detections_lists = [j for _, j in timestamped_detections] for centered_obj_detections in centered_obj_detections_lists: for centered_obj_detection in centered_obj_detections: - centroid, obj_score = centered_obj_detection + centroid, obj_score = centered_obj_detection obj, score = obj_score observables.add(obj) observables = observables - self.object_dtctn_ignorables @@ -399,7 +417,7 @@ def _get_latest_observables(self, curr_time: int, n: int) -> str: for detection in detections[-n:]: for obj in detection.entity: observables.add(obj) - observables = observables - self.object_dtctn_ignorables + observables = observables - self.object_dtctn_ignorables return ", ".join(observables) def get_response( @@ -477,13 +495,13 @@ def process_question_queue(self): self.log.info(f"Latest action: {action}") # Get centered detected objects. - centered_observables = \ - self._get_latest_centered_observables(start_time) + centered_observables = self._get_latest_centered_observables(start_time) self.log.info(f"Observed objects: {centered_observables}") - + # Get all detected objects. - all_observables = self._get_latest_observables(start_time, - self.object_dtctn_last_n_obj_detections) + all_observables = self._get_latest_observables( + start_time, self.object_dtctn_last_n_obj_detections + ) self.log.info(f"Observed objects: {all_observables}") # Generate response. From 3277e733e10652144fb59d24e2993e6fc0b1c3e3 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 4 Nov 2023 14:11:35 -0400 Subject: [PATCH 35/46] Refactor optional field derivation in prompt construction --- .../visual_question_answerer.py | 55 +++++++++---------- .../configs/llm_prompts/vis_qa_teacher_prompt | 10 +--- 2 files changed, 27 insertions(+), 38 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index e84501e8b..4b5e83245 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -70,14 +70,13 @@ PROMPT_VARIABLES = [ "recipe", "chat_history", - "current_step", - "emotion", - "action", + "optional_fields", "centered_observables", "all_observables", "question", ] +# Below configures the GPT request timeout in seconds. PARAM_TIMEOUT = "timeout" @@ -176,7 +175,7 @@ def __init__(self): # Configure supplemental input resources. self.question_queue = queue.Queue() - self.current_step = "Unstarted" + self.current_step = None self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() self.centroid_object_queue = centroid_2d_strategy_queue.Centroid2DStrategyQueue( @@ -357,7 +356,7 @@ def _get_latest_action(self, curr_time: int) -> str: Returns the latest action classification in self.action_classification_queue that does not occur before a provided time. """ - latest_action = "not available" + latest_action = None while not self.action_classification_queue.empty(): next = self.action_classification_queue.queue[0] if next.time < curr_time: @@ -424,10 +423,9 @@ def get_response( self, msg: DialogueUtterance, chat_history: str, - current_step: str, - action: str, centered_observables: str, all_observables: str, + optional_fields: str ): """ Generate a response to the utterance, enriched with the addition of @@ -440,22 +438,18 @@ def get_response( return_string = self.chain.run( recipe=self.recipe, chat_history=chat_history, - current_step=current_step, - action=action, + optional_fields=optional_fields, centered_observables=centered_observables, all_observables=all_observables, - emotion=self.get_emotion_or(msg), question=msg.utterance_text, ) if self.debug_mode: sent_prompt = self.chain.prompt.format_prompt( recipe=self.recipe, chat_history=chat_history, - current_step=current_step, - action=action, + optional_fields=optional_fields, centered_observables=centered_observables, all_observables=all_observables, - emotion=self.get_emotion_or(msg), question=msg.utterance_text, ).to_string() sent_prompt = colored(sent_prompt, "light_red") @@ -464,10 +458,7 @@ def get_response( ) except RuntimeError as err: self.log.info(err) - return_string = ( - "I'm sorry. I don't know how to answer your statement. " - + f"I understand that you feel {self.get_emotion_or(msg)}." - ) + return_string = "I'm sorry. I don't know how to answer your statement." return return_string def question_answer_callback(self, msg: DialogueUtterance): @@ -480,6 +471,17 @@ def question_answer_callback(self, msg: DialogueUtterance): return self.question_queue.put(msg) + def _get_optional_fields_string(self, emotion: str, current_step: str, + current_action: str) -> str: + optional_fields_string = "" + if emotion: + optional_fields_string += f"Emotion: {emotion}\n" + if current_step: + optional_fields_string += f"My Current Step: {current_step}\n" + if current_action: + optional_fields_string += f"My Current Action: {current_action}\n" + return optional_fields_string.strip("\n") + def process_question_queue(self): """ Constant loop to process received questions. @@ -490,28 +492,23 @@ def process_question_queue(self): start_time = self._get_sec(question_msg) self.log.info(f"Processing utterance {question_msg.utterance_text}") - # Get most recently detected action. - action = self._get_latest_action(start_time) - self.log.info(f"Latest action: {action}") - + # Get the optional fields. + optional_fields = \ + self._get_optional_fields_string(question_msg.emotion, self._get_current_step(), + self._get_latest_action(start_time)) # Get centered detected objects. centered_observables = self._get_latest_centered_observables(start_time) - self.log.info(f"Observed objects: {centered_observables}") - # Get all detected objects. - all_observables = self._get_latest_observables( - start_time, self.object_dtctn_last_n_obj_detections - ) - self.log.info(f"Observed objects: {all_observables}") + all_observables = \ + self._get_latest_observables(start_time, self.object_dtctn_last_n_obj_detections) # Generate response. response = self.get_response( question_msg, self._get_dialogue_history(), - self._get_current_step(), - action, centered_observables, all_observables, + optional_fields ) self.publish_generated_response(question_msg.utterance_text, response) self._add_dialogue_history(question_msg.utterance_text, response) diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index 082b32571..f0b36e2c3 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -5,17 +5,9 @@ Task Steps: Chat History: {chat_history} - -My Current Step: {current_step} - -My Emotion: {emotion}. - -My Current Action: {action}. - +{optional_fields} Objects In Front of Me: {centered_observables}. - Objects Nearby: {all_observables} - My Question: "{question}" Your Answer: \ No newline at end of file From b733344fa10c2806864d9fe464707585d695afd8 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 4 Nov 2023 15:16:48 -0400 Subject: [PATCH 36/46] Add object clarification intent and fix intent and emotion detection end tokens to improve classification --- .../angel_system_nodes/base_intent_detector.py | 2 +- .../angel_system_nodes/gpt_emotion_detector.py | 9 +++++---- .../angel_system_nodes/gpt_intent_detector.py | 14 ++++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py index 8cffbc75e..7dcc6d31e 100644 --- a/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/base_intent_detector.py @@ -17,7 +17,7 @@ # config/angel_system_cmds/user_intent_to_sys_cmd_v1.yaml. # Please refer to labels defined in # https://docs.google.com/document/d/1uuvSL5de3LVM9c0tKpRKYazDxckffRHf7IAcabSw9UA . -INTENT_LABELS = ["next_step", "prev_step", "inquiry", "other"] +INTENT_LABELS = ["next_step", "prev_step", "inquiry", "object_clarification", "other"] IN_TOPIC = "input_topic" PARAM_EXPECT_USER_INTENT_TOPIC = "expect_user_intent_topic" diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py index c8eaaa2ce..8b8402726 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py @@ -16,10 +16,10 @@ FEW_SHOT_EXAMPLES = [ { "utterance": "Go back to the previous step you dumb machine!", - "label": "negative.", + "label": "negative[eos]", }, - {"utterance": "Next step, please.", "label": "neutral"}, - {"utterance": "We're doing great and I'm learning a lot!", "label": "positive"}, + {"utterance": "Next step, please.", "label": "neutral[eos]"}, + {"utterance": "We're doing great and I'm learning a lot!", "label": "positive[eos]"}, ] PARAM_TIMEOUT = "timeout" @@ -97,7 +97,8 @@ def get_inference(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ - return (self.chain.run(utterance=msg.utterance_text), 0.5) + emotion = self.chain.run(utterance=msg.utterance_text) + return emotion.split('[eos]')[0], 0.5 def main(): diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py index e991d2152..b41a0d9ea 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_intent_detector.py @@ -15,10 +15,11 @@ # The following are few shot examples when prompting GPT. FEW_SHOT_EXAMPLES = [ - {"utterance": "Go back to the previous step!", "label": "prev_step."}, - {"utterance": "Next step, please.", "label": "next_step"}, - {"utterance": "How should I wrap this tourniquet?", "label": "inquiry"}, - {"utterance": "The sky is blue", "label": "other"}, + {"utterance": "Go back to the previous step!", "label": "prev_step[eos]"}, + {"utterance": "Next step, please.", "label": "next_step[eos]"}, + {"utterance": "How should I wrap this tourniquet?", "label": "inquiry[eos]"}, + {"utterance": "The sky is blue", "label": "other[eos]"}, + {"utterance": "What is this thing?", "label": "object_clarification[eos]"}, ] PARAM_TIMEOUT = "timeout" @@ -89,7 +90,7 @@ def _labels_list_str(labels): temperature=0.0, # Only 2 tokens needed for classification (tokens are delimited by use of '_', i.e. # 'next_step' counts as 2 tokens). - max_tokens=2, + # max_tokens=10, request_timeout=self.timeout, ) return LLMChain(llm=openai_llm, prompt=few_shot_prompt) @@ -98,7 +99,8 @@ def detect_intents(self, msg: DialogueUtterance): """ Detects the user intent via langchain execution of GPT. """ - return self.chain.run(utterance=msg.utterance_text), 0.5 + intent = self.chain.run(utterance=msg.utterance_text) + return intent.split('[eos]')[0], 0.5 def main(): From f67c38466ebff50f8399576343e2be346267d057 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 4 Nov 2023 15:20:14 -0400 Subject: [PATCH 37/46] Add override case for detected object clarification intent --- .../visual_question_answerer.py | 77 +++++++++++++------ .../eval_visual_full_question_answering.yml | 5 +- 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 4b5e83245..dafb15a6c 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -22,7 +22,8 @@ ) from angel_utils import declare_and_get_parameters from angel_system.data.common import bounding_boxes -from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode +from angel_system_nodes.base_dialogue_system_node import BaseDialogueSystemNode +from angel_system_nodes.base_intent_detector import INTENT_LABELS from angel_system.utils.object_detection_queues import centroid_2d_strategy_queue openai.organization = os.getenv("OPENAI_ORG_ID") @@ -79,6 +80,11 @@ # Below configures the GPT request timeout in seconds. PARAM_TIMEOUT = "timeout" +OBJECT_CLARIFICATION_RESPONSE = """ +Based on the information provided, it seems like you are referring to an object that you are +unsure about. I'm unable to determine what it is. Please describe the object to me or get +closer so that I may help you." +""" class VisualQuestionAnswerer(BaseDialogueSystemNode): class TimestampedEntity: @@ -143,14 +149,22 @@ def __init__(self): # Read the configured recipe file. self._recipe_path = param_values[PARAM_RECIPE_PATH] self.recipe = self._configure_recipe(self._recipe_path) - self.log.info(f"Configured recipe to be: ~~~~~~~~~~\n{self.recipe}\n~~~~~~~~~~") + self.log.info( + colored( + f"Configured recipe to be: ~~~~~~~~~~\n{self.recipe}\n~~~~~~~~~~", + "light_red" + ) + ) # Read the configured prompt template. self._prompt_template_path = param_values[PARAM_PROMPT_TEMPLATE_PATH] with open(self._prompt_template_path, "r") as file: self.prompt_template = file.read() self.log.info( - f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~" + colored( + f"Prompt Template: ~~~~~~~~~~\n{self.prompt_template}\n~~~~~~~~~~", + "light_red" + ) ) self.object_dtctn_ignorables = set( @@ -366,7 +380,7 @@ def _get_latest_action(self, curr_time: int) -> str: break return latest_action - def _get_latest_centered_observables(self, curr_time: int) -> str: + def _get_latest_centered_observables(self, curr_time: int) -> Set: """ Returns a comma-delimited list of "centered" objects per all entities in self.detected_objects_queue that occurred before a provided time. @@ -381,10 +395,10 @@ def _get_latest_centered_observables(self, curr_time: int) -> str: ) if timestamped_detections: if self.debug_mode: - print( - f"Timestamped detections based on centroid distance are: " - + f"{timestamped_detections}" - ) + self.log.info("Timestamped detections based on centroid distance are: ") + for detection in timestamped_detections: + self.log.info( + f"- Timestamp = {detection[0]} Centroid-Dist-Object(s) = {detection[1]}") # Recall that we passed in timestamped lists of pairs of # (detection, confidence score). centered_obj_detections_lists = [j for _, j in timestamped_detections] @@ -394,9 +408,9 @@ def _get_latest_centered_observables(self, curr_time: int) -> str: obj, score = obj_score observables.add(obj) observables = observables - self.object_dtctn_ignorables - return ", ".join(observables) + return observables - def _get_latest_observables(self, curr_time: int, n: int) -> str: + def _get_latest_observables(self, curr_time: int, n: int) -> Set: """ Returns a comma-delimited list of all observed objects per all entities in self.detected_objects_queue that occurred before a provided time. @@ -417,7 +431,7 @@ def _get_latest_observables(self, curr_time: int, n: int) -> str: for obj in detection.entity: observables.add(obj) observables = observables - self.object_dtctn_ignorables - return ", ".join(observables) + return observables def get_response( self, @@ -452,7 +466,7 @@ def get_response( all_observables=all_observables, question=msg.utterance_text, ).to_string() - sent_prompt = colored(sent_prompt, "light_red") + sent_prompt = colored(sent_prompt, "light_blue") self.log.info( f"Prompt sent over:~~~~~~~~~~\n{sent_prompt}\n:~~~~~~~~~~" ) @@ -466,21 +480,21 @@ def question_answer_callback(self, msg: DialogueUtterance): This is the main ROS node listener callback loop that will process all messages received via subscribed topics. """ - self.log.info(f"Received message:\n\n{msg.utterance_text}") + self.log.info(f"Received message: \"{msg.utterance_text}\"") if not self._apply_filter(msg): return self.question_queue.put(msg) def _get_optional_fields_string(self, emotion: str, current_step: str, current_action: str) -> str: - optional_fields_string = "" + optional_fields_string = "\n" if emotion: optional_fields_string += f"Emotion: {emotion}\n" if current_step: optional_fields_string += f"My Current Step: {current_step}\n" if current_action: optional_fields_string += f"My Current Action: {current_action}\n" - return optional_fields_string.strip("\n") + return optional_fields_string.rstrip("\n") def process_question_queue(self): """ @@ -490,7 +504,7 @@ def process_question_queue(self): while True: question_msg = self.question_queue.get() start_time = self._get_sec(question_msg) - self.log.info(f"Processing utterance {question_msg.utterance_text}") + self.log.info(f"Processing utterance \"{question_msg.utterance_text}\"") # Get the optional fields. optional_fields = \ @@ -502,14 +516,27 @@ def process_question_queue(self): all_observables = \ self._get_latest_observables(start_time, self.object_dtctn_last_n_obj_detections) - # Generate response. - response = self.get_response( - question_msg, - self._get_dialogue_history(), - centered_observables, - all_observables, - optional_fields - ) + response = None + is_object_clarification = \ + question_msg.intent and question_msg.intent == INTENT_LABELS[3] + if is_object_clarification and len(centered_observables) > 1: + # Object Clarification override: If an associated intent exists and indicates + # object clarification in the presence of multiple objects, override the response with + # a clarification question. + self.log.info( + "Received confusing object clarification question from user " +\ + f"about multiple objects: ({centered_observables}). " +\ + "Inquiring for more details...") + response = OBJECT_CLARIFICATION_RESPONSE + else: + # Normal response generation. + response = self.get_response( + question_msg, + self._get_dialogue_history(), + ", ".join(centered_observables) if centered_observables else "", + ", ".join(all_observables) if all_observables else "", + optional_fields + ) self.publish_generated_response(question_msg.utterance_text, response) self._add_dialogue_history(question_msg.utterance_text, response) @@ -519,7 +546,7 @@ def publish_generated_response(self, utterance: str, response: str): msg.header.stamp = self.get_clock().now().to_msg() msg.utterance_text = utterance msg.response = response - colored_utterance = colored(utterance, "light_blue") + colored_utterance = colored(utterance, "magenta") colored_response = colored(response, "light_green") self.log.info( f'Responding to utterance:\n>>> "{colored_utterance}"\n>>> with:\n' diff --git a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml index 7980189c4..97982a035 100644 --- a/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml +++ b/tmux/vocalized_dialogue_systems/visual/eval_visual_full_question_answering.yml @@ -183,7 +183,7 @@ windows: - question_answering: layout: even-vertical panes: - - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --log-level visual_question_answerer:=DEBUG --ros-args -r __ns:=${ROS_NAMESPACE} -p utterance_topic:=emotion_topic -p task_state_topic:=task_state_topic @@ -192,8 +192,9 @@ windows: -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 + -p obj_det_last_n:=8 -p object_det_ignored_objects:="hand (left),hand (right),background" + -p must_contain_target_phrase:=False -p debug_mode:=True \ No newline at end of file From d9fbb9899d0d8e2e9520902c9b56312eaa0794cd Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 4 Nov 2023 15:35:11 -0400 Subject: [PATCH 38/46] Revise visual question answering prompt --- .../angel_system_nodes/visual_question_answerer.py | 9 +++++---- .../configs/llm_prompts/vis_qa_teacher_prompt | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index dafb15a6c..ba52d4735 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -451,19 +451,19 @@ def get_response( self.log.info(f"User emotion: {msg.emotion}") return_string = self.chain.run( recipe=self.recipe, - chat_history=chat_history, optional_fields=optional_fields, centered_observables=centered_observables, all_observables=all_observables, + chat_history=chat_history, question=msg.utterance_text, ) if self.debug_mode: sent_prompt = self.chain.prompt.format_prompt( recipe=self.recipe, - chat_history=chat_history, optional_fields=optional_fields, centered_observables=centered_observables, all_observables=all_observables, + chat_history=chat_history, question=msg.utterance_text, ).to_string() sent_prompt = colored(sent_prompt, "light_blue") @@ -529,12 +529,13 @@ def process_question_queue(self): "Inquiring for more details...") response = OBJECT_CLARIFICATION_RESPONSE else: + all_observables -= centered_observables # Normal response generation. response = self.get_response( question_msg, self._get_dialogue_history(), - ", ".join(centered_observables) if centered_observables else "", - ", ".join(all_observables) if all_observables else "", + ", ".join(centered_observables) if centered_observables else "Nothing", + ", ".join(all_observables) if all_observables else "Nothing", optional_fields ) self.publish_generated_response(question_msg.utterance_text, response) diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index f0b36e2c3..54cf8bf97 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -3,11 +3,11 @@ You are a teacher helping me learn how to complete a Task. I will tell you how I Task Steps: {recipe} -Chat History: -{chat_history} {optional_fields} -Objects In Front of Me: {centered_observables}. +Objects In Front of Me: {centered_observables} Objects Nearby: {all_observables} -My Question: "{question}" -Your Answer: \ No newline at end of file +Chat: +{chat_history} +Me: {question} +You: \ No newline at end of file From 18f7adf67ebafe06b326739556aa4207bbce4f59 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sat, 4 Nov 2023 15:37:05 -0400 Subject: [PATCH 39/46] Add latest live vqa tmux config --- ...sual_vocalized_question_answering_live.yml | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/tmux/eval_visual_vocalized_question_answering_live.yml b/tmux/eval_visual_vocalized_question_answering_live.yml index 8f5c9d6ba..2fba7790c 100644 --- a/tmux/eval_visual_vocalized_question_answering_live.yml +++ b/tmux/eval_visual_vocalized_question_answering_live.yml @@ -70,8 +70,8 @@ windows: - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args -r __ns:=${ROS_NAMESPACE} -p ip_addr:=${HL2_IP} - -p image_topic:=PVFramesBGR - -p image_ts_topic:=PVFramesBGR_TS + -p image_topic:=PVFramesRGB + -p image_ts_topic:=PVFramesRGB_TS -p hand_pose_topic:=disable -p audio_topic:=HeadsetAudioData -p sm_topic:=disable @@ -82,6 +82,12 @@ windows: -p sm_freq:=5 -p rm_depth_AHAT:=disable + # Old videos were recorded in NV12 + #- image_converter: ros2 run angel_datahub ImageConverter --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p topic_input_images:=PVFramesNV12 + # -p topic_output_images:=PVFramesRGB + - image_ts_relay: ros2 run angel_system_nodes image_timestamp_relay --ros-args -r __ns:=${ROS_NAMESPACE} -p image_topic:=PVFramesRGB @@ -89,35 +95,35 @@ windows: # Visualize RGB Images being output from the headset - rqt_rgb_images: rqt -s rqt_image_view/ImageView - --args ${ROS_NAMESPACE}/PVFramesBGR + --args ${ROS_NAMESPACE}/PVFramesRGB --ros-args -p _image_transport:=raw - - object_detector: layout: even-vertical panes: - # - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args - # -r __ns:=${ROS_NAMESPACE} - # -p image_topic:=PVFramesBGR - # -p det_topic:=ObjectDetections2d - # -p det_conf_threshold:=0.1 - # -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml - - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + - berkeley_object_detector: ros2 run angel_system_nodes berkeley_object_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p image_topic:=PVFramesBGR + -p image_topic:=PVFramesRGB -p det_topic:=ObjectDetections2d - -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt - -p inference_img_size:=1280 -p det_conf_threshold:=0.1 -p cuda_device_id:=0 + -p model_config:=${BERKELEY_CONFIG_DIR}/MC50-InstanceSegmentation/cooking/coffee/stage2/mask_rcnn_R_50_FPN_1x_demo.yaml + # - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + # -r __ns:=${ROS_NAMESPACE} + # -p image_topic:=PVFramesRGB + # -p det_topic:=ObjectDetections2d + # -p net_checkpoint:=${MODEL_DIR}/yolov7-combined_objects-weights.pt + # -p inference_img_size:=1280 + # -p det_conf_threshold:=0.5 + # -p cuda_device_id:=0 - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args -r __ns:=${ROS_NAMESPACE} - -p topic_input_images:=PVFramesBGR + -p topic_input_images:=PVFramesRGB -p topic_input_det_2d:=ObjectDetections2d -p topic_output_images:=pv_image_detections_2d -p filter_top_k:=-1 - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args -r __ns:=${ROS_NAMESPACE} - -p image_ts_topic:=PVFramesBGR_TS + -p image_ts_topic:=PVFramesRGB_TS -p det_topic:=ObjectDetections2d -p act_topic:=ActivityDetections -p model_weights:=${MODEL_DIR}/activity_tcn-coffee-checkpoint.ckpt @@ -136,13 +142,6 @@ windows: -p task_error_topic:=TaskErrors -p query_task_graph_topic:=query_task_graph -p sys_cmd_topic:=SystemCommands - - feedback_generator: ros2 run angel_system_nodes feedback_generator --ros-args - -r __ns:=${ROS_NAMESPACE} - -p activity_detector_topic:=ActivityDetections - -p object_detection_topic:=ObjectDetections3d - -p task_monitor_topic:=TaskUpdates - -p arui_update_topic:=AruiUpdates - -p interp_user_intent_topic:=InterpUserIntents - vocal: layout: even-vertical panes: @@ -183,7 +182,7 @@ windows: - question_answering: layout: even-vertical panes: - - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --log-level visual_question_answerer:=DEBUG --ros-args -r __ns:=${ROS_NAMESPACE} -p utterance_topic:=emotion_topic -p task_state_topic:=task_state_topic @@ -192,8 +191,9 @@ windows: -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_last_n:=5 -p pv_width:=1920 -p pv_height:=1080 + -p obj_det_last_n:=8 -p object_det_ignored_objects:="hand (left),hand (right),background" + -p must_contain_target_phrase:=False -p debug_mode:=True \ No newline at end of file From 83e6dbe61ad8d4ac5945fb9e9a31eea67d9e9062 Mon Sep 17 00:00:00 2001 From: Derek Ahmed Date: Sun, 5 Nov 2023 13:02:14 -0500 Subject: [PATCH 40/46] Change object clarification response message --- .../angel_system_nodes/visual_question_answerer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index ba52d4735..e7df1e98c 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -80,11 +80,7 @@ # Below configures the GPT request timeout in seconds. PARAM_TIMEOUT = "timeout" -OBJECT_CLARIFICATION_RESPONSE = """ -Based on the information provided, it seems like you are referring to an object that you are -unsure about. I'm unable to determine what it is. Please describe the object to me or get -closer so that I may help you." -""" + class VisualQuestionAnswerer(BaseDialogueSystemNode): class TimestampedEntity: @@ -527,7 +523,9 @@ def process_question_queue(self): "Received confusing object clarification question from user " +\ f"about multiple objects: ({centered_observables}). " +\ "Inquiring for more details...") - response = OBJECT_CLARIFICATION_RESPONSE + response = "It seems you are asking about an object you are unsure about. " +\ + "I am detecting the following: {}. ".format(centered_observables) +\ + "Is the object you are referenceing one of these objects?" else: all_observables -= centered_observables # Normal response generation. From 97f42e207a20fc1bfa99487b428d8711aa96b479 Mon Sep 17 00:00:00 2001 From: BS Date: Sun, 5 Nov 2023 15:36:39 -0500 Subject: [PATCH 41/46] Changes for voice demo --- config/activity_labels/recipe_coffee.yaml | 2 +- config/tasks/multi-task-config.yaml | 2 +- .../gpt_emotion_detector.py | 4 +- .../visual_question_answerer.py | 25 +- .../configs/llm_prompts/vis_qa_teacher_prompt | 25 +- ...it_ll_eval_one_coffee_recipe_steps_v2.json | 286 ++++++++---------- tmux/Nov23-voice-live.yml | 209 +++++++++++++ tmux/demos/2023-10-eval_prep-live.yml | 2 +- 8 files changed, 363 insertions(+), 192 deletions(-) create mode 100644 tmux/Nov23-voice-live.yml diff --git a/config/activity_labels/recipe_coffee.yaml b/config/activity_labels/recipe_coffee.yaml index 22ccea358..0fd0d5dea 100644 --- a/config/activity_labels/recipe_coffee.yaml +++ b/config/activity_labels/recipe_coffee.yaml @@ -69,7 +69,7 @@ labels: - id: 14 label: "pour-beans-filter" #full_str: "Transfer the grounds to the filter cone" - full_str: "pour the grounded coffee beans into the filter cone prepared in step 2" + full_str: "Pour the grounded coffee beans into the filter cone prepared in step 2" depends: [7, 13] - id: 15 label: "thermometer-turn-on" diff --git a/config/tasks/multi-task-config.yaml b/config/tasks/multi-task-config.yaml index 21daa9239..30979b16d 100644 --- a/config/tasks/multi-task-config.yaml +++ b/config/tasks/multi-task-config.yaml @@ -18,7 +18,7 @@ tasks: - id: 1 label: "tea" config_file: "./config/tasks/recipe_tea.yaml" - active: true + active: false - id: 2 label: "pinwheel" config_file: "./config/tasks/recipe_pinwheel.yaml" diff --git a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py index c8eaaa2ce..c34ae434a 100644 --- a/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py +++ b/ros/angel_system_nodes/angel_system_nodes/gpt_emotion_detector.py @@ -33,10 +33,10 @@ def __init__(self): param_values = declare_and_get_parameters( self, [ - (PARAM_TIMEOUT, 600), + (PARAM_TIMEOUT, 10), ], ) - self.timeout = param_values[PARAM_TIMEOUT] + self.timeout = 10 # This node additionally includes fields for interacting with OpenAI # via LangChain. diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index e84501e8b..22b27e84e 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -40,7 +40,7 @@ # Below is used to filter out incoming questions. Toggle this parameter to True if questions # are only responded to if they contain the TARGET_PHRASE. PARAM_MUST_CONTAIN_TARGET_PHRASE = "must_contain_target_phrase" -TARGET_PHRASE = "hey angel" +TARGET_PHRASE = "angel" # Below indicates how many of the last n detected objects should be surfaced # in the LLM prompt. These objects do NOT have to be unique. @@ -113,7 +113,7 @@ def __init__(self): (OUT_QA_TOPIC,), (PARAM_CONTEXT_HISTORY_LENGTH, 3), (PARAM_MUST_CONTAIN_TARGET_PHRASE, False), - (PARAM_TIMEOUT, 600), + (PARAM_TIMEOUT, 10), (PARAM_DEBUG_MODE, False), ], ) @@ -349,15 +349,15 @@ def _add_detected_objects_above_threshold(self, msg): ) self.detected_objects_queue.put(te) - def _add_dialogue_history(self, question: str, response: str): - self.dialogue_history.append((f"Me: {question}", f"You: {response}")) + def _add_dialogue_history(self, question: str, response: str, emotion: str): + self.dialogue_history.append((f"Me ({emotion}): {question}", f"You: {response}")) def _get_latest_action(self, curr_time: int) -> str: """ Returns the latest action classification in self.action_classification_queue that does not occur before a provided time. """ - latest_action = "not available" + latest_action = "" while not self.action_classification_queue.empty(): next = self.action_classification_queue.queue[0] if next.time < curr_time: @@ -396,6 +396,8 @@ def _get_latest_centered_observables(self, curr_time: int) -> str: observables.add(obj) observables = observables - self.object_dtctn_ignorables return ", ".join(observables) + else: + return "nothing" def _get_latest_observables(self, curr_time: int, n: int) -> str: """ @@ -418,6 +420,8 @@ def _get_latest_observables(self, curr_time: int, n: int) -> str: for obj in detection.entity: observables.add(obj) observables = observables - self.object_dtctn_ignorables + if len(observables)==0: + return "nothing" return ", ".join(observables) def get_response( @@ -478,6 +482,12 @@ def question_answer_callback(self, msg: DialogueUtterance): self.log.info(f"Received message:\n\n{msg.utterance_text}") if not self._apply_filter(msg): return + + msg.utterance_text= msg.utterance_text.replace("Angel, ", "") + msg.utterance_text= msg.utterance_text.replace("angel, ", "") + msg.utterance_text= msg.utterance_text.replace("angel", "") + msg.utterance_text= msg.utterance_text.replace("Angel", "") + msg.utterance_text= msg.utterance_text.capitalize() self.question_queue.put(msg) def process_question_queue(self): @@ -514,7 +524,7 @@ def process_question_queue(self): all_observables, ) self.publish_generated_response(question_msg.utterance_text, response) - self._add_dialogue_history(question_msg.utterance_text, response) + self._add_dialogue_history(question_msg.utterance_text, response,self.get_emotion_or(question_msg)) def publish_generated_response(self, utterance: str, response: str): msg = SystemTextResponse() @@ -536,7 +546,8 @@ def _apply_filter(self, msg): a boolean value indicating if the message passes a filter and should be processed. """ if self.param_must_contain_target_phrase: - return TARGET_PHRASE in msg.utterance_text.lower() + return TARGET_PHRASE in msg.utterance_text.lower() or "angela" in msg.utterance_text.lower() + else: return True diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index 082b32571..c82adf915 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -1,21 +1,14 @@ -You are a teacher helping me learn how to complete a Task. I will tell you how I am feeling (positive, negative, neutral), all the objects that I can see, and what I am currently doing. I will ask you a question and you will respond with an answer. +You are a professional chef teaching me how to best make this recipe. I will ask you a question about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that". -Task Steps: +Here is some context: +Currently I am working on the following recipe: {recipe} -Chat History: -{chat_history} - -My Current Step: {current_step} - -My Emotion: {emotion}. - -My Current Action: {action}. +I finished all steps up until but not including: {current_step} {action} -Objects In Front of Me: {centered_observables}. +Here are objects that I see: {centered_observables}, and objects that you can see: {all_observables} -Objects Nearby: {all_observables} - -My Question: "{question}" - -Your Answer: \ No newline at end of file +Our conversation so far: +{chat_history} +Me ({emotion}): {question} +Your Answer (short, helpful with empathy): diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json index 2310d05b5..b2ab7e33a 100644 --- a/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json +++ b/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json @@ -1,187 +1,145 @@ { - "Measure 12 ounces of cold water and transfer to a kettle. Boil the water.": { + "Measure 12 ounces of water in the liquid measuring cup.": { "level": 0, "index": 0, - "sub-steps": { - "Measure 12 ounces of water in the liquid measuring cup": { - "level": 1, - "index": 0, - "activity": "Measure 12 ounces of water in the liquid measuring cup" - }, - "Pour the water from the liquid measuring cup into the electric kettle": { - "level": 1, - "index": 1, - "activity": "Pour the water from the liquid measuring cup into the electric kettle" - }, - "Turn on the Kettle": { - "level": 1, - "index": 2, - "activity": "Turn on the Kettle" - } - } - }, - - "While the water is boiling, place the dripper on top of a coffee mug.": { + "sub-steps": {} + }, + + "Pour the water from the liquid measuring cup into the electric kettle.": { "level": 0, "index": 1, - "sub-steps": { - "Place the dripper on top of the mug": { - "level": 1, - "index": 0, - "activity": "Place the dripper on top of the mug" - } - } + "sub-steps": {} }, - "Prepare the filter insert by folding the paper filter in half to create a semi-circle, and in half again to create a quarter-circle. Place the paper filter in the dripper and spread open to create a cone.": { + "Place the dripper on top of the mug.": { "level": 0, "index": 2, - "sub-steps": { - "Take the coffee filter and fold it in half to create a semi-circle": { - "level": 1, - "index": 0, - "activity": "Take the coffee filter and fold it in half to create a semi-circle" - }, - "Fold the filter in half again to create a quarter-circle": { - "level": 1, - "index": 1, - "activity": "Fold the filter in half again to create a quarter-circle" - }, - "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper": { - "level": 1, - "index": 2, - "activity": "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper" - }, - "Spread the filter open to create a cone inside the dripper": { - "level": 1, - "index": 3, - "activity": "Spread the filter open to create a cone inside the dripper" - } - } - }, - - "Weigh the coffee beans and grind until the coffee grounds are the consistency of coarse sand, about 20 seconds. Transfer the grounds to the filter cone.": { + "sub-steps": {} + }, + + "Take the coffee filter and fold it in half to create a semi-circle.": { "level": 0, "index": 3, - "sub-steps": { - "Turn on the kitchen scale": { - "level": 1, - "index": 0, - "activity": "Turn on the kitchen scale" - }, - "Place a bowl on the scale": { - "level": 1, - "index": 1, - "activity": "Place a bowl on the scale" - }, - "Zero the scale": { - "level": 1, - "index": 2, - "activity": "Zero the scale" - }, - "Add coffee beans to the bowl until the scale reads 25 grams": { - "level": 1, - "index": 3, - "activity": "Add coffee beans to the bowl until the scale reads 25 grams" - }, - "Pour the measured coffee beans into the coffee grinder": { - "level": 1, - "index": 4, - "activity": "Pour the measured coffee beans into the coffee grinder" - }, - "Set timer for 20 seconds": { - "level": 1, - "index": 5, - "activity": "Set timer for 20 seconds" - }, - "Turn on the timer": { - "level": 1, - "index": 6, - "activity": "Turn on the timer" - }, - "Grind the coffee beans by pressing and holding down on the black part of the lid": { - "level": 1, - "index": 7, - "activity": "Grind the coffee beans by pressing and holding down on the black part of the lid" - }, - "Pour the grounded coffee beans into the filter cone prepared in step 2": { - "level": 1, - "index": 8, - "activity": "Pour the grounded coffee beans into the filter cone prepared in step 2" - } - } - }, - - "Check the temperature.": { + "sub-steps": {} + }, + + "Fold the filter in half again to create a quarter-circle.": { "level": 0, "index": 4, - "sub-steps": { - "Turn on the thermometer": { - "level": 1, - "index": 0, - "activity": "Turn on the thermometer" - }, - "Place the end of the thermometer into the water": { - "level": 1, - "index": 1, - "activity": "Place the end of the thermometer into the water" - } - } - }, - - "Pour a small amount of water in the filter to wet the grounds. Wait about 30 seconds.": { + "sub-steps": {} + }, + + "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper.": { "level": 0, "index": 5, - "sub-steps": { - "Set timer to 30 seconds": { - "level": 1, - "index": 0, - "activity": "Set timer to 30 seconds" - }, - "Pour a small amount of water over the grounds in order to wet the grounds": { - "level": 1, - "index": 1, - "activity": "Pour a small amount of water over the grounds in order to wet the grounds" - } - } - }, - - "Slowly pour the rest of the water over the grounds in a circular motion. Do not overfill beyond the top of the paper filter.": { + "sub-steps": {} + }, + + "Spread the filter open to create a cone inside the dripper.": { "level": 0, "index": 6, - "sub-steps": { - "Slowly pour the water over the grounds in a circular motion. Do not overfill beyond the top of the paper filter": { - "level": 1, - "index": 0, - "activity": "Slowly pour the water over the grounds in a circular motion. Do not overfill beyond the top of the paper filter" - } - } + "sub-steps": {} }, - "Let the coffee drain completely into the mug before removing the dripper. Discard the paper filter and coffee grounds.": { + "Turn on the kitchen scale.": { "level": 0, "index": 7, - "sub-steps": { - "Allow the rest of the water in the dripper to drain": { - "level": 1, - "index": 0, - "activity": "Allow the rest of the water in the dripper to drain" - }, - "Remove the dripper from the cup": { - "level": 1, - "index": 1, - "activity": "Remove the dripper from the cup" - }, - "Remove the coffee grounds and paper filter from the dripper": { - "level": 1, - "index": 2, - "activity": "Remove the coffee grounds and paper filter from the dripper" - }, - "Discard the coffee grounds and paper filter": { - "level": 1, - "index": 3, - "activity": "Discard the coffee grounds and paper filter" - } - } + "sub-steps": {} + }, + + "Place a bowl on the scale.": { + "level": 0, + "index": 8, + "sub-steps": {} + }, + + "Zero the scale.": { + "level": 0, + "index": 9, + "sub-steps": {} + }, + + "Add coffee beans to the bowl until the scale reads 25 grams.": { + "level": 0, + "index": 10, + "sub-steps": {} + }, + + "Pour the measured coffee beans into the coffee grinder.": { + "level": 0, + "index": 11, + "sub-steps": {} + }, + + "Grind the coffee beans by pressing and holding down on the black part of the lid.": { + "level": 0, + "index": 12, + "sub-steps": {} + }, + + "Pour the grounded coffee beans into the filter cone prepared in step 2.": { + "level": 0, + "index": 13, + "sub-steps": {} + }, + + "Turn on the thermometer.": { + "level": 0, + "index": 14, + "sub-steps": {} + }, + + "Place the end of the thermometer into the water.": { + "level": 0, + "index": 15, + "sub-steps": {} + }, + + "Check the temperature displayed on the thermometer.": { + "level": 0, + "index": 16, + "sub-steps": {} + }, + + "Pour a small amount of water over the grounds in order to wet the grounds": { + "level": 0, + "index": 17, + "sub-steps": {} + }, + + "Slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter": { + "level": 0, + "index": 18, + "sub-steps": {} + }, + + "Allow the rest of the water in the dripper to drain.": { + "level": 0, + "index": 19, + "sub-steps": {} + }, + + "Remove the dripper from the cup.": { + "level": 0, + "index": 20, + "sub-steps": {} + }, + + "Remove the coffee grounds and paper filter from the dripper.": { + "level": 0, + "index": 21, + "sub-steps": {} + }, + + "Discard the coffee grounds and paper filter.": { + "level": 0, + "index": 22, + "sub-steps": {} + }, + + "Turn on the kettle.": { + "level": 0, + "index": 23, + "sub-steps": {} } } diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml new file mode 100644 index 000000000..b4e977978 --- /dev/null +++ b/tmux/Nov23-voice-live.yml @@ -0,0 +1,209 @@ +# +# System configuration to run the ANGEL system for the 2022/11 PI meeting and +# Evaluation 1. +# + +name: 2023-10-eval-live +root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> + +# Optional tmux socket +# socket_name: foo + +# Note that the pre and post options have been deprecated and will be replaced by +# project hooks. + +# Project hooks + +# Runs on project start, always +# on_project_start: command +on_project_start: | + export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} + export HL2_IP=${HL2_IP:-192.168.0.23} + export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config + export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs + export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs + export MODEL_DIR=${ANGEL_WORKSPACE_DIR}/model_files + export BAGS_DIR=${ANGEL_WORKSPACE_DIR}/ros_bags + #export RMW_IMPLEMENTATION=rmw_cyclonedds_cpp + + # Changing the domain ID was important at KHQ to unblock perceived network + # congestion slowdowns to message sending. + export ROS_DOMAIN_ID=77 + + # Set the frame-rate to be used by multiple sources. This should be in frames + # per second (Hz). + export FRAME_RATE=15 + +# Run on project start, the first time +# on_project_first_start: command + +# Run on project start, after the first time +# on_project_restart: command + +# Run on project exit ( detaching from tmux session ) +# on_project_exit: command + +# Run on project stop +# on_project_stop: command + +# Runs in each window and pane before window/pane specific commands. Useful for setting up interpreter versions. +# pre_window: rbenv shell 2.0.0-p247 + +# Pass command line options to tmux. Useful for specifying a different tmux.conf. +# tmux_options: -f ~/.tmux.mac.conf +tmux_options: -f <%= ENV["ANGEL_WORKSPACE_DIR"] %>/tmux/tmux.conf + +windows: + - sensor_input: + layout: even-vertical + panes: + - datahub: ros2 run ros_tcp_endpoint default_server_endpoint --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ROS_IP:=0.0.0.0 + - hl2ss_bridge: ros2 run angel_system_nodes hl2ss_ros_bridge --ros-args + -r __ns:=${ROS_NAMESPACE} + -p ip_addr:=${HL2_IP} + -p image_topic:=PVFramesBGR + -p image_ts_topic:=PVFramesBGR_TS + -p hand_pose_topic:=disable + -p audio_topic:=HeadsetAudioData + -p sm_topic:=disable + -p head_pose_topic:=disable + -p pv_width:=1280 + -p pv_height:=720 + -p pv_framerate:=${FRAME_RATE} + -p sm_freq:=5 + -p rm_depth_AHAT:=disable + + # Visualize RGB Images being output from the headset + #- rqt_rgb_images: rqt -s rqt_image_view/ImageView + # --args ${ROS_NAMESPACE}/PVFramesBGR + # --ros-args -p _image_transport:=raw + + - vocal: + layout: even-vertical + panes: + - vad: ros2 run angel_system_nodes voice_activity_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_audio_topic:=HeadsetAudioData + -p output_voice_activity_topic:=DetectedVoiceData + -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_cadence:=4 + -p vad_margin:=0.50 + -p max_accumulation_length:=15 + -p debug_mode:=True + - asr: ros2 run angel_system_nodes asr --ros-args + -r __ns:=${ROS_NAMESPACE} + -p audio_topic:=DetectedVoiceData + -p utterances_topic:=utterances_topic + -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_req_segment_duration:=2 + -p is_sentence_tokenize:=False + -p debug_mode:=True + - emotion_detection: + layout: even-vertical + panes: + - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=base_emotion_topic + - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -r __ns:=${ROS_NAMESPACE} + -p input_topic:=utterances_topic + -p user_emotion_topic:=gpt_emotion_topic + -p timeout:=2 + - question_answering: + layout: even-vertical + panes: + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + -r __ns:=${ROS_NAMESPACE} + -p utterance_topic:=gpt_emotion_topic + -p task_state_topic:=TaskUpdates + -p object_detections_topic:=ObjectDetections2d + -p action_classifications_topic:=ActivityDetections + -p system_text_response_topic:=system_text_response_topic + -p recipe_path:=${NODE_CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p prompt_template_path:=${NODE_CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt + -p obj_det_last_n:=5 + -p pv_width:=1920 + -p pv_height:=1080 + -p must_contain_target_phrase:=True + -p object_det_ignored_objects:="hand (left),hand (right),background,cutting board,trash can,banana (peeled),microwave (open), microwave (closed)" + -p debug_mode:=True + + - object_detector: + layout: even-vertical + panes: + - object_detector: ros2 run angel_system_nodes object_detection_yolo_v7 --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_topic:=PVFramesBGR + -p det_topic:=ObjectDetections2d + -p net_checkpoint:=${MODEL_DIR}/all_recipes+additional_objs+bkgd_yolov7.pt + -p inference_img_size:=1280 + -p det_conf_threshold:=0.1 + -p cuda_device_id:=0 + + - simple_2d_overlay: ros2 run angel_debug Simple2dDetectionOverlay --ros-args + -r __ns:=${ROS_NAMESPACE} + -p topic_input_images:=PVFramesBGR + -p topic_input_det_2d:=ObjectDetections2d + -p topic_output_images:=pv_image_detections_2d + -p filter_top_k:=-1 + + - activity_classifier: ros2 run angel_system_nodes activity_classifier_tcn --ros-args + -r __ns:=${ROS_NAMESPACE} + -p image_ts_topic:=PVFramesBGR_TS + -p det_topic:=ObjectDetections2d + -p act_topic:=ActivityDetections + -p model_weights:=${MODEL_DIR}/yolo_all_recipes_additional_objs_bkgd_sample_rate_2.ckpt + -p model_mapping:=${MODEL_DIR}/yolo_all_recipes_additional_objs_bkgd_act_mapping.txt + -p model_det_label_mapping:=${MODEL_DIR}/activity_tcn-all_activities-det_label_mapping.json + -p model_device:=cuda + -p model_dets_conv_version:=5 + -p window_size:=30 + -p buffer_max_size_seconds:=5 + -p image_pix_width:=1280 + -p image_pix_height:=720 + + - keyboard_sys_cmd: ros2 run angel_system_nodes keyboard_to_sys_cmd --ros-args + -r __ns:=${ROS_NAMESPACE} + -p system_command_topic:=SystemCommands + + - task_monitor: ros2 run angel_system_nodes global_step_predictor --ros-args + -r __ns:=${ROS_NAMESPACE} + -p det_topic:=ActivityDetections + -p model_file:=${MODEL_DIR}/global_step_predictor_act_avgs_all_classes_v2.0_sample_rate_2.npy + -p threshold_multiplier_weak:=0.05 + -p thresh_frame_count:=$((8 / (30 / ${FRAME_RATE}))) + -p threshold_frame_count_weak:=2 + -p deactivate_thresh_frame_count:=$((20 / (30 / ${FRAME_RATE}))) + -p step_mode:=granular + -p config_file:=${CONFIG_DIR}/tasks/multi-task-config.yaml + -p task_state_topic:=TaskUpdates + -p query_task_graph_topic:=query_task_graph + -p task_error_topic:=TaskErrors + -p system_command_topic:=SystemCommands + #-p gt_activity_mscoco:=model_files/test_activity_preds.mscoco.json + #-p gt_video_id:=8 + #-p gt_output_dir:="${BAGS_DIR}" + + - feedback_generator: ros2 run angel_system_nodes feedback_generator --ros-args + -r __ns:=${ROS_NAMESPACE} + -p activity_detector_topic:=ActivityDetections + -p object_detection_topic:=ObjectDetections3d + -p task_monitor_topic:=TaskUpdates + -p arui_update_topic:=AruiUpdates + -p interp_user_intent_topic:=InterpUserIntents + -p system_text_response_topic:=system_text_response_topic + + - engineering-ui: + layout: even-vertical + panes: + - engineering_ui_websocket: ros2 launch rosbridge_server rosbridge_websocket_launch.xml port:=9090 + - engineering_ui_server: node src/angel_utils/multi_task_demo_ui/index.js + --namespace=${ROS_NAMESPACE} + --image_topic=pv_image_detections_2d/compressed + --query_task_graph_topic=query_task_graph + --task_updates_topic=TaskUpdates + --activity_detections_topic=ActivityDetections + --task_errors_topic=TaskErrors diff --git a/tmux/demos/2023-10-eval_prep-live.yml b/tmux/demos/2023-10-eval_prep-live.yml index fa65b4e5e..1c561f034 100644 --- a/tmux/demos/2023-10-eval_prep-live.yml +++ b/tmux/demos/2023-10-eval_prep-live.yml @@ -18,7 +18,7 @@ root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # on_project_start: command on_project_start: | export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} - export HL2_IP=${HL2_IP:-192.168.1.4} + export HL2_IP=${HL2_IP:-192.168.4.65} export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs From 2727083a3e896ef1cd572e6fec66ce7eee0e94f5 Mon Sep 17 00:00:00 2001 From: BS Date: Sun, 5 Nov 2023 22:46:11 -0500 Subject: [PATCH 42/46] Adding utterance to feedback generator --- q | 155 ++++++++++++++++++ .../angel_system_nodes/feedback_generator.py | 45 ++++- tmux/Nov23-voice-live.yml | 5 +- 3 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 q diff --git a/q b/q new file mode 100644 index 000000000..1e1a202d8 --- /dev/null +++ b/q @@ -0,0 +1,155 @@ +diff --cc ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +index 22b27e84,e7df1e98..00000000 +--- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py ++++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +@@@ -395,11 -404,9 +404,11 @@@ class VisualQuestionAnswerer(BaseDialog + obj, score = obj_score + observables.add(obj) + observables = observables - self.object_dtctn_ignorables +- return ", ".join(observables) + - return observables  +++ return observables + + else:  + + return "nothing" +  +- def _get_latest_observables(self, curr_time: int, n: int) -> str: ++ def _get_latest_observables(self, curr_time: int, n: int) -> Set: + """ + Returns a comma-delimited list of all observed objects per all + entities in self.detected_objects_queue that occurred before a provided time. +@@@ -420,9 -427,7 +429,9 @@@ + for obj in detection.entity: + observables.add(obj) + observables = observables - self.object_dtctn_ignorables + + if len(observables)==0: + + return "nothing" +- return ", ".join(observables) ++ return observables +  + def get_response( + self, +@@@ -479,17 -476,22 +480,28 @@@ + This is the main ROS node listener callback loop that will process + all messages received via subscribed topics. + """ +- self.log.info(f"Received message:\n\n{msg.utterance_text}") ++ self.log.info(f"Received message: \"{msg.utterance_text}\"") + if not self._apply_filter(msg): + return + +  + + msg.utterance_text= msg.utterance_text.replace("Angel, ", "") + + msg.utterance_text= msg.utterance_text.replace("angel, ", "") + + msg.utterance_text= msg.utterance_text.replace("angel", "") + + msg.utterance_text= msg.utterance_text.replace("Angel", "") + + msg.utterance_text= msg.utterance_text.capitalize() + self.question_queue.put(msg) +  ++ def _get_optional_fields_string(self, emotion: str, current_step: str, ++ current_action: str) -> str: ++ optional_fields_string = "\n" ++ if emotion: ++ optional_fields_string += f"Emotion: {emotion}\n" ++ if current_step: ++ optional_fields_string += f"My Current Step: {current_step}\n" ++ if current_action: ++ optional_fields_string += f"My Current Action: {current_action}\n" ++ return optional_fields_string.rstrip("\n") ++  + def process_question_queue(self): + """ + Constant loop to process received questions. +@@@ -498,33 -500,44 +510,44 @@@ + while True: + question_msg = self.question_queue.get() + start_time = self._get_sec(question_msg) +- self.log.info(f"Processing utterance {question_msg.utterance_text}") +-  +- # Get most recently detected action. +- action = self._get_latest_action(start_time) +- self.log.info(f"Latest action: {action}") ++ self.log.info(f"Processing utterance \"{question_msg.utterance_text}\"") +  ++ # Get the optional fields. ++ optional_fields = \ ++ self._get_optional_fields_string(question_msg.emotion, self._get_current_step(), ++ self._get_latest_action(start_time)) + # Get centered detected objects. + centered_observables = self._get_latest_centered_observables(start_time) +- self.log.info(f"Observed objects: {centered_observables}") +-  + # Get all detected objects. +- all_observables = self._get_latest_observables( +- start_time, self.object_dtctn_last_n_obj_detections +- ) +- self.log.info(f"Observed objects: {all_observables}") +-  +- # Generate response. +- response = self.get_response( +- question_msg, +- self._get_dialogue_history(), +- self._get_current_step(), +- action, +- centered_observables, +- all_observables, +- ) ++ all_observables = \ ++ self._get_latest_observables(start_time, self.object_dtctn_last_n_obj_detections) ++  ++ response = None ++ is_object_clarification = \ ++ question_msg.intent and question_msg.intent == INTENT_LABELS[3] ++ if is_object_clarification and len(centered_observables) > 1: ++ # Object Clarification override: If an associated intent exists and indicates ++ # object clarification in the presence of multiple objects, override the response with ++ # a clarification question. ++ self.log.info( ++ "Received confusing object clarification question from user " +\ ++ f"about multiple objects: ({centered_observables}). " +\ ++ "Inquiring for more details...") ++ response = "It seems you are asking about an object you are unsure about. " +\ ++ "I am detecting the following: {}. ".format(centered_observables) +\ ++ "Is the object you are referenceing one of these objects?" ++ else: ++ all_observables -= centered_observables ++ # Normal response generation. ++ response = self.get_response( ++ question_msg, ++ self._get_dialogue_history(), ++ ", ".join(centered_observables) if centered_observables else "Nothing", ++ ", ".join(all_observables) if all_observables else "Nothing", ++ optional_fields ++ ) + self.publish_generated_response(question_msg.utterance_text, response) + - self._add_dialogue_history(question_msg.utterance_text, response) + + self._add_dialogue_history(question_msg.utterance_text, response,self.get_emotion_or(question_msg)) +  + def publish_generated_response(self, utterance: str, response: str): + msg = SystemTextResponse() +diff --cc ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +index c82adf91,54cf8bf9..00000000 +--- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt ++++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +@@@ -1,14 -1,13 +1,14 @@@ +- You are a professional chef teaching me how to best make this recipe. I will ask you a question about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that".  + -You are a teacher helping me learn how to complete a Task. I will tell you how I am feeling (positive, negative, neutral), all the objects that I can see, and what I am currently doing. I will ask you a question and you will respond with an answer. +++You are a professional chef teaching me how to best make this recipe. I will ask you questions about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that".  +  + -Task Steps: + +Here is some context: + +Currently I am working on the following recipe: + {recipe} +  + -{optional_fields} + -Objects In Front of Me: {centered_observables} + -Objects Nearby: {all_observables} + +I finished all steps up until but not including: {current_step} {action} +  + -Chat:  + +Here are objects that I see: {centered_observables}, and objects that you can see: {all_observables} + + + +Our conversation so far: + {chat_history} + -Me: {question} + -You: + +Me ({emotion}): {question} + +Your Answer (short, helpful with empathy): diff --git a/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py b/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py index 79714f821..acc20a437 100644 --- a/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py +++ b/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py @@ -15,6 +15,7 @@ SystemTextResponse, TaskUpdate, VisionBoundingBox3d, + DialogueUtterance, ) from angel_utils import declare_and_get_parameters @@ -26,6 +27,7 @@ PARAM_ARUI_UPDATE_TOPIC = "arui_update_topic" PARAM_INTERP_USER_INTENT_TOPIC = "interp_user_intent_topic" PARAM_SYSTEM_TEXT_RESPONSE_TOPIC = "system_text_response_topic" +PARAM_UTTERANCE_TOPIC = "utterances_topic" class FeedbackGenerator(Node): @@ -55,6 +57,7 @@ def __init__(self): (PARAM_ARUI_UPDATE_TOPIC,), (PARAM_INTERP_USER_INTENT_TOPIC,), (PARAM_SYSTEM_TEXT_RESPONSE_TOPIC,), + (PARAM_UTTERANCE_TOPIC,), ], ) @@ -66,6 +69,9 @@ def __init__(self): self._system_text_response_topic = param_values[ PARAM_SYSTEM_TEXT_RESPONSE_TOPIC ] + self._utterance_topic = param_values[ + PARAM_UTTERANCE_TOPIC + ] # subscribers self.activity_subscriber = self.create_subscription( @@ -94,6 +100,13 @@ def __init__(self): 1, ) + self.utterance_subscriber = self.create_subscription( + DialogueUtterance, + self._utterance_topic, + self.utterance_callback, + 1, + ) + # publisher self.arui_update_publisher = self.create_publisher( AruiUpdate, self._arui_update_topic, 1 @@ -248,11 +261,41 @@ def system_text_response_callback(self, msg: SystemTextResponse) -> None: notification.category = notification.N_CAT_NOTICE notification.context = notification.N_CONTEXT_USER_MODELING - notification.title = f"System response for: {msg.utterance_text}" + notification.title = f"{msg.utterance_text}" notification.description = f"{msg.response}" self.publish_update(notifications=[notification]) + def utterance_callback(self, msg: DialogueUtterance) -> None: + """ + This is the main ROS node listener callback loop that will process + all messages received via subscribed topics. + """ + keyword_check = msg.utterance_text[0:8] + if (keyword_check.contains("Angel,") or keyword_check.contains("angel,") or keyword_check.contains("Angela") or keyword_check.contains("angela,") or keyword_check.contains("Angel,") or keyword_check.contains("angel") or keyword_check.contains("Angela") or keyword_check.contains("angela")): + arui_message = msg.utterance_text + arui_message= arui_message.replace("Angel, ", "") + arui_message= arui_message.replace("angel, ", "") + arui_message= arui_message.replace("Angela, ", "") + arui_message= arui_message.replace("angela, ", "") + arui_message= arui_message.replace("Angel ", "") + arui_message= arui_message.replace("angel ", "") + arui_message= arui_message.replace("Angela ", "") + arui_message= arui_message.replace("angela ", "") + arui_message= arui_message.capitalize() + + # Create an AruiUserNotification msg with this information + notification = AruiUserNotification() + + notification.category = notification.N_CAT_NOTICE + notification.context = notification.N_CONTEXT_USER_MODELING + + notification.title = f"{arui_message}" + notification.description = "" + + self.publish_update(notifications=[notification]) + + def main(): rclpy.init() diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml index b4e977978..34bb6ce60 100644 --- a/tmux/Nov23-voice-live.yml +++ b/tmux/Nov23-voice-live.yml @@ -18,7 +18,7 @@ root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # on_project_start: command on_project_start: | export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} - export HL2_IP=${HL2_IP:-192.168.0.23} + export HL2_IP=${HL2_IP:-172.20.10.12} export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs @@ -124,7 +124,7 @@ windows: -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${NODE_CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${NODE_CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_last_n:=5 + -p obj_det_last_n:=8 -p pv_width:=1920 -p pv_height:=1080 -p must_contain_target_phrase:=True @@ -193,6 +193,7 @@ windows: -p object_detection_topic:=ObjectDetections3d -p task_monitor_topic:=TaskUpdates -p arui_update_topic:=AruiUpdates + -p utterances_topic:=utterances_topic -p interp_user_intent_topic:=InterpUserIntents -p system_text_response_topic:=system_text_response_topic From 5f91c8deefd0cac3080b4493b62c08bd4a5567f0 Mon Sep 17 00:00:00 2001 From: BS Date: Mon, 6 Nov 2023 16:52:47 -0500 Subject: [PATCH 43/46] final touches --- .../global_step_predictor.py | 6 --- .../angel_system_nodes/feedback_generator.py | 5 ++- .../visual_question_answerer.py | 9 ++--- .../configs/llm_prompts/vis_qa_teacher_prompt | 9 +++-- ...it_ll_eval_one_coffee_recipe_steps_v2.json | 40 +++++++------------ tmux/Nov23-voice-live.yml | 40 +++++++++++++------ 6 files changed, 54 insertions(+), 55 deletions(-) diff --git a/angel_system/global_step_prediction/global_step_predictor.py b/angel_system/global_step_prediction/global_step_predictor.py index 824f66953..5cb2d1fe0 100644 --- a/angel_system/global_step_prediction/global_step_predictor.py +++ b/angel_system/global_step_prediction/global_step_predictor.py @@ -684,8 +684,6 @@ def conditionally_reset_irrational_trackers(self, tracker, skip=False): ): print("reset condition hit!!") # import ipdb; ipdb.set_trace() - if tracker["recipe"] == "coffee": - print(f"tea step = {self.trackers[1]['current_granular_step']}") for tracker_ind in self.find_trackers_by_recipe( resetter_granular_step[recipe][1] ): @@ -696,10 +694,6 @@ def conditionally_reset_irrational_trackers(self, tracker, skip=False): ][0] ): self.reset_one_tracker(tracker_ind) - if tracker["recipe"] == "coffee": - print( - f"tea step after = {self.trackers[1]['current_granular_step']}" - ) else: for recipe in resetter_granular_step: granular_steps = [ diff --git a/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py b/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py index acc20a437..3a9c8b873 100644 --- a/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py +++ b/ros/angel_system_nodes/angel_system_nodes/feedback_generator.py @@ -272,7 +272,10 @@ def utterance_callback(self, msg: DialogueUtterance) -> None: all messages received via subscribed topics. """ keyword_check = msg.utterance_text[0:8] - if (keyword_check.contains("Angel,") or keyword_check.contains("angel,") or keyword_check.contains("Angela") or keyword_check.contains("angela,") or keyword_check.contains("Angel,") or keyword_check.contains("angel") or keyword_check.contains("Angela") or keyword_check.contains("angela")): + if "Angel," in keyword_check or "angel," in keyword_check or\ + "Angela," in keyword_check or "angela," in keyword_check or\ + "Angel" in keyword_check or "angel" in keyword_check or\ + "Angela" in keyword_check or "angela" in keyword_check: arui_message = msg.utterance_text arui_message= arui_message.replace("Angel, ", "") arui_message= arui_message.replace("angel, ", "") diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index f86a2ec81..8c5c8ac0c 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -242,9 +242,6 @@ def __init__(self): self.openai_api_key = self._configure_openai_api_key() self.openai_org_id = self._configure_openai_org_id() - # Configure LangChain. - self.chain = self._configure_langchain() - def _configure_openai_org_id(self): if not os.getenv("OPENAI_ORG_ID"): raise ValueError( @@ -290,6 +287,8 @@ def _configure_langchain(self): input_variables=PROMPT_VARIABLES, template=self.prompt_template, ) + zero_shot_example = langchain.PromptTemplate.from_template("Tell me a joke") + return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) def _get_sec(self, msg: DialogueUtterance) -> int: @@ -472,7 +471,7 @@ def get_response( ) except RuntimeError as err: self.log.info(err) - return_string = "I'm sorry. I don't know how to answer your statement." + return_string = "I'm sorry. I don't know how to answer your question." return return_string def question_answer_callback(self, msg: DialogueUtterance): @@ -531,7 +530,7 @@ def process_question_queue(self): "I am detecting the following: {}. ".format(centered_observables) +\ "Is the object you are referenceing one of these objects?" else: - all_observables -= centered_observables + all_observables = centered_observables # Normal response generation. response = self.get_response( question_msg, diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index 7bbff9641..a3db4d85c 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -1,3 +1,4 @@ +### You are a professional chef teaching me how to best make this recipe. I will ask you questions about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that". Here is some context: @@ -5,10 +6,10 @@ Currently I am working on the following recipe: {recipe} {optional_fields} +Here are objects that you can see: {all_observables} ({centered_observables}) +### -Here are objects that I see: {centered_observables}, and objects that you can see: {all_observables} - -Our conversation so far: +Our chat history: {chat_history} {question} -Your Answer (short, helpful with empathy): +Your Answer (short, helpful with empathy): \ No newline at end of file diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json index b2ab7e33a..5ad52699f 100644 --- a/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json +++ b/ros/angel_system_nodes/configs/mit_ll_eval_one_coffee_recipe_steps_v2.json @@ -11,37 +11,37 @@ "sub-steps": {} }, - "Place the dripper on top of the mug.": { + "Turn on the Kettle.": { "level": 0, "index": 2, "sub-steps": {} }, - "Take the coffee filter and fold it in half to create a semi-circle.": { + "Place the dripper on top of the mug.": { "level": 0, "index": 3, "sub-steps": {} }, - "Fold the filter in half again to create a quarter-circle.": { + "Take the coffee filter and fold it in half to create a semi-circle.": { "level": 0, "index": 4, "sub-steps": {} }, - "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper.": { + "Fold the filter in half again to create a quarter-circle.": { "level": 0, "index": 5, "sub-steps": {} }, - "Spread the filter open to create a cone inside the dripper.": { + "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper.": { "level": 0, "index": 6, "sub-steps": {} }, - "Turn on the kitchen scale.": { + "Spread the filter open to create a cone inside the dripper.": { "level": 0, "index": 7, "sub-steps": {} @@ -53,7 +53,7 @@ "sub-steps": {} }, - "Zero the scale.": { + "Turn on the kitchen scale and zero the scale.": { "level": 0, "index": 9, "sub-steps": {} @@ -77,7 +77,7 @@ "sub-steps": {} }, - "Pour the grounded coffee beans into the filter cone prepared in step 2.": { + "Transfer the grounds to the filter cone.": { "level": 0, "index": 13, "sub-steps": {} @@ -95,51 +95,39 @@ "sub-steps": {} }, - "Check the temperature displayed on the thermometer.": { + "Slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter.": { "level": 0, "index": 16, "sub-steps": {} }, - "Pour a small amount of water over the grounds in order to wet the grounds": { + "Continue slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter...": { "level": 0, "index": 17, "sub-steps": {} }, - "Slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter": { - "level": 0, - "index": 18, - "sub-steps": {} - }, - "Allow the rest of the water in the dripper to drain.": { "level": 0, - "index": 19, + "index": 18, "sub-steps": {} }, "Remove the dripper from the cup.": { "level": 0, - "index": 20, + "index": 19, "sub-steps": {} }, "Remove the coffee grounds and paper filter from the dripper.": { "level": 0, - "index": 21, + "index": 20, "sub-steps": {} }, "Discard the coffee grounds and paper filter.": { "level": 0, - "index": 22, - "sub-steps": {} - }, - - "Turn on the kettle.": { - "level": 0, - "index": 23, + "index": 21, "sub-steps": {} } } diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml index 34bb6ce60..a2dca3f14 100644 --- a/tmux/Nov23-voice-live.yml +++ b/tmux/Nov23-voice-live.yml @@ -18,7 +18,7 @@ root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # on_project_start: command on_project_start: | export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} - export HL2_IP=${HL2_IP:-172.20.10.12} + export HL2_IP=${HL2_IP:-192.168.1.3} export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs @@ -90,7 +90,7 @@ windows: -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad -p vad_cadence:=4 -p vad_margin:=0.50 - -p max_accumulation_length:=15 + -p max_accumulation_length:=4 -p debug_mode:=True - asr: ros2 run angel_system_nodes asr --ros-args -r __ns:=${ROS_NAMESPACE} @@ -99,36 +99,50 @@ windows: -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr -p asr_req_segment_duration:=2 -p is_sentence_tokenize:=False - -p debug_mode:=True - - emotion_detection: + -p debug_mode:=True + - intent_detection: layout: even-vertical panes: - - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args + - intent_detection: ros2 run angel_system_nodes gpt_intent_detector --ros-args -r __ns:=${ROS_NAMESPACE} -p input_topic:=utterances_topic - -p user_emotion_topic:=base_emotion_topic - - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args + -p expect_user_intent_topic:=expect_user_intent_topic + -p interp_user_intent_topic:=interp_user_intent_topic + -p timeout:=2 + - emotion_detection: + layout: even-vertical + panes: +# - base_emotion_detection: ros2 run angel_system_nodes base_emotion_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=utterances_topic +# -p user_emotion_topic:=base_emotion_topic +# - gpt_emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args +# -r __ns:=${ROS_NAMESPACE} +# -p input_topic:=utterances_topic +# -p user_emotion_topic:=gpt_emotion_topic +# -p timeout:=2 + - emotion_detection: ros2 run angel_system_nodes gpt_emotion_detector --ros-args -r __ns:=${ROS_NAMESPACE} - -p input_topic:=utterances_topic - -p user_emotion_topic:=gpt_emotion_topic + -p input_topic:=interp_user_intent_topic + -p user_emotion_topic:=emotion_topic -p timeout:=2 - question_answering: layout: even-vertical panes: - - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --ros-args + - gpt_qa: ros2 run angel_system_nodes visual_question_answerer --log-level visual_question_answerer:=DEBUG --ros-args -r __ns:=${ROS_NAMESPACE} - -p utterance_topic:=gpt_emotion_topic + -p utterance_topic:=emotion_topic -p task_state_topic:=TaskUpdates -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic -p recipe_path:=${NODE_CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json -p prompt_template_path:=${NODE_CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt - -p obj_det_last_n:=8 -p pv_width:=1920 -p pv_height:=1080 + -p obj_det_last_n:=8 -p must_contain_target_phrase:=True - -p object_det_ignored_objects:="hand (left),hand (right),background,cutting board,trash can,banana (peeled),microwave (open), microwave (closed)" + -p object_det_ignored_objects:="hand (left),hand (right),background,cutting board,trash can,banana (peeled),microwave (open), microwave (closed), paper towel" -p debug_mode:=True - object_detector: From c3bdccb2c783045796e468f2e299c85577653f61 Mon Sep 17 00:00:00 2001 From: BS Date: Tue, 7 Nov 2023 10:26:51 -0500 Subject: [PATCH 44/46] Small fixes for demo --- config/tasks/multi-task-config.yaml | 4 +- config/tasks/recipe_coffee.yaml | 2 +- ros/angel_msgs/msg/TaskUpdate.msg | 1 + .../visual_question_answerer.py | 7 +- .../mit_ll_eval_one_oatmeal_recipe_steps.json | 133 ++++++++++++++++++ tmux/Nov23-voice-live.yml | 4 +- 6 files changed, 143 insertions(+), 8 deletions(-) create mode 100644 ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json diff --git a/config/tasks/multi-task-config.yaml b/config/tasks/multi-task-config.yaml index 30979b16d..bfad10c19 100644 --- a/config/tasks/multi-task-config.yaml +++ b/config/tasks/multi-task-config.yaml @@ -14,7 +14,7 @@ tasks: - id: 0 label: "coffee" config_file: "./config/tasks/recipe_coffee.yaml" - active: true + active: false - id: 1 label: "tea" config_file: "./config/tasks/recipe_tea.yaml" @@ -30,4 +30,4 @@ tasks: - id: 4 label: "dessert quesadilla" config_file: "./config/tasks/recipe_dessertquesadilla.yaml" - active: false + active: true diff --git a/config/tasks/recipe_coffee.yaml b/config/tasks/recipe_coffee.yaml index ad8e723cc..a5f851598 100644 --- a/config/tasks/recipe_coffee.yaml +++ b/config/tasks/recipe_coffee.yaml @@ -30,7 +30,7 @@ to create a quarter-circle. Place the paper filter in the dripper and spread ope label: "coffee-beans-to-grounds" full_str: "Weigh the coffee beans and grind until the coffee grounds are the consistency of coarse sand, about 20 seconds. Transfer the grounds to the filter cone." - activity_ids: [28, 29, 28, 30, 31, 32, 33] + activity_ids: [29, 28, 30, 31, 32, 33] - id: 5 label: "check-temp" full_str: "Check the temperature of the water." diff --git a/ros/angel_msgs/msg/TaskUpdate.msg b/ros/angel_msgs/msg/TaskUpdate.msg index fdfd5005b..73fe8db56 100644 --- a/ros/angel_msgs/msg/TaskUpdate.msg +++ b/ros/angel_msgs/msg/TaskUpdate.msg @@ -20,6 +20,7 @@ int8 current_step_id # String of the step currently in progress. string current_step + # Previous step is the step worked on before the current step. string previous_step diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 8c5c8ac0c..c3e7669ba 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -80,8 +80,6 @@ # Below configures the GPT request timeout in seconds. PARAM_TIMEOUT = "timeout" - - class VisualQuestionAnswerer(BaseDialogueSystemNode): class TimestampedEntity: """ @@ -242,6 +240,9 @@ def __init__(self): self.openai_api_key = self._configure_openai_api_key() self.openai_org_id = self._configure_openai_org_id() + # Configure LangChain. + self.chain = self._configure_langchain() + def _configure_openai_org_id(self): if not os.getenv("OPENAI_ORG_ID"): raise ValueError( @@ -288,7 +289,7 @@ def _configure_langchain(self): template=self.prompt_template, ) zero_shot_example = langchain.PromptTemplate.from_template("Tell me a joke") - + return LLMChain(llm=openai_llm, prompt=zero_shot_prompt) def _get_sec(self, msg: DialogueUtterance) -> int: diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json new file mode 100644 index 000000000..5ad52699f --- /dev/null +++ b/ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json @@ -0,0 +1,133 @@ +{ + "Measure 12 ounces of water in the liquid measuring cup.": { + "level": 0, + "index": 0, + "sub-steps": {} + }, + + "Pour the water from the liquid measuring cup into the electric kettle.": { + "level": 0, + "index": 1, + "sub-steps": {} + }, + + "Turn on the Kettle.": { + "level": 0, + "index": 2, + "sub-steps": {} + }, + + "Place the dripper on top of the mug.": { + "level": 0, + "index": 3, + "sub-steps": {} + }, + + "Take the coffee filter and fold it in half to create a semi-circle.": { + "level": 0, + "index": 4, + "sub-steps": {} + }, + + "Fold the filter in half again to create a quarter-circle.": { + "level": 0, + "index": 5, + "sub-steps": {} + }, + + "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper.": { + "level": 0, + "index": 6, + "sub-steps": {} + }, + + "Spread the filter open to create a cone inside the dripper.": { + "level": 0, + "index": 7, + "sub-steps": {} + }, + + "Place a bowl on the scale.": { + "level": 0, + "index": 8, + "sub-steps": {} + }, + + "Turn on the kitchen scale and zero the scale.": { + "level": 0, + "index": 9, + "sub-steps": {} + }, + + "Add coffee beans to the bowl until the scale reads 25 grams.": { + "level": 0, + "index": 10, + "sub-steps": {} + }, + + "Pour the measured coffee beans into the coffee grinder.": { + "level": 0, + "index": 11, + "sub-steps": {} + }, + + "Grind the coffee beans by pressing and holding down on the black part of the lid.": { + "level": 0, + "index": 12, + "sub-steps": {} + }, + + "Transfer the grounds to the filter cone.": { + "level": 0, + "index": 13, + "sub-steps": {} + }, + + "Turn on the thermometer.": { + "level": 0, + "index": 14, + "sub-steps": {} + }, + + "Place the end of the thermometer into the water.": { + "level": 0, + "index": 15, + "sub-steps": {} + }, + + "Slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter.": { + "level": 0, + "index": 16, + "sub-steps": {} + }, + + "Continue slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter...": { + "level": 0, + "index": 17, + "sub-steps": {} + }, + + "Allow the rest of the water in the dripper to drain.": { + "level": 0, + "index": 18, + "sub-steps": {} + }, + + "Remove the dripper from the cup.": { + "level": 0, + "index": 19, + "sub-steps": {} + }, + + "Remove the coffee grounds and paper filter from the dripper.": { + "level": 0, + "index": 20, + "sub-steps": {} + }, + + "Discard the coffee grounds and paper filter.": { + "level": 0, + "index": 21, + "sub-steps": {} + } +} diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml index a2dca3f14..d0a82c9c6 100644 --- a/tmux/Nov23-voice-live.yml +++ b/tmux/Nov23-voice-live.yml @@ -18,7 +18,7 @@ root: <%= ENV["ANGEL_WORKSPACE_DIR"] %> # on_project_start: command on_project_start: | export ROS_NAMESPACE=${ROS_NAMESPACE:-/kitware} - export HL2_IP=${HL2_IP:-192.168.1.3} + export HL2_IP=${HL2_IP:-172.20.10.12} export CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/config export BERKELEY_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/angel_system/berkeley/configs export NODE_CONFIG_DIR=${ANGEL_WORKSPACE_DIR}/src/angel_system_nodes/configs @@ -142,7 +142,7 @@ windows: -p pv_height:=1080 -p obj_det_last_n:=8 -p must_contain_target_phrase:=True - -p object_det_ignored_objects:="hand (left),hand (right),background,cutting board,trash can,banana (peeled),microwave (open), microwave (closed), paper towel" + -p object_det_ignored_objects:="hand (left),hand (right),background,trash can,microwave (open),microwave (closed),peanut butter,nut butter jar lid,nut butter jar (open),nut butter jar (closed),jelly jar lid,jelly jar (open),jelly jar (closed),butter knife + nut butter,butter knife + jelly,tortilla + nut butter,tortilla + jelly" -p debug_mode:=True - object_detector: From 91daa8d900545590b71c339d6a8106e43f31e4f9 Mon Sep 17 00:00:00 2001 From: BS Date: Tue, 7 Nov 2023 22:26:41 -0500 Subject: [PATCH 45/46] Optimize prompt for demo --- config/tasks/multi-task-config.yaml | 4 +- q | 155 ------------------ .../visual_question_answerer.py | 90 +++++++--- .../configs/llm_prompts/vis_qa_teacher_prompt | 23 ++- .../mit_ll_eval_one_oatmeal_recipe_steps.json | 133 --------------- ...t_ll_eval_one_quesadilla_recipe_steps.json | 67 ++++++++ tmux/Nov23-voice-live.yml | 6 +- 7 files changed, 155 insertions(+), 323 deletions(-) delete mode 100644 q delete mode 100644 ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json create mode 100644 ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json diff --git a/config/tasks/multi-task-config.yaml b/config/tasks/multi-task-config.yaml index bfad10c19..30979b16d 100644 --- a/config/tasks/multi-task-config.yaml +++ b/config/tasks/multi-task-config.yaml @@ -14,7 +14,7 @@ tasks: - id: 0 label: "coffee" config_file: "./config/tasks/recipe_coffee.yaml" - active: false + active: true - id: 1 label: "tea" config_file: "./config/tasks/recipe_tea.yaml" @@ -30,4 +30,4 @@ tasks: - id: 4 label: "dessert quesadilla" config_file: "./config/tasks/recipe_dessertquesadilla.yaml" - active: true + active: false diff --git a/q b/q deleted file mode 100644 index 1e1a202d8..000000000 --- a/q +++ /dev/null @@ -1,155 +0,0 @@ -diff --cc ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py -index 22b27e84,e7df1e98..00000000 ---- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py -+++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py -@@@ -395,11 -404,9 +404,11 @@@ class VisualQuestionAnswerer(BaseDialog - obj, score = obj_score - observables.add(obj) - observables = observables - self.object_dtctn_ignorables -- return ", ".join(observables) - - return observables  -++ return observables - + else:  - + return "nothing" -  -- def _get_latest_observables(self, curr_time: int, n: int) -> str: -+ def _get_latest_observables(self, curr_time: int, n: int) -> Set: - """ - Returns a comma-delimited list of all observed objects per all - entities in self.detected_objects_queue that occurred before a provided time. -@@@ -420,9 -427,7 +429,9 @@@ - for obj in detection.entity: - observables.add(obj) - observables = observables - self.object_dtctn_ignorables - + if len(observables)==0: - + return "nothing" -- return ", ".join(observables) -+ return observables -  - def get_response( - self, -@@@ -479,17 -476,22 +480,28 @@@ - This is the main ROS node listener callback loop that will process - all messages received via subscribed topics. - """ -- self.log.info(f"Received message:\n\n{msg.utterance_text}") -+ self.log.info(f"Received message: \"{msg.utterance_text}\"") - if not self._apply_filter(msg): - return - +  - + msg.utterance_text= msg.utterance_text.replace("Angel, ", "") - + msg.utterance_text= msg.utterance_text.replace("angel, ", "") - + msg.utterance_text= msg.utterance_text.replace("angel", "") - + msg.utterance_text= msg.utterance_text.replace("Angel", "") - + msg.utterance_text= msg.utterance_text.capitalize() - self.question_queue.put(msg) -  -+ def _get_optional_fields_string(self, emotion: str, current_step: str, -+ current_action: str) -> str: -+ optional_fields_string = "\n" -+ if emotion: -+ optional_fields_string += f"Emotion: {emotion}\n" -+ if current_step: -+ optional_fields_string += f"My Current Step: {current_step}\n" -+ if current_action: -+ optional_fields_string += f"My Current Action: {current_action}\n" -+ return optional_fields_string.rstrip("\n") -+  - def process_question_queue(self): - """ - Constant loop to process received questions. -@@@ -498,33 -500,44 +510,44 @@@ - while True: - question_msg = self.question_queue.get() - start_time = self._get_sec(question_msg) -- self.log.info(f"Processing utterance {question_msg.utterance_text}") --  -- # Get most recently detected action. -- action = self._get_latest_action(start_time) -- self.log.info(f"Latest action: {action}") -+ self.log.info(f"Processing utterance \"{question_msg.utterance_text}\"") -  -+ # Get the optional fields. -+ optional_fields = \ -+ self._get_optional_fields_string(question_msg.emotion, self._get_current_step(), -+ self._get_latest_action(start_time)) - # Get centered detected objects. - centered_observables = self._get_latest_centered_observables(start_time) -- self.log.info(f"Observed objects: {centered_observables}") --  - # Get all detected objects. -- all_observables = self._get_latest_observables( -- start_time, self.object_dtctn_last_n_obj_detections -- ) -- self.log.info(f"Observed objects: {all_observables}") --  -- # Generate response. -- response = self.get_response( -- question_msg, -- self._get_dialogue_history(), -- self._get_current_step(), -- action, -- centered_observables, -- all_observables, -- ) -+ all_observables = \ -+ self._get_latest_observables(start_time, self.object_dtctn_last_n_obj_detections) -+  -+ response = None -+ is_object_clarification = \ -+ question_msg.intent and question_msg.intent == INTENT_LABELS[3] -+ if is_object_clarification and len(centered_observables) > 1: -+ # Object Clarification override: If an associated intent exists and indicates -+ # object clarification in the presence of multiple objects, override the response with -+ # a clarification question. -+ self.log.info( -+ "Received confusing object clarification question from user " +\ -+ f"about multiple objects: ({centered_observables}). " +\ -+ "Inquiring for more details...") -+ response = "It seems you are asking about an object you are unsure about. " +\ -+ "I am detecting the following: {}. ".format(centered_observables) +\ -+ "Is the object you are referenceing one of these objects?" -+ else: -+ all_observables -= centered_observables -+ # Normal response generation. -+ response = self.get_response( -+ question_msg, -+ self._get_dialogue_history(), -+ ", ".join(centered_observables) if centered_observables else "Nothing", -+ ", ".join(all_observables) if all_observables else "Nothing", -+ optional_fields -+ ) - self.publish_generated_response(question_msg.utterance_text, response) - - self._add_dialogue_history(question_msg.utterance_text, response) - + self._add_dialogue_history(question_msg.utterance_text, response,self.get_emotion_or(question_msg)) -  - def publish_generated_response(self, utterance: str, response: str): - msg = SystemTextResponse() -diff --cc ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt -index c82adf91,54cf8bf9..00000000 ---- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt -+++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt -@@@ -1,14 -1,13 +1,14 @@@ -- You are a professional chef teaching me how to best make this recipe. I will ask you a question about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that".  - -You are a teacher helping me learn how to complete a Task. I will tell you how I am feeling (positive, negative, neutral), all the objects that I can see, and what I am currently doing. I will ask you a question and you will respond with an answer. -++You are a professional chef teaching me how to best make this recipe. I will ask you questions about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that".  -  - -Task Steps: - +Here is some context: - +Currently I am working on the following recipe: - {recipe} -  - -{optional_fields} - -Objects In Front of Me: {centered_observables} - -Objects Nearby: {all_observables} - +I finished all steps up until but not including: {current_step} {action} -  - -Chat:  - +Here are objects that I see: {centered_observables}, and objects that you can see: {all_observables} - + - +Our conversation so far: - {chat_history} - -Me: {question} - -You: - +Me ({emotion}): {question} - +Your Answer (short, helpful with empathy): diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index c3e7669ba..89db911c1 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -184,6 +184,7 @@ def __init__(self): # Configure supplemental input resources. self.question_queue = queue.Queue() self.current_step = None + self.completed_steps = None self.action_classification_queue = queue.Queue() self.detected_objects_queue = queue.Queue() self.centroid_object_queue = centroid_2d_strategy_queue.Centroid2DStrategyQueue( @@ -213,6 +214,7 @@ def __init__(self): self._set_current_step, 1, ) + # Configure the optional object detection subscription. self.objects_subscription = None if self._in_objects_topic: @@ -296,11 +298,15 @@ def _get_sec(self, msg: DialogueUtterance) -> int: return msg.header.stamp.sec def _set_current_step(self, msg: TaskUpdate): - self.current_step = msg.current_step + self.current_step = msg.current_step_id + self.completed_steps = msg.completed_steps def _get_current_step(self): return self.current_step + def _get_completed_steps(self): + return self.completed_steps + def _get_dialogue_history(self): """ Gets a string concatenation of the last self.dialogue_history_length turns of conversation. @@ -404,6 +410,7 @@ def _get_latest_centered_observables(self, curr_time: int) -> Set: obj, score = obj_score observables.add(obj) observables = observables - self.object_dtctn_ignorables + self.log.info(f"CENTERED OBJECTS:" + str(observables)) return observables else: return "nothing" @@ -429,8 +436,7 @@ def _get_latest_observables(self, curr_time: int, n: int) -> Set: for obj in detection.entity: observables.add(obj) observables = observables - self.object_dtctn_ignorables - if len(observables)==0: - return "nothing" + self.log.info(f"ALL OBJECTS:" + str(observables)) return observables def get_response( @@ -484,17 +490,42 @@ def question_answer_callback(self, msg: DialogueUtterance): if not self._apply_filter(msg): return - msg.utterance_text= msg.utterance_text.replace("Angel, ", "") - msg.utterance_text= msg.utterance_text.replace("angel, ", "") - msg.utterance_text= msg.utterance_text.replace("angel", "") - msg.utterance_text= msg.utterance_text.replace("Angel", "") - msg.utterance_text= msg.utterance_text.capitalize() - self.question_queue.put(msg) - - def _get_optional_fields_string(self, current_step: str) -> str: + utt = msg.utterance_text + res = utt.split("Angel", 1) + if len(res)==1: + res = utt.split("angel", 1) + if len(res)==1: + res = utt.split("angel,", 1) + if len(res)==1: + res = utt.split("Angel,", 1) + + splitString = res[1] + splitString = splitString.lstrip(',') + splitString = splitString.lstrip(' ') + + if len(splitString)>1: + msg.utterance_text = splitString.capitalize() + self.question_queue.put(msg) + + def _get_optional_fields_string(self, current_step: int, completed_steps: list) -> str: optional_fields_string = "\n" - if current_step: - optional_fields_string += f"I finished all steps up until: {current_step}\n" + + if current_step==None: + #non started case + optional_fields_string += "I didn't start the recipe yet." + else: + if completed_steps[-1]==True: + #the last step is finished + optional_fields_string += f"I am done with all steps." + elif current_step==0: + #user is at step 1 + optional_fields_string += f"I am doing {current_step+1}" + optional_fields_string += f" and I am about to do {current_step+2}" + else: + optional_fields_string += f"I am doing {current_step+1}" + if current_step<=len(completed_steps)-2: + optional_fields_string += f" and I am about to do {current_step+2}" + return optional_fields_string.rstrip("\n") def process_question_queue(self): @@ -509,39 +540,52 @@ def process_question_queue(self): # Get the optional fields. optional_fields = \ - self._get_optional_fields_string(self._get_current_step()) + self._get_optional_fields_string(self._get_current_step(),self._get_completed_steps()) # Get centered detected objects. centered_observables = self._get_latest_centered_observables(start_time) # Get all detected objects. all_observables = \ self._get_latest_observables(start_time, self.object_dtctn_last_n_obj_detections) + self.log.info(f"Current action detected: \"{self._get_latest_action(start_time)}\"") response = None is_object_clarification = \ question_msg.intent and question_msg.intent == INTENT_LABELS[3] - if is_object_clarification and len(centered_observables) > 1: + if is_object_clarification and len(all_observables) > 1: # Object Clarification override: If an associated intent exists and indicates # object clarification in the presence of multiple objects, override the response with # a clarification question. self.log.info( "Received confusing object clarification question from user " +\ - f"about multiple objects: ({centered_observables}). " +\ + f"about multiple objects: ({all_observables}). " +\ "Inquiring for more details...") - response = "It seems you are asking about an object you are unsure about. " +\ - "I am detecting the following: {}. ".format(centered_observables) +\ - "Is the object you are referenceing one of these objects?" + response = "I am seeing the following objects: " + for obs in all_observables: + response+= f"{str(obs)}, " + response.rsplit(",") + response +="What object are you referring to?" + self._add_dialogue_history("What is this?", response,"neutral") + elif is_object_clarification and len(all_observables) == 0: + output = "I don't see any objects. Could you look at it directly and ask again?" + self.log.info(output) + response = output + self._add_dialogue_history("What is this?", response, "neutral") + elif is_object_clarification and len(all_observables) == 1: + response = f"I think that is a {list(all_observables)[0]}?" + self._add_dialogue_history("What is this?", response, "neutral") else: - all_observables = centered_observables # Normal response generation. response = self.get_response( question_msg, self._get_dialogue_history(), - ", ".join(centered_observables) if centered_observables else "Nothing", - ", ".join(all_observables) if all_observables else "Nothing", + "", + ", ".join(all_observables) if len(all_observables) > 0 else "nothing", optional_fields ) + self._add_dialogue_history(question_msg.utterance_text, response,self.get_emotion_or(question_msg)) + self.publish_generated_response(question_msg.utterance_text, response) - self._add_dialogue_history(question_msg.utterance_text, response,self.get_emotion_or(question_msg)) + def publish_generated_response(self, utterance: str, response: str): msg = SystemTextResponse() diff --git a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt index a3db4d85c..b1df5ffdf 100644 --- a/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt +++ b/ros/angel_system_nodes/configs/llm_prompts/vis_qa_teacher_prompt @@ -1,15 +1,24 @@ ### -You are a professional chef teaching me how to best make this recipe. I will ask you questions about cooking and you should respond with a short and efficient answer. To provide an answer, use the context below and if you do not have an answer, say "Sorry I can't help you with that". +Pretend you are a professional chef teaching me how to best make the recipe below. You can tell me how to use the utensils or so certain actions, let me know what is the current status of the recipe, let me know what objects are in front of me and let me know about alternative paths of the recipe if utensils are not available. -Here is some context: -Currently I am working on the following recipe: -{recipe} +When you answer my question, follow the these rules: +* Use information from the recipe below and the available objects. +* Is should not deviate from the instructions, except there ingredients or utensils are not available. +* If I ask you something unrelated to cooking or the recipe, answer with: "Sorry, I can't help you with that". +* You should always resond in a conversational tone. +* DO NOT ANSWER "I'm sorry, I am an AI language model and I cannot see or perceive anything." +* You can differentiate between objects you see in the environment and perceive them as well. +* Don't use the phrasing "However,.." +* Don't say "Based on the information you provided," +Here are the instructions of the recipe you are trying to teach me: +{recipe} {optional_fields} -Here are objects that you can see: {all_observables} ({centered_observables}) + +Here are the objects that are directly accessible to me: {all_observables} {centered_observables}. Objects not mentioned in this list are not directly in my environment. Objects in the environment which are not mentioned in the recipe above, are not relevant. ### -Our chat history: +Here is our conversation history: {chat_history} {question} -Your Answer (short, helpful with empathy): \ No newline at end of file +Your answer (very short, precise, helpful with empathy): \ No newline at end of file diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json deleted file mode 100644 index 5ad52699f..000000000 --- a/ros/angel_system_nodes/configs/mit_ll_eval_one_oatmeal_recipe_steps.json +++ /dev/null @@ -1,133 +0,0 @@ -{ - "Measure 12 ounces of water in the liquid measuring cup.": { - "level": 0, - "index": 0, - "sub-steps": {} - }, - - "Pour the water from the liquid measuring cup into the electric kettle.": { - "level": 0, - "index": 1, - "sub-steps": {} - }, - - "Turn on the Kettle.": { - "level": 0, - "index": 2, - "sub-steps": {} - }, - - "Place the dripper on top of the mug.": { - "level": 0, - "index": 3, - "sub-steps": {} - }, - - "Take the coffee filter and fold it in half to create a semi-circle.": { - "level": 0, - "index": 4, - "sub-steps": {} - }, - - "Fold the filter in half again to create a quarter-circle.": { - "level": 0, - "index": 5, - "sub-steps": {} - }, - - "Place the folded filter into the dripper such that the the point of the quarter-circle rests in the center of the dripper.": { - "level": 0, - "index": 6, - "sub-steps": {} - }, - - "Spread the filter open to create a cone inside the dripper.": { - "level": 0, - "index": 7, - "sub-steps": {} - }, - - "Place a bowl on the scale.": { - "level": 0, - "index": 8, - "sub-steps": {} - }, - - "Turn on the kitchen scale and zero the scale.": { - "level": 0, - "index": 9, - "sub-steps": {} - }, - - "Add coffee beans to the bowl until the scale reads 25 grams.": { - "level": 0, - "index": 10, - "sub-steps": {} - }, - - "Pour the measured coffee beans into the coffee grinder.": { - "level": 0, - "index": 11, - "sub-steps": {} - }, - - "Grind the coffee beans by pressing and holding down on the black part of the lid.": { - "level": 0, - "index": 12, - "sub-steps": {} - }, - - "Transfer the grounds to the filter cone.": { - "level": 0, - "index": 13, - "sub-steps": {} - }, - - "Turn on the thermometer.": { - "level": 0, - "index": 14, - "sub-steps": {} - }, - - "Place the end of the thermometer into the water.": { - "level": 0, - "index": 15, - "sub-steps": {} - }, - - "Slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter.": { - "level": 0, - "index": 16, - "sub-steps": {} - }, - - "Continue slowly pour the water over the grounds in a circular motion. do not overfill beyond the top of the paper filter...": { - "level": 0, - "index": 17, - "sub-steps": {} - }, - - "Allow the rest of the water in the dripper to drain.": { - "level": 0, - "index": 18, - "sub-steps": {} - }, - - "Remove the dripper from the cup.": { - "level": 0, - "index": 19, - "sub-steps": {} - }, - - "Remove the coffee grounds and paper filter from the dripper.": { - "level": 0, - "index": 20, - "sub-steps": {} - }, - - "Discard the coffee grounds and paper filter.": { - "level": 0, - "index": 21, - "sub-steps": {} - } -} diff --git a/ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json b/ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json new file mode 100644 index 000000000..c0bdedaed --- /dev/null +++ b/ros/angel_system_nodes/configs/mit_ll_eval_one_quesadilla_recipe_steps.json @@ -0,0 +1,67 @@ +{ + "Place tortilla on cutting board": { + "level": 0, + "index": 0, + "sub-steps": {} + }, + + "Use the butter knife to scoop nutella from the jar.": { + "level": 0, + "index": 1, + "sub-steps": {} + }, + + "Spread nutella onto tortilla, leaving ½ inch uncovered at the edges.": { + "level": 0, + "index": 2, + "sub-steps": {} + }, + + "Clean the knife by wiping with a paper towel": { + "level": 0, + "index": 3, + "sub-steps": {} + }, + + "Slice one banana.": { + "level": 0, + "index": 4, + "sub-steps": {} + }, + + "Top with banana slices.": { + "level": 0, + "index": 5, + "sub-steps": {} + }, + + "Clean the knife by wiping with a paper towel ": { + "level": 0, + "index": 6, + "sub-steps": {} + }, + + "Sprinkle small amount of cinnamon onto tortilla.": { + "level": 0, + "index": 7, + "sub-steps": {} + }, + + "Fold tortilla in half into semi-cirlce": { + "level": 0, + "index": 8, + "sub-steps": {} + }, + + "Slice tortilla in half using butter knife to create two triangular wedges.": { + "level": 0, + "index": 9, + "sub-steps": {} + }, + + "Place tortilla wedge on the plate.": { + "level": 0, + "index": 10, + "sub-steps": {} + } +} \ No newline at end of file diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml index d0a82c9c6..bdbd869d9 100644 --- a/tmux/Nov23-voice-live.yml +++ b/tmux/Nov23-voice-live.yml @@ -87,7 +87,7 @@ windows: -r __ns:=${ROS_NAMESPACE} -p input_audio_topic:=HeadsetAudioData -p output_voice_activity_topic:=DetectedVoiceData - -p vad_server_url:=http://communication.cs.columbia.edu:55667/vad + -p vad_server_url:=http://localhost:55667/vad -p vad_cadence:=4 -p vad_margin:=0.50 -p max_accumulation_length:=4 @@ -96,7 +96,7 @@ windows: -r __ns:=${ROS_NAMESPACE} -p audio_topic:=DetectedVoiceData -p utterances_topic:=utterances_topic - -p asr_server_url:=http://communication.cs.columbia.edu:55667/asr + -p asr_server_url:=http://localhost:55667/asr -p asr_req_segment_duration:=2 -p is_sentence_tokenize:=False -p debug_mode:=True @@ -142,7 +142,7 @@ windows: -p pv_height:=1080 -p obj_det_last_n:=8 -p must_contain_target_phrase:=True - -p object_det_ignored_objects:="hand (left),hand (right),background,trash can,microwave (open),microwave (closed),peanut butter,nut butter jar lid,nut butter jar (open),nut butter jar (closed),jelly jar lid,jelly jar (open),jelly jar (closed),butter knife + nut butter,butter knife + jelly,tortilla + nut butter,tortilla + jelly" + -p object_det_ignored_objects:="hand (left),hand (right),microwave (closed),microwave (open),background,trash can,peanut butter,nut butter jar lid,nut butter jar (open),nut butter jar (closed),jelly jar lid,jelly jar (open),jelly jar (closed),butter knife + nut butter,butter knife + jelly,tortilla + nut butter,tortilla + jelly" -p debug_mode:=True - object_detector: From dbd99d3a3eb1ff9d7030911f1b6b59e081c8b76d Mon Sep 17 00:00:00 2001 From: BS Date: Wed, 15 Nov 2023 13:56:12 -0500 Subject: [PATCH 46/46] Last adjustments for demo --- config/tasks/multi-task-config.yaml | 4 ++-- .../angel_system_nodes/visual_question_answerer.py | 6 +----- tmux/Nov23-voice-live.yml | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/config/tasks/multi-task-config.yaml b/config/tasks/multi-task-config.yaml index 30979b16d..bfad10c19 100644 --- a/config/tasks/multi-task-config.yaml +++ b/config/tasks/multi-task-config.yaml @@ -14,7 +14,7 @@ tasks: - id: 0 label: "coffee" config_file: "./config/tasks/recipe_coffee.yaml" - active: true + active: false - id: 1 label: "tea" config_file: "./config/tasks/recipe_tea.yaml" @@ -30,4 +30,4 @@ tasks: - id: 4 label: "dessert quesadilla" config_file: "./config/tasks/recipe_dessertquesadilla.yaml" - active: false + active: true diff --git a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py index 89db911c1..57c9243de 100644 --- a/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py +++ b/ros/angel_system_nodes/angel_system_nodes/visual_question_answerer.py @@ -493,11 +493,7 @@ def question_answer_callback(self, msg: DialogueUtterance): utt = msg.utterance_text res = utt.split("Angel", 1) if len(res)==1: - res = utt.split("angel", 1) - if len(res)==1: - res = utt.split("angel,", 1) - if len(res)==1: - res = utt.split("Angel,", 1) + res = utt.split("angel", 1) splitString = res[1] splitString = splitString.lstrip(',') diff --git a/tmux/Nov23-voice-live.yml b/tmux/Nov23-voice-live.yml index bdbd869d9..7d7f84282 100644 --- a/tmux/Nov23-voice-live.yml +++ b/tmux/Nov23-voice-live.yml @@ -136,7 +136,7 @@ windows: -p object_detections_topic:=ObjectDetections2d -p action_classifications_topic:=ActivityDetections -p system_text_response_topic:=system_text_response_topic - -p recipe_path:=${NODE_CONFIG_DIR}/mit_ll_eval_one_coffee_recipe_steps_v2.json + -p recipe_path:=${NODE_CONFIG_DIR}/mit_ll_eval_one_quesadilla_recipe_steps.json -p prompt_template_path:=${NODE_CONFIG_DIR}/llm_prompts/vis_qa_teacher_prompt -p pv_width:=1920 -p pv_height:=1080