From 64ec3f6c342090901d716d6023235f8412b5f5ba Mon Sep 17 00:00:00 2001 From: Mariozada Date: Wed, 20 May 2026 22:36:49 +1000 Subject: [PATCH 1/2] fix: enable click_at under vision and respect explicit disabled_tool --- docs/sdk/configuration.mdx | 20 ++++--- mobilerun/agent/droid/droid_agent.py | 56 ++++++++++++++++--- mobilerun/config_example.yaml | 17 ++++-- mobilerun/config_manager/config_manager.py | 43 ++++++++++---- .../config_manager/migrations/__init__.py | 2 +- .../v006_disabled_tools_default_sentinel.py | 42 ++++++++++++++ mobilerun/tools/ui/provider.py | 13 +++++ 7 files changed, 160 insertions(+), 33 deletions(-) create mode 100644 mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py diff --git a/docs/sdk/configuration.mdx b/docs/sdk/configuration.mdx index 9849566c..d5f388e0 100644 --- a/docs/sdk/configuration.mdx +++ b/docs/sdk/configuration.mdx @@ -186,8 +186,8 @@ TelemetryConfig( from mobilerun import ToolsConfig ToolsConfig( - disabled_tools=["click_at", "click_area", "long_press_at"], # Tools to disable (default disables coordinate-based tools) - stealth=False, # Enable stealth mode (human-like timing + randomized coordinates) + disabled_tools=None, # None = use framework default (see below); pass a list to override + stealth=False, # Enable stealth mode (human-like timing + randomized coordinates) ) ``` @@ -199,7 +199,12 @@ tools: - wait stealth: false ``` -Disabled tools will not be available to agents during execution. The default `disabled_tools` list disables `click_at`, `click_area`, and `long_press_at`. + +**Default behavior (`disabled_tools=None` / key omitted):** `click_at`, `click_area`, and `long_press_at` are disabled. When the active action agent has vision enabled (`fast_agent.vision=True` in direct mode, or both `manager.vision=True` and `executor.vision=True` in reasoning mode, or `vision_only=True`), `click_at` is automatically re-enabled. + +**Explicit list:** If you pass any list (even empty), it is honored as-is — no defaults, no auto-unmask. You take full control of what is disabled. + +**Screenshot-only modes:** When `vision_only=True` or `control_backend: visual-remote`, coordinate tools are required to operate. Disabling any of them explicitly raises a `ValueError` at startup. --- @@ -536,10 +541,11 @@ device: auto_setup: true # Auto-install/fix Portal APK before each run tools: - disabled_tools: - - click_at - - click_area - - long_press_at + # disabled_tools: null # null/omit = framework default (coord tools off, click_at auto-unmasked when vision is on) + # disabled_tools: # uncomment to take full control — list is honored verbatim (no auto-unmask) + # - click_at + # - click_area + # - long_press_at stealth: false telemetry: diff --git a/mobilerun/agent/droid/droid_agent.py b/mobilerun/agent/droid/droid_agent.py index 4f04c64c..c2fc6038 100644 --- a/mobilerun/agent/droid/droid_agent.py +++ b/mobilerun/agent/droid/droid_agent.py @@ -54,6 +54,7 @@ ) from mobilerun.agent.utils.trajectory import Trajectory from mobilerun.config_manager.config_manager import ( + DEFAULT_DISABLED_TOOLS, AgentConfig, CredentialsConfig, DeviceConfig, @@ -114,9 +115,33 @@ def _force_screenshot_only_vision(agent_config: AgentConfig) -> None: agent_config.fast_agent.vision = True -def _effective_disabled_tools(disabled_tools: list[str], state_provider) -> list[str]: - if getattr(state_provider, "requires_coordinate_tools", False): +def _effective_disabled_tools( + disabled_tools: list[str], + state_provider, + vision_enabled: bool = False, + explicit: bool = False, +) -> list[str]: + requires_coords = getattr(state_provider, "requires_coordinate_tools", False) + if requires_coords: + # Screenshot-only / visual-remote modes cannot operate without coordinate + # tools — fail loudly rather than silently ignoring an explicit disable. + if explicit: + blocked = sorted(set(disabled_tools) & _COORDINATE_TOOL_NAMES) + if blocked: + raise ValueError( + f"Cannot disable coordinate tools {blocked} when the state " + "provider requires them (vision_only=True or visual remote " + "control_backend). Remove these from tools.disabled_tools." + ) return [name for name in disabled_tools if name not in _COORDINATE_TOOL_NAMES] + # Auto-unmask click_at only when (a) the caller didn't supply an explicit + # list, and (b) the provider's screenshot pixel space matches the driver's + # tap input space. iOS in normal mode is excluded — the screenshot is + # physical pixels while taps use XCTest points, so screenshot coords would + # tap the wrong location. + coords_align = getattr(state_provider, "screenshot_matches_input_coords", False) + if vision_enabled and not explicit and coords_align: + return [name for name in disabled_tools if name != "click_at"] return disabled_tools @@ -541,13 +566,28 @@ async def start_handler( capabilities = driver.supported | self.state_provider.supported registry.disable_unsupported(capabilities) - # Config-level filtering - disabled_tools = ( - self.config.tools.disabled_tools - if self.config.tools and self.config.tools.disabled_tools - else [] + # Config-level filtering. ``disabled_tools=None`` means "framework + # default"; an explicit list (even empty) is honored verbatim. + user_disabled = self.config.tools.disabled_tools if self.config.tools else None + explicit_disabled = user_disabled is not None + disabled_tools = list(user_disabled if explicit_disabled else DEFAULT_DISABLED_TOOLS) + # In reasoning mode the Executor only sees a screenshot when the Manager + # also captured one (manager.vision=True), so require both before + # exposing coordinate clicks. + if self.config.agent.reasoning: + active_action_vision = ( + self.config.agent.manager.vision + and self.config.agent.executor.vision + ) + else: + active_action_vision = self.config.agent.fast_agent.vision + action_agent_has_vision = self.config.agent.vision_only or active_action_vision + disabled_tools = _effective_disabled_tools( + disabled_tools, + self.state_provider, + vision_enabled=action_agent_has_vision, + explicit=explicit_disabled, ) - disabled_tools = _effective_disabled_tools(disabled_tools, self.state_provider) if disabled_tools: registry.disable(disabled_tools) diff --git a/mobilerun/config_example.yaml b/mobilerun/config_example.yaml index 0f963767..465c2bed 100644 --- a/mobilerun/config_example.yaml +++ b/mobilerun/config_example.yaml @@ -1,7 +1,7 @@ # Mobilerun Configuration File # This file is auto-generated. Edit values as needed. -_version: 5 +_version: 6 # === Agent Settings === agent: @@ -175,11 +175,16 @@ tools: # - wait # Wait for duration # - open_app # Open apps by name # - type_secret # Type credentials (requires credentials.enabled: true) - # Coordinate-based tools disabled by default (enable if needed) - disabled_tools: - - click_at - - click_area - - long_press_at + # Coordinate-based tools (click_at, click_area, long_press_at) are + # disabled by default. When an action agent has vision enabled, click_at + # is auto-unmasked. Uncomment below to provide an explicit list — your + # list is then honored verbatim (no auto-unmask, no defaults). + # disabled_tools: + # - click_at + # - click_area + # - long_press_at + # Stealth mode adds human-like timing and randomized coordinates. + stealth: false # === Credential Settings === credentials: diff --git a/mobilerun/config_manager/config_manager.py b/mobilerun/config_manager/config_manager.py index d2d29d41..0bf7740c 100644 --- a/mobilerun/config_manager/config_manager.py +++ b/mobilerun/config_manager/config_manager.py @@ -181,15 +181,20 @@ class LoggingConfig: trajectory_gifs: bool = True -def _default_disabled_tools() -> List[str]: - return ["click_at", "click_area", "long_press_at"] +DEFAULT_DISABLED_TOOLS: tuple[str, ...] = ("click_at", "click_area", "long_press_at") @dataclass class ToolsConfig: - """Tools configuration.""" + """Tools configuration. - disabled_tools: List[str] = field(default_factory=_default_disabled_tools) + ``disabled_tools=None`` means "use the framework default" — coordinate + tools are disabled, and ``click_at`` is auto-unmasked when the active + action agent has vision. Pass an explicit list (even an empty one) to + take full control: the list is then honored as-is with no auto-unmask. + """ + + disabled_tools: Optional[List[str]] = None stealth: bool = False @@ -268,7 +273,21 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "MobileConfig": - """Create config from dictionary.""" + """Create config from dictionary. + + If ``data`` carries a ``_version`` lower than the current schema + version, migrations are applied before parsing so SDK callers using + ``MobileConfig.from_yaml()`` / ``from_dict()`` get the same upgrade + path as ``ConfigLoader``. In-memory dicts without ``_version`` are + assumed to already match the current schema. + """ + import copy as _copy + + from mobilerun.config_manager.migrations import CURRENT_VERSION, migrate + + if "_version" in data and data["_version"] < CURRENT_VERSION: + data = migrate(_copy.deepcopy(data)) + # Parse LLM profiles llm_profiles = {} for name, profile_data in data.get("llm_profiles", {}).items(): @@ -336,15 +355,17 @@ def from_dict(cls, data: Dict[str, Any]) -> "MobileConfig": servers=mcp_servers, ) + # ``data.get("X") or {}`` so a section present-but-null in YAML + # (e.g. ``tools:`` followed only by comments) is treated as empty. return cls( agent=agent_config, llm_profiles=llm_profiles, - device=DeviceConfig(**data.get("device", {})), - telemetry=TelemetryConfig(**data.get("telemetry", {})), - tracing=TracingConfig(**data.get("tracing", {})), - logging=LoggingConfig(**data.get("logging", {})), - tools=ToolsConfig(**data.get("tools", {})), - credentials=CredentialsConfig(**data.get("credentials", {})), + device=DeviceConfig(**(data.get("device") or {})), + telemetry=TelemetryConfig(**(data.get("telemetry") or {})), + tracing=TracingConfig(**(data.get("tracing") or {})), + logging=LoggingConfig(**(data.get("logging") or {})), + tools=ToolsConfig(**(data.get("tools") or {})), + credentials=CredentialsConfig(**(data.get("credentials") or {})), external_agents=external_agents, mcp=mcp_config, ) diff --git a/mobilerun/config_manager/migrations/__init__.py b/mobilerun/config_manager/migrations/__init__.py index a50e5981..a16e36f7 100644 --- a/mobilerun/config_manager/migrations/__init__.py +++ b/mobilerun/config_manager/migrations/__init__.py @@ -6,7 +6,7 @@ from pathlib import Path -CURRENT_VERSION = 5 +CURRENT_VERSION = 6 def get_migrations() -> List: diff --git a/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py b/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py new file mode 100644 index 00000000..b8318591 --- /dev/null +++ b/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py @@ -0,0 +1,42 @@ +"""Migration v6: Treat default-shaped ``tools.disabled_tools`` as the sentinel. + +Older generated configs (v5) shipped the literal default list +``[click_at, click_area, long_press_at]``. The schema now uses ``None`` as the +"use framework default" sentinel — an explicit list is honored verbatim and +disables the vision auto-unmask for ``click_at`` (and raises in +screenshot-only modes when coordinate tools are listed). + +Migration rules for ``tools.disabled_tools``: + +* Exact match for the old default list → convert to ``None`` (implicit default). +* Superset of the old default (e.g. ``[click_at, click_area, long_press_at, + wait]``) → strip the three legacy coordinate entries and keep the user's + additions. If nothing remains, fall through to ``None``. +* Anything else (custom list, missing entries, ``None``) → left untouched. +""" + +from typing import Any, Dict + +VERSION = 6 + +_OLD_DEFAULT = {"click_at", "click_area", "long_press_at"} + + +def migrate(config: Dict[str, Any]) -> Dict[str, Any]: + tools = config.get("tools") + if not isinstance(tools, dict): + return config + + disabled = tools.get("disabled_tools") + if not isinstance(disabled, list): + return config + + if not _OLD_DEFAULT.issubset(set(disabled)): + # User removed entries from the old default — treat as explicit choice. + return config + + # Drop legacy default entries while preserving user additions and order. + extras = [t for t in disabled if t not in _OLD_DEFAULT] + tools["disabled_tools"] = extras if extras else None + + return config diff --git a/mobilerun/tools/ui/provider.py b/mobilerun/tools/ui/provider.py index 63928517..1d4956ee 100644 --- a/mobilerun/tools/ui/provider.py +++ b/mobilerun/tools/ui/provider.py @@ -126,6 +126,13 @@ class StateProvider: """Base class — subclass to support different platforms.""" supported: set[str] = set() + # True when raw screenshot pixel coordinates can be sent directly to driver + # tap actions without scaling (e.g. Android, where screenshot and input + # coords are both device pixels). iOS in normal mode is False — the + # screenshot is physical pixels while taps use XCTest points, so a model + # picking from the screenshot would tap the wrong location. Screenshot-only + # providers handle scaling explicitly via ``coordinate_scale_x/y``. + screenshot_matches_input_coords: bool = False def __init__(self, driver: "DeviceDriver") -> None: self.driver = driver @@ -158,6 +165,12 @@ def __init__( self.tree_formatter = tree_formatter self.use_normalized = use_normalized self._ui_cls = ui_cls or (StealthUIState if stealth else UIState) + # Android screenshots and input taps share device-pixel coordinates, + # but only when not in normalized mode. ``use_normalized=True`` makes + # ``UIState.convert_point`` treat inputs as [0-1000] normalized + # coordinates, which is incompatible with picking coordinates off the + # screenshot — keep click_at masked in that case. + self.screenshot_matches_input_coords = not use_normalized async def _recover_portal(self) -> None: """Restart Portal's accessibility service and TCP socket server.""" From 47e90af9a20ec0224ff22e59ec71157052e9a55a Mon Sep 17 00:00:00 2001 From: Mariozada Date: Thu, 21 May 2026 00:48:30 +1000 Subject: [PATCH 2/2] fix: legacy-superset grace path and narrow v6 migration to exact-default Replaces hard raise with warn-and-strip for screenshot-only mode when disabled_tools is a superset of the old v5 default (legacy-extension pattern). Genuine v6 explicit lists that block coord tools still raise. Narrows v006 migration to only convert the exact default list to None; supersets are left untouched so non-vision runs keep disabling coord tools. --- mobilerun/agent/droid/droid_agent.py | 28 ++++++++++++++----- .../v006_disabled_tools_default_sentinel.py | 25 ++++++----------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/mobilerun/agent/droid/droid_agent.py b/mobilerun/agent/droid/droid_agent.py index c2fc6038..d1ee05a9 100644 --- a/mobilerun/agent/droid/droid_agent.py +++ b/mobilerun/agent/droid/droid_agent.py @@ -124,15 +124,29 @@ def _effective_disabled_tools( requires_coords = getattr(state_provider, "requires_coordinate_tools", False) if requires_coords: # Screenshot-only / visual-remote modes cannot operate without coordinate - # tools — fail loudly rather than silently ignoring an explicit disable. + # tools. Treat lists that are supersets of the legacy v5 default coord + # set as a legacy-extension pattern (warn-and-strip) rather than + # genuine v6 intent (raise). Genuine v6 explicit lists fail loudly. if explicit: - blocked = sorted(set(disabled_tools) & _COORDINATE_TOOL_NAMES) + disabled_set = set(disabled_tools) + blocked = sorted(disabled_set & _COORDINATE_TOOL_NAMES) if blocked: - raise ValueError( - f"Cannot disable coordinate tools {blocked} when the state " - "provider requires them (vision_only=True or visual remote " - "control_backend). Remove these from tools.disabled_tools." - ) + if set(DEFAULT_DISABLED_TOOLS).issubset(disabled_set): + logger.warning( + "Legacy disabled_tools list %s contains coordinate tools " + "that the active state provider requires; stripping them " + "to allow startup. Consider setting tools.disabled_tools " + "to null (framework default) and listing only the extras " + "you actually want disabled.", + disabled_tools, + ) + else: + raise ValueError( + f"Cannot disable coordinate tools {blocked} when the " + "state provider requires them (vision_only=True or " + "visual remote control_backend). Remove these from " + "tools.disabled_tools." + ) return [name for name in disabled_tools if name not in _COORDINATE_TOOL_NAMES] # Auto-unmask click_at only when (a) the caller didn't supply an explicit # list, and (b) the provider's screenshot pixel space matches the driver's diff --git a/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py b/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py index b8318591..f0438332 100644 --- a/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py +++ b/mobilerun/config_manager/migrations/v006_disabled_tools_default_sentinel.py @@ -6,13 +6,12 @@ disables the vision auto-unmask for ``click_at`` (and raises in screenshot-only modes when coordinate tools are listed). -Migration rules for ``tools.disabled_tools``: - -* Exact match for the old default list → convert to ``None`` (implicit default). -* Superset of the old default (e.g. ``[click_at, click_area, long_press_at, - wait]``) → strip the three legacy coordinate entries and keep the user's - additions. If nothing remains, fall through to ``None``. -* Anything else (custom list, missing entries, ``None``) → left untouched. +This migration only converts the **exact** default list to ``None``. Supersets +like ``[click_at, click_area, long_press_at, wait]`` are intentionally left +untouched so non-vision runs continue disabling the coordinate tools the user +expected. ``_effective_disabled_tools`` gives those legacy supersets a +graceful path through screenshot-only modes (coord tools are stripped with a +warning instead of raising ValueError). """ from typing import Any, Dict @@ -28,15 +27,7 @@ def migrate(config: Dict[str, Any]) -> Dict[str, Any]: return config disabled = tools.get("disabled_tools") - if not isinstance(disabled, list): - return config - - if not _OLD_DEFAULT.issubset(set(disabled)): - # User removed entries from the old default — treat as explicit choice. - return config - - # Drop legacy default entries while preserving user additions and order. - extras = [t for t in disabled if t not in _OLD_DEFAULT] - tools["disabled_tools"] = extras if extras else None + if isinstance(disabled, list) and set(disabled) == _OLD_DEFAULT: + tools["disabled_tools"] = None return config