Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions docs/sdk/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,8 @@ TelemetryConfig(
from mobilerun import ToolsConfig

ToolsConfig(
disabled_tools=["click_at", "click_area", "long_press_at"], # Tools to disable (default disables coordinate-based tools)
stealth=False, # Enable stealth mode (human-like timing + randomized coordinates)
disabled_tools=None, # None = use framework default (see below); pass a list to override
stealth=False, # Enable stealth mode (human-like timing + randomized coordinates)
)
```

Expand All @@ -199,7 +199,12 @@ tools:
- wait
stealth: false
```
Disabled tools will not be available to agents during execution. The default `disabled_tools` list disables `click_at`, `click_area`, and `long_press_at`.

**Default behavior (`disabled_tools=None` / key omitted):** `click_at`, `click_area`, and `long_press_at` are disabled. When the active action agent has vision enabled (`fast_agent.vision=True` in direct mode, or both `manager.vision=True` and `executor.vision=True` in reasoning mode, or `vision_only=True`), `click_at` is automatically re-enabled.

**Explicit list:** If you pass any list (even empty), it is honored as-is — no defaults, no auto-unmask. You take full control of what is disabled.

**Screenshot-only modes:** When `vision_only=True` or `control_backend: visual-remote`, coordinate tools are required to operate. Disabling any of them explicitly raises a `ValueError` at startup.

---

Expand Down Expand Up @@ -536,10 +541,11 @@ device:
auto_setup: true # Auto-install/fix Portal APK before each run

tools:
disabled_tools:
- click_at
- click_area
- long_press_at
# disabled_tools: null # null/omit = framework default (coord tools off, click_at auto-unmasked when vision is on)
# disabled_tools: # uncomment to take full control — list is honored verbatim (no auto-unmask)
# - click_at
# - click_area
# - long_press_at
stealth: false

telemetry:
Expand Down
70 changes: 62 additions & 8 deletions mobilerun/agent/droid/droid_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
)
from mobilerun.agent.utils.trajectory import Trajectory
from mobilerun.config_manager.config_manager import (
DEFAULT_DISABLED_TOOLS,
AgentConfig,
CredentialsConfig,
DeviceConfig,
Expand Down Expand Up @@ -114,9 +115,47 @@ def _force_screenshot_only_vision(agent_config: AgentConfig) -> None:
agent_config.fast_agent.vision = True


def _effective_disabled_tools(disabled_tools: list[str], state_provider) -> list[str]:
if getattr(state_provider, "requires_coordinate_tools", False):
def _effective_disabled_tools(
disabled_tools: list[str],
state_provider,
vision_enabled: bool = False,
explicit: bool = False,
) -> list[str]:
requires_coords = getattr(state_provider, "requires_coordinate_tools", False)
if requires_coords:
# Screenshot-only / visual-remote modes cannot operate without coordinate
# tools. Treat lists that are supersets of the legacy v5 default coord
# set as a legacy-extension pattern (warn-and-strip) rather than
# genuine v6 intent (raise). Genuine v6 explicit lists fail loudly.
if explicit:
disabled_set = set(disabled_tools)
blocked = sorted(disabled_set & _COORDINATE_TOOL_NAMES)
if blocked:
if set(DEFAULT_DISABLED_TOOLS).issubset(disabled_set):
logger.warning(
"Legacy disabled_tools list %s contains coordinate tools "
"that the active state provider requires; stripping them "
"to allow startup. Consider setting tools.disabled_tools "
"to null (framework default) and listing only the extras "
"you actually want disabled.",
disabled_tools,
)
else:
raise ValueError(
f"Cannot disable coordinate tools {blocked} when the "
"state provider requires them (vision_only=True or "
"visual remote control_backend). Remove these from "
"tools.disabled_tools."
)
return [name for name in disabled_tools if name not in _COORDINATE_TOOL_NAMES]
# Auto-unmask click_at only when (a) the caller didn't supply an explicit
# list, and (b) the provider's screenshot pixel space matches the driver's
# tap input space. iOS in normal mode is excluded — the screenshot is
# physical pixels while taps use XCTest points, so screenshot coords would
# tap the wrong location.
coords_align = getattr(state_provider, "screenshot_matches_input_coords", False)
if vision_enabled and not explicit and coords_align:
return [name for name in disabled_tools if name != "click_at"]
return disabled_tools


Expand Down Expand Up @@ -541,13 +580,28 @@ async def start_handler(
capabilities = driver.supported | self.state_provider.supported
registry.disable_unsupported(capabilities)

# Config-level filtering
disabled_tools = (
self.config.tools.disabled_tools
if self.config.tools and self.config.tools.disabled_tools
else []
# Config-level filtering. ``disabled_tools=None`` means "framework
# default"; an explicit list (even empty) is honored verbatim.
user_disabled = self.config.tools.disabled_tools if self.config.tools else None
explicit_disabled = user_disabled is not None
disabled_tools = list(user_disabled if explicit_disabled else DEFAULT_DISABLED_TOOLS)
# In reasoning mode the Executor only sees a screenshot when the Manager
# also captured one (manager.vision=True), so require both before
# exposing coordinate clicks.
if self.config.agent.reasoning:
active_action_vision = (
self.config.agent.manager.vision
and self.config.agent.executor.vision
)
else:
active_action_vision = self.config.agent.fast_agent.vision
action_agent_has_vision = self.config.agent.vision_only or active_action_vision
disabled_tools = _effective_disabled_tools(
disabled_tools,
self.state_provider,
vision_enabled=action_agent_has_vision,
explicit=explicit_disabled,
)
disabled_tools = _effective_disabled_tools(disabled_tools, self.state_provider)
if disabled_tools:
registry.disable(disabled_tools)

Expand Down
17 changes: 11 additions & 6 deletions mobilerun/config_example.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Mobilerun Configuration File
# This file is auto-generated. Edit values as needed.

_version: 5
_version: 6

# === Agent Settings ===
agent:
Expand Down Expand Up @@ -175,11 +175,16 @@ tools:
# - wait # Wait for duration
# - open_app # Open apps by name
# - type_secret # Type credentials (requires credentials.enabled: true)
# Coordinate-based tools disabled by default (enable if needed)
disabled_tools:
- click_at
- click_area
- long_press_at
# Coordinate-based tools (click_at, click_area, long_press_at) are
# disabled by default. When an action agent has vision enabled, click_at
# is auto-unmasked. Uncomment below to provide an explicit list — your
# list is then honored verbatim (no auto-unmask, no defaults).
# disabled_tools:
# - click_at
# - click_area
# - long_press_at
# Stealth mode adds human-like timing and randomized coordinates.
stealth: false

# === Credential Settings ===
credentials:
Expand Down
43 changes: 32 additions & 11 deletions mobilerun/config_manager/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,20 @@ class LoggingConfig:
trajectory_gifs: bool = True


def _default_disabled_tools() -> List[str]:
return ["click_at", "click_area", "long_press_at"]
DEFAULT_DISABLED_TOOLS: tuple[str, ...] = ("click_at", "click_area", "long_press_at")


@dataclass
class ToolsConfig:
"""Tools configuration."""
"""Tools configuration.

disabled_tools: List[str] = field(default_factory=_default_disabled_tools)
``disabled_tools=None`` means "use the framework default" — coordinate
tools are disabled, and ``click_at`` is auto-unmasked when the active
action agent has vision. Pass an explicit list (even an empty one) to
take full control: the list is then honored as-is with no auto-unmask.
"""

disabled_tools: Optional[List[str]] = None
stealth: bool = False


Expand Down Expand Up @@ -268,7 +273,21 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MobileConfig":
"""Create config from dictionary."""
"""Create config from dictionary.

If ``data`` carries a ``_version`` lower than the current schema
version, migrations are applied before parsing so SDK callers using
``MobileConfig.from_yaml()`` / ``from_dict()`` get the same upgrade
path as ``ConfigLoader``. In-memory dicts without ``_version`` are
assumed to already match the current schema.
"""
import copy as _copy

from mobilerun.config_manager.migrations import CURRENT_VERSION, migrate

if "_version" in data and data["_version"] < CURRENT_VERSION:
data = migrate(_copy.deepcopy(data))

# Parse LLM profiles
llm_profiles = {}
for name, profile_data in data.get("llm_profiles", {}).items():
Expand Down Expand Up @@ -336,15 +355,17 @@ def from_dict(cls, data: Dict[str, Any]) -> "MobileConfig":
servers=mcp_servers,
)

# ``data.get("X") or {}`` so a section present-but-null in YAML
# (e.g. ``tools:`` followed only by comments) is treated as empty.
return cls(
agent=agent_config,
llm_profiles=llm_profiles,
device=DeviceConfig(**data.get("device", {})),
telemetry=TelemetryConfig(**data.get("telemetry", {})),
tracing=TracingConfig(**data.get("tracing", {})),
logging=LoggingConfig(**data.get("logging", {})),
tools=ToolsConfig(**data.get("tools", {})),
credentials=CredentialsConfig(**data.get("credentials", {})),
device=DeviceConfig(**(data.get("device") or {})),
telemetry=TelemetryConfig(**(data.get("telemetry") or {})),
tracing=TracingConfig(**(data.get("tracing") or {})),
logging=LoggingConfig(**(data.get("logging") or {})),
tools=ToolsConfig(**(data.get("tools") or {})),
credentials=CredentialsConfig(**(data.get("credentials") or {})),
external_agents=external_agents,
mcp=mcp_config,
)
Expand Down
2 changes: 1 addition & 1 deletion mobilerun/config_manager/migrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path


CURRENT_VERSION = 5
CURRENT_VERSION = 6


def get_migrations() -> List:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Migration v6: Treat default-shaped ``tools.disabled_tools`` as the sentinel.

Older generated configs (v5) shipped the literal default list
``[click_at, click_area, long_press_at]``. The schema now uses ``None`` as the
"use framework default" sentinel — an explicit list is honored verbatim and
disables the vision auto-unmask for ``click_at`` (and raises in
screenshot-only modes when coordinate tools are listed).

This migration only converts the **exact** default list to ``None``. Supersets
like ``[click_at, click_area, long_press_at, wait]`` are intentionally left
untouched so non-vision runs continue disabling the coordinate tools the user
expected. ``_effective_disabled_tools`` gives those legacy supersets a
graceful path through screenshot-only modes (coord tools are stripped with a
warning instead of raising ValueError).
"""

from typing import Any, Dict

VERSION = 6

_OLD_DEFAULT = {"click_at", "click_area", "long_press_at"}


def migrate(config: Dict[str, Any]) -> Dict[str, Any]:
tools = config.get("tools")
if not isinstance(tools, dict):
return config

disabled = tools.get("disabled_tools")
if isinstance(disabled, list) and set(disabled) == _OLD_DEFAULT:
tools["disabled_tools"] = None

return config
13 changes: 13 additions & 0 deletions mobilerun/tools/ui/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ class StateProvider:
"""Base class — subclass to support different platforms."""

supported: set[str] = set()
# True when raw screenshot pixel coordinates can be sent directly to driver
# tap actions without scaling (e.g. Android, where screenshot and input
# coords are both device pixels). iOS in normal mode is False — the
# screenshot is physical pixels while taps use XCTest points, so a model
# picking from the screenshot would tap the wrong location. Screenshot-only
# providers handle scaling explicitly via ``coordinate_scale_x/y``.
screenshot_matches_input_coords: bool = False

def __init__(self, driver: "DeviceDriver") -> None:
self.driver = driver
Expand Down Expand Up @@ -158,6 +165,12 @@ def __init__(
self.tree_formatter = tree_formatter
self.use_normalized = use_normalized
self._ui_cls = ui_cls or (StealthUIState if stealth else UIState)
# Android screenshots and input taps share device-pixel coordinates,
# but only when not in normalized mode. ``use_normalized=True`` makes
# ``UIState.convert_point`` treat inputs as [0-1000] normalized
# coordinates, which is incompatible with picking coordinates off the
# screenshot — keep click_at masked in that case.
self.screenshot_matches_input_coords = not use_normalized

async def _recover_portal(self) -> None:
"""Restart Portal's accessibility service and TCP socket server."""
Expand Down
Loading