diff --git a/Cargo.toml b/Cargo.toml index 8b1ed4f..ac99876 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ rmcp = { version = "1.5.0", features = ["transport-io"] } schemars = "1.0" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" -tokio = { version = "1.51.1", features = ["macros", "rt", "time"] } +tokio = { version = "1.51.1", features = ["macros", "process", "rt", "time"] } wayland-client = "0.31.11" wayland-protocols = { version = "0.32.9", features = ["client", "staging"] } xkeysym = "0.2.1" diff --git a/README.md b/README.md index ab749e0..952a4aa 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The Rust crate is published as [`computer-use-linux`](https://crates.io/crates/c Most computer-use MCP servers are macOS-only (they lean on AppKit, AXUIElement, CGEvent). The few that target Linux either drive `xdotool` against an X11 root window or shell out to OCR over screenshots. Four things set this one apart: -- **Wayland actually works.** Pointer actions can use the `org.freedesktop.portal.RemoteDesktop` interface on Wayland, with `ydotool` / `ydotoold` (uinput) as the deterministic fallback and keyboard/text path. Screenshots use the GNOME Shell DBus screenshot method when present and `org.freedesktop.portal.Screenshot` otherwise. +- **Wayland actually works.** Pointer actions can use the `org.freedesktop.portal.RemoteDesktop` interface on Wayland, with `ydotool` / `ydotoold` (uinput) as the deterministic fallback and keyboard/text path. Screenshots use the GNOME Shell DBus screenshot method when present, `org.freedesktop.portal.Screenshot` otherwise, and fall back to spawning `gnome-screenshot` for background/systemd contexts where both DBus paths are denied. - **Window targeting is compositor-aware.** The window registry tries GNOME Shell extension, GNOME Shell Introspect, COSMIC Wayland helper, KWin DBus scripting, Hyprland `hyprctl`, and i3 IPC in order, then reports exactly which backend won or why each backend failed. - **Semantic selectors, not pixel coordinates.** Tools like `click`, `perform_action`, and `set_value` accept `role` / `name` / `text` / `states` selectors backed by AT-SPI. Pixel coordinates remain available as a fallback for rendering-only surfaces (canvas, games, X clients without ATK). - **One JSON readiness report.** `computer-use-linux doctor` returns a structured document covering platform, portals, AT-SPI, windowing, input, and a `readiness` summary with explicit blockers and a recommended next step. MCP hosts can render or surface that to the user without parsing prose. @@ -310,6 +310,7 @@ Most setups need none of these — `doctor` and the installers pick sensible def | `CU_DISABLE_ABS_POINTER` | Disable the uinput absolute pointer and click through `ydotool` instead (for setups where the abs-pointer device misbehaves); embedded Codex builds may use `CODEX_COMPUTER_USE_DISABLE_ABS_POINTER`. | | `COMPUTER_USE_LINUX_FORCE_PORTAL_POINTER` / `…_KEYBOARD` | Always route pointer / keyboard through the RemoteDesktop portal on Wayland, skipping auto-detection; embedded Codex builds may use `CODEX_COMPUTER_USE_FORCE_PORTAL_POINTER` / `…_KEYBOARD`. | | `COMPUTER_USE_LINUX_FORCE_YDOTOOL_POINTER` / `…_KEYBOARD` | Always route pointer / keyboard through `ydotool`, skipping the portal and KDE clipboard paths; embedded Codex builds may use `CODEX_COMPUTER_USE_FORCE_YDOTOOL_POINTER` / `…_KEYBOARD`. | +| `COMPUTER_USE_LINUX_SCREENSHOT_BACKEND` | Force a single screenshot backend, skipping the fallback chain. Accepts `gnome-shell`, `portal`, or `gnome-screenshot`. Pin `gnome-screenshot` for background/systemd contexts where the GNOME Shell and portal DBus paths are denied. | **Build-time identity overrides** (set while compiling a downstream embedded bundle): `CUL_GNOME_EXTENSION_UUID`, `CUL_DBUS_SERVICE`, and diff --git a/src/diagnostics.rs b/src/diagnostics.rs index d393bc4..506967b 100644 --- a/src/diagnostics.rs +++ b/src/diagnostics.rs @@ -80,6 +80,7 @@ pub struct PlatformReport { pub dbus_session_bus_address: Option, pub xdg_runtime_dir: Option, pub gnome_shell_version: Check, + pub gnome_screenshot: Check, } #[derive(Debug, Clone, Serialize, JsonSchema)] @@ -218,6 +219,10 @@ fn capability_map( if portals.screenshot.ok { screenshot_backends.push("portal".to_string()); } + // Subprocess fallback for background/systemd contexts the DBus paths reject. + if platform.gnome_screenshot.ok { + screenshot_backends.push("gnome_screenshot".to_string()); + } let mut window_backends = Vec::new(); if windowing.computer_use_linux_gnome_shell_extension.ok { @@ -519,6 +524,7 @@ fn platform_report() -> PlatformReport { dbus_session_bus_address: dbus_session_address(), xdg_runtime_dir: xdg_runtime_dir().map(|path| path.display().to_string()), gnome_shell_version: command_check("gnome-shell", &["--version"]), + gnome_screenshot: command_check("gnome-screenshot", &["--version"]), } } @@ -969,6 +975,7 @@ mod tests { dbus_session_bus_address: Some("unix:path=/run/user/1000/bus".to_string()), xdg_runtime_dir: Some("/run/user/1000".to_string()), gnome_shell_version: Check::ok("GNOME Shell 46.0"), + gnome_screenshot: Check::ok("gnome-screenshot 41.0"), } } diff --git a/src/screenshot.rs b/src/screenshot.rs index c4eff42..110642e 100644 --- a/src/screenshot.rs +++ b/src/screenshot.rs @@ -11,8 +11,10 @@ use std::{ fs, io::Cursor, path::{Path, PathBuf}, + process::Stdio, time::{Duration, SystemTime, UNIX_EPOCH}, }; +use tokio::process::Command; use zbus::{ message::{Message, Type as MessageType}, zvariant::{OwnedObjectPath, OwnedValue, Value}, @@ -141,17 +143,83 @@ impl ScreenshotPayloadOptions { } } +/// Environment variable forcing a single capture backend, skipping the +/// fallback chain. Accepts `gnome-shell`, `portal`, or `gnome-screenshot`. +const SCREENSHOT_BACKEND_ENV: &str = "COMPUTER_USE_LINUX_SCREENSHOT_BACKEND"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ScreenshotBackend { + GnomeShell, + Portal, + GnomeScreenshot, +} + +impl ScreenshotBackend { + fn parse(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "gnome-shell" | "gnome_shell" | "shell" => Some(Self::GnomeShell), + "portal" | "xdg-portal" | "xdg_portal" => Some(Self::Portal), + "gnome-screenshot" | "gnome_screenshot" => Some(Self::GnomeScreenshot), + _ => None, + } + } + + async fn capture(self) -> Result { + match self { + Self::GnomeShell => capture_with_gnome_shell().await, + Self::Portal => capture_with_portal().await, + Self::GnomeScreenshot => capture_with_gnome_screenshot().await, + } + } +} + pub async fn capture_screenshot_raw() -> Result { hydrate_session_bus_env(); - match capture_with_gnome_shell().await { - Ok(capture) => Ok(capture), - Err(gnome_error) => match capture_with_portal().await { - Ok(capture) => Ok(capture), - Err(portal_error) => Err(anyhow!( - "GNOME Shell screenshot failed: {gnome_error}; XDG portal screenshot failed: {portal_error}" - )), - }, + // Explicit override: use exactly the requested backend, no fallback. Lets + // background/systemd contexts pin `gnome-screenshot` when the DBus paths are + // blocked, and aids debugging. + if let Some(forced) = forced_backend()? { + return forced.capture().await; + } + + // The Shell and portal DBus paths fail for background processes (systemd + // user services, non-interactive parent shells): GNOME Shell's + // DBusSenderChecker rejects unknown bus names, and the portal cancels with + // response code 2 when there is no foreground window. `gnome-screenshot` + // claims an allowlisted bus name and works regardless, so it is the final + // fallback. See issue #20. + let gnome_error = match capture_with_gnome_shell().await { + Ok(capture) => return Ok(capture), + Err(error) => error, + }; + let portal_error = match capture_with_portal().await { + Ok(capture) => return Ok(capture), + Err(error) => error, + }; + let cli_error = match capture_with_gnome_screenshot().await { + Ok(capture) => return Ok(capture), + Err(error) => error, + }; + + Err(anyhow!( + "GNOME Shell screenshot failed: {gnome_error}; \ + XDG portal screenshot failed: {portal_error}; \ + gnome-screenshot fallback failed: {cli_error}" + )) +} + +fn forced_backend() -> Result> { + match std::env::var(SCREENSHOT_BACKEND_ENV) { + Ok(value) if !value.trim().is_empty() => { + ScreenshotBackend::parse(&value).map(Some).ok_or_else(|| { + anyhow!( + "{SCREENSHOT_BACKEND_ENV}={value:?} is not a recognized backend \ + (expected gnome-shell, portal, or gnome-screenshot)" + ) + }) + } + _ => Ok(None), } } @@ -302,6 +370,61 @@ async fn capture_with_portal() -> Result { read_png_as_capture(path, "xdg-desktop-portal", ScreenshotCleanup::Preserve).await } +/// Upper bound on how long we wait for `gnome-screenshot` before killing it. +/// Matches the portal timeout: a hung capture must not block the tool forever. +const GNOME_SCREENSHOT_TIMEOUT: Duration = Duration::from_secs(20); + +async fn capture_with_gnome_screenshot() -> Result { + let path = temp_png_path("gnome-screenshot"); + let filename = path + .to_str() + .context("temporary screenshot path is not valid UTF-8")?; + + // `-f ` writes a full-screen PNG without prompting; no portal, no + // foreground window required. `tokio::process::Command` searches PATH and + // provides an async, non-polling wait. + let mut child = match Command::new("gnome-screenshot") + .args(["-f", filename]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + { + Ok(child) => child, + Err(error) => { + cleanup_gnome_requested_path(&path); + return Err(error).context("failed to spawn gnome-screenshot"); + } + }; + + // A hung capture must not block the tool forever, so bound the wait and + // kill the child if it outlives the deadline. + let status = match tokio::time::timeout(GNOME_SCREENSHOT_TIMEOUT, child.wait()).await { + Ok(Ok(status)) => status, + Ok(Err(error)) => { + cleanup_gnome_requested_path(&path); + return Err(error).context("failed to wait for gnome-screenshot"); + } + Err(_) => { + let _ = child.kill().await; + cleanup_gnome_requested_path(&path); + bail!("gnome-screenshot timed out"); + } + }; + + if !status.success() { + cleanup_gnome_requested_path(&path); + bail!("gnome-screenshot exited with {status}"); + } + + read_png_as_capture( + path.clone(), + "gnome-screenshot", + ScreenshotCleanup::DeletePath(path), + ) + .await +} + async fn portal_response_stream(connection: &zbus::Connection) -> Result { let response_rule = MatchRule::builder() .msg_type(MessageType::Signal) @@ -607,6 +730,43 @@ mod tests { ); } + #[test] + fn parses_known_backend_names() { + assert_eq!( + ScreenshotBackend::parse("gnome-shell"), + Some(ScreenshotBackend::GnomeShell) + ); + assert_eq!( + ScreenshotBackend::parse(" Portal "), + Some(ScreenshotBackend::Portal) + ); + assert_eq!( + ScreenshotBackend::parse("GNOME_SCREENSHOT"), + Some(ScreenshotBackend::GnomeScreenshot) + ); + assert_eq!(ScreenshotBackend::parse("nonsense"), None); + } + + #[test] + fn forced_backend_reads_env_override() { + // Only this test touches SCREENSHOT_BACKEND_ENV, so no cross-test race. + std::env::set_var(SCREENSHOT_BACKEND_ENV, "gnome-screenshot"); + assert_eq!( + forced_backend().unwrap(), + Some(ScreenshotBackend::GnomeScreenshot) + ); + + std::env::set_var(SCREENSHOT_BACKEND_ENV, " "); + assert_eq!(forced_backend().unwrap(), None); + + std::env::set_var(SCREENSHOT_BACKEND_ENV, "bogus"); + let error = forced_backend().unwrap_err(); + assert!(error.to_string().contains("not a recognized backend")); + + std::env::remove_var(SCREENSHOT_BACKEND_ENV); + assert_eq!(forced_backend().unwrap(), None); + } + #[test] fn request_token_is_portal_safe() { let token = request_token();