From a931759868eb107eec0cfe7cc01ccd2f43e8c31d Mon Sep 17 00:00:00 2001 From: Alessandro Frau <155005371+3clyp50@users.noreply.github.com> Date: Sat, 23 May 2026 15:23:38 +0200 Subject: [PATCH] Keep backend computer-use actions out of generic guidance Move explicit AX action names and argument details out of the always-loaded computer_use_remote prompt and generic host-computer-use skill. The generic guidance now only explains backend discovery and skill loading, while host-computer-use-macos remains the detailed home for macOS structural targeting. Also soften the old Super+H hide-window guidance so window actions are chosen from the reported backend and verified visually. --- .../agent.system.tool.computer_use_remote.md | 11 +++------- .../skills/host-computer-use/SKILL.md | 18 ++++++--------- tests/test_a0_connector_prompt_gating.py | 22 ++++++++++++------- tests/test_tool_action_contracts.py | 6 +++++ 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md b/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md index 3b4ed31d7..721cce83f 100644 --- a/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md +++ b/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md @@ -10,9 +10,9 @@ If the tool reports no CLI, disabled computer use, or `COMPUTER_USE_REARM_REQUIR Call `start_session` before screen-driven tasks. Use `status` for state only, `capture` for screenshots without an action, and `stop_session` when the desktop task is complete. Interactive coordinate actions should use normalized global-screen coordinates from the most recent capture. -Some actions are backend-specific. Use `ax_snapshot` and `ax_action` only when `status` or `start_session` reports a macOS backend with Accessibility-tree features such as `accessibility-tree-snapshot` or `accessibility-structural-targeting`. For macOS structural targeting details, load and follow skill `host-computer-use-macos`; do not apply AX guidance to non-macOS backends. +Some actions are backend-specific and intentionally documented only in backend skills. If `status` or `start_session` reports backend-specific features or tells you to load a backend skill, load and follow that skill before using those backend-only actions. For macOS structural targeting details, load and follow skill `host-computer-use-macos`; do not apply macOS guidance to non-macOS backends. -State-changing actions automatically attach a fresh screen after they run. Treat key presses, clicks, scrolling, typing, and window-manager shortcuts as attempts, not success: inspect the latest attached screen, or one explicit `capture` if it is unclear or unchanged, before saying the requested outcome happened. If the tool says a screen was attached but you cannot actually inspect the image, stop and report that visual verification is unavailable; do not continue by assuming the host state. For Ubuntu/GNOME/Wayland hide-window tasks, prefer `Super+H` (`{"action":"key","keys":["Super","H"]}`) for the active window; do not use `Alt+F9` as the primary hide/minimize shortcut because it often leaves the window visible. A `type` result only proves keystrokes were sent; it does not prove the window was hidden or that text landed in the intended place. +State-changing actions automatically attach a fresh screen after they run. Treat key presses, clicks, scrolling, and typing as attempts, not success: inspect the latest attached screen, or one explicit `capture` if it is unclear or unchanged, before saying the requested outcome happened. If the tool says a screen was attached but you cannot actually inspect the image, stop and report that visual verification is unavailable; do not continue by assuming the host state. A `type` result only proves keystrokes were sent; it does not prove that text landed in the intended place. ```json { @@ -24,18 +24,13 @@ State-changing actions automatically attach a fresh screen after they run. Treat ``` Required argument: -- `action`: one of `start_session`, `status`, `capture`, `ax_snapshot`, `ax_action`, `move`, `click`, `scroll`, `key`, `type`, `stop_session` +- `action`: one of `start_session`, `status`, `capture`, `move`, `click`, `scroll`, `key`, `type`, `stop_session`; backend skills may document additional backend-only action values Optional arguments by action: - `session_id`: session returned by `start_session` - `x`, `y`: normalized `[0,1]` global-screen coordinates for `move` and `click` - `button`: `left`, `right`, or `middle` for `click` - `count`: click count for `click` -- `max_depth`, `max_nodes`: optional bounds for backend-gated `ax_snapshot` -- `path`: element path from `ax_snapshot` for backend-gated `ax_action` -- `target`: semantic element target for backend-gated `ax_action`, for example role/title/description/value/identifier -- `operation` or `ax_action`: backend-gated AX operation such as `press`, `focus`, or `set_value` -- `value` or `text`: value for backend-gated `ax_action` with `set_value` - `dx`, `dy`: scroll amounts for `scroll` - `key` or `keys`: key press value for `key` - `text`: text to type for `type` diff --git a/plugins/_a0_connector/skills/host-computer-use/SKILL.md b/plugins/_a0_connector/skills/host-computer-use/SKILL.md index ccb501ce3..e3675f051 100644 --- a/plugins/_a0_connector/skills/host-computer-use/SKILL.md +++ b/plugins/_a0_connector/skills/host-computer-use/SKILL.md @@ -13,8 +13,6 @@ triggers: - "host screen" - "local screen" - "Ubuntu Wayland desktop" - - "hide window" - - "minimize window" --- # Host Computer Use @@ -57,9 +55,9 @@ Use: Arguments: -- `action`: `start_session`, `status`, `capture`, `ax_snapshot`, `ax_action`, `move`, `click`, `scroll`, `key`, `type`, `stop_session` +- `action`: `start_session`, `status`, `capture`, `move`, `click`, `scroll`, `key`, `type`, `stop_session` - `session_id`: optional after `start_session` -- `ax_snapshot`, `ax_action`: backend-gated structural accessibility actions; use only when backend metadata advertises matching support, and load the backend-specific skill first +- backend skills may document additional backend-only action values; use them only when backend metadata advertises matching support and after loading the backend-specific skill - `move`: `x`, `y` normalized to `[0,1]` - `click`: optional `x`, `y`, optional `button` (`left`, `right`, `middle`), optional `count` - `scroll`: `dx`, `dy` @@ -75,30 +73,28 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi 1. Call `start_session` first. 2. Read the returned `backend_id`, `backend_family`, and `features`; load a backend-specific Computer Use skill when the task needs backend-only affordances. 3. Decide final success from the latest screenshot, not from memory. -4. Interactive actions (`ax_action`, `move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run; inspect it before claiming the requested outcome succeeded. +4. Interactive actions already attach a fresh screenshot after they run; inspect it before claiming the requested outcome succeeded. 5. Use `status` for state without starting a session. 6. Use `capture` only when you need another screenshot without taking an action. ## Backend Skills -- If the backend is macOS or features include `accessibility-tree-snapshot` / `accessibility-structural-targeting`, load `host-computer-use-macos` before using `ax_snapshot` or `ax_action`. +- If the backend is macOS or features include `accessibility-tree-snapshot` / `accessibility-structural-targeting`, load `host-computer-use-macos` before using macOS structural Accessibility actions. - Do not use backend-specific actions just because their argument names exist in the generic contract. Treat them as unavailable unless the connected CLI advertises the matching feature. ## Operating Rules - Only the latest screenshot or a definitive tool result counts as evidence. - If a tool result says a screenshot was attached but you cannot actually see the image, stop and report that visual verification is unavailable. Do not continue with another action from an assumed host state. -- Outside advertised AX support, use normalized global screen coordinates; do not assume window ids, element indexes, background-safe input, or semantic click targets unless the runtime explicitly advertises them. +- Outside advertised structural accessibility support, use normalized global screen coordinates; do not assume window ids, element indexes, background-safe input, or semantic click targets unless the runtime explicitly advertises them. - Prefer accessibility and semantic UI paths first: shortcuts, command palettes, menu accelerators, address/search bars, focus traversal, and other keyboard-accessible controls. - Prefer `key` and `type` over pointer actions whenever a reliable keyboard path exists. - When a menu or popup is open, treat it as the active UI and prefer keyboard navigation over clicking small transient rows by coordinate. - If a click dismisses a menu or popup without producing the expected next UI, treat that attempt as failed. - If the same approach has already failed twice without visible progress, switch strategy instead of repeating it. - Do not infer focus or task completion from chat logs, sidebars, tool summaries, or status text. -- Never claim a window was hidden, minimized, moved, text was submitted, or navigation completed until the latest screenshot visibly confirms it. -- On Ubuntu/GNOME/Wayland, use `Super+H` (`{"action":"key","keys":["Super","H"]}`) to hide the active window. Do not use `Alt+F9` as the primary hide/minimize shortcut on this environment; it often leaves the window visible. -- After any hide/minimize shortcut, inspect the fresh screenshot. If the target window or focused composer is still visible, treat the attempt as failed and do not type follow-up text into the active field. -- A `type` tool result only confirms keystrokes were sent. It is not evidence that the text landed in the intended application, nor evidence that a window was hidden first. +- Never claim a state-changing action succeeded until the latest screenshot visibly confirms it. +- A `type` tool result only confirms keystrokes were sent. It is not evidence that the text landed in the intended application. - For browser-navigation tasks done through this tool, only claim success if the browser content area visibly shows the destination page or result. - If the attached screenshot appears unchanged after a state-changing action, use one explicit `capture` to verify before repeating the same action. - Use `type(..., submit=true)` only for URL or navigation-style entry where Enter should fire immediately after typing. diff --git a/tests/test_a0_connector_prompt_gating.py b/tests/test_a0_connector_prompt_gating.py index 044b07d69..0f06ce0e6 100644 --- a/tests/test_a0_connector_prompt_gating.py +++ b/tests/test_a0_connector_prompt_gating.py @@ -134,19 +134,22 @@ def test_computer_use_remote_prompt_requires_visual_verification_after_actions() / "SKILL.md" ).read_text(encoding="utf-8") - assert "Treat key presses, clicks, scrolling, typing" in prompt + assert "Treat key presses, clicks, scrolling, and typing" in prompt assert "attempts, not success" in prompt assert "visual verification is unavailable" in prompt assert "do not continue by assuming the host state" in prompt - assert "Super+H" in prompt - assert '["Super","H"]' in prompt - assert "Alt+F9" in prompt + assert "Super+H" not in prompt + assert "Alt+F9" not in prompt + assert "hide" not in prompt.lower() + assert "minimize" not in prompt.lower() + assert "window-manager" not in prompt assert "cannot actually see the image" in skill - assert "Do not use `Alt+F9` as the primary hide/minimize shortcut" in skill assert "A `type` tool result only confirms keystrokes were sent" in skill - assert "do not type follow-up text into the active field" in skill assert "visibly confirms" in skill - assert "Ubuntu/GNOME/Wayland" in skill + assert "hide window" not in skill + assert "minimize window" not in skill + assert "hide/minimize" not in skill + assert "window-manager" not in skill def test_remote_file_and_exec_tools_are_standard_tool_prompts_independent_from_context(): @@ -398,11 +401,14 @@ def test_remote_tool_stubs_are_self_contained_and_reference_per_tool_skills(): assert '"tool_name": "computer_use_remote"' in computer_stub assert "load and follow skill `host-computer-use`" in computer_stub assert "host-computer-use-macos" in computer_stub + assert "ax_snapshot" not in computer_stub + assert "ax_action" not in computer_stub assert "Do not substitute the `linux-desktop` skill" in computer_stub assert '"tool_name": "computer_use_remote"' in computer_skill assert '"tool_name": "computer_use_remote"' in macos_computer_skill assert "ax_snapshot" in macos_computer_skill - assert "ax_snapshot`/`ax_action` are structural Accessibility targeting" not in computer_skill + assert "ax_snapshot" not in computer_skill + assert "ax_action" not in computer_skill assert "Availability, backend support, and trust mode are checked when the tool runs" in computer_stub assert "not `code_execution_tool`" in exec_stub assert "not to" in exec_stub diff --git a/tests/test_tool_action_contracts.py b/tests/test_tool_action_contracts.py index e3289d56b..b3e3d70e4 100644 --- a/tests/test_tool_action_contracts.py +++ b/tests/test_tool_action_contracts.py @@ -618,8 +618,14 @@ def test_computer_use_remote_is_runtime_checked_standard_tool(): assert "checked when the tool runs" in standard_prompt_text assert "visual verification is unavailable" in standard_prompt_text assert "host-computer-use-macos" in standard_prompt_text + assert "ax_snapshot" not in standard_prompt_text + assert "ax_action" not in standard_prompt_text assert '"tool_name": "computer_use_remote"' in skill_text + assert "ax_snapshot" not in skill_text + assert "ax_action" not in skill_text assert '"tool_name": "computer_use_remote"' in macos_skill_text + assert "ax_snapshot" in macos_skill_text + assert "ax_action" in macos_skill_text assert "Backend-specific macOS guidance" in macos_skill_text assert "Beta desktop control" in skill_text