From 1f34b87c00edacbc5eeea9d6a41bc16f2d7f062d Mon Sep 17 00:00:00 2001 From: Alessandro <155005371+3clyp50@users.noreply.github.com> Date: Sat, 23 May 2026 10:32:37 +0200 Subject: [PATCH] Require visual verification for computer-use captures Sanitize embedded image data URLs from prompt token estimates so screenshot attachments do not explode context accounting.\n\nStrengthen computer_use_remote prompt, skill, and capture-result text so state-changing desktop actions are treated as attempts until a fresh screen visibly confirms the requested outcome. --- agent.py | 2 +- helpers/tokens.py | 19 +++++++++++++++++++ plugins/_a0_connector/api/v1/token_status.py | 2 +- .../agent.system.tool.computer_use_remote.md | 2 ++ .../skills/host-computer-use/SKILL.md | 4 +++- .../tools/computer_use_remote.py | 7 +++++-- tests/test_a0_connector_prompt_gating.py | 18 ++++++++++++++++++ tests/test_default_prompt_budget.py | 13 +++++++++++++ 8 files changed, 62 insertions(+), 5 deletions(-) diff --git a/agent.py b/agent.py index 7fc238ce6..149c900e3 100644 --- a/agent.py +++ b/agent.py @@ -584,7 +584,7 @@ class Agent: Agent.DATA_NAME_CTX_WINDOW, { "text": full_text, - "tokens": tokens.approximate_tokens(full_text), + "tokens": tokens.approximate_prompt_tokens(full_text), }, ) diff --git a/helpers/tokens.py b/helpers/tokens.py index 42224a495..a80b28ead 100644 --- a/helpers/tokens.py +++ b/helpers/tokens.py @@ -1,8 +1,13 @@ +import re from typing import Literal import tiktoken APPROX_BUFFER = 1.1 TRIM_BUFFER = 0.8 +EMBEDDED_IMAGE_DATA_PLACEHOLDER = "[embedded image data omitted from token estimate]" +_EMBEDDED_IMAGE_DATA_URL_PATTERN = re.compile( + r"data:(image/[A-Za-z0-9.+-]+(?:;[A-Za-z0-9.+-]+=[A-Za-z0-9.+/=_-]+)*);base64,[A-Za-z0-9+/=_-]+" +) def count_tokens(text: str, encoding_name="cl100k_base") -> int: @@ -25,6 +30,20 @@ def approximate_tokens( return int(count_tokens(text) * APPROX_BUFFER) +def sanitize_embedded_image_data_urls(text: str) -> str: + if not text: + return text + + return _EMBEDDED_IMAGE_DATA_URL_PATTERN.sub( + f"data:\\1;base64,{EMBEDDED_IMAGE_DATA_PLACEHOLDER}", + text, + ) + + +def approximate_prompt_tokens(text: str) -> int: + return approximate_tokens(sanitize_embedded_image_data_urls(text)) + + def trim_to_tokens( text: str, max_tokens: int, diff --git a/plugins/_a0_connector/api/v1/token_status.py b/plugins/_a0_connector/api/v1/token_status.py index 40380cc64..8183be4ae 100644 --- a/plugins/_a0_connector/api/v1/token_status.py +++ b/plugins/_a0_connector/api/v1/token_status.py @@ -49,7 +49,7 @@ class TokenStatus(connector_base.ProtectedConnectorApiHandler): history_output, ai_label="assistant", human_label="user" ) if full_text.strip(): - token_count = tokens_helper.approximate_tokens(full_text) + token_count = tokens_helper.approximate_prompt_tokens(full_text) except Exception: token_count = None diff --git a/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md b/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md index c35285b98..0a840f4dd 100644 --- a/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md +++ b/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md @@ -8,6 +8,8 @@ If the tool reports no CLI, disabled computer use, or `COMPUTER_USE_REARM_REQUIR Call `start_session` before screen-driven tasks. Use `status` for state only, `capture` for screenshots without an action, and `stop_session` when the desktop task is complete. Interactive actions should use normalized global-screen coordinates from the most recent capture. +State-changing actions automatically attach a fresh screen after they run. Treat key presses, clicks, scrolling, typing, and window-manager shortcuts as attempts, not success: inspect the latest attached screen, or one explicit `capture` if it is unclear or unchanged, before saying the requested outcome happened. This is mandatory for Ubuntu/Wayland shortcuts such as `Alt+F9`. + ```json { "tool_name": "computer_use_remote", diff --git a/plugins/_a0_connector/skills/host-computer-use/SKILL.md b/plugins/_a0_connector/skills/host-computer-use/SKILL.md index 04a730e86..8b3c19abc 100644 --- a/plugins/_a0_connector/skills/host-computer-use/SKILL.md +++ b/plugins/_a0_connector/skills/host-computer-use/SKILL.md @@ -53,7 +53,7 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi 1. Call `start_session` first. 2. Decide from the latest screenshot, not from memory. -3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run. +3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run; inspect it before claiming the requested outcome succeeded. 4. Use `status` for state without starting a session. 5. Use `capture` only when you need another screenshot without taking an action. @@ -67,6 +67,8 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi - If a click dismisses a menu or popup without producing the expected next UI, treat that attempt as failed. - If the same approach has already failed twice without visible progress, switch strategy instead of repeating it. - Do not infer focus or task completion from chat logs, sidebars, tool summaries, or status text. +- Never claim a window was hidden, minimized, moved, text was submitted, or navigation completed until the latest screenshot visibly confirms it. +- Treat Ubuntu/Wayland window-manager shortcuts such as `Alt+F9` as attempts only; verify the result from the fresh screenshot before deciding what happened. - For browser-navigation tasks done through this tool, only claim success if the browser content area visibly shows the destination page or result. - If the attached screenshot appears unchanged after a state-changing action, use one explicit `capture` to verify before repeating the same action. - Use `type(..., submit=true)` only for URL or navigation-style entry where Enter should fire immediately after typing. diff --git a/plugins/_a0_connector/tools/computer_use_remote.py b/plugins/_a0_connector/tools/computer_use_remote.py index df5487dfc..48352e03b 100644 --- a/plugins/_a0_connector/tools/computer_use_remote.py +++ b/plugins/_a0_connector/tools/computer_use_remote.py @@ -23,6 +23,9 @@ COMPUTER_USE_OP_TIMEOUT = 180.0 COMPUTER_USE_OP_EVENT = "connector_computer_use_op" CAPTURE_TOKENS_ESTIMATE = 1500 MAX_CAPTURE_ARTIFACT_SIZE_BYTES = 25 * 1024 * 1024 +CAPTURE_VERIFICATION_NOTE = ( + "Do not claim success unless this screen visibly confirms the requested outcome." +) REARM_REQUIRED_DEFAULT_MESSAGE = ( "Computer use is configured, but the installed desktop-control backend is not armed." ) @@ -204,7 +207,7 @@ class ComputerUseRemote(Tool): summary = self._record_capture(capture_data) except Exception as exc: return f"Automatic screen refresh failed: {exc}" - return f"Latest screen attached: {summary}" + return f"Latest screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}" def _auto_capture_settle_seconds(self, action: str) -> float: if action == "start_session": @@ -288,7 +291,7 @@ class ComputerUseRemote(Tool): if action == "capture": summary = self._record_capture(data) - return f"Current screen attached: {summary}" + return f"Current screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}" if action == "status": return self._format_status(data) if action == "start_session": diff --git a/tests/test_a0_connector_prompt_gating.py b/tests/test_a0_connector_prompt_gating.py index 1f65c9ece..9c9920bee 100644 --- a/tests/test_a0_connector_prompt_gating.py +++ b/tests/test_a0_connector_prompt_gating.py @@ -123,6 +123,24 @@ def test_computer_use_remote_prompt_keeps_runtime_failures_actionable(): assert "/computer-use on" in prompt +def test_computer_use_remote_prompt_requires_visual_verification_after_actions(): + prompt = _apply_gate(_context_id()) + skill = ( + PROJECT_ROOT + / "plugins" + / "_a0_connector" + / "skills" + / "host-computer-use" + / "SKILL.md" + ).read_text(encoding="utf-8") + + assert "Treat key presses, clicks, scrolling, typing" in prompt + assert "attempts, not success" in prompt + assert "Alt+F9" in prompt + assert "visibly confirms" in skill + assert "Ubuntu/Wayland" in skill + + def test_remote_file_and_exec_tools_are_standard_tool_prompts_independent_from_context(): text_stub = (PROMPT_ROOT / "agent.system.tool.text_editor_remote.md").read_text(encoding="utf-8") exec_stub = (PROMPT_ROOT / "agent.system.tool.code_execution_remote.md").read_text(encoding="utf-8") diff --git a/tests/test_default_prompt_budget.py b/tests/test_default_prompt_budget.py index c186cff9f..d0c29f351 100644 --- a/tests/test_default_prompt_budget.py +++ b/tests/test_default_prompt_budget.py @@ -74,3 +74,16 @@ def test_a0_small_profile_removed_and_prompt_text_generic(): for path in _iter_prompt_files(): assert "a0_small" not in path.read_text(encoding="utf-8") + + +def test_prompt_token_estimate_omits_embedded_image_data_urls(): + embedded_png = "data:image/png;base64," + ("ABCDabcd0123+/==" * 20_000) + prompt_text = f"user: please inspect this screenshot {embedded_png}" + + sanitized = tokens.sanitize_embedded_image_data_urls(prompt_text) + + assert "ABCDabcd0123+/==" not in sanitized + assert "data:image/png;base64," in sanitized + assert tokens.EMBEDDED_IMAGE_DATA_PLACEHOLDER in sanitized + assert tokens.approximate_prompt_tokens(prompt_text) < 100 + assert tokens.approximate_prompt_tokens(prompt_text) < tokens.approximate_tokens(prompt_text) / 100