From 1f34b87c00edacbc5eeea9d6a41bc16f2d7f062d Mon Sep 17 00:00:00 2001
From: Alessandro <155005371+3clyp50@users.noreply.github.com>
Date: Sat, 23 May 2026 10:32:37 +0200
Subject: [PATCH] Require visual verification for computer-use captures

Sanitize embedded image data URLs from prompt token estimates so screenshot attachments do not explode context accounting.\n\nStrengthen computer_use_remote prompt, skill, and capture-result text so state-changing desktop actions are treated as attempts until a fresh screen visibly confirms the requested outcome.
---
 agent.py                                      |  2 +-
 helpers/tokens.py                             | 19 +++++++++++++++++++
 plugins/_a0_connector/api/v1/token_status.py  |  2 +-
 .../agent.system.tool.computer_use_remote.md  |  2 ++
 .../skills/host-computer-use/SKILL.md         |  4 +++-
 .../tools/computer_use_remote.py              |  7 +++++--
 tests/test_a0_connector_prompt_gating.py      | 18 ++++++++++++++++++
 tests/test_default_prompt_budget.py           | 13 +++++++++++++
 8 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/agent.py b/agent.py
index 7fc238ce6..149c900e3 100644
--- a/agent.py
+++ b/agent.py
@@ -584,7 +584,7 @@ class Agent:
             Agent.DATA_NAME_CTX_WINDOW,
             {
                 "text": full_text,
-                "tokens": tokens.approximate_tokens(full_text),
+                "tokens": tokens.approximate_prompt_tokens(full_text),
             },
         )
 
diff --git a/helpers/tokens.py b/helpers/tokens.py
index 42224a495..a80b28ead 100644
--- a/helpers/tokens.py
+++ b/helpers/tokens.py
@@ -1,8 +1,13 @@
+import re
 from typing import Literal
 import tiktoken
 
 APPROX_BUFFER = 1.1
 TRIM_BUFFER = 0.8
+EMBEDDED_IMAGE_DATA_PLACEHOLDER = "[embedded image data omitted from token estimate]"
+_EMBEDDED_IMAGE_DATA_URL_PATTERN = re.compile(
+    r"data:(image/[A-Za-z0-9.+-]+(?:;[A-Za-z0-9.+-]+=[A-Za-z0-9.+/=_-]+)*);base64,[A-Za-z0-9+/=_-]+"
+)
 
 
 def count_tokens(text: str, encoding_name="cl100k_base") -> int:
@@ -25,6 +30,20 @@ def approximate_tokens(
     return int(count_tokens(text) * APPROX_BUFFER)
 
 
+def sanitize_embedded_image_data_urls(text: str) -> str:
+    if not text:
+        return text
+
+    return _EMBEDDED_IMAGE_DATA_URL_PATTERN.sub(
+        f"data:\\1;base64,{EMBEDDED_IMAGE_DATA_PLACEHOLDER}",
+        text,
+    )
+
+
+def approximate_prompt_tokens(text: str) -> int:
+    return approximate_tokens(sanitize_embedded_image_data_urls(text))
+
+
 def trim_to_tokens(
     text: str,
     max_tokens: int,
diff --git a/plugins/_a0_connector/api/v1/token_status.py b/plugins/_a0_connector/api/v1/token_status.py
index 40380cc64..8183be4ae 100644
--- a/plugins/_a0_connector/api/v1/token_status.py
+++ b/plugins/_a0_connector/api/v1/token_status.py
@@ -49,7 +49,7 @@ class TokenStatus(connector_base.ProtectedConnectorApiHandler):
                     history_output, ai_label="assistant", human_label="user"
                 )
                 if full_text.strip():
-                    token_count = tokens_helper.approximate_tokens(full_text)
+                    token_count = tokens_helper.approximate_prompt_tokens(full_text)
             except Exception:
                 token_count = None
 
diff --git a/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md b/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md
index c35285b98..0a840f4dd 100644
--- a/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md
+++ b/plugins/_a0_connector/prompts/agent.system.tool.computer_use_remote.md
@@ -8,6 +8,8 @@ If the tool reports no CLI, disabled computer use, or `COMPUTER_USE_REARM_REQUIR
 
 Call `start_session` before screen-driven tasks. Use `status` for state only, `capture` for screenshots without an action, and `stop_session` when the desktop task is complete. Interactive actions should use normalized global-screen coordinates from the most recent capture.
 
+State-changing actions automatically attach a fresh screen after they run. Treat key presses, clicks, scrolling, typing, and window-manager shortcuts as attempts, not success: inspect the latest attached screen, or one explicit `capture` if it is unclear or unchanged, before saying the requested outcome happened. This is mandatory for Ubuntu/Wayland shortcuts such as `Alt+F9`.
+
 ```json
 {
   "tool_name": "computer_use_remote",
diff --git a/plugins/_a0_connector/skills/host-computer-use/SKILL.md b/plugins/_a0_connector/skills/host-computer-use/SKILL.md
index 04a730e86..8b3c19abc 100644
--- a/plugins/_a0_connector/skills/host-computer-use/SKILL.md
+++ b/plugins/_a0_connector/skills/host-computer-use/SKILL.md
@@ -53,7 +53,7 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi
 
 1. Call `start_session` first.
 2. Decide from the latest screenshot, not from memory.
-3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run.
+3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run; inspect it before claiming the requested outcome succeeded.
 4. Use `status` for state without starting a session.
 5. Use `capture` only when you need another screenshot without taking an action.
 
@@ -67,6 +67,8 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi
 - If a click dismisses a menu or popup without producing the expected next UI, treat that attempt as failed.
 - If the same approach has already failed twice without visible progress, switch strategy instead of repeating it.
 - Do not infer focus or task completion from chat logs, sidebars, tool summaries, or status text.
+- Never claim a window was hidden, minimized, moved, text was submitted, or navigation completed until the latest screenshot visibly confirms it.
+- Treat Ubuntu/Wayland window-manager shortcuts such as `Alt+F9` as attempts only; verify the result from the fresh screenshot before deciding what happened.
 - For browser-navigation tasks done through this tool, only claim success if the browser content area visibly shows the destination page or result.
 - If the attached screenshot appears unchanged after a state-changing action, use one explicit `capture` to verify before repeating the same action.
 - Use `type(..., submit=true)` only for URL or navigation-style entry where Enter should fire immediately after typing.
diff --git a/plugins/_a0_connector/tools/computer_use_remote.py b/plugins/_a0_connector/tools/computer_use_remote.py
index df5487dfc..48352e03b 100644
--- a/plugins/_a0_connector/tools/computer_use_remote.py
+++ b/plugins/_a0_connector/tools/computer_use_remote.py
@@ -23,6 +23,9 @@ COMPUTER_USE_OP_TIMEOUT = 180.0
 COMPUTER_USE_OP_EVENT = "connector_computer_use_op"
 CAPTURE_TOKENS_ESTIMATE = 1500
 MAX_CAPTURE_ARTIFACT_SIZE_BYTES = 25 * 1024 * 1024
+CAPTURE_VERIFICATION_NOTE = (
+    "Do not claim success unless this screen visibly confirms the requested outcome."
+)
 REARM_REQUIRED_DEFAULT_MESSAGE = (
     "Computer use is configured, but the installed desktop-control backend is not armed."
 )
@@ -204,7 +207,7 @@ class ComputerUseRemote(Tool):
             summary = self._record_capture(capture_data)
         except Exception as exc:
             return f"Automatic screen refresh failed: {exc}"
-        return f"Latest screen attached: {summary}"
+        return f"Latest screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}"
 
     def _auto_capture_settle_seconds(self, action: str) -> float:
         if action == "start_session":
@@ -288,7 +291,7 @@ class ComputerUseRemote(Tool):
 
         if action == "capture":
             summary = self._record_capture(data)
-            return f"Current screen attached: {summary}"
+            return f"Current screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}"
         if action == "status":
             return self._format_status(data)
         if action == "start_session":
diff --git a/tests/test_a0_connector_prompt_gating.py b/tests/test_a0_connector_prompt_gating.py
index 1f65c9ece..9c9920bee 100644
--- a/tests/test_a0_connector_prompt_gating.py
+++ b/tests/test_a0_connector_prompt_gating.py
@@ -123,6 +123,24 @@ def test_computer_use_remote_prompt_keeps_runtime_failures_actionable():
     assert "/computer-use on" in prompt
 
 
+def test_computer_use_remote_prompt_requires_visual_verification_after_actions():
+    prompt = _apply_gate(_context_id())
+    skill = (
+        PROJECT_ROOT
+        / "plugins"
+        / "_a0_connector"
+        / "skills"
+        / "host-computer-use"
+        / "SKILL.md"
+    ).read_text(encoding="utf-8")
+
+    assert "Treat key presses, clicks, scrolling, typing" in prompt
+    assert "attempts, not success" in prompt
+    assert "Alt+F9" in prompt
+    assert "visibly confirms" in skill
+    assert "Ubuntu/Wayland" in skill
+
+
 def test_remote_file_and_exec_tools_are_standard_tool_prompts_independent_from_context():
     text_stub = (PROMPT_ROOT / "agent.system.tool.text_editor_remote.md").read_text(encoding="utf-8")
     exec_stub = (PROMPT_ROOT / "agent.system.tool.code_execution_remote.md").read_text(encoding="utf-8")
diff --git a/tests/test_default_prompt_budget.py b/tests/test_default_prompt_budget.py
index c186cff9f..d0c29f351 100644
--- a/tests/test_default_prompt_budget.py
+++ b/tests/test_default_prompt_budget.py
@@ -74,3 +74,16 @@ def test_a0_small_profile_removed_and_prompt_text_generic():
 
     for path in _iter_prompt_files():
         assert "a0_small" not in path.read_text(encoding="utf-8")
+
+
+def test_prompt_token_estimate_omits_embedded_image_data_urls():
+    embedded_png = "data:image/png;base64," + ("ABCDabcd0123+/==" * 20_000)
+    prompt_text = f"user: please inspect this screenshot {embedded_png}"
+
+    sanitized = tokens.sanitize_embedded_image_data_urls(prompt_text)
+
+    assert "ABCDabcd0123+/==" not in sanitized
+    assert "data:image/png;base64," in sanitized
+    assert tokens.EMBEDDED_IMAGE_DATA_PLACEHOLDER in sanitized
+    assert tokens.approximate_prompt_tokens(prompt_text) < 100
+    assert tokens.approximate_prompt_tokens(prompt_text) < tokens.approximate_tokens(prompt_text) / 100