Require visual verification for computer-use captures

Sanitize embedded image data URLs from prompt token estimates so screenshot attachments do not explode context accounting.\n\nStrengthen computer_use_remote prompt, skill, and capture-result text so state-changing desktop actions are treated as attempts until a fresh screen visibly confirms the requested outcome.
This commit is contained in:
Alessandro 2026-05-23 10:32:37 +02:00
parent 60c36d16d8
commit 1f34b87c00
8 changed files with 62 additions and 5 deletions

View file

@ -584,7 +584,7 @@ class Agent:
Agent.DATA_NAME_CTX_WINDOW,
{
"text": full_text,
"tokens": tokens.approximate_tokens(full_text),
"tokens": tokens.approximate_prompt_tokens(full_text),
},
)

View file

@ -1,8 +1,13 @@
import re
from typing import Literal
import tiktoken
APPROX_BUFFER = 1.1
TRIM_BUFFER = 0.8
EMBEDDED_IMAGE_DATA_PLACEHOLDER = "[embedded image data omitted from token estimate]"
_EMBEDDED_IMAGE_DATA_URL_PATTERN = re.compile(
r"data:(image/[A-Za-z0-9.+-]+(?:;[A-Za-z0-9.+-]+=[A-Za-z0-9.+/=_-]+)*);base64,[A-Za-z0-9+/=_-]+"
)
def count_tokens(text: str, encoding_name="cl100k_base") -> int:
@ -25,6 +30,20 @@ def approximate_tokens(
return int(count_tokens(text) * APPROX_BUFFER)
def sanitize_embedded_image_data_urls(text: str) -> str:
if not text:
return text
return _EMBEDDED_IMAGE_DATA_URL_PATTERN.sub(
f"data:\\1;base64,{EMBEDDED_IMAGE_DATA_PLACEHOLDER}",
text,
)
def approximate_prompt_tokens(text: str) -> int:
return approximate_tokens(sanitize_embedded_image_data_urls(text))
def trim_to_tokens(
text: str,
max_tokens: int,

View file

@ -49,7 +49,7 @@ class TokenStatus(connector_base.ProtectedConnectorApiHandler):
history_output, ai_label="assistant", human_label="user"
)
if full_text.strip():
token_count = tokens_helper.approximate_tokens(full_text)
token_count = tokens_helper.approximate_prompt_tokens(full_text)
except Exception:
token_count = None

View file

@ -8,6 +8,8 @@ If the tool reports no CLI, disabled computer use, or `COMPUTER_USE_REARM_REQUIR
Call `start_session` before screen-driven tasks. Use `status` for state only, `capture` for screenshots without an action, and `stop_session` when the desktop task is complete. Interactive actions should use normalized global-screen coordinates from the most recent capture.
State-changing actions automatically attach a fresh screen after they run. Treat key presses, clicks, scrolling, typing, and window-manager shortcuts as attempts, not success: inspect the latest attached screen, or one explicit `capture` if it is unclear or unchanged, before saying the requested outcome happened. This is mandatory for Ubuntu/Wayland shortcuts such as `Alt+F9`.
```json
{
"tool_name": "computer_use_remote",

View file

@ -53,7 +53,7 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi
1. Call `start_session` first.
2. Decide from the latest screenshot, not from memory.
3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run.
3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run; inspect it before claiming the requested outcome succeeded.
4. Use `status` for state without starting a session.
5. Use `capture` only when you need another screenshot without taking an action.
@ -67,6 +67,8 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi
- If a click dismisses a menu or popup without producing the expected next UI, treat that attempt as failed.
- If the same approach has already failed twice without visible progress, switch strategy instead of repeating it.
- Do not infer focus or task completion from chat logs, sidebars, tool summaries, or status text.
- Never claim a window was hidden, minimized, moved, text was submitted, or navigation completed until the latest screenshot visibly confirms it.
- Treat Ubuntu/Wayland window-manager shortcuts such as `Alt+F9` as attempts only; verify the result from the fresh screenshot before deciding what happened.
- For browser-navigation tasks done through this tool, only claim success if the browser content area visibly shows the destination page or result.
- If the attached screenshot appears unchanged after a state-changing action, use one explicit `capture` to verify before repeating the same action.
- Use `type(..., submit=true)` only for URL or navigation-style entry where Enter should fire immediately after typing.

View file

@ -23,6 +23,9 @@ COMPUTER_USE_OP_TIMEOUT = 180.0
COMPUTER_USE_OP_EVENT = "connector_computer_use_op"
CAPTURE_TOKENS_ESTIMATE = 1500
MAX_CAPTURE_ARTIFACT_SIZE_BYTES = 25 * 1024 * 1024
CAPTURE_VERIFICATION_NOTE = (
"Do not claim success unless this screen visibly confirms the requested outcome."
)
REARM_REQUIRED_DEFAULT_MESSAGE = (
"Computer use is configured, but the installed desktop-control backend is not armed."
)
@ -204,7 +207,7 @@ class ComputerUseRemote(Tool):
summary = self._record_capture(capture_data)
except Exception as exc:
return f"Automatic screen refresh failed: {exc}"
return f"Latest screen attached: {summary}"
return f"Latest screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}"
def _auto_capture_settle_seconds(self, action: str) -> float:
if action == "start_session":
@ -288,7 +291,7 @@ class ComputerUseRemote(Tool):
if action == "capture":
summary = self._record_capture(data)
return f"Current screen attached: {summary}"
return f"Current screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}"
if action == "status":
return self._format_status(data)
if action == "start_session":

View file

@ -123,6 +123,24 @@ def test_computer_use_remote_prompt_keeps_runtime_failures_actionable():
assert "/computer-use on" in prompt
def test_computer_use_remote_prompt_requires_visual_verification_after_actions():
prompt = _apply_gate(_context_id())
skill = (
PROJECT_ROOT
/ "plugins"
/ "_a0_connector"
/ "skills"
/ "host-computer-use"
/ "SKILL.md"
).read_text(encoding="utf-8")
assert "Treat key presses, clicks, scrolling, typing" in prompt
assert "attempts, not success" in prompt
assert "Alt+F9" in prompt
assert "visibly confirms" in skill
assert "Ubuntu/Wayland" in skill
def test_remote_file_and_exec_tools_are_standard_tool_prompts_independent_from_context():
text_stub = (PROMPT_ROOT / "agent.system.tool.text_editor_remote.md").read_text(encoding="utf-8")
exec_stub = (PROMPT_ROOT / "agent.system.tool.code_execution_remote.md").read_text(encoding="utf-8")

View file

@ -74,3 +74,16 @@ def test_a0_small_profile_removed_and_prompt_text_generic():
for path in _iter_prompt_files():
assert "a0_small" not in path.read_text(encoding="utf-8")
def test_prompt_token_estimate_omits_embedded_image_data_urls():
embedded_png = "data:image/png;base64," + ("ABCDabcd0123+/==" * 20_000)
prompt_text = f"user: please inspect this screenshot {embedded_png}"
sanitized = tokens.sanitize_embedded_image_data_urls(prompt_text)
assert "ABCDabcd0123+/==" not in sanitized
assert "data:image/png;base64," in sanitized
assert tokens.EMBEDDED_IMAGE_DATA_PLACEHOLDER in sanitized
assert tokens.approximate_prompt_tokens(prompt_text) < 100
assert tokens.approximate_prompt_tokens(prompt_text) < tokens.approximate_tokens(prompt_text) / 100