mirror of
https://github.com/agent0ai/agent-zero.git
synced 2026-05-29 19:15:35 +00:00
Require visual verification for computer-use captures
Sanitize embedded image data URLs from prompt token estimates so screenshot attachments do not explode context accounting.\n\nStrengthen computer_use_remote prompt, skill, and capture-result text so state-changing desktop actions are treated as attempts until a fresh screen visibly confirms the requested outcome.
This commit is contained in:
parent
60c36d16d8
commit
1f34b87c00
8 changed files with 62 additions and 5 deletions
2
agent.py
2
agent.py
|
|
@ -584,7 +584,7 @@ class Agent:
|
|||
Agent.DATA_NAME_CTX_WINDOW,
|
||||
{
|
||||
"text": full_text,
|
||||
"tokens": tokens.approximate_tokens(full_text),
|
||||
"tokens": tokens.approximate_prompt_tokens(full_text),
|
||||
},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,13 @@
|
|||
import re
|
||||
from typing import Literal
|
||||
import tiktoken
|
||||
|
||||
APPROX_BUFFER = 1.1
|
||||
TRIM_BUFFER = 0.8
|
||||
EMBEDDED_IMAGE_DATA_PLACEHOLDER = "[embedded image data omitted from token estimate]"
|
||||
_EMBEDDED_IMAGE_DATA_URL_PATTERN = re.compile(
|
||||
r"data:(image/[A-Za-z0-9.+-]+(?:;[A-Za-z0-9.+-]+=[A-Za-z0-9.+/=_-]+)*);base64,[A-Za-z0-9+/=_-]+"
|
||||
)
|
||||
|
||||
|
||||
def count_tokens(text: str, encoding_name="cl100k_base") -> int:
|
||||
|
|
@ -25,6 +30,20 @@ def approximate_tokens(
|
|||
return int(count_tokens(text) * APPROX_BUFFER)
|
||||
|
||||
|
||||
def sanitize_embedded_image_data_urls(text: str) -> str:
|
||||
if not text:
|
||||
return text
|
||||
|
||||
return _EMBEDDED_IMAGE_DATA_URL_PATTERN.sub(
|
||||
f"data:\\1;base64,{EMBEDDED_IMAGE_DATA_PLACEHOLDER}",
|
||||
text,
|
||||
)
|
||||
|
||||
|
||||
def approximate_prompt_tokens(text: str) -> int:
|
||||
return approximate_tokens(sanitize_embedded_image_data_urls(text))
|
||||
|
||||
|
||||
def trim_to_tokens(
|
||||
text: str,
|
||||
max_tokens: int,
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ class TokenStatus(connector_base.ProtectedConnectorApiHandler):
|
|||
history_output, ai_label="assistant", human_label="user"
|
||||
)
|
||||
if full_text.strip():
|
||||
token_count = tokens_helper.approximate_tokens(full_text)
|
||||
token_count = tokens_helper.approximate_prompt_tokens(full_text)
|
||||
except Exception:
|
||||
token_count = None
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ If the tool reports no CLI, disabled computer use, or `COMPUTER_USE_REARM_REQUIR
|
|||
|
||||
Call `start_session` before screen-driven tasks. Use `status` for state only, `capture` for screenshots without an action, and `stop_session` when the desktop task is complete. Interactive actions should use normalized global-screen coordinates from the most recent capture.
|
||||
|
||||
State-changing actions automatically attach a fresh screen after they run. Treat key presses, clicks, scrolling, typing, and window-manager shortcuts as attempts, not success: inspect the latest attached screen, or one explicit `capture` if it is unclear or unchanged, before saying the requested outcome happened. This is mandatory for Ubuntu/Wayland shortcuts such as `Alt+F9`.
|
||||
|
||||
```json
|
||||
{
|
||||
"tool_name": "computer_use_remote",
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi
|
|||
|
||||
1. Call `start_session` first.
|
||||
2. Decide from the latest screenshot, not from memory.
|
||||
3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run.
|
||||
3. Interactive actions (`move`, `click`, `scroll`, `key`, `type`) already attach a fresh screenshot after they run; inspect it before claiming the requested outcome succeeded.
|
||||
4. Use `status` for state without starting a session.
|
||||
5. Use `capture` only when you need another screenshot without taking an action.
|
||||
|
||||
|
|
@ -67,6 +67,8 @@ If any tool result contains `COMPUTER_USE_REARM_REQUIRED` or `status=rearm requi
|
|||
- If a click dismisses a menu or popup without producing the expected next UI, treat that attempt as failed.
|
||||
- If the same approach has already failed twice without visible progress, switch strategy instead of repeating it.
|
||||
- Do not infer focus or task completion from chat logs, sidebars, tool summaries, or status text.
|
||||
- Never claim a window was hidden, minimized, moved, text was submitted, or navigation completed until the latest screenshot visibly confirms it.
|
||||
- Treat Ubuntu/Wayland window-manager shortcuts such as `Alt+F9` as attempts only; verify the result from the fresh screenshot before deciding what happened.
|
||||
- For browser-navigation tasks done through this tool, only claim success if the browser content area visibly shows the destination page or result.
|
||||
- If the attached screenshot appears unchanged after a state-changing action, use one explicit `capture` to verify before repeating the same action.
|
||||
- Use `type(..., submit=true)` only for URL or navigation-style entry where Enter should fire immediately after typing.
|
||||
|
|
|
|||
|
|
@ -23,6 +23,9 @@ COMPUTER_USE_OP_TIMEOUT = 180.0
|
|||
COMPUTER_USE_OP_EVENT = "connector_computer_use_op"
|
||||
CAPTURE_TOKENS_ESTIMATE = 1500
|
||||
MAX_CAPTURE_ARTIFACT_SIZE_BYTES = 25 * 1024 * 1024
|
||||
CAPTURE_VERIFICATION_NOTE = (
|
||||
"Do not claim success unless this screen visibly confirms the requested outcome."
|
||||
)
|
||||
REARM_REQUIRED_DEFAULT_MESSAGE = (
|
||||
"Computer use is configured, but the installed desktop-control backend is not armed."
|
||||
)
|
||||
|
|
@ -204,7 +207,7 @@ class ComputerUseRemote(Tool):
|
|||
summary = self._record_capture(capture_data)
|
||||
except Exception as exc:
|
||||
return f"Automatic screen refresh failed: {exc}"
|
||||
return f"Latest screen attached: {summary}"
|
||||
return f"Latest screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}"
|
||||
|
||||
def _auto_capture_settle_seconds(self, action: str) -> float:
|
||||
if action == "start_session":
|
||||
|
|
@ -288,7 +291,7 @@ class ComputerUseRemote(Tool):
|
|||
|
||||
if action == "capture":
|
||||
summary = self._record_capture(data)
|
||||
return f"Current screen attached: {summary}"
|
||||
return f"Current screen attached: {summary} {CAPTURE_VERIFICATION_NOTE}"
|
||||
if action == "status":
|
||||
return self._format_status(data)
|
||||
if action == "start_session":
|
||||
|
|
|
|||
|
|
@ -123,6 +123,24 @@ def test_computer_use_remote_prompt_keeps_runtime_failures_actionable():
|
|||
assert "/computer-use on" in prompt
|
||||
|
||||
|
||||
def test_computer_use_remote_prompt_requires_visual_verification_after_actions():
|
||||
prompt = _apply_gate(_context_id())
|
||||
skill = (
|
||||
PROJECT_ROOT
|
||||
/ "plugins"
|
||||
/ "_a0_connector"
|
||||
/ "skills"
|
||||
/ "host-computer-use"
|
||||
/ "SKILL.md"
|
||||
).read_text(encoding="utf-8")
|
||||
|
||||
assert "Treat key presses, clicks, scrolling, typing" in prompt
|
||||
assert "attempts, not success" in prompt
|
||||
assert "Alt+F9" in prompt
|
||||
assert "visibly confirms" in skill
|
||||
assert "Ubuntu/Wayland" in skill
|
||||
|
||||
|
||||
def test_remote_file_and_exec_tools_are_standard_tool_prompts_independent_from_context():
|
||||
text_stub = (PROMPT_ROOT / "agent.system.tool.text_editor_remote.md").read_text(encoding="utf-8")
|
||||
exec_stub = (PROMPT_ROOT / "agent.system.tool.code_execution_remote.md").read_text(encoding="utf-8")
|
||||
|
|
|
|||
|
|
@ -74,3 +74,16 @@ def test_a0_small_profile_removed_and_prompt_text_generic():
|
|||
|
||||
for path in _iter_prompt_files():
|
||||
assert "a0_small" not in path.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_prompt_token_estimate_omits_embedded_image_data_urls():
|
||||
embedded_png = "data:image/png;base64," + ("ABCDabcd0123+/==" * 20_000)
|
||||
prompt_text = f"user: please inspect this screenshot {embedded_png}"
|
||||
|
||||
sanitized = tokens.sanitize_embedded_image_data_urls(prompt_text)
|
||||
|
||||
assert "ABCDabcd0123+/==" not in sanitized
|
||||
assert "data:image/png;base64," in sanitized
|
||||
assert tokens.EMBEDDED_IMAGE_DATA_PLACEHOLDER in sanitized
|
||||
assert tokens.approximate_prompt_tokens(prompt_text) < 100
|
||||
assert tokens.approximate_prompt_tokens(prompt_text) < tokens.approximate_tokens(prompt_text) / 100
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue