mirror of
https://github.com/agent0ai/agent-zero.git
synced 2026-06-02 07:11:56 +00:00
Fix durable screenshot artifacts and Xpra sizing
Materialize browser, desktop, computer-use, and vision-load screenshots into chat-scoped artifacts so historical image refs survive temporary screenshot pruning. Keep history serialization free of rescue assumptions, document durable screenshot behavior in tool prompts/skills, and size Xpra canvases from backend-normalized display dimensions to prevent stretched desktop views. Verified with focused pytest coverage plus live Docker checks for browser screenshot persistence and Xpra canvas dimensions.
This commit is contained in:
parent
45e4bd892c
commit
edd58a42d2
18 changed files with 688 additions and 81 deletions
244
helpers/chat_media.py
Normal file
244
helpers/chat_media.py
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from helpers import files, media_artifacts
|
||||
|
||||
|
||||
DEFAULT_MAX_IMAGE_BYTES = media_artifacts.DEFAULT_MAX_ARTIFACT_SIZE_BYTES
|
||||
ImageCategory = Literal["images", "screenshots"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ChatImage:
|
||||
path: str
|
||||
a0_path: str
|
||||
mime: str
|
||||
size: int
|
||||
|
||||
|
||||
def screenshot_dir(context_id: str, source: str) -> Path:
|
||||
return artifact_dir(context_id, category="screenshots", source=source)
|
||||
|
||||
|
||||
def artifact_dir(
|
||||
context_id: str,
|
||||
*,
|
||||
category: ImageCategory = "images",
|
||||
source: str = "vision-load",
|
||||
) -> Path:
|
||||
context_segment = files.safe_file_name(str(context_id or "default")).strip("._") or "default"
|
||||
safe_category = files.safe_file_name(category).strip("._") or "images"
|
||||
safe_source = files.safe_file_name(source).strip("._") or "vision-load"
|
||||
|
||||
return Path(files.get_abs_path("usr/chats", context_segment)) / safe_category / safe_source
|
||||
|
||||
|
||||
def save_image_bytes(
|
||||
*,
|
||||
context_id: str,
|
||||
payload: bytes,
|
||||
mime_type: str = "image/png",
|
||||
category: ImageCategory = "images",
|
||||
source: str = "vision-load",
|
||||
preferred_name: str = "",
|
||||
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
|
||||
) -> ChatImage:
|
||||
data = bytes(payload or b"")
|
||||
if not data:
|
||||
raise media_artifacts.EmptyBase64Data("image payload is empty")
|
||||
if max_bytes is not None and len(data) > max_bytes:
|
||||
raise media_artifacts.ArtifactTooLarge(len(data), max_bytes)
|
||||
|
||||
safe_mime = media_artifacts.normalize_mime(
|
||||
mime_type,
|
||||
default="image/png",
|
||||
required_prefix="image/",
|
||||
)
|
||||
default_extension = media_artifacts.guess_extension(safe_mime, ".png")
|
||||
default_filename = f"{source or 'image'}{default_extension}"
|
||||
filename = media_artifacts.safe_filename(
|
||||
preferred_name,
|
||||
default=default_filename,
|
||||
default_extension=default_extension,
|
||||
)
|
||||
filename_path = Path(filename)
|
||||
stem = filename_path.stem or Path(default_filename).stem or "image"
|
||||
suffix = filename_path.suffix or default_extension
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
path = artifact_dir(context_id, category=category, source=source) / (
|
||||
f"{stem}-{timestamp}-{uuid.uuid4().hex[:8]}{suffix}"
|
||||
)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_bytes(data)
|
||||
return ChatImage(
|
||||
path=str(path),
|
||||
a0_path=files.normalize_a0_path(str(path)),
|
||||
mime=safe_mime,
|
||||
size=len(data),
|
||||
)
|
||||
|
||||
|
||||
def save_image_base64(
|
||||
*,
|
||||
context_id: str,
|
||||
data: str,
|
||||
mime_type: str = "image/png",
|
||||
category: ImageCategory = "images",
|
||||
source: str = "vision-load",
|
||||
preferred_name: str = "",
|
||||
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
|
||||
) -> ChatImage:
|
||||
payload = media_artifacts.decode_base64_payload(data, max_bytes=max_bytes)
|
||||
return save_image_bytes(
|
||||
context_id=context_id,
|
||||
payload=payload.payload,
|
||||
mime_type=mime_type,
|
||||
category=category,
|
||||
source=source,
|
||||
preferred_name=preferred_name,
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
|
||||
|
||||
def save_image_file(
|
||||
*,
|
||||
context_id: str,
|
||||
path: str | Path,
|
||||
category: ImageCategory = "images",
|
||||
source: str = "vision-load",
|
||||
preferred_name: str = "",
|
||||
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
|
||||
) -> ChatImage:
|
||||
image_path = Path(path)
|
||||
payload = image_path.read_bytes()
|
||||
mime = media_artifacts.normalize_mime(
|
||||
_guess_image_mime(image_path),
|
||||
default="image/png",
|
||||
required_prefix="image/",
|
||||
)
|
||||
return save_image_bytes(
|
||||
context_id=context_id,
|
||||
payload=payload,
|
||||
mime_type=mime,
|
||||
category=category,
|
||||
source=source,
|
||||
preferred_name=preferred_name or image_path.name,
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
|
||||
|
||||
def save_image_data_url(
|
||||
*,
|
||||
context_id: str,
|
||||
data_url: str,
|
||||
category: ImageCategory = "images",
|
||||
source: str = "vision-load",
|
||||
preferred_name: str = "",
|
||||
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
|
||||
) -> ChatImage:
|
||||
header, encoded = _split_image_data_url(data_url)
|
||||
mime = header.removeprefix("data:").split(";", 1)[0] or "image/png"
|
||||
return save_image_base64(
|
||||
context_id=context_id,
|
||||
data=encoded,
|
||||
mime_type=mime,
|
||||
category=category,
|
||||
source=source,
|
||||
preferred_name=preferred_name,
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
|
||||
|
||||
def materialize_image_ref(
|
||||
*,
|
||||
context_id: str,
|
||||
url: str,
|
||||
source: str = "",
|
||||
preferred_name: str = "",
|
||||
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
|
||||
) -> str:
|
||||
value = str(url or "").strip()
|
||||
if not value or not str(context_id or "").strip():
|
||||
return value
|
||||
|
||||
resolved_source = source or infer_source(value, preferred_name)
|
||||
category = category_for_source(resolved_source)
|
||||
if _is_data_image_url(value):
|
||||
saved = save_image_data_url(
|
||||
context_id=context_id,
|
||||
data_url=value,
|
||||
category=category,
|
||||
source=resolved_source,
|
||||
preferred_name=preferred_name,
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
return saved.a0_path
|
||||
|
||||
from helpers import images
|
||||
|
||||
source_path = images.resolve_ref(value)
|
||||
if is_chat_scoped_path(context_id=context_id, path=source_path):
|
||||
return files.normalize_a0_path(str(source_path))
|
||||
saved = save_image_file(
|
||||
context_id=context_id,
|
||||
path=source_path,
|
||||
category=category,
|
||||
source=resolved_source,
|
||||
preferred_name=preferred_name or source_path.name,
|
||||
max_bytes=max_bytes,
|
||||
)
|
||||
return saved.a0_path
|
||||
|
||||
|
||||
def is_chat_scoped_path(*, context_id: str, path: str | Path) -> bool:
|
||||
if not str(context_id or "").strip():
|
||||
return False
|
||||
try:
|
||||
target = Path(path).resolve(strict=False)
|
||||
root = artifact_dir(context_id, category="images", source="vision-load").parents[1].resolve(strict=False)
|
||||
return target == root or root in target.parents
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def infer_source(value: str = "", preferred_name: str = "") -> str:
|
||||
raw = f"{value or ''} {preferred_name or ''}".lower()
|
||||
if "computer-use" in raw or "computer_use" in raw or "_a0_connector/computer_use" in raw:
|
||||
return "computer-use"
|
||||
if "/desktop/screenshots/" in raw or "\\desktop\\screenshots\\" in raw or "desktop-" in raw:
|
||||
return "desktop"
|
||||
if (
|
||||
"/browser/screenshots/" in raw
|
||||
or "\\browser\\screenshots\\" in raw
|
||||
or "host-browser" in raw
|
||||
or "browser-" in raw
|
||||
):
|
||||
return "browser"
|
||||
return "vision-load"
|
||||
|
||||
|
||||
def category_for_source(source: str) -> ImageCategory:
|
||||
return "screenshots" if source in {"desktop", "browser", "computer-use"} else "images"
|
||||
|
||||
|
||||
def _guess_image_mime(path: Path) -> str:
|
||||
import mimetypes
|
||||
|
||||
return mimetypes.guess_type(path.name)[0] or "image/png"
|
||||
|
||||
|
||||
def _is_data_image_url(value: str) -> bool:
|
||||
normalized = str(value or "").strip().lower()
|
||||
return normalized.startswith("data:image/") and ";base64," in normalized
|
||||
|
||||
|
||||
def _split_image_data_url(data_url: str) -> tuple[str, str]:
|
||||
value = str(data_url or "").strip()
|
||||
if not _is_data_image_url(value) or "," not in value:
|
||||
raise ValueError("image data URL must be data:image/*;base64,...")
|
||||
return value.split(",", 1)
|
||||
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from helpers import history, media_artifacts
|
||||
from helpers import chat_media, history, media_artifacts
|
||||
from helpers.print_style import PrintStyle
|
||||
from helpers.tool import Response, Tool
|
||||
from helpers.ws import NAMESPACE
|
||||
|
|
@ -744,7 +744,15 @@ class ComputerUseRemote(Tool):
|
|||
except FileNotFoundError as exc:
|
||||
path_error = exc
|
||||
else:
|
||||
return display_path, image_path.stem
|
||||
saved = chat_media.save_image_file(
|
||||
context_id=self.agent.context.id,
|
||||
path=image_path,
|
||||
category="screenshots",
|
||||
source="computer-use",
|
||||
preferred_name=Path(display_path).name or image_path.name,
|
||||
max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES,
|
||||
)
|
||||
return saved.a0_path, Path(saved.path).stem
|
||||
|
||||
artifact = data.get("artifact")
|
||||
if isinstance(artifact, dict) and str(artifact.get("encoding", "")).strip().lower() == "base64":
|
||||
|
|
@ -764,7 +772,16 @@ class ComputerUseRemote(Tool):
|
|||
default=f"computer-use-{uuid.uuid4().hex}.png",
|
||||
default_extension=".png",
|
||||
)
|
||||
return f"data:{mime};base64,{encoded}", Path(filename).stem
|
||||
saved = chat_media.save_image_base64(
|
||||
context_id=self.agent.context.id,
|
||||
data=encoded,
|
||||
mime_type=mime,
|
||||
category="screenshots",
|
||||
source="computer-use",
|
||||
preferred_name=filename,
|
||||
max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES,
|
||||
)
|
||||
return saved.a0_path, Path(saved.path).stem
|
||||
|
||||
if path_error is not None:
|
||||
raise path_error
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from pathlib import Path
|
|||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from helpers import ephemeral_images, media_artifacts
|
||||
from helpers import chat_media, media_artifacts
|
||||
|
||||
try:
|
||||
from helpers.ws import NAMESPACE
|
||||
|
|
@ -451,12 +451,16 @@ class ConnectorBrowserRuntime:
|
|||
default=f"host-browser-{uuid.uuid4().hex}.jpg",
|
||||
default_extension=".jpg",
|
||||
)
|
||||
mime = str(artifact.get("mime") or result.get("mime") or "image/jpeg")
|
||||
try:
|
||||
ref = ephemeral_images.put_image(
|
||||
saved = chat_media.save_image_base64(
|
||||
context_id=self.context_id,
|
||||
mime=str(artifact.get("mime") or result.get("mime") or "image/jpeg"),
|
||||
data=data,
|
||||
name=filename,
|
||||
mime_type=mime,
|
||||
category="screenshots",
|
||||
source="browser",
|
||||
preferred_name=filename,
|
||||
max_bytes=MAX_ARTIFACT_SIZE_BYTES,
|
||||
)
|
||||
except Exception as exc:
|
||||
raise RuntimeError("Host browser artifact could not be decoded.") from exc
|
||||
|
|
@ -466,11 +470,14 @@ class ConnectorBrowserRuntime:
|
|||
materialized.pop("a0_path", None)
|
||||
materialized.pop("host_path", None)
|
||||
materialized.setdefault("context_id", self.context_id)
|
||||
materialized["ephemeral"] = True
|
||||
materialized["ephemeral_ref"] = ref
|
||||
materialized["path"] = saved.path
|
||||
materialized["a0_path"] = saved.a0_path
|
||||
materialized["mime"] = saved.mime
|
||||
materialized["ephemeral"] = False
|
||||
materialized["chat_scoped"] = True
|
||||
materialized["vision_load"] = {
|
||||
"tool_name": "vision_load",
|
||||
"tool_args": {"paths": [ref]},
|
||||
"tool_args": {"paths": [saved.a0_path]},
|
||||
}
|
||||
return materialized
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ from dataclasses import dataclass
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from helpers import ephemeral_images, files
|
||||
from helpers import chat_media, files
|
||||
from helpers.defer import DeferredTask
|
||||
from helpers.errors import RepairableException
|
||||
from helpers.print_style import PrintStyle
|
||||
|
|
@ -1558,23 +1558,27 @@ class _BrowserRuntimeCore:
|
|||
quality=max(20, min(95, int(quality))),
|
||||
full_page=bool(full_page),
|
||||
)
|
||||
ref = ephemeral_images.put_image_bytes(
|
||||
saved = chat_media.save_image_bytes(
|
||||
context_id=self.context_id,
|
||||
mime="image/jpeg",
|
||||
payload=image,
|
||||
name=f"browser-{resolved_id}.jpg",
|
||||
mime_type="image/jpeg",
|
||||
category="screenshots",
|
||||
source="browser",
|
||||
preferred_name=f"browser-{resolved_id}.jpg",
|
||||
)
|
||||
return {
|
||||
"browser_id": resolved_id,
|
||||
"context_id": self.context_id,
|
||||
"path": saved.path,
|
||||
"a0_path": saved.a0_path,
|
||||
"mime": "image/jpeg",
|
||||
"ephemeral": True,
|
||||
"ephemeral_ref": ref,
|
||||
"ephemeral": False,
|
||||
"chat_scoped": True,
|
||||
"state": await self._state(resolved_id),
|
||||
"vision_load": {
|
||||
"tool_name": "vision_load",
|
||||
"tool_args": {
|
||||
"paths": [ref],
|
||||
"paths": [saved.a0_path],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Workflow:
|
|||
- For same-page controls that are easier to identify structurally, `click`, `type`, `submit`, `type_submit`, `scroll`, `select_option`, `set_checked`, and `upload_file` may use `selector` instead of `ref`; the tool resolves the selector through `content` first.
|
||||
- `click` with `x`/`y` and no `ref` is treated as a coordinate mouse click. `type` with text and no `ref` types into the currently focused element. `key_chord` accepts either `["Control", "A"]` or `"CTRL+A"`.
|
||||
- `navigate` reuses an existing `browser_id` and is preferred for serial browsing.
|
||||
- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are ephemeral refs rather than conserved files.
|
||||
- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are saved as chat-scoped artifacts; explicit `path` requests remain user-owned files.
|
||||
- Keep the tab set small; close pages after extracting what you need.
|
||||
|
||||
`multi` is only a browser action: use `tool_name: "browser"` with `tool_args.action: "multi"`. Never use `tool_name: "multi"`.
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ Screenshots are explicit only; the browser does not automatically load images in
|
|||
2. Call `vision_load` with the returned `vision_load.tool_args.paths` value.
|
||||
3. Reason from the latest loaded screenshot.
|
||||
|
||||
Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is an ephemeral ref consumed by `vision_load`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used.
|
||||
Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is saved as a chat-scoped artifact and returned through `vision_load.tool_args.paths`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used.
|
||||
|
||||
## Forms And Files
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ Start with `browser:content` to capture current refs, then use `browser:detail`
|
|||
|
||||
Use `select_option`, `set_checked`, `upload_file`, `type`, `type_submit`, and `submit` for form interaction. Use coordinates only when no stable ref exists or the UI is intentionally canvas-like.
|
||||
|
||||
Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return ephemeral refs for `vision_load`.
|
||||
Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return chat-scoped artifact paths for `vision_load`.
|
||||
|
||||
Verify after submission with `browser:content`, `browser:state`, or another explicit `browser:screenshot` plus `vision_load`.
|
||||
|
||||
|
|
|
|||
|
|
@ -37,6 +37,19 @@ def context_screenshot_dir(context_id: str = "") -> Path:
|
|||
return SCREENSHOT_DIR / _safe_context_id(context_id)
|
||||
|
||||
|
||||
def chat_screenshot_dir(context_id: str = "") -> Path:
|
||||
return BASE_DIR / "usr" / "chats" / _safe_context_id(context_id) / "screenshots" / "desktop"
|
||||
|
||||
|
||||
def normalize_a0_path(path: str | Path) -> str:
|
||||
candidate = Path(path)
|
||||
try:
|
||||
relative = candidate.resolve(strict=False).relative_to(BASE_DIR.resolve(strict=False))
|
||||
except ValueError:
|
||||
return str(candidate)
|
||||
return "/a0/" + str(relative).replace(os.sep, "/")
|
||||
|
||||
|
||||
def _safe_context_id(context_id: str = "") -> str:
|
||||
raw = str(context_id or os.environ.get("A0_DESKTOP_CONTEXT_ID") or "default")
|
||||
return _SAFE_CONTEXT_RE.sub("_", raw).strip("._") or "default"
|
||||
|
|
@ -118,9 +131,11 @@ def capture_screenshot(
|
|||
return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message}
|
||||
|
||||
explicit_path = path is not None and str(path).strip() != ""
|
||||
ephemeral_ref = not explicit_path and str(transport or "").strip().lower() != "path"
|
||||
screenshot_dir = context_screenshot_dir(context_id)
|
||||
if not explicit_path:
|
||||
transport_mode = str(transport or "").strip().lower()
|
||||
chat_scoped = bool(not explicit_path and transport_mode == "path" and str(context_id or "").strip())
|
||||
ephemeral_ref = not explicit_path and transport_mode != "path"
|
||||
screenshot_dir = chat_screenshot_dir(context_id) if chat_scoped else context_screenshot_dir(context_id)
|
||||
if not explicit_path and not chat_scoped:
|
||||
prune_context_screenshots(context_id=context_id)
|
||||
screenshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S")
|
||||
|
|
@ -138,15 +153,17 @@ def capture_screenshot(
|
|||
return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail}
|
||||
|
||||
if target.suffix.lower() == ".xwd":
|
||||
if not explicit_path:
|
||||
if not explicit_path and not chat_scoped:
|
||||
prune_context_screenshots(context_id=context_id, keep_path=raw_path)
|
||||
return {
|
||||
"ok": True,
|
||||
"path": str(raw_path),
|
||||
"a0_path": normalize_a0_path(raw_path),
|
||||
"format": "xwd",
|
||||
"captured_at": iso_now(),
|
||||
"recent": True,
|
||||
"ephemeral": not explicit_path,
|
||||
"ephemeral": not explicit_path and not chat_scoped,
|
||||
"chat_scoped": chat_scoped,
|
||||
"context_id": safe_context,
|
||||
"error": "",
|
||||
}
|
||||
|
|
@ -167,17 +184,19 @@ def capture_screenshot(
|
|||
width=width,
|
||||
height=height,
|
||||
)
|
||||
if not explicit_path:
|
||||
if not explicit_path and not chat_scoped:
|
||||
prune_context_screenshots(context_id=context_id, keep_path=target)
|
||||
return {
|
||||
"ok": True,
|
||||
"path": str(target),
|
||||
"a0_path": normalize_a0_path(target),
|
||||
"format": target.suffix.lower().lstrip(".") or "png",
|
||||
"width": width,
|
||||
"height": height,
|
||||
"captured_at": iso_now(),
|
||||
"recent": True,
|
||||
"ephemeral": not explicit_path,
|
||||
"ephemeral": not explicit_path and not chat_scoped,
|
||||
"chat_scoped": chat_scoped,
|
||||
"context_id": safe_context,
|
||||
"error": "",
|
||||
}
|
||||
|
|
@ -193,17 +212,19 @@ def capture_screenshot(
|
|||
width=converted["width"],
|
||||
height=converted["height"],
|
||||
)
|
||||
if not explicit_path:
|
||||
if not explicit_path and not chat_scoped:
|
||||
prune_context_screenshots(context_id=context_id, keep_path=target)
|
||||
return {
|
||||
"ok": True,
|
||||
"path": str(target),
|
||||
"a0_path": normalize_a0_path(target),
|
||||
"format": target.suffix.lower().lstrip(".") or "png",
|
||||
"width": converted["width"],
|
||||
"height": converted["height"],
|
||||
"captured_at": iso_now(),
|
||||
"recent": True,
|
||||
"ephemeral": not explicit_path,
|
||||
"ephemeral": not explicit_path and not chat_scoped,
|
||||
"chat_scoped": chat_scoped,
|
||||
"context_id": safe_context,
|
||||
"error": "",
|
||||
}
|
||||
|
|
@ -226,10 +247,12 @@ def capture_screenshot(
|
|||
return {
|
||||
"ok": True,
|
||||
"path": str(raw_path),
|
||||
"a0_path": normalize_a0_path(raw_path),
|
||||
"format": "xwd",
|
||||
"captured_at": iso_now(),
|
||||
"recent": True,
|
||||
"ephemeral": not explicit_path,
|
||||
"ephemeral": not explicit_path and not chat_scoped,
|
||||
"chat_scoped": chat_scoped,
|
||||
"context_id": safe_context,
|
||||
"error": message,
|
||||
}
|
||||
|
|
@ -575,8 +598,36 @@ def parse_xprop(output: str) -> dict[str, str]:
|
|||
|
||||
|
||||
def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
|
||||
chat_dir = chat_screenshot_dir(context_id)
|
||||
chat_latest = _latest_screenshot_from_dir(
|
||||
chat_dir,
|
||||
context_id=context_id,
|
||||
ephemeral=False,
|
||||
chat_scoped=True,
|
||||
prune_older=False,
|
||||
)
|
||||
if chat_latest.get("ok"):
|
||||
return chat_latest
|
||||
|
||||
prune_context_screenshots(context_id=context_id, max_age_seconds=RECENT_SCREENSHOT_SECONDS)
|
||||
screenshot_dir = context_screenshot_dir(context_id)
|
||||
return _latest_screenshot_from_dir(
|
||||
screenshot_dir,
|
||||
context_id=context_id,
|
||||
ephemeral=True,
|
||||
chat_scoped=False,
|
||||
prune_older=True,
|
||||
)
|
||||
|
||||
|
||||
def _latest_screenshot_from_dir(
|
||||
screenshot_dir: Path,
|
||||
*,
|
||||
context_id: str = "",
|
||||
ephemeral: bool,
|
||||
chat_scoped: bool,
|
||||
prune_older: bool,
|
||||
) -> dict[str, Any]:
|
||||
if not screenshot_dir.exists():
|
||||
return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
|
||||
candidates = [
|
||||
|
|
@ -587,17 +638,20 @@ def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
|
|||
if not candidates:
|
||||
return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
|
||||
latest = max(candidates, key=lambda item: item.stat().st_mtime)
|
||||
for candidate in candidates:
|
||||
if candidate != latest:
|
||||
candidate.unlink(missing_ok=True)
|
||||
if prune_older:
|
||||
for candidate in candidates:
|
||||
if candidate != latest:
|
||||
candidate.unlink(missing_ok=True)
|
||||
age = max(0.0, time.time() - latest.stat().st_mtime)
|
||||
return {
|
||||
"ok": True,
|
||||
"path": str(latest),
|
||||
"a0_path": normalize_a0_path(latest),
|
||||
"format": latest.suffix.lower().lstrip("."),
|
||||
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)),
|
||||
"recent": age <= RECENT_SCREENSHOT_SECONDS,
|
||||
"ephemeral": True,
|
||||
"ephemeral": ephemeral,
|
||||
"chat_scoped": chat_scoped,
|
||||
"context_id": _safe_context_id(context_id),
|
||||
}
|
||||
|
||||
|
|
@ -660,7 +714,8 @@ def compact_prompt_context(state: dict[str, Any] | None = None) -> str:
|
|||
screenshot = state.get("screenshot") or {}
|
||||
if screenshot.get("recent") and screenshot.get("path"):
|
||||
ephemeral = " ephemeral" if screenshot.get("ephemeral") else ""
|
||||
lines.append(f"- recent_screenshot={screenshot['path']}{ephemeral}")
|
||||
screenshot_ref = screenshot.get("a0_path") or screenshot["path"]
|
||||
lines.append(f"- recent_screenshot={screenshot_ref}{ephemeral}")
|
||||
context_id = str(state.get("context_id") or "").strip()
|
||||
if context_id:
|
||||
lines.append(f"- screenshot_context={context_id}")
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ The Desktop is an observe-act-verify control surface. Use this decision hierarch
|
|||
3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands.
|
||||
4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation.
|
||||
5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output.
|
||||
6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations return temporary context paths. Do not report from an earlier screenshot path.
|
||||
6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations with `--context-id` return chat-scoped screenshot paths. Do not report from an earlier screenshot path.
|
||||
|
||||
Keep these standing rules:
|
||||
|
||||
|
|
@ -68,7 +68,7 @@ $DESKTOP key ctrl+s
|
|||
|
||||
The script targets the persistent `agent-zero-desktop` X display, sets `DISPLAY`, `XAUTHORITY`, and `HOME` to the XFCE profile, then uses `xdotool` for input. Startup normally prepares this session. If `check` fails during explicit Desktop work, report that the Desktop runtime is not ready instead of installing packages ad hoc.
|
||||
|
||||
If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Use any returned shell screenshot path promptly; only the latest temporary context screenshot is retained.
|
||||
If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Shell screenshots captured with `--context-id` live in the owning chat's screenshot folder; screenshots without a chat context remain temporary.
|
||||
|
||||
For direct app launches without coordinates:
|
||||
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ Commands:
|
|||
observe --json [--screenshot] [--context-id ID]
|
||||
Return structured state, optionally with a fresh screenshot.
|
||||
screenshot [PATH] [--context-id ID]
|
||||
Capture the Desktop to PATH, or to the temporary context screenshot directory.
|
||||
Capture the Desktop to PATH, or to the chat screenshot directory when context-id is set.
|
||||
active-window Print the active window name.
|
||||
geometry PATTERN Print the first matching visible window geometry.
|
||||
wait-window PATTERN Wait for a visible matching window and print its id.
|
||||
|
|
|
|||
|
|
@ -258,6 +258,7 @@ const model = {
|
|||
_desktopFrameHost: null,
|
||||
_desktopFrameLoadHandler: null,
|
||||
_desktopKeepaliveHost: null,
|
||||
_desktopDisplaySizes: {},
|
||||
_desktopIntentionalShutdown: false,
|
||||
|
||||
async init(element = null) {
|
||||
|
|
@ -1499,7 +1500,7 @@ const model = {
|
|||
this.stopXpraDesktopPrime();
|
||||
this._desktopPrimeAttempts = 0;
|
||||
}
|
||||
if (this.applyXpraDesktopFrameMode(options.frame || null)) return;
|
||||
if (this.applyXpraDesktopFrameMode(options.frame || null, options)) return;
|
||||
if (this._desktopPrimeAttempts >= XPRA_DESKTOP_PRIME_ATTEMPTS) return;
|
||||
this._desktopPrimeAttempts += 1;
|
||||
if (this._desktopPrimeTimer) globalThis.clearTimeout(this._desktopPrimeTimer);
|
||||
|
|
@ -1540,8 +1541,12 @@ const model = {
|
|||
const windows = Object.values(client.id_to_window || {});
|
||||
if (!client.connected || !windows.length) return false;
|
||||
|
||||
const width = Math.round(container.clientWidth || remoteWindow.innerWidth || 0);
|
||||
const height = Math.round(container.clientHeight || remoteWindow.innerHeight || 0);
|
||||
const token = options.token || this.session?.desktop?.token || "";
|
||||
const displaySize = options.displaySize || this.desktopDisplaySizeForToken(token);
|
||||
const viewportWidth = Math.round(container.clientWidth || remoteWindow.innerWidth || 0);
|
||||
const viewportHeight = Math.round(container.clientHeight || remoteWindow.innerHeight || 0);
|
||||
const width = Math.round(displaySize?.width || viewportWidth || 0);
|
||||
const height = Math.round(displaySize?.height || viewportHeight || 0);
|
||||
if (width > 0 && height > 0) {
|
||||
client.desktop_width = width;
|
||||
client.desktop_height = height;
|
||||
|
|
@ -1574,6 +1579,26 @@ const model = {
|
|||
}
|
||||
},
|
||||
|
||||
desktopDisplaySizeForToken(token = "") {
|
||||
const key = String(token || "").trim();
|
||||
const size = key ? this._desktopDisplaySizes?.[key] : null;
|
||||
const width = Math.round(Number(size?.width || 0));
|
||||
const height = Math.round(Number(size?.height || 0));
|
||||
return width > 0 && height > 0 ? { width, height } : null;
|
||||
},
|
||||
|
||||
rememberDesktopDisplaySize(token = "", width = 0, height = 0) {
|
||||
const key = String(token || "").trim();
|
||||
const normalizedWidth = Math.round(Number(width || 0));
|
||||
const normalizedHeight = Math.round(Number(height || 0));
|
||||
if (!key || normalizedWidth <= 0 || normalizedHeight <= 0) return null;
|
||||
this._desktopDisplaySizes = {
|
||||
...(this._desktopDisplaySizes || {}),
|
||||
[key]: { width: normalizedWidth, height: normalizedHeight },
|
||||
};
|
||||
return this._desktopDisplaySizes[key];
|
||||
},
|
||||
|
||||
installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container) {
|
||||
if (!frame || !remoteWindow || !remoteDocument || !client) return null;
|
||||
const store = this;
|
||||
|
|
@ -1584,8 +1609,10 @@ const model = {
|
|||
const metrics = () => {
|
||||
const desktopWidth = Math.max(1, finite(client.desktop_width || container?.clientWidth || remoteWindow.innerWidth, 1));
|
||||
const desktopHeight = Math.max(1, finite(client.desktop_height || container?.clientHeight || remoteWindow.innerHeight, 1));
|
||||
const clientWidth = Math.max(1, finite(container?.clientWidth || remoteWindow.innerWidth, desktopWidth));
|
||||
const clientHeight = Math.max(1, finite(container?.clientHeight || remoteWindow.innerHeight, desktopHeight));
|
||||
const primaryWindow = Object.values(client.id_to_window || {})[0];
|
||||
const canvas = primaryWindow?.canvas;
|
||||
const clientWidth = Math.max(1, finite(canvas?.clientWidth || canvas?.width || container?.clientWidth || remoteWindow.innerWidth, desktopWidth));
|
||||
const clientHeight = Math.max(1, finite(canvas?.clientHeight || canvas?.height || container?.clientHeight || remoteWindow.innerHeight, desktopHeight));
|
||||
return {
|
||||
desktopWidth,
|
||||
desktopHeight,
|
||||
|
|
@ -1683,8 +1710,10 @@ const model = {
|
|||
},
|
||||
|
||||
fitXpraDesktopWindowElement(xpraWindow, width, height) {
|
||||
const cssWidth = `${Math.max(1, Number(width || 0))}px`;
|
||||
const cssHeight = `${Math.max(1, Number(height || 0))}px`;
|
||||
const normalizedWidth = Math.max(1, Math.round(Number(width || 0)));
|
||||
const normalizedHeight = Math.max(1, Math.round(Number(height || 0)));
|
||||
const cssWidth = `${normalizedWidth}px`;
|
||||
const cssHeight = `${normalizedHeight}px`;
|
||||
const windowElement = xpraWindow?.div;
|
||||
const canvas = xpraWindow?.canvas;
|
||||
windowElement?.style?.setProperty("left", "0px", "important");
|
||||
|
|
@ -1698,6 +1727,12 @@ const model = {
|
|||
canvas?.style?.setProperty("height", cssHeight, "important");
|
||||
canvas?.style?.setProperty("display", "block", "important");
|
||||
canvas?.style?.setProperty("margin", "0", "important");
|
||||
if (canvas) {
|
||||
if (canvas.width !== normalizedWidth) canvas.width = normalizedWidth;
|
||||
if (canvas.height !== normalizedHeight) canvas.height = normalizedHeight;
|
||||
canvas.setAttribute("width", String(normalizedWidth));
|
||||
canvas.setAttribute("height", String(normalizedHeight));
|
||||
}
|
||||
},
|
||||
|
||||
installXpraDesktopWheelBridge(remoteWindow, xpraWindow) {
|
||||
|
|
@ -2139,6 +2174,11 @@ const model = {
|
|||
const response = await fetch(`/desktop/resize?${params.toString()}`, { credentials: "same-origin" });
|
||||
if (response.ok) {
|
||||
const result = await response.json().catch(() => ({}));
|
||||
const displaySize = this.rememberDesktopDisplaySize(
|
||||
token,
|
||||
result?.width || width,
|
||||
result?.height || height,
|
||||
);
|
||||
this._desktopResizeKey = key;
|
||||
const activeFrame = this.desktopFrame(frame);
|
||||
const activeTarget = activeFrame?.parentElement || activeFrame;
|
||||
|
|
@ -2153,7 +2193,7 @@ const model = {
|
|||
}
|
||||
}
|
||||
if (result?.reload) this.reloadDesktopFrame(activeFrame || frame);
|
||||
this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame });
|
||||
this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame, token, displaySize });
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn("Desktop resize skipped", error);
|
||||
|
|
|
|||
|
|
@ -2477,7 +2477,7 @@ async def test_browser_runtime_remounts_initial_changed_viewport():
|
|||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeypatch, tmp_path):
|
||||
async def test_browser_runtime_screenshot_file_defaults_to_chat_scoped_artifact(monkeypatch, tmp_path):
|
||||
screenshot_calls = []
|
||||
|
||||
def fake_get_abs_path(*parts):
|
||||
|
|
@ -2512,15 +2512,15 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
|
|||
|
||||
result = await core.screenshot_file(5, quality=500)
|
||||
|
||||
assert "path" not in result
|
||||
assert "a0_path" not in result
|
||||
assert Path(result["path"]).read_bytes() == b"image-bytes"
|
||||
assert result["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/browser/browser-5-")
|
||||
assert result["context_id"] == "ctx/id"
|
||||
assert result["mime"] == "image/jpeg"
|
||||
assert result["ephemeral"] is True
|
||||
assert result["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX)
|
||||
assert result["ephemeral"] is False
|
||||
assert result["chat_scoped"] is True
|
||||
assert result["vision_load"] == {
|
||||
"tool_name": "vision_load",
|
||||
"tool_args": {"paths": [result["ephemeral_ref"]]},
|
||||
"tool_args": {"paths": [result["a0_path"]]},
|
||||
}
|
||||
assert "image" not in result
|
||||
assert not list((tmp_path / "tmp" / "browser" / "screenshots").rglob("*.jpg"))
|
||||
|
|
@ -2528,7 +2528,6 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
|
|||
assert screenshot_calls[-1]["quality"] == 95
|
||||
assert screenshot_calls[-1]["full_page"] is False
|
||||
assert "path" not in screenshot_calls[-1]
|
||||
assert ephemeral_images.consume_image(result["ephemeral_ref"], context_id="ctx/id").data_url == "data:image/jpeg;base64,aW1hZ2UtYnl0ZXM="
|
||||
|
||||
png_path = tmp_path / "custom.png"
|
||||
png_result = await core.screenshot_file(5, quality=1, full_page=True, path=str(png_path))
|
||||
|
|
@ -2543,9 +2542,27 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
|
|||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
|
||||
async def test_vision_load_materializes_ephemeral_browser_refs(monkeypatch, tmp_path):
|
||||
monkeypatch.setitem(sys.modules, "helpers.tool", SimpleNamespace(Response=_TestResponse, Tool=_TestTool))
|
||||
history_stub = ModuleType("helpers.history")
|
||||
|
||||
class _RawMessage(dict):
|
||||
def __init__(self, raw_content, preview):
|
||||
super().__init__(raw_content=raw_content, preview=preview)
|
||||
|
||||
history_stub.RawMessage = _RawMessage
|
||||
monkeypatch.setitem(sys.modules, "helpers.history", history_stub)
|
||||
monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
|
||||
import tools.vision_load as vision_load_module
|
||||
|
||||
def fake_get_abs_path(*parts):
|
||||
return str(tmp_path.joinpath(*parts))
|
||||
|
||||
def fake_normalize_a0_path(path):
|
||||
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
|
||||
|
||||
monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path)
|
||||
monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
|
||||
monkeypatch.setattr(
|
||||
vision_load_module.plugins,
|
||||
"get_plugin_config",
|
||||
|
|
@ -2561,7 +2578,7 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
|
|||
hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)),
|
||||
hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)),
|
||||
)
|
||||
ref = ephemeral_images.put_image(
|
||||
ref = vision_load_module.ephemeral_images.put_image(
|
||||
context_id="ctx-vision",
|
||||
mime="image/jpeg",
|
||||
data=SMALL_JPEG_10X10,
|
||||
|
|
@ -2580,10 +2597,13 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
|
|||
response = await tool.execute(paths=[ref])
|
||||
await tool.after_execution(response)
|
||||
|
||||
assert ephemeral_images.get_image(ref, context_id="ctx-vision") is None
|
||||
assert vision_load_module.ephemeral_images.get_image(ref, context_id="ctx-vision") is None
|
||||
assert tool.loaded_paths == ["browser-shot.jpg"]
|
||||
raw_message = messages[0][1]["content"]
|
||||
assert raw_message.raw_content[0]["image_url"]["url"] == f"data:image/jpeg;base64,{SMALL_JPEG_10X10}"
|
||||
stored_ref = raw_message["raw_content"][0]["image_url"]["url"]
|
||||
assert stored_ref.startswith("/a0/usr/chats/ctx-vision/screenshots/browser/browser-shot-")
|
||||
stored_path = tmp_path / stored_ref.removeprefix("/a0/")
|
||||
assert stored_path.read_bytes() == __import__("base64").b64decode(SMALL_JPEG_10X10)
|
||||
assert updates[-1]["result"] == "1 images loaded, 0 skipped"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -12,8 +12,8 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from helpers import ephemeral_images
|
||||
from plugins._a0_connector.helpers import ws_runtime
|
||||
from plugins._browser.helpers import connector_runtime as connector_runtime_module
|
||||
from plugins._browser.helpers.connector_runtime import (
|
||||
ConnectorBrowserRuntime,
|
||||
_agent_uses_local_chat_model,
|
||||
|
|
@ -330,7 +330,15 @@ def test_connector_runtime_adds_docker_recovery_to_host_errors():
|
|||
assert "/browser container" in message
|
||||
|
||||
|
||||
def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path):
|
||||
def test_host_browser_artifacts_become_chat_scoped_files(monkeypatch, tmp_path):
|
||||
def fake_get_abs_path(*parts):
|
||||
return str(tmp_path.joinpath(*parts))
|
||||
|
||||
def fake_normalize_a0_path(path):
|
||||
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
|
||||
|
||||
monkeypatch.setattr(connector_runtime_module.chat_media.files, "get_abs_path", fake_get_abs_path)
|
||||
monkeypatch.setattr(connector_runtime_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
|
||||
runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host"))
|
||||
|
||||
result = runtime._materialize_artifact(
|
||||
|
|
@ -352,19 +360,15 @@ def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path):
|
|||
|
||||
inner = result[0]["result"]
|
||||
assert "artifact" not in inner
|
||||
assert "path" not in inner
|
||||
assert "a0_path" not in inner
|
||||
assert Path(inner["path"]).read_bytes() == b"fake"
|
||||
assert inner["a0_path"].startswith("/a0/usr/chats/ctx-host/screenshots/browser/shot-")
|
||||
assert inner["context_id"] == "ctx-host"
|
||||
assert inner["ephemeral"] is True
|
||||
assert inner["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX)
|
||||
assert inner["vision_load"]["tool_args"]["paths"] == [inner["ephemeral_ref"]]
|
||||
assert ephemeral_images.consume_image(inner["ephemeral_ref"], context_id="ctx-host").data_url == "data:image/jpeg;base64,ZmFrZQ=="
|
||||
assert not list(tmp_path.rglob("shot.jpg"))
|
||||
assert inner["ephemeral"] is False
|
||||
assert inner["chat_scoped"] is True
|
||||
assert inner["vision_load"]["tool_args"]["paths"] == [inner["a0_path"]]
|
||||
|
||||
|
||||
def test_host_browser_artifact_materialization_rejects_oversized_payload(monkeypatch, tmp_path):
|
||||
import plugins._browser.helpers.connector_runtime as connector_runtime_module
|
||||
|
||||
monkeypatch.setattr(connector_runtime_module, "MAX_ARTIFACT_SIZE_BYTES", 2)
|
||||
runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host"))
|
||||
|
||||
|
|
|
|||
|
|
@ -264,6 +264,14 @@ def test_desktop_plugin_owns_routes_runtime_surface_and_state_paths():
|
|||
assert "DESKTOP_RUNTIME_INSTALL_MESSAGE" in desktop_store
|
||||
assert "openDesktopWhenRuntimeReady" in desktop_store
|
||||
assert "isDesktopRuntimeInstalling" in desktop_store
|
||||
assert "_desktopDisplaySizes: {}" in desktop_store
|
||||
assert "desktopDisplaySizeForToken(token" in desktop_store
|
||||
assert "rememberDesktopDisplaySize(token" in desktop_store
|
||||
assert "options.displaySize || this.desktopDisplaySizeForToken(token)" in desktop_store
|
||||
assert "result?.width || width" in desktop_store
|
||||
assert "canvas.width = normalizedWidth" in desktop_store
|
||||
assert "canvas.height = normalizedHeight" in desktop_store
|
||||
assert "canvas?.clientWidth || canvas?.width" in desktop_store
|
||||
assert "Installing Agent Zero Desktop runtime dependencies" in desktop_session
|
||||
assert "__a0XpraOffsetWarnPatched" in desktop_store
|
||||
assert "window does not fit in canvas, offsets" in desktop_store
|
||||
|
|
|
|||
|
|
@ -191,7 +191,8 @@ def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp
|
|||
|
||||
|
||||
def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path)
|
||||
monkeypatch.setattr(desktop_state, "BASE_DIR", tmp_path)
|
||||
monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path / "tmp" / "desktop" / "screenshots")
|
||||
capabilities = {"xwd": "/usr/bin/xwd"}
|
||||
env = {"DISPLAY": ":120"}
|
||||
|
||||
|
|
@ -222,7 +223,7 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp
|
|||
monkeypatch.setattr(desktop_state, "run", fake_run)
|
||||
monkeypatch.setitem(sys.modules, "PIL", pil_module)
|
||||
monkeypatch.setitem(sys.modules, "PIL.Image", image_module)
|
||||
stale_path = tmp_path / "ctx_id" / "stale.png"
|
||||
stale_path = tmp_path / "tmp" / "desktop" / "screenshots" / "ctx_id" / "stale.png"
|
||||
stale_path.parent.mkdir(parents=True)
|
||||
stale_path.write_bytes(b"stale")
|
||||
|
||||
|
|
@ -236,12 +237,14 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp
|
|||
|
||||
path = Path(screenshot["path"])
|
||||
assert screenshot["ok"] is True
|
||||
assert screenshot["ephemeral"] is True
|
||||
assert screenshot["ephemeral"] is False
|
||||
assert screenshot["chat_scoped"] is True
|
||||
assert screenshot["context_id"] == "ctx_id"
|
||||
assert path.parent == tmp_path / "ctx_id"
|
||||
assert screenshot["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/desktop/desktop-")
|
||||
assert path.parent == tmp_path / "usr" / "chats" / "ctx_id" / "screenshots" / "desktop"
|
||||
assert path.name.startswith("desktop-")
|
||||
assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path)
|
||||
assert not stale_path.exists()
|
||||
assert stale_path.exists()
|
||||
|
||||
|
||||
def test_desktop_state_default_screenshot_returns_ephemeral_ref(tmp_path, monkeypatch):
|
||||
|
|
|
|||
|
|
@ -699,3 +699,35 @@ def test_computer_use_remote_start_session_reports_backend_features_and_windows_
|
|||
assert "backend=windows/windows" in message
|
||||
assert "features=uia-tree-snapshot, uia-structural-targeting" in message
|
||||
assert "host-computer-use-windows" in message
|
||||
|
||||
|
||||
def test_computer_use_remote_capture_artifact_is_chat_scoped(monkeypatch, tmp_path: Path):
|
||||
module = _load_computer_use_remote_tool(monkeypatch)
|
||||
|
||||
def fake_get_abs_path(*parts):
|
||||
return str(tmp_path.joinpath(*parts))
|
||||
|
||||
def fake_normalize_a0_path(path):
|
||||
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
|
||||
|
||||
monkeypatch.setattr(module.chat_media.files, "get_abs_path", fake_get_abs_path)
|
||||
monkeypatch.setattr(module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
|
||||
|
||||
tool = object.__new__(module.ComputerUseRemote)
|
||||
tool.agent = types.SimpleNamespace(context=types.SimpleNamespace(id="ctx-computer"))
|
||||
|
||||
display_ref, capture_id = tool._resolve_capture_ref(
|
||||
{
|
||||
"artifact": {
|
||||
"filename": "capture.png",
|
||||
"mime": "image/png",
|
||||
"encoding": "base64",
|
||||
"data": "ZmFrZQ==",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
assert display_ref.startswith("/a0/usr/chats/ctx-computer/screenshots/computer-use/capture-")
|
||||
stored_path = tmp_path / display_ref.removeprefix("/a0/")
|
||||
assert stored_path.read_bytes() == b"fake"
|
||||
assert capture_id == stored_path.stem
|
||||
|
|
|
|||
123
tests/test_vision_load_image_refs.py
Normal file
123
tests/test_vision_load_image_refs.py
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
import types
|
||||
from types import SimpleNamespace
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from helpers import images
|
||||
|
||||
|
||||
class _TestResponse(SimpleNamespace):
|
||||
def __init__(self, message="", break_loop=False, **kwargs):
|
||||
super().__init__(message=message, break_loop=break_loop, **kwargs)
|
||||
|
||||
|
||||
class _TestTool:
|
||||
def __init__(
|
||||
self,
|
||||
agent=None,
|
||||
name="",
|
||||
method=None,
|
||||
args=None,
|
||||
message="",
|
||||
loop_data=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.agent = agent
|
||||
self.name = name
|
||||
self.method = method
|
||||
self.args = args or {}
|
||||
self.message = message
|
||||
self.loop_data = loop_data
|
||||
|
||||
|
||||
def _install_tool_stub(monkeypatch):
|
||||
tool_stub = types.ModuleType("helpers.tool")
|
||||
tool_stub.Response = _TestResponse
|
||||
tool_stub.Tool = _TestTool
|
||||
history_stub = types.ModuleType("helpers.history")
|
||||
|
||||
class _RawMessage(dict):
|
||||
def __init__(self, raw_content, preview):
|
||||
super().__init__(raw_content=raw_content, preview=preview)
|
||||
|
||||
history_stub.RawMessage = _RawMessage
|
||||
monkeypatch.setitem(sys.modules, "helpers.tool", tool_stub)
|
||||
monkeypatch.setitem(sys.modules, "helpers.history", history_stub)
|
||||
monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
|
||||
|
||||
|
||||
def test_prepare_content_keeps_missing_local_image_refs_strict():
|
||||
missing_path = "/tmp/a0-missing-desktop-screenshot.png"
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
images.prepare_content(
|
||||
[{"type": "image_url", "image_url": {"url": missing_path}}]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_vision_load_materializes_local_image_to_chat_artifact(monkeypatch, tmp_path):
|
||||
_install_tool_stub(monkeypatch)
|
||||
import tools.vision_load as vision_load_module
|
||||
|
||||
def fake_get_abs_path(*parts):
|
||||
return str(tmp_path.joinpath(*parts))
|
||||
|
||||
def fake_normalize_a0_path(path):
|
||||
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
|
||||
|
||||
monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path)
|
||||
monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
|
||||
monkeypatch.setattr(
|
||||
vision_load_module.plugins,
|
||||
"get_plugin_config",
|
||||
lambda *args, **kwargs: {"chat_model": {"max_embeds": 10}},
|
||||
)
|
||||
|
||||
async def direct_call(func, *args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(
|
||||
vision_load_module.runtime,
|
||||
"call_development_function",
|
||||
direct_call,
|
||||
)
|
||||
|
||||
image_path = tmp_path / "sample-image.png"
|
||||
image_path.write_bytes(b"png-data")
|
||||
|
||||
tool_results = []
|
||||
messages = []
|
||||
updates = []
|
||||
agent = SimpleNamespace(
|
||||
context=SimpleNamespace(id="ctx-vision"),
|
||||
agent_name="Agent 0",
|
||||
hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)),
|
||||
hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)),
|
||||
)
|
||||
tool = vision_load_module.VisionLoad(
|
||||
agent=agent,
|
||||
name="vision_load",
|
||||
method=None,
|
||||
args={"paths": [str(image_path)]},
|
||||
message="",
|
||||
loop_data=None,
|
||||
)
|
||||
tool.log = SimpleNamespace(id="vision-log", update=lambda **kwargs: updates.append(kwargs))
|
||||
|
||||
response = await tool.execute(paths=[str(image_path)])
|
||||
image_path.unlink()
|
||||
await tool.after_execution(response)
|
||||
|
||||
raw_message = messages[0][1]["content"]
|
||||
stored_ref = raw_message["raw_content"][0]["image_url"]["url"]
|
||||
assert stored_ref.startswith("/a0/usr/chats/ctx-vision/images/vision-load/sample-image-")
|
||||
stored_path = tmp_path / stored_ref.removeprefix("/a0/")
|
||||
assert stored_path.read_bytes() == b"png-data"
|
||||
assert updates[-1]["result"] == "1 images loaded, 0 skipped"
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from helpers.print_style import PrintStyle
|
||||
from helpers.tool import Tool, Response
|
||||
from helpers import runtime, files, plugins, ephemeral_images
|
||||
from helpers import runtime, files, plugins, ephemeral_images, images, chat_media
|
||||
from mimetypes import guess_type
|
||||
from helpers import history
|
||||
|
||||
|
|
@ -27,7 +27,7 @@ class VisionLoad(Tool):
|
|||
else []
|
||||
)
|
||||
|
||||
for path, display_path in limited_paths:
|
||||
for idx, (path, display_path) in enumerate(limited_paths):
|
||||
if not path:
|
||||
continue
|
||||
if ephemeral_images.is_ref(path):
|
||||
|
|
@ -38,12 +38,16 @@ class VisionLoad(Tool):
|
|||
if image is None:
|
||||
continue
|
||||
display = image.display_name or display_path
|
||||
self.images_dict[display] = image.data_url
|
||||
self.loaded_paths.append(display)
|
||||
stored_ref = self._store_ephemeral_image(image)
|
||||
if stored_ref:
|
||||
self.images_dict[display] = stored_ref
|
||||
self.loaded_paths.append(display)
|
||||
continue
|
||||
if self._is_data_image_url(path):
|
||||
self.images_dict[display_path] = path
|
||||
self.loaded_paths.append(display_path)
|
||||
stored_ref = self._store_data_url(path, preferred_name=f"vision-load-{idx + 1}.png")
|
||||
if stored_ref:
|
||||
self.images_dict[display_path] = stored_ref
|
||||
self.loaded_paths.append(display_path)
|
||||
continue
|
||||
if not await runtime.call_development_function(files.exists, str(path)):
|
||||
continue
|
||||
|
|
@ -51,8 +55,12 @@ class VisionLoad(Tool):
|
|||
if path not in self.images_dict:
|
||||
mime_type, _ = guess_type(str(path))
|
||||
if mime_type and mime_type.startswith("image/"):
|
||||
self.images_dict[display_path] = str(path)
|
||||
self.loaded_paths.append(display_path)
|
||||
try:
|
||||
stored_ref = self._store_local_image(path, preferred_name=files.basename(path))
|
||||
self.images_dict[display_path] = stored_ref
|
||||
self.loaded_paths.append(display_path)
|
||||
except (FileNotFoundError, OSError, ValueError):
|
||||
continue
|
||||
|
||||
return Response(message="dummy", break_loop=False)
|
||||
|
||||
|
|
@ -65,6 +73,48 @@ class VisionLoad(Tool):
|
|||
def _context_id(self) -> str:
|
||||
return str(getattr(getattr(self.agent, "context", None), "id", "") or "").strip()
|
||||
|
||||
def _store_ephemeral_image(self, image: ephemeral_images.EphemeralImage) -> str:
|
||||
context_id = self._context_id()
|
||||
if not context_id:
|
||||
return image.data_url
|
||||
source = chat_media.infer_source(image.ref, image.display_name)
|
||||
category = chat_media.category_for_source(source)
|
||||
saved = chat_media.save_image_base64(
|
||||
context_id=context_id,
|
||||
data=image.data,
|
||||
mime_type=image.mime,
|
||||
category=category,
|
||||
source=source,
|
||||
preferred_name=image.display_name,
|
||||
)
|
||||
return saved.a0_path
|
||||
|
||||
def _store_data_url(self, data_url: str, *, preferred_name: str = "") -> str:
|
||||
context_id = self._context_id()
|
||||
if not context_id:
|
||||
return data_url
|
||||
source = chat_media.infer_source(data_url, preferred_name)
|
||||
category = chat_media.category_for_source(source)
|
||||
saved = chat_media.save_image_data_url(
|
||||
context_id=context_id,
|
||||
data_url=data_url,
|
||||
category=category,
|
||||
source=source,
|
||||
preferred_name=preferred_name,
|
||||
)
|
||||
return saved.a0_path
|
||||
|
||||
def _store_local_image(self, path: str, *, preferred_name: str = "") -> str:
|
||||
context_id = self._context_id()
|
||||
if not context_id:
|
||||
return images.to_data_url(path)
|
||||
return chat_media.materialize_image_ref(
|
||||
context_id=context_id,
|
||||
url=path,
|
||||
source=chat_media.infer_source(path, preferred_name),
|
||||
preferred_name=preferred_name,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_data_image_url(value: str) -> bool:
|
||||
normalized = str(value or "").strip().lower()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue