Fix durable screenshot artifacts and Xpra sizing

Materialize browser, desktop, computer-use, and vision-load screenshots into chat-scoped artifacts so historical image refs survive temporary screenshot pruning.

Keep history serialization free of rescue assumptions, document durable screenshot behavior in tool prompts/skills, and size Xpra canvases from backend-normalized display dimensions to prevent stretched desktop views.

Verified with focused pytest coverage plus live Docker checks for browser screenshot persistence and Xpra canvas dimensions.
This commit is contained in:
Alessandro 2026-05-30 17:45:19 +02:00
parent 45e4bd892c
commit edd58a42d2
18 changed files with 688 additions and 81 deletions

244
helpers/chat_media.py Normal file
View file

@ -0,0 +1,244 @@
from __future__ import annotations
import time
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Literal
from helpers import files, media_artifacts
DEFAULT_MAX_IMAGE_BYTES = media_artifacts.DEFAULT_MAX_ARTIFACT_SIZE_BYTES
ImageCategory = Literal["images", "screenshots"]
@dataclass(frozen=True)
class ChatImage:
path: str
a0_path: str
mime: str
size: int
def screenshot_dir(context_id: str, source: str) -> Path:
return artifact_dir(context_id, category="screenshots", source=source)
def artifact_dir(
context_id: str,
*,
category: ImageCategory = "images",
source: str = "vision-load",
) -> Path:
context_segment = files.safe_file_name(str(context_id or "default")).strip("._") or "default"
safe_category = files.safe_file_name(category).strip("._") or "images"
safe_source = files.safe_file_name(source).strip("._") or "vision-load"
return Path(files.get_abs_path("usr/chats", context_segment)) / safe_category / safe_source
def save_image_bytes(
*,
context_id: str,
payload: bytes,
mime_type: str = "image/png",
category: ImageCategory = "images",
source: str = "vision-load",
preferred_name: str = "",
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
) -> ChatImage:
data = bytes(payload or b"")
if not data:
raise media_artifacts.EmptyBase64Data("image payload is empty")
if max_bytes is not None and len(data) > max_bytes:
raise media_artifacts.ArtifactTooLarge(len(data), max_bytes)
safe_mime = media_artifacts.normalize_mime(
mime_type,
default="image/png",
required_prefix="image/",
)
default_extension = media_artifacts.guess_extension(safe_mime, ".png")
default_filename = f"{source or 'image'}{default_extension}"
filename = media_artifacts.safe_filename(
preferred_name,
default=default_filename,
default_extension=default_extension,
)
filename_path = Path(filename)
stem = filename_path.stem or Path(default_filename).stem or "image"
suffix = filename_path.suffix or default_extension
timestamp = time.strftime("%Y%m%d-%H%M%S")
path = artifact_dir(context_id, category=category, source=source) / (
f"{stem}-{timestamp}-{uuid.uuid4().hex[:8]}{suffix}"
)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(data)
return ChatImage(
path=str(path),
a0_path=files.normalize_a0_path(str(path)),
mime=safe_mime,
size=len(data),
)
def save_image_base64(
*,
context_id: str,
data: str,
mime_type: str = "image/png",
category: ImageCategory = "images",
source: str = "vision-load",
preferred_name: str = "",
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
) -> ChatImage:
payload = media_artifacts.decode_base64_payload(data, max_bytes=max_bytes)
return save_image_bytes(
context_id=context_id,
payload=payload.payload,
mime_type=mime_type,
category=category,
source=source,
preferred_name=preferred_name,
max_bytes=max_bytes,
)
def save_image_file(
*,
context_id: str,
path: str | Path,
category: ImageCategory = "images",
source: str = "vision-load",
preferred_name: str = "",
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
) -> ChatImage:
image_path = Path(path)
payload = image_path.read_bytes()
mime = media_artifacts.normalize_mime(
_guess_image_mime(image_path),
default="image/png",
required_prefix="image/",
)
return save_image_bytes(
context_id=context_id,
payload=payload,
mime_type=mime,
category=category,
source=source,
preferred_name=preferred_name or image_path.name,
max_bytes=max_bytes,
)
def save_image_data_url(
*,
context_id: str,
data_url: str,
category: ImageCategory = "images",
source: str = "vision-load",
preferred_name: str = "",
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
) -> ChatImage:
header, encoded = _split_image_data_url(data_url)
mime = header.removeprefix("data:").split(";", 1)[0] or "image/png"
return save_image_base64(
context_id=context_id,
data=encoded,
mime_type=mime,
category=category,
source=source,
preferred_name=preferred_name,
max_bytes=max_bytes,
)
def materialize_image_ref(
*,
context_id: str,
url: str,
source: str = "",
preferred_name: str = "",
max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
) -> str:
value = str(url or "").strip()
if not value or not str(context_id or "").strip():
return value
resolved_source = source or infer_source(value, preferred_name)
category = category_for_source(resolved_source)
if _is_data_image_url(value):
saved = save_image_data_url(
context_id=context_id,
data_url=value,
category=category,
source=resolved_source,
preferred_name=preferred_name,
max_bytes=max_bytes,
)
return saved.a0_path
from helpers import images
source_path = images.resolve_ref(value)
if is_chat_scoped_path(context_id=context_id, path=source_path):
return files.normalize_a0_path(str(source_path))
saved = save_image_file(
context_id=context_id,
path=source_path,
category=category,
source=resolved_source,
preferred_name=preferred_name or source_path.name,
max_bytes=max_bytes,
)
return saved.a0_path
def is_chat_scoped_path(*, context_id: str, path: str | Path) -> bool:
if not str(context_id or "").strip():
return False
try:
target = Path(path).resolve(strict=False)
root = artifact_dir(context_id, category="images", source="vision-load").parents[1].resolve(strict=False)
return target == root or root in target.parents
except OSError:
return False
def infer_source(value: str = "", preferred_name: str = "") -> str:
raw = f"{value or ''} {preferred_name or ''}".lower()
if "computer-use" in raw or "computer_use" in raw or "_a0_connector/computer_use" in raw:
return "computer-use"
if "/desktop/screenshots/" in raw or "\\desktop\\screenshots\\" in raw or "desktop-" in raw:
return "desktop"
if (
"/browser/screenshots/" in raw
or "\\browser\\screenshots\\" in raw
or "host-browser" in raw
or "browser-" in raw
):
return "browser"
return "vision-load"
def category_for_source(source: str) -> ImageCategory:
return "screenshots" if source in {"desktop", "browser", "computer-use"} else "images"
def _guess_image_mime(path: Path) -> str:
import mimetypes
return mimetypes.guess_type(path.name)[0] or "image/png"
def _is_data_image_url(value: str) -> bool:
normalized = str(value or "").strip().lower()
return normalized.startswith("data:image/") and ";base64," in normalized
def _split_image_data_url(data_url: str) -> tuple[str, str]:
value = str(data_url or "").strip()
if not _is_data_image_url(value) or "," not in value:
raise ValueError("image data URL must be data:image/*;base64,...")
return value.split(",", 1)

View file

@ -6,7 +6,7 @@ from pathlib import Path
import uuid
from typing import Any
from helpers import history, media_artifacts
from helpers import chat_media, history, media_artifacts
from helpers.print_style import PrintStyle
from helpers.tool import Response, Tool
from helpers.ws import NAMESPACE
@ -744,7 +744,15 @@ class ComputerUseRemote(Tool):
except FileNotFoundError as exc:
path_error = exc
else:
return display_path, image_path.stem
saved = chat_media.save_image_file(
context_id=self.agent.context.id,
path=image_path,
category="screenshots",
source="computer-use",
preferred_name=Path(display_path).name or image_path.name,
max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES,
)
return saved.a0_path, Path(saved.path).stem
artifact = data.get("artifact")
if isinstance(artifact, dict) and str(artifact.get("encoding", "")).strip().lower() == "base64":
@ -764,7 +772,16 @@ class ComputerUseRemote(Tool):
default=f"computer-use-{uuid.uuid4().hex}.png",
default_extension=".png",
)
return f"data:{mime};base64,{encoded}", Path(filename).stem
saved = chat_media.save_image_base64(
context_id=self.agent.context.id,
data=encoded,
mime_type=mime,
category="screenshots",
source="computer-use",
preferred_name=filename,
max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES,
)
return saved.a0_path, Path(saved.path).stem
if path_error is not None:
raise path_error

View file

@ -9,7 +9,7 @@ from pathlib import Path
from typing import Any
from urllib.parse import urlparse
from helpers import ephemeral_images, media_artifacts
from helpers import chat_media, media_artifacts
try:
from helpers.ws import NAMESPACE
@ -451,12 +451,16 @@ class ConnectorBrowserRuntime:
default=f"host-browser-{uuid.uuid4().hex}.jpg",
default_extension=".jpg",
)
mime = str(artifact.get("mime") or result.get("mime") or "image/jpeg")
try:
ref = ephemeral_images.put_image(
saved = chat_media.save_image_base64(
context_id=self.context_id,
mime=str(artifact.get("mime") or result.get("mime") or "image/jpeg"),
data=data,
name=filename,
mime_type=mime,
category="screenshots",
source="browser",
preferred_name=filename,
max_bytes=MAX_ARTIFACT_SIZE_BYTES,
)
except Exception as exc:
raise RuntimeError("Host browser artifact could not be decoded.") from exc
@ -466,11 +470,14 @@ class ConnectorBrowserRuntime:
materialized.pop("a0_path", None)
materialized.pop("host_path", None)
materialized.setdefault("context_id", self.context_id)
materialized["ephemeral"] = True
materialized["ephemeral_ref"] = ref
materialized["path"] = saved.path
materialized["a0_path"] = saved.a0_path
materialized["mime"] = saved.mime
materialized["ephemeral"] = False
materialized["chat_scoped"] = True
materialized["vision_load"] = {
"tool_name": "vision_load",
"tool_args": {"paths": [ref]},
"tool_args": {"paths": [saved.a0_path]},
}
return materialized

View file

@ -15,7 +15,7 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Any
from helpers import ephemeral_images, files
from helpers import chat_media, files
from helpers.defer import DeferredTask
from helpers.errors import RepairableException
from helpers.print_style import PrintStyle
@ -1558,23 +1558,27 @@ class _BrowserRuntimeCore:
quality=max(20, min(95, int(quality))),
full_page=bool(full_page),
)
ref = ephemeral_images.put_image_bytes(
saved = chat_media.save_image_bytes(
context_id=self.context_id,
mime="image/jpeg",
payload=image,
name=f"browser-{resolved_id}.jpg",
mime_type="image/jpeg",
category="screenshots",
source="browser",
preferred_name=f"browser-{resolved_id}.jpg",
)
return {
"browser_id": resolved_id,
"context_id": self.context_id,
"path": saved.path,
"a0_path": saved.a0_path,
"mime": "image/jpeg",
"ephemeral": True,
"ephemeral_ref": ref,
"ephemeral": False,
"chat_scoped": True,
"state": await self._state(resolved_id),
"vision_load": {
"tool_name": "vision_load",
"tool_args": {
"paths": [ref],
"paths": [saved.a0_path],
},
},
}

View file

@ -20,7 +20,7 @@ Workflow:
- For same-page controls that are easier to identify structurally, `click`, `type`, `submit`, `type_submit`, `scroll`, `select_option`, `set_checked`, and `upload_file` may use `selector` instead of `ref`; the tool resolves the selector through `content` first.
- `click` with `x`/`y` and no `ref` is treated as a coordinate mouse click. `type` with text and no `ref` types into the currently focused element. `key_chord` accepts either `["Control", "A"]` or `"CTRL+A"`.
- `navigate` reuses an existing `browser_id` and is preferred for serial browsing.
- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are ephemeral refs rather than conserved files.
- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are saved as chat-scoped artifacts; explicit `path` requests remain user-owned files.
- Keep the tab set small; close pages after extracting what you need.
`multi` is only a browser action: use `tool_name: "browser"` with `tool_args.action: "multi"`. Never use `tool_name: "multi"`.

View file

@ -32,7 +32,7 @@ Screenshots are explicit only; the browser does not automatically load images in
2. Call `vision_load` with the returned `vision_load.tool_args.paths` value.
3. Reason from the latest loaded screenshot.
Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is an ephemeral ref consumed by `vision_load`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used.
Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is saved as a chat-scoped artifact and returned through `vision_load.tool_args.paths`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used.
## Forms And Files

View file

@ -11,7 +11,7 @@ Start with `browser:content` to capture current refs, then use `browser:detail`
Use `select_option`, `set_checked`, `upload_file`, `type`, `type_submit`, and `submit` for form interaction. Use coordinates only when no stable ref exists or the UI is intentionally canvas-like.
Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return ephemeral refs for `vision_load`.
Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return chat-scoped artifact paths for `vision_load`.
Verify after submission with `browser:content`, `browser:state`, or another explicit `browser:screenshot` plus `vision_load`.

View file

@ -37,6 +37,19 @@ def context_screenshot_dir(context_id: str = "") -> Path:
return SCREENSHOT_DIR / _safe_context_id(context_id)
def chat_screenshot_dir(context_id: str = "") -> Path:
return BASE_DIR / "usr" / "chats" / _safe_context_id(context_id) / "screenshots" / "desktop"
def normalize_a0_path(path: str | Path) -> str:
candidate = Path(path)
try:
relative = candidate.resolve(strict=False).relative_to(BASE_DIR.resolve(strict=False))
except ValueError:
return str(candidate)
return "/a0/" + str(relative).replace(os.sep, "/")
def _safe_context_id(context_id: str = "") -> str:
raw = str(context_id or os.environ.get("A0_DESKTOP_CONTEXT_ID") or "default")
return _SAFE_CONTEXT_RE.sub("_", raw).strip("._") or "default"
@ -118,9 +131,11 @@ def capture_screenshot(
return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message}
explicit_path = path is not None and str(path).strip() != ""
ephemeral_ref = not explicit_path and str(transport or "").strip().lower() != "path"
screenshot_dir = context_screenshot_dir(context_id)
if not explicit_path:
transport_mode = str(transport or "").strip().lower()
chat_scoped = bool(not explicit_path and transport_mode == "path" and str(context_id or "").strip())
ephemeral_ref = not explicit_path and transport_mode != "path"
screenshot_dir = chat_screenshot_dir(context_id) if chat_scoped else context_screenshot_dir(context_id)
if not explicit_path and not chat_scoped:
prune_context_screenshots(context_id=context_id)
screenshot_dir.mkdir(parents=True, exist_ok=True)
timestamp = time.strftime("%Y%m%d-%H%M%S")
@ -138,15 +153,17 @@ def capture_screenshot(
return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail}
if target.suffix.lower() == ".xwd":
if not explicit_path:
if not explicit_path and not chat_scoped:
prune_context_screenshots(context_id=context_id, keep_path=raw_path)
return {
"ok": True,
"path": str(raw_path),
"a0_path": normalize_a0_path(raw_path),
"format": "xwd",
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"ephemeral": not explicit_path and not chat_scoped,
"chat_scoped": chat_scoped,
"context_id": safe_context,
"error": "",
}
@ -167,17 +184,19 @@ def capture_screenshot(
width=width,
height=height,
)
if not explicit_path:
if not explicit_path and not chat_scoped:
prune_context_screenshots(context_id=context_id, keep_path=target)
return {
"ok": True,
"path": str(target),
"a0_path": normalize_a0_path(target),
"format": target.suffix.lower().lstrip(".") or "png",
"width": width,
"height": height,
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"ephemeral": not explicit_path and not chat_scoped,
"chat_scoped": chat_scoped,
"context_id": safe_context,
"error": "",
}
@ -193,17 +212,19 @@ def capture_screenshot(
width=converted["width"],
height=converted["height"],
)
if not explicit_path:
if not explicit_path and not chat_scoped:
prune_context_screenshots(context_id=context_id, keep_path=target)
return {
"ok": True,
"path": str(target),
"a0_path": normalize_a0_path(target),
"format": target.suffix.lower().lstrip(".") or "png",
"width": converted["width"],
"height": converted["height"],
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"ephemeral": not explicit_path and not chat_scoped,
"chat_scoped": chat_scoped,
"context_id": safe_context,
"error": "",
}
@ -226,10 +247,12 @@ def capture_screenshot(
return {
"ok": True,
"path": str(raw_path),
"a0_path": normalize_a0_path(raw_path),
"format": "xwd",
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"ephemeral": not explicit_path and not chat_scoped,
"chat_scoped": chat_scoped,
"context_id": safe_context,
"error": message,
}
@ -575,8 +598,36 @@ def parse_xprop(output: str) -> dict[str, str]:
def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
chat_dir = chat_screenshot_dir(context_id)
chat_latest = _latest_screenshot_from_dir(
chat_dir,
context_id=context_id,
ephemeral=False,
chat_scoped=True,
prune_older=False,
)
if chat_latest.get("ok"):
return chat_latest
prune_context_screenshots(context_id=context_id, max_age_seconds=RECENT_SCREENSHOT_SECONDS)
screenshot_dir = context_screenshot_dir(context_id)
return _latest_screenshot_from_dir(
screenshot_dir,
context_id=context_id,
ephemeral=True,
chat_scoped=False,
prune_older=True,
)
def _latest_screenshot_from_dir(
screenshot_dir: Path,
*,
context_id: str = "",
ephemeral: bool,
chat_scoped: bool,
prune_older: bool,
) -> dict[str, Any]:
if not screenshot_dir.exists():
return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
candidates = [
@ -587,17 +638,20 @@ def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
if not candidates:
return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
latest = max(candidates, key=lambda item: item.stat().st_mtime)
for candidate in candidates:
if candidate != latest:
candidate.unlink(missing_ok=True)
if prune_older:
for candidate in candidates:
if candidate != latest:
candidate.unlink(missing_ok=True)
age = max(0.0, time.time() - latest.stat().st_mtime)
return {
"ok": True,
"path": str(latest),
"a0_path": normalize_a0_path(latest),
"format": latest.suffix.lower().lstrip("."),
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)),
"recent": age <= RECENT_SCREENSHOT_SECONDS,
"ephemeral": True,
"ephemeral": ephemeral,
"chat_scoped": chat_scoped,
"context_id": _safe_context_id(context_id),
}
@ -660,7 +714,8 @@ def compact_prompt_context(state: dict[str, Any] | None = None) -> str:
screenshot = state.get("screenshot") or {}
if screenshot.get("recent") and screenshot.get("path"):
ephemeral = " ephemeral" if screenshot.get("ephemeral") else ""
lines.append(f"- recent_screenshot={screenshot['path']}{ephemeral}")
screenshot_ref = screenshot.get("a0_path") or screenshot["path"]
lines.append(f"- recent_screenshot={screenshot_ref}{ephemeral}")
context_id = str(state.get("context_id") or "").strip()
if context_id:
lines.append(f"- screenshot_context={context_id}")

View file

@ -38,7 +38,7 @@ The Desktop is an observe-act-verify control surface. Use this decision hierarch
3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands.
4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation.
5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output.
6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations return temporary context paths. Do not report from an earlier screenshot path.
6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations with `--context-id` return chat-scoped screenshot paths. Do not report from an earlier screenshot path.
Keep these standing rules:
@ -68,7 +68,7 @@ $DESKTOP key ctrl+s
The script targets the persistent `agent-zero-desktop` X display, sets `DISPLAY`, `XAUTHORITY`, and `HOME` to the XFCE profile, then uses `xdotool` for input. Startup normally prepares this session. If `check` fails during explicit Desktop work, report that the Desktop runtime is not ready instead of installing packages ad hoc.
If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Use any returned shell screenshot path promptly; only the latest temporary context screenshot is retained.
If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Shell screenshots captured with `--context-id` live in the owning chat's screenshot folder; screenshots without a chat context remain temporary.
For direct app launches without coordinates:

View file

@ -60,7 +60,7 @@ Commands:
observe --json [--screenshot] [--context-id ID]
Return structured state, optionally with a fresh screenshot.
screenshot [PATH] [--context-id ID]
Capture the Desktop to PATH, or to the temporary context screenshot directory.
Capture the Desktop to PATH, or to the chat screenshot directory when context-id is set.
active-window Print the active window name.
geometry PATTERN Print the first matching visible window geometry.
wait-window PATTERN Wait for a visible matching window and print its id.

View file

@ -258,6 +258,7 @@ const model = {
_desktopFrameHost: null,
_desktopFrameLoadHandler: null,
_desktopKeepaliveHost: null,
_desktopDisplaySizes: {},
_desktopIntentionalShutdown: false,
async init(element = null) {
@ -1499,7 +1500,7 @@ const model = {
this.stopXpraDesktopPrime();
this._desktopPrimeAttempts = 0;
}
if (this.applyXpraDesktopFrameMode(options.frame || null)) return;
if (this.applyXpraDesktopFrameMode(options.frame || null, options)) return;
if (this._desktopPrimeAttempts >= XPRA_DESKTOP_PRIME_ATTEMPTS) return;
this._desktopPrimeAttempts += 1;
if (this._desktopPrimeTimer) globalThis.clearTimeout(this._desktopPrimeTimer);
@ -1540,8 +1541,12 @@ const model = {
const windows = Object.values(client.id_to_window || {});
if (!client.connected || !windows.length) return false;
const width = Math.round(container.clientWidth || remoteWindow.innerWidth || 0);
const height = Math.round(container.clientHeight || remoteWindow.innerHeight || 0);
const token = options.token || this.session?.desktop?.token || "";
const displaySize = options.displaySize || this.desktopDisplaySizeForToken(token);
const viewportWidth = Math.round(container.clientWidth || remoteWindow.innerWidth || 0);
const viewportHeight = Math.round(container.clientHeight || remoteWindow.innerHeight || 0);
const width = Math.round(displaySize?.width || viewportWidth || 0);
const height = Math.round(displaySize?.height || viewportHeight || 0);
if (width > 0 && height > 0) {
client.desktop_width = width;
client.desktop_height = height;
@ -1574,6 +1579,26 @@ const model = {
}
},
desktopDisplaySizeForToken(token = "") {
const key = String(token || "").trim();
const size = key ? this._desktopDisplaySizes?.[key] : null;
const width = Math.round(Number(size?.width || 0));
const height = Math.round(Number(size?.height || 0));
return width > 0 && height > 0 ? { width, height } : null;
},
rememberDesktopDisplaySize(token = "", width = 0, height = 0) {
const key = String(token || "").trim();
const normalizedWidth = Math.round(Number(width || 0));
const normalizedHeight = Math.round(Number(height || 0));
if (!key || normalizedWidth <= 0 || normalizedHeight <= 0) return null;
this._desktopDisplaySizes = {
...(this._desktopDisplaySizes || {}),
[key]: { width: normalizedWidth, height: normalizedHeight },
};
return this._desktopDisplaySizes[key];
},
installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container) {
if (!frame || !remoteWindow || !remoteDocument || !client) return null;
const store = this;
@ -1584,8 +1609,10 @@ const model = {
const metrics = () => {
const desktopWidth = Math.max(1, finite(client.desktop_width || container?.clientWidth || remoteWindow.innerWidth, 1));
const desktopHeight = Math.max(1, finite(client.desktop_height || container?.clientHeight || remoteWindow.innerHeight, 1));
const clientWidth = Math.max(1, finite(container?.clientWidth || remoteWindow.innerWidth, desktopWidth));
const clientHeight = Math.max(1, finite(container?.clientHeight || remoteWindow.innerHeight, desktopHeight));
const primaryWindow = Object.values(client.id_to_window || {})[0];
const canvas = primaryWindow?.canvas;
const clientWidth = Math.max(1, finite(canvas?.clientWidth || canvas?.width || container?.clientWidth || remoteWindow.innerWidth, desktopWidth));
const clientHeight = Math.max(1, finite(canvas?.clientHeight || canvas?.height || container?.clientHeight || remoteWindow.innerHeight, desktopHeight));
return {
desktopWidth,
desktopHeight,
@ -1683,8 +1710,10 @@ const model = {
},
fitXpraDesktopWindowElement(xpraWindow, width, height) {
const cssWidth = `${Math.max(1, Number(width || 0))}px`;
const cssHeight = `${Math.max(1, Number(height || 0))}px`;
const normalizedWidth = Math.max(1, Math.round(Number(width || 0)));
const normalizedHeight = Math.max(1, Math.round(Number(height || 0)));
const cssWidth = `${normalizedWidth}px`;
const cssHeight = `${normalizedHeight}px`;
const windowElement = xpraWindow?.div;
const canvas = xpraWindow?.canvas;
windowElement?.style?.setProperty("left", "0px", "important");
@ -1698,6 +1727,12 @@ const model = {
canvas?.style?.setProperty("height", cssHeight, "important");
canvas?.style?.setProperty("display", "block", "important");
canvas?.style?.setProperty("margin", "0", "important");
if (canvas) {
if (canvas.width !== normalizedWidth) canvas.width = normalizedWidth;
if (canvas.height !== normalizedHeight) canvas.height = normalizedHeight;
canvas.setAttribute("width", String(normalizedWidth));
canvas.setAttribute("height", String(normalizedHeight));
}
},
installXpraDesktopWheelBridge(remoteWindow, xpraWindow) {
@ -2139,6 +2174,11 @@ const model = {
const response = await fetch(`/desktop/resize?${params.toString()}`, { credentials: "same-origin" });
if (response.ok) {
const result = await response.json().catch(() => ({}));
const displaySize = this.rememberDesktopDisplaySize(
token,
result?.width || width,
result?.height || height,
);
this._desktopResizeKey = key;
const activeFrame = this.desktopFrame(frame);
const activeTarget = activeFrame?.parentElement || activeFrame;
@ -2153,7 +2193,7 @@ const model = {
}
}
if (result?.reload) this.reloadDesktopFrame(activeFrame || frame);
this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame });
this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame, token, displaySize });
}
} catch (error) {
console.warn("Desktop resize skipped", error);

View file

@ -2477,7 +2477,7 @@ async def test_browser_runtime_remounts_initial_changed_viewport():
@pytest.mark.anyio
async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeypatch, tmp_path):
async def test_browser_runtime_screenshot_file_defaults_to_chat_scoped_artifact(monkeypatch, tmp_path):
screenshot_calls = []
def fake_get_abs_path(*parts):
@ -2512,15 +2512,15 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
result = await core.screenshot_file(5, quality=500)
assert "path" not in result
assert "a0_path" not in result
assert Path(result["path"]).read_bytes() == b"image-bytes"
assert result["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/browser/browser-5-")
assert result["context_id"] == "ctx/id"
assert result["mime"] == "image/jpeg"
assert result["ephemeral"] is True
assert result["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX)
assert result["ephemeral"] is False
assert result["chat_scoped"] is True
assert result["vision_load"] == {
"tool_name": "vision_load",
"tool_args": {"paths": [result["ephemeral_ref"]]},
"tool_args": {"paths": [result["a0_path"]]},
}
assert "image" not in result
assert not list((tmp_path / "tmp" / "browser" / "screenshots").rglob("*.jpg"))
@ -2528,7 +2528,6 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
assert screenshot_calls[-1]["quality"] == 95
assert screenshot_calls[-1]["full_page"] is False
assert "path" not in screenshot_calls[-1]
assert ephemeral_images.consume_image(result["ephemeral_ref"], context_id="ctx/id").data_url == "data:image/jpeg;base64,aW1hZ2UtYnl0ZXM="
png_path = tmp_path / "custom.png"
png_result = await core.screenshot_file(5, quality=1, full_page=True, path=str(png_path))
@ -2543,9 +2542,27 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
@pytest.mark.anyio
async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
async def test_vision_load_materializes_ephemeral_browser_refs(monkeypatch, tmp_path):
monkeypatch.setitem(sys.modules, "helpers.tool", SimpleNamespace(Response=_TestResponse, Tool=_TestTool))
history_stub = ModuleType("helpers.history")
class _RawMessage(dict):
def __init__(self, raw_content, preview):
super().__init__(raw_content=raw_content, preview=preview)
history_stub.RawMessage = _RawMessage
monkeypatch.setitem(sys.modules, "helpers.history", history_stub)
monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
import tools.vision_load as vision_load_module
def fake_get_abs_path(*parts):
return str(tmp_path.joinpath(*parts))
def fake_normalize_a0_path(path):
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path)
monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
monkeypatch.setattr(
vision_load_module.plugins,
"get_plugin_config",
@ -2561,7 +2578,7 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)),
hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)),
)
ref = ephemeral_images.put_image(
ref = vision_load_module.ephemeral_images.put_image(
context_id="ctx-vision",
mime="image/jpeg",
data=SMALL_JPEG_10X10,
@ -2580,10 +2597,13 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
response = await tool.execute(paths=[ref])
await tool.after_execution(response)
assert ephemeral_images.get_image(ref, context_id="ctx-vision") is None
assert vision_load_module.ephemeral_images.get_image(ref, context_id="ctx-vision") is None
assert tool.loaded_paths == ["browser-shot.jpg"]
raw_message = messages[0][1]["content"]
assert raw_message.raw_content[0]["image_url"]["url"] == f"data:image/jpeg;base64,{SMALL_JPEG_10X10}"
stored_ref = raw_message["raw_content"][0]["image_url"]["url"]
assert stored_ref.startswith("/a0/usr/chats/ctx-vision/screenshots/browser/browser-shot-")
stored_path = tmp_path / stored_ref.removeprefix("/a0/")
assert stored_path.read_bytes() == __import__("base64").b64decode(SMALL_JPEG_10X10)
assert updates[-1]["result"] == "1 images loaded, 0 skipped"

View file

@ -12,8 +12,8 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from helpers import ephemeral_images
from plugins._a0_connector.helpers import ws_runtime
from plugins._browser.helpers import connector_runtime as connector_runtime_module
from plugins._browser.helpers.connector_runtime import (
ConnectorBrowserRuntime,
_agent_uses_local_chat_model,
@ -330,7 +330,15 @@ def test_connector_runtime_adds_docker_recovery_to_host_errors():
assert "/browser container" in message
def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path):
def test_host_browser_artifacts_become_chat_scoped_files(monkeypatch, tmp_path):
def fake_get_abs_path(*parts):
return str(tmp_path.joinpath(*parts))
def fake_normalize_a0_path(path):
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
monkeypatch.setattr(connector_runtime_module.chat_media.files, "get_abs_path", fake_get_abs_path)
monkeypatch.setattr(connector_runtime_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host"))
result = runtime._materialize_artifact(
@ -352,19 +360,15 @@ def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path):
inner = result[0]["result"]
assert "artifact" not in inner
assert "path" not in inner
assert "a0_path" not in inner
assert Path(inner["path"]).read_bytes() == b"fake"
assert inner["a0_path"].startswith("/a0/usr/chats/ctx-host/screenshots/browser/shot-")
assert inner["context_id"] == "ctx-host"
assert inner["ephemeral"] is True
assert inner["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX)
assert inner["vision_load"]["tool_args"]["paths"] == [inner["ephemeral_ref"]]
assert ephemeral_images.consume_image(inner["ephemeral_ref"], context_id="ctx-host").data_url == "data:image/jpeg;base64,ZmFrZQ=="
assert not list(tmp_path.rglob("shot.jpg"))
assert inner["ephemeral"] is False
assert inner["chat_scoped"] is True
assert inner["vision_load"]["tool_args"]["paths"] == [inner["a0_path"]]
def test_host_browser_artifact_materialization_rejects_oversized_payload(monkeypatch, tmp_path):
import plugins._browser.helpers.connector_runtime as connector_runtime_module
monkeypatch.setattr(connector_runtime_module, "MAX_ARTIFACT_SIZE_BYTES", 2)
runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host"))

View file

@ -264,6 +264,14 @@ def test_desktop_plugin_owns_routes_runtime_surface_and_state_paths():
assert "DESKTOP_RUNTIME_INSTALL_MESSAGE" in desktop_store
assert "openDesktopWhenRuntimeReady" in desktop_store
assert "isDesktopRuntimeInstalling" in desktop_store
assert "_desktopDisplaySizes: {}" in desktop_store
assert "desktopDisplaySizeForToken(token" in desktop_store
assert "rememberDesktopDisplaySize(token" in desktop_store
assert "options.displaySize || this.desktopDisplaySizeForToken(token)" in desktop_store
assert "result?.width || width" in desktop_store
assert "canvas.width = normalizedWidth" in desktop_store
assert "canvas.height = normalizedHeight" in desktop_store
assert "canvas?.clientWidth || canvas?.width" in desktop_store
assert "Installing Agent Zero Desktop runtime dependencies" in desktop_session
assert "__a0XpraOffsetWarnPatched" in desktop_store
assert "window does not fit in canvas, offsets" in desktop_store

View file

@ -191,7 +191,8 @@ def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp
def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeypatch):
monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path)
monkeypatch.setattr(desktop_state, "BASE_DIR", tmp_path)
monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path / "tmp" / "desktop" / "screenshots")
capabilities = {"xwd": "/usr/bin/xwd"}
env = {"DISPLAY": ":120"}
@ -222,7 +223,7 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp
monkeypatch.setattr(desktop_state, "run", fake_run)
monkeypatch.setitem(sys.modules, "PIL", pil_module)
monkeypatch.setitem(sys.modules, "PIL.Image", image_module)
stale_path = tmp_path / "ctx_id" / "stale.png"
stale_path = tmp_path / "tmp" / "desktop" / "screenshots" / "ctx_id" / "stale.png"
stale_path.parent.mkdir(parents=True)
stale_path.write_bytes(b"stale")
@ -236,12 +237,14 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp
path = Path(screenshot["path"])
assert screenshot["ok"] is True
assert screenshot["ephemeral"] is True
assert screenshot["ephemeral"] is False
assert screenshot["chat_scoped"] is True
assert screenshot["context_id"] == "ctx_id"
assert path.parent == tmp_path / "ctx_id"
assert screenshot["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/desktop/desktop-")
assert path.parent == tmp_path / "usr" / "chats" / "ctx_id" / "screenshots" / "desktop"
assert path.name.startswith("desktop-")
assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path)
assert not stale_path.exists()
assert stale_path.exists()
def test_desktop_state_default_screenshot_returns_ephemeral_ref(tmp_path, monkeypatch):

View file

@ -699,3 +699,35 @@ def test_computer_use_remote_start_session_reports_backend_features_and_windows_
assert "backend=windows/windows" in message
assert "features=uia-tree-snapshot, uia-structural-targeting" in message
assert "host-computer-use-windows" in message
def test_computer_use_remote_capture_artifact_is_chat_scoped(monkeypatch, tmp_path: Path):
module = _load_computer_use_remote_tool(monkeypatch)
def fake_get_abs_path(*parts):
return str(tmp_path.joinpath(*parts))
def fake_normalize_a0_path(path):
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
monkeypatch.setattr(module.chat_media.files, "get_abs_path", fake_get_abs_path)
monkeypatch.setattr(module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
tool = object.__new__(module.ComputerUseRemote)
tool.agent = types.SimpleNamespace(context=types.SimpleNamespace(id="ctx-computer"))
display_ref, capture_id = tool._resolve_capture_ref(
{
"artifact": {
"filename": "capture.png",
"mime": "image/png",
"encoding": "base64",
"data": "ZmFrZQ==",
},
}
)
assert display_ref.startswith("/a0/usr/chats/ctx-computer/screenshots/computer-use/capture-")
stored_path = tmp_path / display_ref.removeprefix("/a0/")
assert stored_path.read_bytes() == b"fake"
assert capture_id == stored_path.stem

View file

@ -0,0 +1,123 @@
import types
from types import SimpleNamespace
import sys
from pathlib import Path
import pytest
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from helpers import images
class _TestResponse(SimpleNamespace):
def __init__(self, message="", break_loop=False, **kwargs):
super().__init__(message=message, break_loop=break_loop, **kwargs)
class _TestTool:
def __init__(
self,
agent=None,
name="",
method=None,
args=None,
message="",
loop_data=None,
**kwargs,
):
self.agent = agent
self.name = name
self.method = method
self.args = args or {}
self.message = message
self.loop_data = loop_data
def _install_tool_stub(monkeypatch):
tool_stub = types.ModuleType("helpers.tool")
tool_stub.Response = _TestResponse
tool_stub.Tool = _TestTool
history_stub = types.ModuleType("helpers.history")
class _RawMessage(dict):
def __init__(self, raw_content, preview):
super().__init__(raw_content=raw_content, preview=preview)
history_stub.RawMessage = _RawMessage
monkeypatch.setitem(sys.modules, "helpers.tool", tool_stub)
monkeypatch.setitem(sys.modules, "helpers.history", history_stub)
monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
def test_prepare_content_keeps_missing_local_image_refs_strict():
missing_path = "/tmp/a0-missing-desktop-screenshot.png"
with pytest.raises(FileNotFoundError):
images.prepare_content(
[{"type": "image_url", "image_url": {"url": missing_path}}]
)
@pytest.mark.anyio
async def test_vision_load_materializes_local_image_to_chat_artifact(monkeypatch, tmp_path):
_install_tool_stub(monkeypatch)
import tools.vision_load as vision_load_module
def fake_get_abs_path(*parts):
return str(tmp_path.joinpath(*parts))
def fake_normalize_a0_path(path):
return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path)
monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
monkeypatch.setattr(
vision_load_module.plugins,
"get_plugin_config",
lambda *args, **kwargs: {"chat_model": {"max_embeds": 10}},
)
async def direct_call(func, *args, **kwargs):
return func(*args, **kwargs)
monkeypatch.setattr(
vision_load_module.runtime,
"call_development_function",
direct_call,
)
image_path = tmp_path / "sample-image.png"
image_path.write_bytes(b"png-data")
tool_results = []
messages = []
updates = []
agent = SimpleNamespace(
context=SimpleNamespace(id="ctx-vision"),
agent_name="Agent 0",
hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)),
hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)),
)
tool = vision_load_module.VisionLoad(
agent=agent,
name="vision_load",
method=None,
args={"paths": [str(image_path)]},
message="",
loop_data=None,
)
tool.log = SimpleNamespace(id="vision-log", update=lambda **kwargs: updates.append(kwargs))
response = await tool.execute(paths=[str(image_path)])
image_path.unlink()
await tool.after_execution(response)
raw_message = messages[0][1]["content"]
stored_ref = raw_message["raw_content"][0]["image_url"]["url"]
assert stored_ref.startswith("/a0/usr/chats/ctx-vision/images/vision-load/sample-image-")
stored_path = tmp_path / stored_ref.removeprefix("/a0/")
assert stored_path.read_bytes() == b"png-data"
assert updates[-1]["result"] == "1 images loaded, 0 skipped"

View file

@ -1,6 +1,6 @@
from helpers.print_style import PrintStyle
from helpers.tool import Tool, Response
from helpers import runtime, files, plugins, ephemeral_images
from helpers import runtime, files, plugins, ephemeral_images, images, chat_media
from mimetypes import guess_type
from helpers import history
@ -27,7 +27,7 @@ class VisionLoad(Tool):
else []
)
for path, display_path in limited_paths:
for idx, (path, display_path) in enumerate(limited_paths):
if not path:
continue
if ephemeral_images.is_ref(path):
@ -38,12 +38,16 @@ class VisionLoad(Tool):
if image is None:
continue
display = image.display_name or display_path
self.images_dict[display] = image.data_url
self.loaded_paths.append(display)
stored_ref = self._store_ephemeral_image(image)
if stored_ref:
self.images_dict[display] = stored_ref
self.loaded_paths.append(display)
continue
if self._is_data_image_url(path):
self.images_dict[display_path] = path
self.loaded_paths.append(display_path)
stored_ref = self._store_data_url(path, preferred_name=f"vision-load-{idx + 1}.png")
if stored_ref:
self.images_dict[display_path] = stored_ref
self.loaded_paths.append(display_path)
continue
if not await runtime.call_development_function(files.exists, str(path)):
continue
@ -51,8 +55,12 @@ class VisionLoad(Tool):
if path not in self.images_dict:
mime_type, _ = guess_type(str(path))
if mime_type and mime_type.startswith("image/"):
self.images_dict[display_path] = str(path)
self.loaded_paths.append(display_path)
try:
stored_ref = self._store_local_image(path, preferred_name=files.basename(path))
self.images_dict[display_path] = stored_ref
self.loaded_paths.append(display_path)
except (FileNotFoundError, OSError, ValueError):
continue
return Response(message="dummy", break_loop=False)
@ -65,6 +73,48 @@ class VisionLoad(Tool):
def _context_id(self) -> str:
return str(getattr(getattr(self.agent, "context", None), "id", "") or "").strip()
def _store_ephemeral_image(self, image: ephemeral_images.EphemeralImage) -> str:
context_id = self._context_id()
if not context_id:
return image.data_url
source = chat_media.infer_source(image.ref, image.display_name)
category = chat_media.category_for_source(source)
saved = chat_media.save_image_base64(
context_id=context_id,
data=image.data,
mime_type=image.mime,
category=category,
source=source,
preferred_name=image.display_name,
)
return saved.a0_path
def _store_data_url(self, data_url: str, *, preferred_name: str = "") -> str:
context_id = self._context_id()
if not context_id:
return data_url
source = chat_media.infer_source(data_url, preferred_name)
category = chat_media.category_for_source(source)
saved = chat_media.save_image_data_url(
context_id=context_id,
data_url=data_url,
category=category,
source=source,
preferred_name=preferred_name,
)
return saved.a0_path
def _store_local_image(self, path: str, *, preferred_name: str = "") -> str:
context_id = self._context_id()
if not context_id:
return images.to_data_url(path)
return chat_media.materialize_image_ref(
context_id=context_id,
url=path,
source=chat_media.infer_source(path, preferred_name),
preferred_name=preferred_name,
)
@staticmethod
def _is_data_image_url(value: str) -> bool:
normalized = str(value or "").strip().lower()