Fix durable screenshot artifacts and Xpra sizing

Materialize browser, desktop, computer-use, and vision-load screenshots into chat-scoped artifacts so historical image refs survive temporary screenshot pruning. Keep history serialization free of rescue assumptions, document durable screenshot behavior in tool prompts/skills, and size Xpra canvases from backend-normalized display dimensions to prevent stretched desktop views. Verified with focused pytest coverage plus live Docker checks for browser screenshot persistence and Xpra canvas dimensions.
2026-06-02 07:11:56 +00:00 · 2026-05-30 17:45:19 +02:00 · 2026-05-30 17:45:19 +02:00 · edd58a42d2
commit edd58a42d2
parent 45e4bd892c
18 changed files with 688 additions and 81 deletions
--- a/helpers/chat_media.py
+++ b/helpers/chat_media.py
@ -0,0 +1,244 @@
+from __future__ import annotations
+
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+
+from helpers import files, media_artifacts
+
+
+DEFAULT_MAX_IMAGE_BYTES = media_artifacts.DEFAULT_MAX_ARTIFACT_SIZE_BYTES
+ImageCategory = Literal["images", "screenshots"]
+
+
+@dataclass(frozen=True)
+class ChatImage:
+    path: str
+    a0_path: str
+    mime: str
+    size: int
+
+
+def screenshot_dir(context_id: str, source: str) -> Path:
+    return artifact_dir(context_id, category="screenshots", source=source)
+
+
+def artifact_dir(
+    context_id: str,
+    *,
+    category: ImageCategory = "images",
+    source: str = "vision-load",
+) -> Path:
+    context_segment = files.safe_file_name(str(context_id or "default")).strip("._") or "default"
+    safe_category = files.safe_file_name(category).strip("._") or "images"
+    safe_source = files.safe_file_name(source).strip("._") or "vision-load"
+
+    return Path(files.get_abs_path("usr/chats", context_segment)) / safe_category / safe_source
+
+
+def save_image_bytes(
+    *,
+    context_id: str,
+    payload: bytes,
+    mime_type: str = "image/png",
+    category: ImageCategory = "images",
+    source: str = "vision-load",
+    preferred_name: str = "",
+    max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
+) -> ChatImage:
+    data = bytes(payload or b"")
+    if not data:
+        raise media_artifacts.EmptyBase64Data("image payload is empty")
+    if max_bytes is not None and len(data) > max_bytes:
+        raise media_artifacts.ArtifactTooLarge(len(data), max_bytes)
+
+    safe_mime = media_artifacts.normalize_mime(
+        mime_type,
+        default="image/png",
+        required_prefix="image/",
+    )
+    default_extension = media_artifacts.guess_extension(safe_mime, ".png")
+    default_filename = f"{source or 'image'}{default_extension}"
+    filename = media_artifacts.safe_filename(
+        preferred_name,
+        default=default_filename,
+        default_extension=default_extension,
+    )
+    filename_path = Path(filename)
+    stem = filename_path.stem or Path(default_filename).stem or "image"
+    suffix = filename_path.suffix or default_extension
+    timestamp = time.strftime("%Y%m%d-%H%M%S")
+    path = artifact_dir(context_id, category=category, source=source) / (
+        f"{stem}-{timestamp}-{uuid.uuid4().hex[:8]}{suffix}"
+    )
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_bytes(data)
+    return ChatImage(
+        path=str(path),
+        a0_path=files.normalize_a0_path(str(path)),
+        mime=safe_mime,
+        size=len(data),
+    )
+
+
+def save_image_base64(
+    *,
+    context_id: str,
+    data: str,
+    mime_type: str = "image/png",
+    category: ImageCategory = "images",
+    source: str = "vision-load",
+    preferred_name: str = "",
+    max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
+) -> ChatImage:
+    payload = media_artifacts.decode_base64_payload(data, max_bytes=max_bytes)
+    return save_image_bytes(
+        context_id=context_id,
+        payload=payload.payload,
+        mime_type=mime_type,
+        category=category,
+        source=source,
+        preferred_name=preferred_name,
+        max_bytes=max_bytes,
+    )
+
+
+def save_image_file(
+    *,
+    context_id: str,
+    path: str | Path,
+    category: ImageCategory = "images",
+    source: str = "vision-load",
+    preferred_name: str = "",
+    max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
+) -> ChatImage:
+    image_path = Path(path)
+    payload = image_path.read_bytes()
+    mime = media_artifacts.normalize_mime(
+        _guess_image_mime(image_path),
+        default="image/png",
+        required_prefix="image/",
+    )
+    return save_image_bytes(
+        context_id=context_id,
+        payload=payload,
+        mime_type=mime,
+        category=category,
+        source=source,
+        preferred_name=preferred_name or image_path.name,
+        max_bytes=max_bytes,
+    )
+
+
+def save_image_data_url(
+    *,
+    context_id: str,
+    data_url: str,
+    category: ImageCategory = "images",
+    source: str = "vision-load",
+    preferred_name: str = "",
+    max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
+) -> ChatImage:
+    header, encoded = _split_image_data_url(data_url)
+    mime = header.removeprefix("data:").split(";", 1)[0] or "image/png"
+    return save_image_base64(
+        context_id=context_id,
+        data=encoded,
+        mime_type=mime,
+        category=category,
+        source=source,
+        preferred_name=preferred_name,
+        max_bytes=max_bytes,
+    )
+
+
+def materialize_image_ref(
+    *,
+    context_id: str,
+    url: str,
+    source: str = "",
+    preferred_name: str = "",
+    max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES,
+) -> str:
+    value = str(url or "").strip()
+    if not value or not str(context_id or "").strip():
+        return value
+
+    resolved_source = source or infer_source(value, preferred_name)
+    category = category_for_source(resolved_source)
+    if _is_data_image_url(value):
+        saved = save_image_data_url(
+            context_id=context_id,
+            data_url=value,
+            category=category,
+            source=resolved_source,
+            preferred_name=preferred_name,
+            max_bytes=max_bytes,
+        )
+        return saved.a0_path
+
+    from helpers import images
+
+    source_path = images.resolve_ref(value)
+    if is_chat_scoped_path(context_id=context_id, path=source_path):
+        return files.normalize_a0_path(str(source_path))
+    saved = save_image_file(
+        context_id=context_id,
+        path=source_path,
+        category=category,
+        source=resolved_source,
+        preferred_name=preferred_name or source_path.name,
+        max_bytes=max_bytes,
+    )
+    return saved.a0_path
+
+
+def is_chat_scoped_path(*, context_id: str, path: str | Path) -> bool:
+    if not str(context_id or "").strip():
+        return False
+    try:
+        target = Path(path).resolve(strict=False)
+        root = artifact_dir(context_id, category="images", source="vision-load").parents[1].resolve(strict=False)
+        return target == root or root in target.parents
+    except OSError:
+        return False
+
+
+def infer_source(value: str = "", preferred_name: str = "") -> str:
+    raw = f"{value or ''} {preferred_name or ''}".lower()
+    if "computer-use" in raw or "computer_use" in raw or "_a0_connector/computer_use" in raw:
+        return "computer-use"
+    if "/desktop/screenshots/" in raw or "\\desktop\\screenshots\\" in raw or "desktop-" in raw:
+        return "desktop"
+    if (
+        "/browser/screenshots/" in raw
+        or "\\browser\\screenshots\\" in raw
+        or "host-browser" in raw
+        or "browser-" in raw
+    ):
+        return "browser"
+    return "vision-load"
+
+
+def category_for_source(source: str) -> ImageCategory:
+    return "screenshots" if source in {"desktop", "browser", "computer-use"} else "images"
+
+
+def _guess_image_mime(path: Path) -> str:
+    import mimetypes
+
+    return mimetypes.guess_type(path.name)[0] or "image/png"
+
+
+def _is_data_image_url(value: str) -> bool:
+    normalized = str(value or "").strip().lower()
+    return normalized.startswith("data:image/") and ";base64," in normalized
+
+
+def _split_image_data_url(data_url: str) -> tuple[str, str]:
+    value = str(data_url or "").strip()
+    if not _is_data_image_url(value) or "," not in value:
+        raise ValueError("image data URL must be data:image/*;base64,...")
+    return value.split(",", 1)
--- a/plugins/_a0_connector/tools/computer_use_remote.py
+++ b/plugins/_a0_connector/tools/computer_use_remote.py
@ -6,7 +6,7 @@ from pathlib import Path
 import uuid
 from typing import Any

-from helpers import history, media_artifacts
+from helpers import chat_media, history, media_artifacts
 from helpers.print_style import PrintStyle
 from helpers.tool import Response, Tool
 from helpers.ws import NAMESPACE
@ -744,7 +744,15 @@ class ComputerUseRemote(Tool):
        except FileNotFoundError as exc:
            path_error = exc
        else:
-            return display_path, image_path.stem
+            saved = chat_media.save_image_file(
+                context_id=self.agent.context.id,
+                path=image_path,
+                category="screenshots",
+                source="computer-use",
+                preferred_name=Path(display_path).name or image_path.name,
+                max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES,
+            )
+            return saved.a0_path, Path(saved.path).stem

        artifact = data.get("artifact")
        if isinstance(artifact, dict) and str(artifact.get("encoding", "")).strip().lower() == "base64":
@ -764,7 +772,16 @@ class ComputerUseRemote(Tool):
                    default=f"computer-use-{uuid.uuid4().hex}.png",
                    default_extension=".png",
                )
-                return f"data:{mime};base64,{encoded}", Path(filename).stem
+                saved = chat_media.save_image_base64(
+                    context_id=self.agent.context.id,
+                    data=encoded,
+                    mime_type=mime,
+                    category="screenshots",
+                    source="computer-use",
+                    preferred_name=filename,
+                    max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES,
+                )
+                return saved.a0_path, Path(saved.path).stem

        if path_error is not None:
            raise path_error
--- a/plugins/_browser/helpers/connector_runtime.py
+++ b/plugins/_browser/helpers/connector_runtime.py
@ -9,7 +9,7 @@ from pathlib import Path
 from typing import Any
 from urllib.parse import urlparse

-from helpers import ephemeral_images, media_artifacts
+from helpers import chat_media, media_artifacts

 try:
    from helpers.ws import NAMESPACE
@ -451,12 +451,16 @@ class ConnectorBrowserRuntime:
            default=f"host-browser-{uuid.uuid4().hex}.jpg",
            default_extension=".jpg",
        )
+        mime = str(artifact.get("mime") or result.get("mime") or "image/jpeg")
        try:
-            ref = ephemeral_images.put_image(
+            saved = chat_media.save_image_base64(
                context_id=self.context_id,
-                mime=str(artifact.get("mime") or result.get("mime") or "image/jpeg"),
                data=data,
-                name=filename,
+                mime_type=mime,
+                category="screenshots",
+                source="browser",
+                preferred_name=filename,
+                max_bytes=MAX_ARTIFACT_SIZE_BYTES,
            )
        except Exception as exc:
            raise RuntimeError("Host browser artifact could not be decoded.") from exc
@ -466,11 +470,14 @@ class ConnectorBrowserRuntime:
        materialized.pop("a0_path", None)
        materialized.pop("host_path", None)
        materialized.setdefault("context_id", self.context_id)
-        materialized["ephemeral"] = True
-        materialized["ephemeral_ref"] = ref
+        materialized["path"] = saved.path
+        materialized["a0_path"] = saved.a0_path
+        materialized["mime"] = saved.mime
+        materialized["ephemeral"] = False
+        materialized["chat_scoped"] = True
        materialized["vision_load"] = {
            "tool_name": "vision_load",
-            "tool_args": {"paths": [ref]},
+            "tool_args": {"paths": [saved.a0_path]},
        }
        return materialized

--- a/plugins/_browser/helpers/runtime.py
+++ b/plugins/_browser/helpers/runtime.py
@ -15,7 +15,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any

-from helpers import ephemeral_images, files
+from helpers import chat_media, files
 from helpers.defer import DeferredTask
 from helpers.errors import RepairableException
 from helpers.print_style import PrintStyle
@ -1558,23 +1558,27 @@ class _BrowserRuntimeCore:
                quality=max(20, min(95, int(quality))),
                full_page=bool(full_page),
            )
-            ref = ephemeral_images.put_image_bytes(
+            saved = chat_media.save_image_bytes(
                context_id=self.context_id,
-                mime="image/jpeg",
                payload=image,
-                name=f"browser-{resolved_id}.jpg",
+                mime_type="image/jpeg",
+                category="screenshots",
+                source="browser",
+                preferred_name=f"browser-{resolved_id}.jpg",
            )
            return {
                "browser_id": resolved_id,
                "context_id": self.context_id,
+                "path": saved.path,
+                "a0_path": saved.a0_path,
                "mime": "image/jpeg",
-                "ephemeral": True,
-                "ephemeral_ref": ref,
+                "ephemeral": False,
+                "chat_scoped": True,
                "state": await self._state(resolved_id),
                "vision_load": {
                    "tool_name": "vision_load",
                    "tool_args": {
-                        "paths": [ref],
+                        "paths": [saved.a0_path],
                    },
                },
            }
--- a/plugins/_browser/prompts/agent.system.tool.browser.md
+++ b/plugins/_browser/prompts/agent.system.tool.browser.md
@ -20,7 +20,7 @@ Workflow:
 - For same-page controls that are easier to identify structurally, `click`, `type`, `submit`, `type_submit`, `scroll`, `select_option`, `set_checked`, and `upload_file` may use `selector` instead of `ref`; the tool resolves the selector through `content` first.
 - `click` with `x`/`y` and no `ref` is treated as a coordinate mouse click. `type` with text and no `ref` types into the currently focused element. `key_chord` accepts either `["Control", "A"]` or `"CTRL+A"`.
 - `navigate` reuses an existing `browser_id` and is preferred for serial browsing.
- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are ephemeral refs rather than conserved files.
+- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are saved as chat-scoped artifacts; explicit `path` requests remain user-owned files.
 - Keep the tab set small; close pages after extracting what you need.

 `multi` is only a browser action: use `tool_name: "browser"` with `tool_args.action: "multi"`. Never use `tool_name: "multi"`.
--- a/plugins/_browser/skills/browser-automation/SKILL.md
+++ b/plugins/_browser/skills/browser-automation/SKILL.md
@ -32,7 +32,7 @@ Screenshots are explicit only; the browser does not automatically load images in
 2. Call `vision_load` with the returned `vision_load.tool_args.paths` value.
 3. Reason from the latest loaded screenshot.

-Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is an ephemeral ref consumed by `vision_load`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used.
+Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is saved as a chat-scoped artifact and returned through `vision_load.tool_args.paths`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used.

 ## Forms And Files

--- a/plugins/_browser/skills/browser-form-workflows/SKILL.md
+++ b/plugins/_browser/skills/browser-form-workflows/SKILL.md
@ -11,7 +11,7 @@ Start with `browser:content` to capture current refs, then use `browser:detail`

 Use `select_option`, `set_checked`, `upload_file`, `type`, `type_submit`, and `submit` for form interaction. Use coordinates only when no stable ref exists or the UI is intentionally canvas-like.

-Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return ephemeral refs for `vision_load`.
+Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return chat-scoped artifact paths for `vision_load`.

 Verify after submission with `browser:content`, `browser:state`, or another explicit `browser:screenshot` plus `vision_load`.

--- a/plugins/_desktop/helpers/desktop_state.py
+++ b/plugins/_desktop/helpers/desktop_state.py
@ -37,6 +37,19 @@ def context_screenshot_dir(context_id: str = "") -> Path:
    return SCREENSHOT_DIR / _safe_context_id(context_id)


+def chat_screenshot_dir(context_id: str = "") -> Path:
+    return BASE_DIR / "usr" / "chats" / _safe_context_id(context_id) / "screenshots" / "desktop"
+
+
+def normalize_a0_path(path: str | Path) -> str:
+    candidate = Path(path)
+    try:
+        relative = candidate.resolve(strict=False).relative_to(BASE_DIR.resolve(strict=False))
+    except ValueError:
+        return str(candidate)
+    return "/a0/" + str(relative).replace(os.sep, "/")
+
+
 def _safe_context_id(context_id: str = "") -> str:
    raw = str(context_id or os.environ.get("A0_DESKTOP_CONTEXT_ID") or "default")
    return _SAFE_CONTEXT_RE.sub("_", raw).strip("._") or "default"
@ -118,9 +131,11 @@ def capture_screenshot(
        return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message}

    explicit_path = path is not None and str(path).strip() != ""
-    ephemeral_ref = not explicit_path and str(transport or "").strip().lower() != "path"
-    screenshot_dir = context_screenshot_dir(context_id)
-    if not explicit_path:
+    transport_mode = str(transport or "").strip().lower()
+    chat_scoped = bool(not explicit_path and transport_mode == "path" and str(context_id or "").strip())
+    ephemeral_ref = not explicit_path and transport_mode != "path"
+    screenshot_dir = chat_screenshot_dir(context_id) if chat_scoped else context_screenshot_dir(context_id)
+    if not explicit_path and not chat_scoped:
        prune_context_screenshots(context_id=context_id)
        screenshot_dir.mkdir(parents=True, exist_ok=True)
    timestamp = time.strftime("%Y%m%d-%H%M%S")
@ -138,15 +153,17 @@ def capture_screenshot(
        return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail}

    if target.suffix.lower() == ".xwd":
-        if not explicit_path:
+        if not explicit_path and not chat_scoped:
            prune_context_screenshots(context_id=context_id, keep_path=raw_path)
        return {
            "ok": True,
            "path": str(raw_path),
+            "a0_path": normalize_a0_path(raw_path),
            "format": "xwd",
            "captured_at": iso_now(),
            "recent": True,
-            "ephemeral": not explicit_path,
+            "ephemeral": not explicit_path and not chat_scoped,
+            "chat_scoped": chat_scoped,
            "context_id": safe_context,
            "error": "",
        }
@ -167,17 +184,19 @@ def capture_screenshot(
                width=width,
                height=height,
            )
-        if not explicit_path:
+        if not explicit_path and not chat_scoped:
            prune_context_screenshots(context_id=context_id, keep_path=target)
        return {
            "ok": True,
            "path": str(target),
+            "a0_path": normalize_a0_path(target),
            "format": target.suffix.lower().lstrip(".") or "png",
            "width": width,
            "height": height,
            "captured_at": iso_now(),
            "recent": True,
-            "ephemeral": not explicit_path,
+            "ephemeral": not explicit_path and not chat_scoped,
+            "chat_scoped": chat_scoped,
            "context_id": safe_context,
            "error": "",
        }
@ -193,17 +212,19 @@ def capture_screenshot(
                    width=converted["width"],
                    height=converted["height"],
                )
-            if not explicit_path:
+            if not explicit_path and not chat_scoped:
                prune_context_screenshots(context_id=context_id, keep_path=target)
            return {
                "ok": True,
                "path": str(target),
+                "a0_path": normalize_a0_path(target),
                "format": target.suffix.lower().lstrip(".") or "png",
                "width": converted["width"],
                "height": converted["height"],
                "captured_at": iso_now(),
                "recent": True,
-                "ephemeral": not explicit_path,
+                "ephemeral": not explicit_path and not chat_scoped,
+                "chat_scoped": chat_scoped,
                "context_id": safe_context,
                "error": "",
            }
@ -226,10 +247,12 @@ def capture_screenshot(
        return {
            "ok": True,
            "path": str(raw_path),
+            "a0_path": normalize_a0_path(raw_path),
            "format": "xwd",
            "captured_at": iso_now(),
            "recent": True,
-            "ephemeral": not explicit_path,
+            "ephemeral": not explicit_path and not chat_scoped,
+            "chat_scoped": chat_scoped,
            "context_id": safe_context,
            "error": message,
        }
@ -575,8 +598,36 @@ def parse_xprop(output: str) -> dict[str, str]:


 def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
+    chat_dir = chat_screenshot_dir(context_id)
+    chat_latest = _latest_screenshot_from_dir(
+        chat_dir,
+        context_id=context_id,
+        ephemeral=False,
+        chat_scoped=True,
+        prune_older=False,
+    )
+    if chat_latest.get("ok"):
+        return chat_latest
+
    prune_context_screenshots(context_id=context_id, max_age_seconds=RECENT_SCREENSHOT_SECONDS)
    screenshot_dir = context_screenshot_dir(context_id)
+    return _latest_screenshot_from_dir(
+        screenshot_dir,
+        context_id=context_id,
+        ephemeral=True,
+        chat_scoped=False,
+        prune_older=True,
+    )
+
+
+def _latest_screenshot_from_dir(
+    screenshot_dir: Path,
+    *,
+    context_id: str = "",
+    ephemeral: bool,
+    chat_scoped: bool,
+    prune_older: bool,
+) -> dict[str, Any]:
    if not screenshot_dir.exists():
        return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
    candidates = [
@ -587,17 +638,20 @@ def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
    if not candidates:
        return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
    latest = max(candidates, key=lambda item: item.stat().st_mtime)
-    for candidate in candidates:
-        if candidate != latest:
-            candidate.unlink(missing_ok=True)
+    if prune_older:
+        for candidate in candidates:
+            if candidate != latest:
+                candidate.unlink(missing_ok=True)
    age = max(0.0, time.time() - latest.stat().st_mtime)
    return {
        "ok": True,
        "path": str(latest),
+        "a0_path": normalize_a0_path(latest),
        "format": latest.suffix.lower().lstrip("."),
        "captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)),
        "recent": age <= RECENT_SCREENSHOT_SECONDS,
-        "ephemeral": True,
+        "ephemeral": ephemeral,
+        "chat_scoped": chat_scoped,
        "context_id": _safe_context_id(context_id),
    }

@ -660,7 +714,8 @@ def compact_prompt_context(state: dict[str, Any] | None = None) -> str:
    screenshot = state.get("screenshot") or {}
    if screenshot.get("recent") and screenshot.get("path"):
        ephemeral = " ephemeral" if screenshot.get("ephemeral") else ""
-        lines.append(f"- recent_screenshot={screenshot['path']}{ephemeral}")
+        screenshot_ref = screenshot.get("a0_path") or screenshot["path"]
+        lines.append(f"- recent_screenshot={screenshot_ref}{ephemeral}")
    context_id = str(state.get("context_id") or "").strip()
    if context_id:
        lines.append(f"- screenshot_context={context_id}")
--- a/plugins/_desktop/skills/linux-desktop/SKILL.md
+++ b/plugins/_desktop/skills/linux-desktop/SKILL.md
@ -38,7 +38,7 @@ The Desktop is an observe-act-verify control surface. Use this decision hierarch
 3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands.
 4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation.
 5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output.
-6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations return temporary context paths. Do not report from an earlier screenshot path.
+6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations with `--context-id` return chat-scoped screenshot paths. Do not report from an earlier screenshot path.

 Keep these standing rules:

@ -68,7 +68,7 @@ $DESKTOP key ctrl+s

 The script targets the persistent `agent-zero-desktop` X display, sets `DISPLAY`, `XAUTHORITY`, and `HOME` to the XFCE profile, then uses `xdotool` for input. Startup normally prepares this session. If `check` fails during explicit Desktop work, report that the Desktop runtime is not ready instead of installing packages ad hoc.

-If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Use any returned shell screenshot path promptly; only the latest temporary context screenshot is retained.
+If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Shell screenshots captured with `--context-id` live in the owning chat's screenshot folder; screenshots without a chat context remain temporary.

 For direct app launches without coordinates:

--- a/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh
+++ b/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh
@ -60,7 +60,7 @@ Commands:
  observe --json [--screenshot] [--context-id ID]
                              Return structured state, optionally with a fresh screenshot.
  screenshot [PATH] [--context-id ID]
-                              Capture the Desktop to PATH, or to the temporary context screenshot directory.
+                              Capture the Desktop to PATH, or to the chat screenshot directory when context-id is set.
  active-window               Print the active window name.
  geometry PATTERN            Print the first matching visible window geometry.
  wait-window PATTERN         Wait for a visible matching window and print its id.
--- a/plugins/_desktop/webui/desktop-store.js
+++ b/plugins/_desktop/webui/desktop-store.js
@ -258,6 +258,7 @@ const model = {
  _desktopFrameHost: null,
  _desktopFrameLoadHandler: null,
  _desktopKeepaliveHost: null,
+  _desktopDisplaySizes: {},
  _desktopIntentionalShutdown: false,

  async init(element = null) {
@ -1499,7 +1500,7 @@ const model = {
      this.stopXpraDesktopPrime();
      this._desktopPrimeAttempts = 0;
    }
-    if (this.applyXpraDesktopFrameMode(options.frame || null)) return;
+    if (this.applyXpraDesktopFrameMode(options.frame || null, options)) return;
    if (this._desktopPrimeAttempts >= XPRA_DESKTOP_PRIME_ATTEMPTS) return;
    this._desktopPrimeAttempts += 1;
    if (this._desktopPrimeTimer) globalThis.clearTimeout(this._desktopPrimeTimer);
@ -1540,8 +1541,12 @@ const model = {
      const windows = Object.values(client.id_to_window || {});
      if (!client.connected || !windows.length) return false;

-      const width = Math.round(container.clientWidth || remoteWindow.innerWidth || 0);
-      const height = Math.round(container.clientHeight || remoteWindow.innerHeight || 0);
+      const token = options.token || this.session?.desktop?.token || "";
+      const displaySize = options.displaySize || this.desktopDisplaySizeForToken(token);
+      const viewportWidth = Math.round(container.clientWidth || remoteWindow.innerWidth || 0);
+      const viewportHeight = Math.round(container.clientHeight || remoteWindow.innerHeight || 0);
+      const width = Math.round(displaySize?.width || viewportWidth || 0);
+      const height = Math.round(displaySize?.height || viewportHeight || 0);
      if (width > 0 && height > 0) {
        client.desktop_width = width;
        client.desktop_height = height;
@ -1574,6 +1579,26 @@ const model = {
    }
  },

+  desktopDisplaySizeForToken(token = "") {
+    const key = String(token || "").trim();
+    const size = key ? this._desktopDisplaySizes?.[key] : null;
+    const width = Math.round(Number(size?.width || 0));
+    const height = Math.round(Number(size?.height || 0));
+    return width > 0 && height > 0 ? { width, height } : null;
+  },
+
+  rememberDesktopDisplaySize(token = "", width = 0, height = 0) {
+    const key = String(token || "").trim();
+    const normalizedWidth = Math.round(Number(width || 0));
+    const normalizedHeight = Math.round(Number(height || 0));
+    if (!key || normalizedWidth <= 0 || normalizedHeight <= 0) return null;
+    this._desktopDisplaySizes = {
+      ...(this._desktopDisplaySizes || {}),
+      [key]: { width: normalizedWidth, height: normalizedHeight },
+    };
+    return this._desktopDisplaySizes[key];
+  },
+
  installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container) {
    if (!frame || !remoteWindow || !remoteDocument || !client) return null;
    const store = this;
@ -1584,8 +1609,10 @@ const model = {
    const metrics = () => {
      const desktopWidth = Math.max(1, finite(client.desktop_width || container?.clientWidth || remoteWindow.innerWidth, 1));
      const desktopHeight = Math.max(1, finite(client.desktop_height || container?.clientHeight || remoteWindow.innerHeight, 1));
-      const clientWidth = Math.max(1, finite(container?.clientWidth || remoteWindow.innerWidth, desktopWidth));
-      const clientHeight = Math.max(1, finite(container?.clientHeight || remoteWindow.innerHeight, desktopHeight));
+      const primaryWindow = Object.values(client.id_to_window || {})[0];
+      const canvas = primaryWindow?.canvas;
+      const clientWidth = Math.max(1, finite(canvas?.clientWidth || canvas?.width || container?.clientWidth || remoteWindow.innerWidth, desktopWidth));
+      const clientHeight = Math.max(1, finite(canvas?.clientHeight || canvas?.height || container?.clientHeight || remoteWindow.innerHeight, desktopHeight));
      return {
        desktopWidth,
        desktopHeight,
@ -1683,8 +1710,10 @@ const model = {
  },

  fitXpraDesktopWindowElement(xpraWindow, width, height) {
-    const cssWidth = `${Math.max(1, Number(width || 0))}px`;
-    const cssHeight = `${Math.max(1, Number(height || 0))}px`;
+    const normalizedWidth = Math.max(1, Math.round(Number(width || 0)));
+    const normalizedHeight = Math.max(1, Math.round(Number(height || 0)));
+    const cssWidth = `${normalizedWidth}px`;
+    const cssHeight = `${normalizedHeight}px`;
    const windowElement = xpraWindow?.div;
    const canvas = xpraWindow?.canvas;
    windowElement?.style?.setProperty("left", "0px", "important");
@ -1698,6 +1727,12 @@ const model = {
    canvas?.style?.setProperty("height", cssHeight, "important");
    canvas?.style?.setProperty("display", "block", "important");
    canvas?.style?.setProperty("margin", "0", "important");
+    if (canvas) {
+      if (canvas.width !== normalizedWidth) canvas.width = normalizedWidth;
+      if (canvas.height !== normalizedHeight) canvas.height = normalizedHeight;
+      canvas.setAttribute("width", String(normalizedWidth));
+      canvas.setAttribute("height", String(normalizedHeight));
+    }
  },

  installXpraDesktopWheelBridge(remoteWindow, xpraWindow) {
@ -2139,6 +2174,11 @@ const model = {
        const response = await fetch(`/desktop/resize?${params.toString()}`, { credentials: "same-origin" });
        if (response.ok) {
          const result = await response.json().catch(() => ({}));
+          const displaySize = this.rememberDesktopDisplaySize(
+            token,
+            result?.width || width,
+            result?.height || height,
+          );
          this._desktopResizeKey = key;
          const activeFrame = this.desktopFrame(frame);
          const activeTarget = activeFrame?.parentElement || activeFrame;
@ -2153,7 +2193,7 @@ const model = {
            }
          }
          if (result?.reload) this.reloadDesktopFrame(activeFrame || frame);
-          this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame });
+          this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame, token, displaySize });
        }
      } catch (error) {
        console.warn("Desktop resize skipped", error);
--- a/tests/test_browser_agent_regressions.py
+++ b/tests/test_browser_agent_regressions.py
@ -2477,7 +2477,7 @@ async def test_browser_runtime_remounts_initial_changed_viewport():


@pytest.mark.anyio
-async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeypatch, tmp_path):
+async def test_browser_runtime_screenshot_file_defaults_to_chat_scoped_artifact(monkeypatch, tmp_path):
    screenshot_calls = []

    def fake_get_abs_path(*parts):
@ -2512,15 +2512,15 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp

    result = await core.screenshot_file(5, quality=500)

-    assert "path" not in result
-    assert "a0_path" not in result
+    assert Path(result["path"]).read_bytes() == b"image-bytes"
+    assert result["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/browser/browser-5-")
    assert result["context_id"] == "ctx/id"
    assert result["mime"] == "image/jpeg"
-    assert result["ephemeral"] is True
-    assert result["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX)
+    assert result["ephemeral"] is False
+    assert result["chat_scoped"] is True
    assert result["vision_load"] == {
        "tool_name": "vision_load",
-        "tool_args": {"paths": [result["ephemeral_ref"]]},
+        "tool_args": {"paths": [result["a0_path"]]},
    }
    assert "image" not in result
    assert not list((tmp_path / "tmp" / "browser" / "screenshots").rglob("*.jpg"))
@ -2528,7 +2528,6 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp
    assert screenshot_calls[-1]["quality"] == 95
    assert screenshot_calls[-1]["full_page"] is False
    assert "path" not in screenshot_calls[-1]
-    assert ephemeral_images.consume_image(result["ephemeral_ref"], context_id="ctx/id").data_url == "data:image/jpeg;base64,aW1hZ2UtYnl0ZXM="

    png_path = tmp_path / "custom.png"
    png_result = await core.screenshot_file(5, quality=1, full_page=True, path=str(png_path))
@ -2543,9 +2542,27 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp


@pytest.mark.anyio
-async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
+async def test_vision_load_materializes_ephemeral_browser_refs(monkeypatch, tmp_path):
+    monkeypatch.setitem(sys.modules, "helpers.tool", SimpleNamespace(Response=_TestResponse, Tool=_TestTool))
+    history_stub = ModuleType("helpers.history")
+
+    class _RawMessage(dict):
+        def __init__(self, raw_content, preview):
+            super().__init__(raw_content=raw_content, preview=preview)
+
+    history_stub.RawMessage = _RawMessage
+    monkeypatch.setitem(sys.modules, "helpers.history", history_stub)
+    monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
    import tools.vision_load as vision_load_module

+    def fake_get_abs_path(*parts):
+        return str(tmp_path.joinpath(*parts))
+
+    def fake_normalize_a0_path(path):
+        return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
+
+    monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path)
+    monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
    monkeypatch.setattr(
        vision_load_module.plugins,
        "get_plugin_config",
@ -2561,7 +2578,7 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
        hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)),
        hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)),
    )
-    ref = ephemeral_images.put_image(
+    ref = vision_load_module.ephemeral_images.put_image(
        context_id="ctx-vision",
        mime="image/jpeg",
        data=SMALL_JPEG_10X10,
@ -2580,10 +2597,13 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch):
    response = await tool.execute(paths=[ref])
    await tool.after_execution(response)

-    assert ephemeral_images.get_image(ref, context_id="ctx-vision") is None
+    assert vision_load_module.ephemeral_images.get_image(ref, context_id="ctx-vision") is None
    assert tool.loaded_paths == ["browser-shot.jpg"]
    raw_message = messages[0][1]["content"]
-    assert raw_message.raw_content[0]["image_url"]["url"] == f"data:image/jpeg;base64,{SMALL_JPEG_10X10}"
+    stored_ref = raw_message["raw_content"][0]["image_url"]["url"]
+    assert stored_ref.startswith("/a0/usr/chats/ctx-vision/screenshots/browser/browser-shot-")
+    stored_path = tmp_path / stored_ref.removeprefix("/a0/")
+    assert stored_path.read_bytes() == __import__("base64").b64decode(SMALL_JPEG_10X10)
    assert updates[-1]["result"] == "1 images loaded, 0 skipped"


--- a/tests/test_host_browser_connector.py
+++ b/tests/test_host_browser_connector.py
@ -12,8 +12,8 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

-from helpers import ephemeral_images
 from plugins._a0_connector.helpers import ws_runtime
+from plugins._browser.helpers import connector_runtime as connector_runtime_module
 from plugins._browser.helpers.connector_runtime import (
    ConnectorBrowserRuntime,
    _agent_uses_local_chat_model,
@ -330,7 +330,15 @@ def test_connector_runtime_adds_docker_recovery_to_host_errors():
    assert "/browser container" in message


-def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path):
+def test_host_browser_artifacts_become_chat_scoped_files(monkeypatch, tmp_path):
+    def fake_get_abs_path(*parts):
+        return str(tmp_path.joinpath(*parts))
+
+    def fake_normalize_a0_path(path):
+        return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
+
+    monkeypatch.setattr(connector_runtime_module.chat_media.files, "get_abs_path", fake_get_abs_path)
+    monkeypatch.setattr(connector_runtime_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
    runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host"))

    result = runtime._materialize_artifact(
@ -352,19 +360,15 @@ def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path):

    inner = result[0]["result"]
    assert "artifact" not in inner
-    assert "path" not in inner
-    assert "a0_path" not in inner
+    assert Path(inner["path"]).read_bytes() == b"fake"
+    assert inner["a0_path"].startswith("/a0/usr/chats/ctx-host/screenshots/browser/shot-")
    assert inner["context_id"] == "ctx-host"
-    assert inner["ephemeral"] is True
-    assert inner["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX)
-    assert inner["vision_load"]["tool_args"]["paths"] == [inner["ephemeral_ref"]]
-    assert ephemeral_images.consume_image(inner["ephemeral_ref"], context_id="ctx-host").data_url == "data:image/jpeg;base64,ZmFrZQ=="
-    assert not list(tmp_path.rglob("shot.jpg"))
+    assert inner["ephemeral"] is False
+    assert inner["chat_scoped"] is True
+    assert inner["vision_load"]["tool_args"]["paths"] == [inner["a0_path"]]


 def test_host_browser_artifact_materialization_rejects_oversized_payload(monkeypatch, tmp_path):
-    import plugins._browser.helpers.connector_runtime as connector_runtime_module
-
    monkeypatch.setattr(connector_runtime_module, "MAX_ARTIFACT_SIZE_BYTES", 2)
    runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host"))

--- a/tests/test_office_canvas_setup.py
+++ b/tests/test_office_canvas_setup.py
@ -264,6 +264,14 @@ def test_desktop_plugin_owns_routes_runtime_surface_and_state_paths():
    assert "DESKTOP_RUNTIME_INSTALL_MESSAGE" in desktop_store
    assert "openDesktopWhenRuntimeReady" in desktop_store
    assert "isDesktopRuntimeInstalling" in desktop_store
+    assert "_desktopDisplaySizes: {}" in desktop_store
+    assert "desktopDisplaySizeForToken(token" in desktop_store
+    assert "rememberDesktopDisplaySize(token" in desktop_store
+    assert "options.displaySize || this.desktopDisplaySizeForToken(token)" in desktop_store
+    assert "result?.width || width" in desktop_store
+    assert "canvas.width = normalizedWidth" in desktop_store
+    assert "canvas.height = normalizedHeight" in desktop_store
+    assert "canvas?.clientWidth || canvas?.width" in desktop_store
    assert "Installing Agent Zero Desktop runtime dependencies" in desktop_session
    assert "__a0XpraOffsetWarnPatched" in desktop_store
    assert "window does not fit in canvas, offsets" in desktop_store
--- a/tests/test_office_desktop_state.py
+++ b/tests/test_office_desktop_state.py
@ -191,7 +191,8 @@ def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp


 def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeypatch):
-    monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path)
+    monkeypatch.setattr(desktop_state, "BASE_DIR", tmp_path)
+    monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path / "tmp" / "desktop" / "screenshots")
    capabilities = {"xwd": "/usr/bin/xwd"}
    env = {"DISPLAY": ":120"}

@ -222,7 +223,7 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp
    monkeypatch.setattr(desktop_state, "run", fake_run)
    monkeypatch.setitem(sys.modules, "PIL", pil_module)
    monkeypatch.setitem(sys.modules, "PIL.Image", image_module)
-    stale_path = tmp_path / "ctx_id" / "stale.png"
+    stale_path = tmp_path / "tmp" / "desktop" / "screenshots" / "ctx_id" / "stale.png"
    stale_path.parent.mkdir(parents=True)
    stale_path.write_bytes(b"stale")

@ -236,12 +237,14 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp

    path = Path(screenshot["path"])
    assert screenshot["ok"] is True
-    assert screenshot["ephemeral"] is True
+    assert screenshot["ephemeral"] is False
+    assert screenshot["chat_scoped"] is True
    assert screenshot["context_id"] == "ctx_id"
-    assert path.parent == tmp_path / "ctx_id"
+    assert screenshot["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/desktop/desktop-")
+    assert path.parent == tmp_path / "usr" / "chats" / "ctx_id" / "screenshots" / "desktop"
    assert path.name.startswith("desktop-")
    assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path)
-    assert not stale_path.exists()
+    assert stale_path.exists()


 def test_desktop_state_default_screenshot_returns_ephemeral_ref(tmp_path, monkeypatch):
--- a/tests/test_tool_action_contracts.py
+++ b/tests/test_tool_action_contracts.py
@ -699,3 +699,35 @@ def test_computer_use_remote_start_session_reports_backend_features_and_windows_
    assert "backend=windows/windows" in message
    assert "features=uia-tree-snapshot, uia-structural-targeting" in message
    assert "host-computer-use-windows" in message
+
+
+def test_computer_use_remote_capture_artifact_is_chat_scoped(monkeypatch, tmp_path: Path):
+    module = _load_computer_use_remote_tool(monkeypatch)
+
+    def fake_get_abs_path(*parts):
+        return str(tmp_path.joinpath(*parts))
+
+    def fake_normalize_a0_path(path):
+        return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
+
+    monkeypatch.setattr(module.chat_media.files, "get_abs_path", fake_get_abs_path)
+    monkeypatch.setattr(module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
+
+    tool = object.__new__(module.ComputerUseRemote)
+    tool.agent = types.SimpleNamespace(context=types.SimpleNamespace(id="ctx-computer"))
+
+    display_ref, capture_id = tool._resolve_capture_ref(
+        {
+            "artifact": {
+                "filename": "capture.png",
+                "mime": "image/png",
+                "encoding": "base64",
+                "data": "ZmFrZQ==",
+            },
+        }
+    )
+
+    assert display_ref.startswith("/a0/usr/chats/ctx-computer/screenshots/computer-use/capture-")
+    stored_path = tmp_path / display_ref.removeprefix("/a0/")
+    assert stored_path.read_bytes() == b"fake"
+    assert capture_id == stored_path.stem
--- a/tests/test_vision_load_image_refs.py
+++ b/tests/test_vision_load_image_refs.py
@ -0,0 +1,123 @@
+import types
+from types import SimpleNamespace
+import sys
+from pathlib import Path
+
+import pytest
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from helpers import images
+
+
+class _TestResponse(SimpleNamespace):
+    def __init__(self, message="", break_loop=False, **kwargs):
+        super().__init__(message=message, break_loop=break_loop, **kwargs)
+
+
+class _TestTool:
+    def __init__(
+        self,
+        agent=None,
+        name="",
+        method=None,
+        args=None,
+        message="",
+        loop_data=None,
+        **kwargs,
+    ):
+        self.agent = agent
+        self.name = name
+        self.method = method
+        self.args = args or {}
+        self.message = message
+        self.loop_data = loop_data
+
+
+def _install_tool_stub(monkeypatch):
+    tool_stub = types.ModuleType("helpers.tool")
+    tool_stub.Response = _TestResponse
+    tool_stub.Tool = _TestTool
+    history_stub = types.ModuleType("helpers.history")
+
+    class _RawMessage(dict):
+        def __init__(self, raw_content, preview):
+            super().__init__(raw_content=raw_content, preview=preview)
+
+    history_stub.RawMessage = _RawMessage
+    monkeypatch.setitem(sys.modules, "helpers.tool", tool_stub)
+    monkeypatch.setitem(sys.modules, "helpers.history", history_stub)
+    monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False)
+
+
+def test_prepare_content_keeps_missing_local_image_refs_strict():
+    missing_path = "/tmp/a0-missing-desktop-screenshot.png"
+
+    with pytest.raises(FileNotFoundError):
+        images.prepare_content(
+            [{"type": "image_url", "image_url": {"url": missing_path}}]
+        )
+
+
+@pytest.mark.anyio
+async def test_vision_load_materializes_local_image_to_chat_artifact(monkeypatch, tmp_path):
+    _install_tool_stub(monkeypatch)
+    import tools.vision_load as vision_load_module
+
+    def fake_get_abs_path(*parts):
+        return str(tmp_path.joinpath(*parts))
+
+    def fake_normalize_a0_path(path):
+        return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/")
+
+    monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path)
+    monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path)
+    monkeypatch.setattr(
+        vision_load_module.plugins,
+        "get_plugin_config",
+        lambda *args, **kwargs: {"chat_model": {"max_embeds": 10}},
+    )
+
+    async def direct_call(func, *args, **kwargs):
+        return func(*args, **kwargs)
+
+    monkeypatch.setattr(
+        vision_load_module.runtime,
+        "call_development_function",
+        direct_call,
+    )
+
+    image_path = tmp_path / "sample-image.png"
+    image_path.write_bytes(b"png-data")
+
+    tool_results = []
+    messages = []
+    updates = []
+    agent = SimpleNamespace(
+        context=SimpleNamespace(id="ctx-vision"),
+        agent_name="Agent 0",
+        hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)),
+        hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)),
+    )
+    tool = vision_load_module.VisionLoad(
+        agent=agent,
+        name="vision_load",
+        method=None,
+        args={"paths": [str(image_path)]},
+        message="",
+        loop_data=None,
+    )
+    tool.log = SimpleNamespace(id="vision-log", update=lambda **kwargs: updates.append(kwargs))
+
+    response = await tool.execute(paths=[str(image_path)])
+    image_path.unlink()
+    await tool.after_execution(response)
+
+    raw_message = messages[0][1]["content"]
+    stored_ref = raw_message["raw_content"][0]["image_url"]["url"]
+    assert stored_ref.startswith("/a0/usr/chats/ctx-vision/images/vision-load/sample-image-")
+    stored_path = tmp_path / stored_ref.removeprefix("/a0/")
+    assert stored_path.read_bytes() == b"png-data"
+    assert updates[-1]["result"] == "1 images loaded, 0 skipped"
--- a/tools/vision_load.py
+++ b/tools/vision_load.py
@ -1,6 +1,6 @@
 from helpers.print_style import PrintStyle
 from helpers.tool import Tool, Response
-from helpers import runtime, files, plugins, ephemeral_images
+from helpers import runtime, files, plugins, ephemeral_images, images, chat_media
 from mimetypes import guess_type
 from helpers import history

@ -27,7 +27,7 @@ class VisionLoad(Tool):
            else []
        )

-        for path, display_path in limited_paths:
+        for idx, (path, display_path) in enumerate(limited_paths):
            if not path:
                continue
            if ephemeral_images.is_ref(path):
@ -38,12 +38,16 @@ class VisionLoad(Tool):
                if image is None:
                    continue
                display = image.display_name or display_path
-                self.images_dict[display] = image.data_url
-                self.loaded_paths.append(display)
+                stored_ref = self._store_ephemeral_image(image)
+                if stored_ref:
+                    self.images_dict[display] = stored_ref
+                    self.loaded_paths.append(display)
                continue
            if self._is_data_image_url(path):
-                self.images_dict[display_path] = path
-                self.loaded_paths.append(display_path)
+                stored_ref = self._store_data_url(path, preferred_name=f"vision-load-{idx + 1}.png")
+                if stored_ref:
+                    self.images_dict[display_path] = stored_ref
+                    self.loaded_paths.append(display_path)
                continue
            if not await runtime.call_development_function(files.exists, str(path)):
                continue
@ -51,8 +55,12 @@ class VisionLoad(Tool):
            if path not in self.images_dict:
                mime_type, _ = guess_type(str(path))
                if mime_type and mime_type.startswith("image/"):
-                    self.images_dict[display_path] = str(path)
-                    self.loaded_paths.append(display_path)
+                    try:
+                        stored_ref = self._store_local_image(path, preferred_name=files.basename(path))
+                        self.images_dict[display_path] = stored_ref
+                        self.loaded_paths.append(display_path)
+                    except (FileNotFoundError, OSError, ValueError):
+                        continue

        return Response(message="dummy", break_loop=False)

@ -65,6 +73,48 @@ class VisionLoad(Tool):
    def _context_id(self) -> str:
        return str(getattr(getattr(self.agent, "context", None), "id", "") or "").strip()

+    def _store_ephemeral_image(self, image: ephemeral_images.EphemeralImage) -> str:
+        context_id = self._context_id()
+        if not context_id:
+            return image.data_url
+        source = chat_media.infer_source(image.ref, image.display_name)
+        category = chat_media.category_for_source(source)
+        saved = chat_media.save_image_base64(
+            context_id=context_id,
+            data=image.data,
+            mime_type=image.mime,
+            category=category,
+            source=source,
+            preferred_name=image.display_name,
+        )
+        return saved.a0_path
+
+    def _store_data_url(self, data_url: str, *, preferred_name: str = "") -> str:
+        context_id = self._context_id()
+        if not context_id:
+            return data_url
+        source = chat_media.infer_source(data_url, preferred_name)
+        category = chat_media.category_for_source(source)
+        saved = chat_media.save_image_data_url(
+            context_id=context_id,
+            data_url=data_url,
+            category=category,
+            source=source,
+            preferred_name=preferred_name,
+        )
+        return saved.a0_path
+
+    def _store_local_image(self, path: str, *, preferred_name: str = "") -> str:
+        context_id = self._context_id()
+        if not context_id:
+            return images.to_data_url(path)
+        return chat_media.materialize_image_ref(
+            context_id=context_id,
+            url=path,
+            source=chat_media.infer_source(path, preferred_name),
+            preferred_name=preferred_name,
+        )
+
    @staticmethod
    def _is_data_image_url(value: str) -> bool:
        normalized = str(value or "").strip().lower()