diff --git a/helpers/chat_media.py b/helpers/chat_media.py new file mode 100644 index 000000000..d93af7cd4 --- /dev/null +++ b/helpers/chat_media.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import time +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +from helpers import files, media_artifacts + + +DEFAULT_MAX_IMAGE_BYTES = media_artifacts.DEFAULT_MAX_ARTIFACT_SIZE_BYTES +ImageCategory = Literal["images", "screenshots"] + + +@dataclass(frozen=True) +class ChatImage: + path: str + a0_path: str + mime: str + size: int + + +def screenshot_dir(context_id: str, source: str) -> Path: + return artifact_dir(context_id, category="screenshots", source=source) + + +def artifact_dir( + context_id: str, + *, + category: ImageCategory = "images", + source: str = "vision-load", +) -> Path: + context_segment = files.safe_file_name(str(context_id or "default")).strip("._") or "default" + safe_category = files.safe_file_name(category).strip("._") or "images" + safe_source = files.safe_file_name(source).strip("._") or "vision-load" + + return Path(files.get_abs_path("usr/chats", context_segment)) / safe_category / safe_source + + +def save_image_bytes( + *, + context_id: str, + payload: bytes, + mime_type: str = "image/png", + category: ImageCategory = "images", + source: str = "vision-load", + preferred_name: str = "", + max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES, +) -> ChatImage: + data = bytes(payload or b"") + if not data: + raise media_artifacts.EmptyBase64Data("image payload is empty") + if max_bytes is not None and len(data) > max_bytes: + raise media_artifacts.ArtifactTooLarge(len(data), max_bytes) + + safe_mime = media_artifacts.normalize_mime( + mime_type, + default="image/png", + required_prefix="image/", + ) + default_extension = media_artifacts.guess_extension(safe_mime, ".png") + default_filename = f"{source or 'image'}{default_extension}" + filename = media_artifacts.safe_filename( + preferred_name, + default=default_filename, + default_extension=default_extension, + ) + filename_path = Path(filename) + stem = filename_path.stem or Path(default_filename).stem or "image" + suffix = filename_path.suffix or default_extension + timestamp = time.strftime("%Y%m%d-%H%M%S") + path = artifact_dir(context_id, category=category, source=source) / ( + f"{stem}-{timestamp}-{uuid.uuid4().hex[:8]}{suffix}" + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + return ChatImage( + path=str(path), + a0_path=files.normalize_a0_path(str(path)), + mime=safe_mime, + size=len(data), + ) + + +def save_image_base64( + *, + context_id: str, + data: str, + mime_type: str = "image/png", + category: ImageCategory = "images", + source: str = "vision-load", + preferred_name: str = "", + max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES, +) -> ChatImage: + payload = media_artifacts.decode_base64_payload(data, max_bytes=max_bytes) + return save_image_bytes( + context_id=context_id, + payload=payload.payload, + mime_type=mime_type, + category=category, + source=source, + preferred_name=preferred_name, + max_bytes=max_bytes, + ) + + +def save_image_file( + *, + context_id: str, + path: str | Path, + category: ImageCategory = "images", + source: str = "vision-load", + preferred_name: str = "", + max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES, +) -> ChatImage: + image_path = Path(path) + payload = image_path.read_bytes() + mime = media_artifacts.normalize_mime( + _guess_image_mime(image_path), + default="image/png", + required_prefix="image/", + ) + return save_image_bytes( + context_id=context_id, + payload=payload, + mime_type=mime, + category=category, + source=source, + preferred_name=preferred_name or image_path.name, + max_bytes=max_bytes, + ) + + +def save_image_data_url( + *, + context_id: str, + data_url: str, + category: ImageCategory = "images", + source: str = "vision-load", + preferred_name: str = "", + max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES, +) -> ChatImage: + header, encoded = _split_image_data_url(data_url) + mime = header.removeprefix("data:").split(";", 1)[0] or "image/png" + return save_image_base64( + context_id=context_id, + data=encoded, + mime_type=mime, + category=category, + source=source, + preferred_name=preferred_name, + max_bytes=max_bytes, + ) + + +def materialize_image_ref( + *, + context_id: str, + url: str, + source: str = "", + preferred_name: str = "", + max_bytes: int | None = DEFAULT_MAX_IMAGE_BYTES, +) -> str: + value = str(url or "").strip() + if not value or not str(context_id or "").strip(): + return value + + resolved_source = source or infer_source(value, preferred_name) + category = category_for_source(resolved_source) + if _is_data_image_url(value): + saved = save_image_data_url( + context_id=context_id, + data_url=value, + category=category, + source=resolved_source, + preferred_name=preferred_name, + max_bytes=max_bytes, + ) + return saved.a0_path + + from helpers import images + + source_path = images.resolve_ref(value) + if is_chat_scoped_path(context_id=context_id, path=source_path): + return files.normalize_a0_path(str(source_path)) + saved = save_image_file( + context_id=context_id, + path=source_path, + category=category, + source=resolved_source, + preferred_name=preferred_name or source_path.name, + max_bytes=max_bytes, + ) + return saved.a0_path + + +def is_chat_scoped_path(*, context_id: str, path: str | Path) -> bool: + if not str(context_id or "").strip(): + return False + try: + target = Path(path).resolve(strict=False) + root = artifact_dir(context_id, category="images", source="vision-load").parents[1].resolve(strict=False) + return target == root or root in target.parents + except OSError: + return False + + +def infer_source(value: str = "", preferred_name: str = "") -> str: + raw = f"{value or ''} {preferred_name or ''}".lower() + if "computer-use" in raw or "computer_use" in raw or "_a0_connector/computer_use" in raw: + return "computer-use" + if "/desktop/screenshots/" in raw or "\\desktop\\screenshots\\" in raw or "desktop-" in raw: + return "desktop" + if ( + "/browser/screenshots/" in raw + or "\\browser\\screenshots\\" in raw + or "host-browser" in raw + or "browser-" in raw + ): + return "browser" + return "vision-load" + + +def category_for_source(source: str) -> ImageCategory: + return "screenshots" if source in {"desktop", "browser", "computer-use"} else "images" + + +def _guess_image_mime(path: Path) -> str: + import mimetypes + + return mimetypes.guess_type(path.name)[0] or "image/png" + + +def _is_data_image_url(value: str) -> bool: + normalized = str(value or "").strip().lower() + return normalized.startswith("data:image/") and ";base64," in normalized + + +def _split_image_data_url(data_url: str) -> tuple[str, str]: + value = str(data_url or "").strip() + if not _is_data_image_url(value) or "," not in value: + raise ValueError("image data URL must be data:image/*;base64,...") + return value.split(",", 1) diff --git a/plugins/_a0_connector/tools/computer_use_remote.py b/plugins/_a0_connector/tools/computer_use_remote.py index 692c40b62..32f73e9d6 100644 --- a/plugins/_a0_connector/tools/computer_use_remote.py +++ b/plugins/_a0_connector/tools/computer_use_remote.py @@ -6,7 +6,7 @@ from pathlib import Path import uuid from typing import Any -from helpers import history, media_artifacts +from helpers import chat_media, history, media_artifacts from helpers.print_style import PrintStyle from helpers.tool import Response, Tool from helpers.ws import NAMESPACE @@ -744,7 +744,15 @@ class ComputerUseRemote(Tool): except FileNotFoundError as exc: path_error = exc else: - return display_path, image_path.stem + saved = chat_media.save_image_file( + context_id=self.agent.context.id, + path=image_path, + category="screenshots", + source="computer-use", + preferred_name=Path(display_path).name or image_path.name, + max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES, + ) + return saved.a0_path, Path(saved.path).stem artifact = data.get("artifact") if isinstance(artifact, dict) and str(artifact.get("encoding", "")).strip().lower() == "base64": @@ -764,7 +772,16 @@ class ComputerUseRemote(Tool): default=f"computer-use-{uuid.uuid4().hex}.png", default_extension=".png", ) - return f"data:{mime};base64,{encoded}", Path(filename).stem + saved = chat_media.save_image_base64( + context_id=self.agent.context.id, + data=encoded, + mime_type=mime, + category="screenshots", + source="computer-use", + preferred_name=filename, + max_bytes=MAX_CAPTURE_ARTIFACT_SIZE_BYTES, + ) + return saved.a0_path, Path(saved.path).stem if path_error is not None: raise path_error diff --git a/plugins/_browser/helpers/connector_runtime.py b/plugins/_browser/helpers/connector_runtime.py index 0c7ac4250..82107aa7b 100644 --- a/plugins/_browser/helpers/connector_runtime.py +++ b/plugins/_browser/helpers/connector_runtime.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Any from urllib.parse import urlparse -from helpers import ephemeral_images, media_artifacts +from helpers import chat_media, media_artifacts try: from helpers.ws import NAMESPACE @@ -451,12 +451,16 @@ class ConnectorBrowserRuntime: default=f"host-browser-{uuid.uuid4().hex}.jpg", default_extension=".jpg", ) + mime = str(artifact.get("mime") or result.get("mime") or "image/jpeg") try: - ref = ephemeral_images.put_image( + saved = chat_media.save_image_base64( context_id=self.context_id, - mime=str(artifact.get("mime") or result.get("mime") or "image/jpeg"), data=data, - name=filename, + mime_type=mime, + category="screenshots", + source="browser", + preferred_name=filename, + max_bytes=MAX_ARTIFACT_SIZE_BYTES, ) except Exception as exc: raise RuntimeError("Host browser artifact could not be decoded.") from exc @@ -466,11 +470,14 @@ class ConnectorBrowserRuntime: materialized.pop("a0_path", None) materialized.pop("host_path", None) materialized.setdefault("context_id", self.context_id) - materialized["ephemeral"] = True - materialized["ephemeral_ref"] = ref + materialized["path"] = saved.path + materialized["a0_path"] = saved.a0_path + materialized["mime"] = saved.mime + materialized["ephemeral"] = False + materialized["chat_scoped"] = True materialized["vision_load"] = { "tool_name": "vision_load", - "tool_args": {"paths": [ref]}, + "tool_args": {"paths": [saved.a0_path]}, } return materialized diff --git a/plugins/_browser/helpers/runtime.py b/plugins/_browser/helpers/runtime.py index 3282d13ca..e7455d81a 100644 --- a/plugins/_browser/helpers/runtime.py +++ b/plugins/_browser/helpers/runtime.py @@ -15,7 +15,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any -from helpers import ephemeral_images, files +from helpers import chat_media, files from helpers.defer import DeferredTask from helpers.errors import RepairableException from helpers.print_style import PrintStyle @@ -1558,23 +1558,27 @@ class _BrowserRuntimeCore: quality=max(20, min(95, int(quality))), full_page=bool(full_page), ) - ref = ephemeral_images.put_image_bytes( + saved = chat_media.save_image_bytes( context_id=self.context_id, - mime="image/jpeg", payload=image, - name=f"browser-{resolved_id}.jpg", + mime_type="image/jpeg", + category="screenshots", + source="browser", + preferred_name=f"browser-{resolved_id}.jpg", ) return { "browser_id": resolved_id, "context_id": self.context_id, + "path": saved.path, + "a0_path": saved.a0_path, "mime": "image/jpeg", - "ephemeral": True, - "ephemeral_ref": ref, + "ephemeral": False, + "chat_scoped": True, "state": await self._state(resolved_id), "vision_load": { "tool_name": "vision_load", "tool_args": { - "paths": [ref], + "paths": [saved.a0_path], }, }, } diff --git a/plugins/_browser/prompts/agent.system.tool.browser.md b/plugins/_browser/prompts/agent.system.tool.browser.md index f9ed07b35..258a5233a 100644 --- a/plugins/_browser/prompts/agent.system.tool.browser.md +++ b/plugins/_browser/prompts/agent.system.tool.browser.md @@ -20,7 +20,7 @@ Workflow: - For same-page controls that are easier to identify structurally, `click`, `type`, `submit`, `type_submit`, `scroll`, `select_option`, `set_checked`, and `upload_file` may use `selector` instead of `ref`; the tool resolves the selector through `content` first. - `click` with `x`/`y` and no `ref` is treated as a coordinate mouse click. `type` with text and no `ref` types into the currently focused element. `key_chord` accepts either `["Control", "A"]` or `"CTRL+A"`. - `navigate` reuses an existing `browser_id` and is preferred for serial browsing. -- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are ephemeral refs rather than conserved files. +- Screenshots are explicit only; the browser does not automatically load screenshots. Call `vision_load` with the returned `vision_load.tool_args.paths` value before reasoning visually. When no `path` is requested, browser screenshots are saved as chat-scoped artifacts; explicit `path` requests remain user-owned files. - Keep the tab set small; close pages after extracting what you need. `multi` is only a browser action: use `tool_name: "browser"` with `tool_args.action: "multi"`. Never use `tool_name: "multi"`. diff --git a/plugins/_browser/skills/browser-automation/SKILL.md b/plugins/_browser/skills/browser-automation/SKILL.md index a31949e10..b79d0e42b 100644 --- a/plugins/_browser/skills/browser-automation/SKILL.md +++ b/plugins/_browser/skills/browser-automation/SKILL.md @@ -32,7 +32,7 @@ Screenshots are explicit only; the browser does not automatically load images in 2. Call `vision_load` with the returned `vision_load.tool_args.paths` value. 3. Reason from the latest loaded screenshot. -Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is an ephemeral ref consumed by `vision_load`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used. +Screenshot args include `quality`, `full_page`, and optional `path`. Without `path`, the screenshot is saved as a chat-scoped artifact and returned through `vision_load.tool_args.paths`; with `path`, PNG is used when `path` ends with `.png`, otherwise JPEG is used. ## Forms And Files diff --git a/plugins/_browser/skills/browser-form-workflows/SKILL.md b/plugins/_browser/skills/browser-form-workflows/SKILL.md index aae826cfb..4dffd483d 100644 --- a/plugins/_browser/skills/browser-form-workflows/SKILL.md +++ b/plugins/_browser/skills/browser-form-workflows/SKILL.md @@ -11,7 +11,7 @@ Start with `browser:content` to capture current refs, then use `browser:detail` Use `select_option`, `set_checked`, `upload_file`, `type`, `type_submit`, and `submit` for form interaction. Use coordinates only when no stable ref exists or the UI is intentionally canvas-like. -Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return ephemeral refs for `vision_load`. +Use `browser:screenshot` plus `vision_load` when layout, visual validation, captcha-like UI, canvas content, or hidden state matters. Browser screenshots are not automatically loaded into model-visible history; no-path screenshots return chat-scoped artifact paths for `vision_load`. Verify after submission with `browser:content`, `browser:state`, or another explicit `browser:screenshot` plus `vision_load`. diff --git a/plugins/_desktop/helpers/desktop_state.py b/plugins/_desktop/helpers/desktop_state.py index 44852923b..825a7d6cb 100644 --- a/plugins/_desktop/helpers/desktop_state.py +++ b/plugins/_desktop/helpers/desktop_state.py @@ -37,6 +37,19 @@ def context_screenshot_dir(context_id: str = "") -> Path: return SCREENSHOT_DIR / _safe_context_id(context_id) +def chat_screenshot_dir(context_id: str = "") -> Path: + return BASE_DIR / "usr" / "chats" / _safe_context_id(context_id) / "screenshots" / "desktop" + + +def normalize_a0_path(path: str | Path) -> str: + candidate = Path(path) + try: + relative = candidate.resolve(strict=False).relative_to(BASE_DIR.resolve(strict=False)) + except ValueError: + return str(candidate) + return "/a0/" + str(relative).replace(os.sep, "/") + + def _safe_context_id(context_id: str = "") -> str: raw = str(context_id or os.environ.get("A0_DESKTOP_CONTEXT_ID") or "default") return _SAFE_CONTEXT_RE.sub("_", raw).strip("._") or "default" @@ -118,9 +131,11 @@ def capture_screenshot( return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message} explicit_path = path is not None and str(path).strip() != "" - ephemeral_ref = not explicit_path and str(transport or "").strip().lower() != "path" - screenshot_dir = context_screenshot_dir(context_id) - if not explicit_path: + transport_mode = str(transport or "").strip().lower() + chat_scoped = bool(not explicit_path and transport_mode == "path" and str(context_id or "").strip()) + ephemeral_ref = not explicit_path and transport_mode != "path" + screenshot_dir = chat_screenshot_dir(context_id) if chat_scoped else context_screenshot_dir(context_id) + if not explicit_path and not chat_scoped: prune_context_screenshots(context_id=context_id) screenshot_dir.mkdir(parents=True, exist_ok=True) timestamp = time.strftime("%Y%m%d-%H%M%S") @@ -138,15 +153,17 @@ def capture_screenshot( return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail} if target.suffix.lower() == ".xwd": - if not explicit_path: + if not explicit_path and not chat_scoped: prune_context_screenshots(context_id=context_id, keep_path=raw_path) return { "ok": True, "path": str(raw_path), + "a0_path": normalize_a0_path(raw_path), "format": "xwd", "captured_at": iso_now(), "recent": True, - "ephemeral": not explicit_path, + "ephemeral": not explicit_path and not chat_scoped, + "chat_scoped": chat_scoped, "context_id": safe_context, "error": "", } @@ -167,17 +184,19 @@ def capture_screenshot( width=width, height=height, ) - if not explicit_path: + if not explicit_path and not chat_scoped: prune_context_screenshots(context_id=context_id, keep_path=target) return { "ok": True, "path": str(target), + "a0_path": normalize_a0_path(target), "format": target.suffix.lower().lstrip(".") or "png", "width": width, "height": height, "captured_at": iso_now(), "recent": True, - "ephemeral": not explicit_path, + "ephemeral": not explicit_path and not chat_scoped, + "chat_scoped": chat_scoped, "context_id": safe_context, "error": "", } @@ -193,17 +212,19 @@ def capture_screenshot( width=converted["width"], height=converted["height"], ) - if not explicit_path: + if not explicit_path and not chat_scoped: prune_context_screenshots(context_id=context_id, keep_path=target) return { "ok": True, "path": str(target), + "a0_path": normalize_a0_path(target), "format": target.suffix.lower().lstrip(".") or "png", "width": converted["width"], "height": converted["height"], "captured_at": iso_now(), "recent": True, - "ephemeral": not explicit_path, + "ephemeral": not explicit_path and not chat_scoped, + "chat_scoped": chat_scoped, "context_id": safe_context, "error": "", } @@ -226,10 +247,12 @@ def capture_screenshot( return { "ok": True, "path": str(raw_path), + "a0_path": normalize_a0_path(raw_path), "format": "xwd", "captured_at": iso_now(), "recent": True, - "ephemeral": not explicit_path, + "ephemeral": not explicit_path and not chat_scoped, + "chat_scoped": chat_scoped, "context_id": safe_context, "error": message, } @@ -575,8 +598,36 @@ def parse_xprop(output: str) -> dict[str, str]: def latest_screenshot(*, context_id: str = "") -> dict[str, Any]: + chat_dir = chat_screenshot_dir(context_id) + chat_latest = _latest_screenshot_from_dir( + chat_dir, + context_id=context_id, + ephemeral=False, + chat_scoped=True, + prune_older=False, + ) + if chat_latest.get("ok"): + return chat_latest + prune_context_screenshots(context_id=context_id, max_age_seconds=RECENT_SCREENSHOT_SECONDS) screenshot_dir = context_screenshot_dir(context_id) + return _latest_screenshot_from_dir( + screenshot_dir, + context_id=context_id, + ephemeral=True, + chat_scoped=False, + prune_older=True, + ) + + +def _latest_screenshot_from_dir( + screenshot_dir: Path, + *, + context_id: str = "", + ephemeral: bool, + chat_scoped: bool, + prune_older: bool, +) -> dict[str, Any]: if not screenshot_dir.exists(): return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False} candidates = [ @@ -587,17 +638,20 @@ def latest_screenshot(*, context_id: str = "") -> dict[str, Any]: if not candidates: return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False} latest = max(candidates, key=lambda item: item.stat().st_mtime) - for candidate in candidates: - if candidate != latest: - candidate.unlink(missing_ok=True) + if prune_older: + for candidate in candidates: + if candidate != latest: + candidate.unlink(missing_ok=True) age = max(0.0, time.time() - latest.stat().st_mtime) return { "ok": True, "path": str(latest), + "a0_path": normalize_a0_path(latest), "format": latest.suffix.lower().lstrip("."), "captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)), "recent": age <= RECENT_SCREENSHOT_SECONDS, - "ephemeral": True, + "ephemeral": ephemeral, + "chat_scoped": chat_scoped, "context_id": _safe_context_id(context_id), } @@ -660,7 +714,8 @@ def compact_prompt_context(state: dict[str, Any] | None = None) -> str: screenshot = state.get("screenshot") or {} if screenshot.get("recent") and screenshot.get("path"): ephemeral = " ephemeral" if screenshot.get("ephemeral") else "" - lines.append(f"- recent_screenshot={screenshot['path']}{ephemeral}") + screenshot_ref = screenshot.get("a0_path") or screenshot["path"] + lines.append(f"- recent_screenshot={screenshot_ref}{ephemeral}") context_id = str(state.get("context_id") or "").strip() if context_id: lines.append(f"- screenshot_context={context_id}") diff --git a/plugins/_desktop/skills/linux-desktop/SKILL.md b/plugins/_desktop/skills/linux-desktop/SKILL.md index a9b189762..54931bae4 100644 --- a/plugins/_desktop/skills/linux-desktop/SKILL.md +++ b/plugins/_desktop/skills/linux-desktop/SKILL.md @@ -38,7 +38,7 @@ The Desktop is an observe-act-verify control surface. Use this decision hierarch 3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands. 4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation. 5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output. -6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations return temporary context paths. Do not report from an earlier screenshot path. +6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations with `--context-id` return chat-scoped screenshot paths. Do not report from an earlier screenshot path. Keep these standing rules: @@ -68,7 +68,7 @@ $DESKTOP key ctrl+s The script targets the persistent `agent-zero-desktop` X display, sets `DISPLAY`, `XAUTHORITY`, and `HOME` to the XFCE profile, then uses `xdotool` for input. Startup normally prepares this session. If `check` fails during explicit Desktop work, report that the Desktop runtime is not ready instead of installing packages ad hoc. -If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Use any returned shell screenshot path promptly; only the latest temporary context screenshot is retained. +If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Shell screenshots captured with `--context-id` live in the owning chat's screenshot folder; screenshots without a chat context remain temporary. For direct app launches without coordinates: diff --git a/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh b/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh index cff3fb288..0abbf7e64 100755 --- a/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh +++ b/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh @@ -60,7 +60,7 @@ Commands: observe --json [--screenshot] [--context-id ID] Return structured state, optionally with a fresh screenshot. screenshot [PATH] [--context-id ID] - Capture the Desktop to PATH, or to the temporary context screenshot directory. + Capture the Desktop to PATH, or to the chat screenshot directory when context-id is set. active-window Print the active window name. geometry PATTERN Print the first matching visible window geometry. wait-window PATTERN Wait for a visible matching window and print its id. diff --git a/plugins/_desktop/webui/desktop-store.js b/plugins/_desktop/webui/desktop-store.js index e106f5f12..75f9d4371 100644 --- a/plugins/_desktop/webui/desktop-store.js +++ b/plugins/_desktop/webui/desktop-store.js @@ -258,6 +258,7 @@ const model = { _desktopFrameHost: null, _desktopFrameLoadHandler: null, _desktopKeepaliveHost: null, + _desktopDisplaySizes: {}, _desktopIntentionalShutdown: false, async init(element = null) { @@ -1499,7 +1500,7 @@ const model = { this.stopXpraDesktopPrime(); this._desktopPrimeAttempts = 0; } - if (this.applyXpraDesktopFrameMode(options.frame || null)) return; + if (this.applyXpraDesktopFrameMode(options.frame || null, options)) return; if (this._desktopPrimeAttempts >= XPRA_DESKTOP_PRIME_ATTEMPTS) return; this._desktopPrimeAttempts += 1; if (this._desktopPrimeTimer) globalThis.clearTimeout(this._desktopPrimeTimer); @@ -1540,8 +1541,12 @@ const model = { const windows = Object.values(client.id_to_window || {}); if (!client.connected || !windows.length) return false; - const width = Math.round(container.clientWidth || remoteWindow.innerWidth || 0); - const height = Math.round(container.clientHeight || remoteWindow.innerHeight || 0); + const token = options.token || this.session?.desktop?.token || ""; + const displaySize = options.displaySize || this.desktopDisplaySizeForToken(token); + const viewportWidth = Math.round(container.clientWidth || remoteWindow.innerWidth || 0); + const viewportHeight = Math.round(container.clientHeight || remoteWindow.innerHeight || 0); + const width = Math.round(displaySize?.width || viewportWidth || 0); + const height = Math.round(displaySize?.height || viewportHeight || 0); if (width > 0 && height > 0) { client.desktop_width = width; client.desktop_height = height; @@ -1574,6 +1579,26 @@ const model = { } }, + desktopDisplaySizeForToken(token = "") { + const key = String(token || "").trim(); + const size = key ? this._desktopDisplaySizes?.[key] : null; + const width = Math.round(Number(size?.width || 0)); + const height = Math.round(Number(size?.height || 0)); + return width > 0 && height > 0 ? { width, height } : null; + }, + + rememberDesktopDisplaySize(token = "", width = 0, height = 0) { + const key = String(token || "").trim(); + const normalizedWidth = Math.round(Number(width || 0)); + const normalizedHeight = Math.round(Number(height || 0)); + if (!key || normalizedWidth <= 0 || normalizedHeight <= 0) return null; + this._desktopDisplaySizes = { + ...(this._desktopDisplaySizes || {}), + [key]: { width: normalizedWidth, height: normalizedHeight }, + }; + return this._desktopDisplaySizes[key]; + }, + installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container) { if (!frame || !remoteWindow || !remoteDocument || !client) return null; const store = this; @@ -1584,8 +1609,10 @@ const model = { const metrics = () => { const desktopWidth = Math.max(1, finite(client.desktop_width || container?.clientWidth || remoteWindow.innerWidth, 1)); const desktopHeight = Math.max(1, finite(client.desktop_height || container?.clientHeight || remoteWindow.innerHeight, 1)); - const clientWidth = Math.max(1, finite(container?.clientWidth || remoteWindow.innerWidth, desktopWidth)); - const clientHeight = Math.max(1, finite(container?.clientHeight || remoteWindow.innerHeight, desktopHeight)); + const primaryWindow = Object.values(client.id_to_window || {})[0]; + const canvas = primaryWindow?.canvas; + const clientWidth = Math.max(1, finite(canvas?.clientWidth || canvas?.width || container?.clientWidth || remoteWindow.innerWidth, desktopWidth)); + const clientHeight = Math.max(1, finite(canvas?.clientHeight || canvas?.height || container?.clientHeight || remoteWindow.innerHeight, desktopHeight)); return { desktopWidth, desktopHeight, @@ -1683,8 +1710,10 @@ const model = { }, fitXpraDesktopWindowElement(xpraWindow, width, height) { - const cssWidth = `${Math.max(1, Number(width || 0))}px`; - const cssHeight = `${Math.max(1, Number(height || 0))}px`; + const normalizedWidth = Math.max(1, Math.round(Number(width || 0))); + const normalizedHeight = Math.max(1, Math.round(Number(height || 0))); + const cssWidth = `${normalizedWidth}px`; + const cssHeight = `${normalizedHeight}px`; const windowElement = xpraWindow?.div; const canvas = xpraWindow?.canvas; windowElement?.style?.setProperty("left", "0px", "important"); @@ -1698,6 +1727,12 @@ const model = { canvas?.style?.setProperty("height", cssHeight, "important"); canvas?.style?.setProperty("display", "block", "important"); canvas?.style?.setProperty("margin", "0", "important"); + if (canvas) { + if (canvas.width !== normalizedWidth) canvas.width = normalizedWidth; + if (canvas.height !== normalizedHeight) canvas.height = normalizedHeight; + canvas.setAttribute("width", String(normalizedWidth)); + canvas.setAttribute("height", String(normalizedHeight)); + } }, installXpraDesktopWheelBridge(remoteWindow, xpraWindow) { @@ -2139,6 +2174,11 @@ const model = { const response = await fetch(`/desktop/resize?${params.toString()}`, { credentials: "same-origin" }); if (response.ok) { const result = await response.json().catch(() => ({})); + const displaySize = this.rememberDesktopDisplaySize( + token, + result?.width || width, + result?.height || height, + ); this._desktopResizeKey = key; const activeFrame = this.desktopFrame(frame); const activeTarget = activeFrame?.parentElement || activeFrame; @@ -2153,7 +2193,7 @@ const model = { } } if (result?.reload) this.reloadDesktopFrame(activeFrame || frame); - this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame }); + this.primeXpraDesktopFrame({ reset: true, frame: activeFrame || frame, token, displaySize }); } } catch (error) { console.warn("Desktop resize skipped", error); diff --git a/tests/test_browser_agent_regressions.py b/tests/test_browser_agent_regressions.py index ab3987567..160e1bde8 100644 --- a/tests/test_browser_agent_regressions.py +++ b/tests/test_browser_agent_regressions.py @@ -2477,7 +2477,7 @@ async def test_browser_runtime_remounts_initial_changed_viewport(): @pytest.mark.anyio -async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeypatch, tmp_path): +async def test_browser_runtime_screenshot_file_defaults_to_chat_scoped_artifact(monkeypatch, tmp_path): screenshot_calls = [] def fake_get_abs_path(*parts): @@ -2512,15 +2512,15 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp result = await core.screenshot_file(5, quality=500) - assert "path" not in result - assert "a0_path" not in result + assert Path(result["path"]).read_bytes() == b"image-bytes" + assert result["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/browser/browser-5-") assert result["context_id"] == "ctx/id" assert result["mime"] == "image/jpeg" - assert result["ephemeral"] is True - assert result["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX) + assert result["ephemeral"] is False + assert result["chat_scoped"] is True assert result["vision_load"] == { "tool_name": "vision_load", - "tool_args": {"paths": [result["ephemeral_ref"]]}, + "tool_args": {"paths": [result["a0_path"]]}, } assert "image" not in result assert not list((tmp_path / "tmp" / "browser" / "screenshots").rglob("*.jpg")) @@ -2528,7 +2528,6 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp assert screenshot_calls[-1]["quality"] == 95 assert screenshot_calls[-1]["full_page"] is False assert "path" not in screenshot_calls[-1] - assert ephemeral_images.consume_image(result["ephemeral_ref"], context_id="ctx/id").data_url == "data:image/jpeg;base64,aW1hZ2UtYnl0ZXM=" png_path = tmp_path / "custom.png" png_result = await core.screenshot_file(5, quality=1, full_page=True, path=str(png_path)) @@ -2543,9 +2542,27 @@ async def test_browser_runtime_screenshot_file_defaults_to_ephemeral_ref(monkeyp @pytest.mark.anyio -async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch): +async def test_vision_load_materializes_ephemeral_browser_refs(monkeypatch, tmp_path): + monkeypatch.setitem(sys.modules, "helpers.tool", SimpleNamespace(Response=_TestResponse, Tool=_TestTool)) + history_stub = ModuleType("helpers.history") + + class _RawMessage(dict): + def __init__(self, raw_content, preview): + super().__init__(raw_content=raw_content, preview=preview) + + history_stub.RawMessage = _RawMessage + monkeypatch.setitem(sys.modules, "helpers.history", history_stub) + monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False) import tools.vision_load as vision_load_module + def fake_get_abs_path(*parts): + return str(tmp_path.joinpath(*parts)) + + def fake_normalize_a0_path(path): + return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/") + + monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path) + monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path) monkeypatch.setattr( vision_load_module.plugins, "get_plugin_config", @@ -2561,7 +2578,7 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch): hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)), hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)), ) - ref = ephemeral_images.put_image( + ref = vision_load_module.ephemeral_images.put_image( context_id="ctx-vision", mime="image/jpeg", data=SMALL_JPEG_10X10, @@ -2580,10 +2597,13 @@ async def test_vision_load_consumes_ephemeral_browser_refs(monkeypatch): response = await tool.execute(paths=[ref]) await tool.after_execution(response) - assert ephemeral_images.get_image(ref, context_id="ctx-vision") is None + assert vision_load_module.ephemeral_images.get_image(ref, context_id="ctx-vision") is None assert tool.loaded_paths == ["browser-shot.jpg"] raw_message = messages[0][1]["content"] - assert raw_message.raw_content[0]["image_url"]["url"] == f"data:image/jpeg;base64,{SMALL_JPEG_10X10}" + stored_ref = raw_message["raw_content"][0]["image_url"]["url"] + assert stored_ref.startswith("/a0/usr/chats/ctx-vision/screenshots/browser/browser-shot-") + stored_path = tmp_path / stored_ref.removeprefix("/a0/") + assert stored_path.read_bytes() == __import__("base64").b64decode(SMALL_JPEG_10X10) assert updates[-1]["result"] == "1 images loaded, 0 skipped" diff --git a/tests/test_host_browser_connector.py b/tests/test_host_browser_connector.py index 452a11d7e..20ff61fb5 100644 --- a/tests/test_host_browser_connector.py +++ b/tests/test_host_browser_connector.py @@ -12,8 +12,8 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) -from helpers import ephemeral_images from plugins._a0_connector.helpers import ws_runtime +from plugins._browser.helpers import connector_runtime as connector_runtime_module from plugins._browser.helpers.connector_runtime import ( ConnectorBrowserRuntime, _agent_uses_local_chat_model, @@ -330,7 +330,15 @@ def test_connector_runtime_adds_docker_recovery_to_host_errors(): assert "/browser container" in message -def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path): +def test_host_browser_artifacts_become_chat_scoped_files(monkeypatch, tmp_path): + def fake_get_abs_path(*parts): + return str(tmp_path.joinpath(*parts)) + + def fake_normalize_a0_path(path): + return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/") + + monkeypatch.setattr(connector_runtime_module.chat_media.files, "get_abs_path", fake_get_abs_path) + monkeypatch.setattr(connector_runtime_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path) runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host")) result = runtime._materialize_artifact( @@ -352,19 +360,15 @@ def test_host_browser_artifacts_become_context_scoped_ephemeral_refs(tmp_path): inner = result[0]["result"] assert "artifact" not in inner - assert "path" not in inner - assert "a0_path" not in inner + assert Path(inner["path"]).read_bytes() == b"fake" + assert inner["a0_path"].startswith("/a0/usr/chats/ctx-host/screenshots/browser/shot-") assert inner["context_id"] == "ctx-host" - assert inner["ephemeral"] is True - assert inner["ephemeral_ref"].startswith(ephemeral_images.REF_PREFIX) - assert inner["vision_load"]["tool_args"]["paths"] == [inner["ephemeral_ref"]] - assert ephemeral_images.consume_image(inner["ephemeral_ref"], context_id="ctx-host").data_url == "data:image/jpeg;base64,ZmFrZQ==" - assert not list(tmp_path.rglob("shot.jpg")) + assert inner["ephemeral"] is False + assert inner["chat_scoped"] is True + assert inner["vision_load"]["tool_args"]["paths"] == [inner["a0_path"]] def test_host_browser_artifact_materialization_rejects_oversized_payload(monkeypatch, tmp_path): - import plugins._browser.helpers.connector_runtime as connector_runtime_module - monkeypatch.setattr(connector_runtime_module, "MAX_ARTIFACT_SIZE_BYTES", 2) runtime = ConnectorBrowserRuntime("ctx-host", _agent("ctx-host")) diff --git a/tests/test_office_canvas_setup.py b/tests/test_office_canvas_setup.py index 61c2dc7b2..535237acf 100644 --- a/tests/test_office_canvas_setup.py +++ b/tests/test_office_canvas_setup.py @@ -264,6 +264,14 @@ def test_desktop_plugin_owns_routes_runtime_surface_and_state_paths(): assert "DESKTOP_RUNTIME_INSTALL_MESSAGE" in desktop_store assert "openDesktopWhenRuntimeReady" in desktop_store assert "isDesktopRuntimeInstalling" in desktop_store + assert "_desktopDisplaySizes: {}" in desktop_store + assert "desktopDisplaySizeForToken(token" in desktop_store + assert "rememberDesktopDisplaySize(token" in desktop_store + assert "options.displaySize || this.desktopDisplaySizeForToken(token)" in desktop_store + assert "result?.width || width" in desktop_store + assert "canvas.width = normalizedWidth" in desktop_store + assert "canvas.height = normalizedHeight" in desktop_store + assert "canvas?.clientWidth || canvas?.width" in desktop_store assert "Installing Agent Zero Desktop runtime dependencies" in desktop_session assert "__a0XpraOffsetWarnPatched" in desktop_store assert "window does not fit in canvas, offsets" in desktop_store diff --git a/tests/test_office_desktop_state.py b/tests/test_office_desktop_state.py index e2ab36454..bf4f18bea 100644 --- a/tests/test_office_desktop_state.py +++ b/tests/test_office_desktop_state.py @@ -191,7 +191,8 @@ def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeypatch): - monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path) + monkeypatch.setattr(desktop_state, "BASE_DIR", tmp_path) + monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path / "tmp" / "desktop" / "screenshots") capabilities = {"xwd": "/usr/bin/xwd"} env = {"DISPLAY": ":120"} @@ -222,7 +223,7 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp monkeypatch.setattr(desktop_state, "run", fake_run) monkeypatch.setitem(sys.modules, "PIL", pil_module) monkeypatch.setitem(sys.modules, "PIL.Image", image_module) - stale_path = tmp_path / "ctx_id" / "stale.png" + stale_path = tmp_path / "tmp" / "desktop" / "screenshots" / "ctx_id" / "stale.png" stale_path.parent.mkdir(parents=True) stale_path.write_bytes(b"stale") @@ -236,12 +237,14 @@ def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeyp path = Path(screenshot["path"]) assert screenshot["ok"] is True - assert screenshot["ephemeral"] is True + assert screenshot["ephemeral"] is False + assert screenshot["chat_scoped"] is True assert screenshot["context_id"] == "ctx_id" - assert path.parent == tmp_path / "ctx_id" + assert screenshot["a0_path"].startswith("/a0/usr/chats/ctx_id/screenshots/desktop/desktop-") + assert path.parent == tmp_path / "usr" / "chats" / "ctx_id" / "screenshots" / "desktop" assert path.name.startswith("desktop-") assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path) - assert not stale_path.exists() + assert stale_path.exists() def test_desktop_state_default_screenshot_returns_ephemeral_ref(tmp_path, monkeypatch): diff --git a/tests/test_tool_action_contracts.py b/tests/test_tool_action_contracts.py index c38a9c711..de98f259f 100644 --- a/tests/test_tool_action_contracts.py +++ b/tests/test_tool_action_contracts.py @@ -699,3 +699,35 @@ def test_computer_use_remote_start_session_reports_backend_features_and_windows_ assert "backend=windows/windows" in message assert "features=uia-tree-snapshot, uia-structural-targeting" in message assert "host-computer-use-windows" in message + + +def test_computer_use_remote_capture_artifact_is_chat_scoped(monkeypatch, tmp_path: Path): + module = _load_computer_use_remote_tool(monkeypatch) + + def fake_get_abs_path(*parts): + return str(tmp_path.joinpath(*parts)) + + def fake_normalize_a0_path(path): + return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/") + + monkeypatch.setattr(module.chat_media.files, "get_abs_path", fake_get_abs_path) + monkeypatch.setattr(module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path) + + tool = object.__new__(module.ComputerUseRemote) + tool.agent = types.SimpleNamespace(context=types.SimpleNamespace(id="ctx-computer")) + + display_ref, capture_id = tool._resolve_capture_ref( + { + "artifact": { + "filename": "capture.png", + "mime": "image/png", + "encoding": "base64", + "data": "ZmFrZQ==", + }, + } + ) + + assert display_ref.startswith("/a0/usr/chats/ctx-computer/screenshots/computer-use/capture-") + stored_path = tmp_path / display_ref.removeprefix("/a0/") + assert stored_path.read_bytes() == b"fake" + assert capture_id == stored_path.stem diff --git a/tests/test_vision_load_image_refs.py b/tests/test_vision_load_image_refs.py new file mode 100644 index 000000000..2842a5e1e --- /dev/null +++ b/tests/test_vision_load_image_refs.py @@ -0,0 +1,123 @@ +import types +from types import SimpleNamespace +import sys +from pathlib import Path + +import pytest + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from helpers import images + + +class _TestResponse(SimpleNamespace): + def __init__(self, message="", break_loop=False, **kwargs): + super().__init__(message=message, break_loop=break_loop, **kwargs) + + +class _TestTool: + def __init__( + self, + agent=None, + name="", + method=None, + args=None, + message="", + loop_data=None, + **kwargs, + ): + self.agent = agent + self.name = name + self.method = method + self.args = args or {} + self.message = message + self.loop_data = loop_data + + +def _install_tool_stub(monkeypatch): + tool_stub = types.ModuleType("helpers.tool") + tool_stub.Response = _TestResponse + tool_stub.Tool = _TestTool + history_stub = types.ModuleType("helpers.history") + + class _RawMessage(dict): + def __init__(self, raw_content, preview): + super().__init__(raw_content=raw_content, preview=preview) + + history_stub.RawMessage = _RawMessage + monkeypatch.setitem(sys.modules, "helpers.tool", tool_stub) + monkeypatch.setitem(sys.modules, "helpers.history", history_stub) + monkeypatch.delitem(sys.modules, "tools.vision_load", raising=False) + + +def test_prepare_content_keeps_missing_local_image_refs_strict(): + missing_path = "/tmp/a0-missing-desktop-screenshot.png" + + with pytest.raises(FileNotFoundError): + images.prepare_content( + [{"type": "image_url", "image_url": {"url": missing_path}}] + ) + + +@pytest.mark.anyio +async def test_vision_load_materializes_local_image_to_chat_artifact(monkeypatch, tmp_path): + _install_tool_stub(monkeypatch) + import tools.vision_load as vision_load_module + + def fake_get_abs_path(*parts): + return str(tmp_path.joinpath(*parts)) + + def fake_normalize_a0_path(path): + return "/a0/" + str(Path(path).relative_to(tmp_path)).replace("\\", "/") + + monkeypatch.setattr(vision_load_module.chat_media.files, "get_abs_path", fake_get_abs_path) + monkeypatch.setattr(vision_load_module.chat_media.files, "normalize_a0_path", fake_normalize_a0_path) + monkeypatch.setattr( + vision_load_module.plugins, + "get_plugin_config", + lambda *args, **kwargs: {"chat_model": {"max_embeds": 10}}, + ) + + async def direct_call(func, *args, **kwargs): + return func(*args, **kwargs) + + monkeypatch.setattr( + vision_load_module.runtime, + "call_development_function", + direct_call, + ) + + image_path = tmp_path / "sample-image.png" + image_path.write_bytes(b"png-data") + + tool_results = [] + messages = [] + updates = [] + agent = SimpleNamespace( + context=SimpleNamespace(id="ctx-vision"), + agent_name="Agent 0", + hist_add_tool_result=lambda *args, **kwargs: tool_results.append((args, kwargs)), + hist_add_message=lambda *args, **kwargs: messages.append((args, kwargs)), + ) + tool = vision_load_module.VisionLoad( + agent=agent, + name="vision_load", + method=None, + args={"paths": [str(image_path)]}, + message="", + loop_data=None, + ) + tool.log = SimpleNamespace(id="vision-log", update=lambda **kwargs: updates.append(kwargs)) + + response = await tool.execute(paths=[str(image_path)]) + image_path.unlink() + await tool.after_execution(response) + + raw_message = messages[0][1]["content"] + stored_ref = raw_message["raw_content"][0]["image_url"]["url"] + assert stored_ref.startswith("/a0/usr/chats/ctx-vision/images/vision-load/sample-image-") + stored_path = tmp_path / stored_ref.removeprefix("/a0/") + assert stored_path.read_bytes() == b"png-data" + assert updates[-1]["result"] == "1 images loaded, 0 skipped" diff --git a/tools/vision_load.py b/tools/vision_load.py index 90f2133e6..7358afbde 100644 --- a/tools/vision_load.py +++ b/tools/vision_load.py @@ -1,6 +1,6 @@ from helpers.print_style import PrintStyle from helpers.tool import Tool, Response -from helpers import runtime, files, plugins, ephemeral_images +from helpers import runtime, files, plugins, ephemeral_images, images, chat_media from mimetypes import guess_type from helpers import history @@ -27,7 +27,7 @@ class VisionLoad(Tool): else [] ) - for path, display_path in limited_paths: + for idx, (path, display_path) in enumerate(limited_paths): if not path: continue if ephemeral_images.is_ref(path): @@ -38,12 +38,16 @@ class VisionLoad(Tool): if image is None: continue display = image.display_name or display_path - self.images_dict[display] = image.data_url - self.loaded_paths.append(display) + stored_ref = self._store_ephemeral_image(image) + if stored_ref: + self.images_dict[display] = stored_ref + self.loaded_paths.append(display) continue if self._is_data_image_url(path): - self.images_dict[display_path] = path - self.loaded_paths.append(display_path) + stored_ref = self._store_data_url(path, preferred_name=f"vision-load-{idx + 1}.png") + if stored_ref: + self.images_dict[display_path] = stored_ref + self.loaded_paths.append(display_path) continue if not await runtime.call_development_function(files.exists, str(path)): continue @@ -51,8 +55,12 @@ class VisionLoad(Tool): if path not in self.images_dict: mime_type, _ = guess_type(str(path)) if mime_type and mime_type.startswith("image/"): - self.images_dict[display_path] = str(path) - self.loaded_paths.append(display_path) + try: + stored_ref = self._store_local_image(path, preferred_name=files.basename(path)) + self.images_dict[display_path] = stored_ref + self.loaded_paths.append(display_path) + except (FileNotFoundError, OSError, ValueError): + continue return Response(message="dummy", break_loop=False) @@ -65,6 +73,48 @@ class VisionLoad(Tool): def _context_id(self) -> str: return str(getattr(getattr(self.agent, "context", None), "id", "") or "").strip() + def _store_ephemeral_image(self, image: ephemeral_images.EphemeralImage) -> str: + context_id = self._context_id() + if not context_id: + return image.data_url + source = chat_media.infer_source(image.ref, image.display_name) + category = chat_media.category_for_source(source) + saved = chat_media.save_image_base64( + context_id=context_id, + data=image.data, + mime_type=image.mime, + category=category, + source=source, + preferred_name=image.display_name, + ) + return saved.a0_path + + def _store_data_url(self, data_url: str, *, preferred_name: str = "") -> str: + context_id = self._context_id() + if not context_id: + return data_url + source = chat_media.infer_source(data_url, preferred_name) + category = chat_media.category_for_source(source) + saved = chat_media.save_image_data_url( + context_id=context_id, + data_url=data_url, + category=category, + source=source, + preferred_name=preferred_name, + ) + return saved.a0_path + + def _store_local_image(self, path: str, *, preferred_name: str = "") -> str: + context_id = self._context_id() + if not context_id: + return images.to_data_url(path) + return chat_media.materialize_image_ref( + context_id=context_id, + url=path, + source=chat_media.infer_source(path, preferred_name), + preferred_name=preferred_name, + ) + @staticmethod def _is_data_image_url(value: str) -> bool: normalized = str(value or "").strip().lower()