diff --git a/plugins/_desktop/extensions/python/message_loop_prompts_after/_55_include_desktop_state.py b/plugins/_desktop/extensions/python/message_loop_prompts_after/_55_include_desktop_state.py index 3fe197ad5..3efa731e6 100644 --- a/plugins/_desktop/extensions/python/message_loop_prompts_after/_55_include_desktop_state.py +++ b/plugins/_desktop/extensions/python/message_loop_prompts_after/_55_include_desktop_state.py @@ -7,7 +7,8 @@ from plugins._desktop.helpers import prompt_context class IncludeDesktopState(Extension): async def execute(self, loop_data: LoopData = LoopData(), **kwargs): - context = prompt_context.build_context() + context_id = str(getattr(getattr(self.agent, "context", None), "id", "") or "") + context = prompt_context.build_context(context_id=context_id) if not context: loop_data.extras_temporary.pop("desktop_state", None) return diff --git a/plugins/_desktop/helpers/desktop_state.py b/plugins/_desktop/helpers/desktop_state.py index bf8662a37..44852923b 100644 --- a/plugins/_desktop/helpers/desktop_state.py +++ b/plugins/_desktop/helpers/desktop_state.py @@ -6,11 +6,14 @@ import os import re import shutil import subprocess +import sys import time from pathlib import Path from typing import Any PROJECT_ROOT = Path(__file__).resolve().parents[3] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) SESSION_ID = "agent-zero-desktop" @@ -20,9 +23,10 @@ STATE_DIR = BASE_DIR / "usr" / "plugins" / PLUGIN_NAME RETIRED_STATE_DIR = BASE_DIR / "usr" / PLUGIN_NAME SESSION_DIR = STATE_DIR / "sessions" PROFILE_DIR = STATE_DIR / "profiles" -SCREENSHOT_DIR = STATE_DIR / "screenshots" +SCREENSHOT_DIR = Path(os.environ.get("A0_DESKTOP_SCREENSHOT_DIR") or BASE_DIR / "tmp" / "desktop" / "screenshots") RECENT_SCREENSHOT_SECONDS = 600 _SAFE_CONTEXT_RE = re.compile(r"[^a-zA-Z0-9_.-]+") +_SCREENSHOT_SUFFIXES = {".png", ".jpg", ".jpeg", ".xwd"} def session_manifest_path(session_id: str = SESSION_ID) -> Path: @@ -47,6 +51,7 @@ def collect_state( include_screenshot: bool = False, screenshot_path: str | Path | None = None, context_id: str = "", + screenshot_transport: str = "ephemeral", ) -> dict[str, Any]: errors: list[str] = [] env_info = resolve_environment(errors=errors) @@ -72,9 +77,11 @@ def collect_state( path=screenshot_path, errors=errors, context_id=context_id, + transport=screenshot_transport, ) return stable_state( + context_id=context_id, display=display, profile_dir=profile_dir, size=size, @@ -94,6 +101,7 @@ def capture_screenshot( path: str | Path | None = None, errors: list[str] | None = None, context_id: str = "", + transport: str = "ephemeral", ) -> dict[str, Any]: local_errors = errors if errors is not None else [] capabilities = capabilities or collect_capabilities() @@ -109,12 +117,18 @@ def capture_screenshot( local_errors.append(message) return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message} + explicit_path = path is not None and str(path).strip() != "" + ephemeral_ref = not explicit_path and str(transport or "").strip().lower() != "path" screenshot_dir = context_screenshot_dir(context_id) - screenshot_dir.mkdir(parents=True, exist_ok=True) + if not explicit_path: + prune_context_screenshots(context_id=context_id) + screenshot_dir.mkdir(parents=True, exist_ok=True) timestamp = time.strftime("%Y%m%d-%H%M%S") - target = Path(path) if path else screenshot_dir / f"desktop-{timestamp}.png" + millis = int((time.time() % 1) * 1000) + target = Path(path) if explicit_path else screenshot_dir / f"desktop-{timestamp}-{millis:03d}.png" target.parent.mkdir(parents=True, exist_ok=True) raw_path = target.with_suffix(".xwd") + safe_context = _safe_context_id(context_id) result = run([xwd, "-root", "-silent", "-out", str(raw_path)], env=env, timeout=8) if result.returncode != 0: @@ -124,12 +138,16 @@ def capture_screenshot( return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail} if target.suffix.lower() == ".xwd": + if not explicit_path: + prune_context_screenshots(context_id=context_id, keep_path=raw_path) return { "ok": True, "path": str(raw_path), "format": "xwd", "captured_at": iso_now(), "recent": True, + "ephemeral": not explicit_path, + "context_id": safe_context, "error": "", } @@ -141,6 +159,16 @@ def capture_screenshot( width = int(image.width) height = int(image.height) raw_path.unlink(missing_ok=True) + if ephemeral_ref: + return ephemeral_screenshot_result( + target, + context_id=context_id, + image_format=target.suffix.lower().lstrip(".") or "png", + width=width, + height=height, + ) + if not explicit_path: + prune_context_screenshots(context_id=context_id, keep_path=target) return { "ok": True, "path": str(target), @@ -149,12 +177,24 @@ def capture_screenshot( "height": height, "captured_at": iso_now(), "recent": True, + "ephemeral": not explicit_path, + "context_id": safe_context, "error": "", } except Exception as exc: try: converted = convert_xwd_to_image(raw_path, target) raw_path.unlink(missing_ok=True) + if ephemeral_ref: + return ephemeral_screenshot_result( + target, + context_id=context_id, + image_format=target.suffix.lower().lstrip(".") or "png", + width=converted["width"], + height=converted["height"], + ) + if not explicit_path: + prune_context_screenshots(context_id=context_id, keep_path=target) return { "ok": True, "path": str(target), @@ -163,17 +203,34 @@ def capture_screenshot( "height": converted["height"], "captured_at": iso_now(), "recent": True, + "ephemeral": not explicit_path, + "context_id": safe_context, "error": "", } except Exception as fallback_exc: message = f"Pillow could not convert the XWD screenshot: {exc}; fallback parser failed: {fallback_exc}" local_errors.append(message) + if ephemeral_ref: + raw_path.unlink(missing_ok=True) + target.unlink(missing_ok=True) + return { + "ok": False, + "path": "", + "format": "", + "captured_at": iso_now(), + "recent": False, + "ephemeral": True, + "context_id": safe_context, + "error": message, + } return { "ok": True, "path": str(raw_path), "format": "xwd", "captured_at": iso_now(), "recent": True, + "ephemeral": not explicit_path, + "context_id": safe_context, "error": message, } @@ -518,17 +575,21 @@ def parse_xprop(output: str) -> dict[str, str]: def latest_screenshot(*, context_id: str = "") -> dict[str, Any]: + prune_context_screenshots(context_id=context_id, max_age_seconds=RECENT_SCREENSHOT_SECONDS) screenshot_dir = context_screenshot_dir(context_id) if not screenshot_dir.exists(): return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False} candidates = [ path for path in screenshot_dir.iterdir() - if path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg", ".xwd"} + if path.is_file() and path.suffix.lower() in _SCREENSHOT_SUFFIXES ] if not candidates: return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False} latest = max(candidates, key=lambda item: item.stat().st_mtime) + for candidate in candidates: + if candidate != latest: + candidate.unlink(missing_ok=True) age = max(0.0, time.time() - latest.stat().st_mtime) return { "ok": True, @@ -536,6 +597,8 @@ def latest_screenshot(*, context_id: str = "") -> dict[str, Any]: "format": latest.suffix.lower().lstrip("."), "captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)), "recent": age <= RECENT_SCREENSHOT_SECONDS, + "ephemeral": True, + "context_id": _safe_context_id(context_id), } @@ -543,6 +606,7 @@ def stable_state( *, display: str, profile_dir: str, + context_id: str = "", size: dict[str, int] | None = None, pointer: dict[str, int] | None = None, active_window: dict[str, Any] | None = None, @@ -554,6 +618,7 @@ def stable_state( clean_errors = [str(error) for error in errors or [] if str(error)] return { "ok": not clean_errors, + "context_id": _safe_context_id(context_id), "display": display, "profile_dir": profile_dir, "size": size or {"width": 0, "height": 0}, @@ -594,9 +659,15 @@ def compact_prompt_context(state: dict[str, Any] | None = None) -> str: lines.append("- visible=" + "; ".join(visible)) screenshot = state.get("screenshot") or {} if screenshot.get("recent") and screenshot.get("path"): - lines.append(f"- recent_screenshot={screenshot['path']}") + ephemeral = " ephemeral" if screenshot.get("ephemeral") else "" + lines.append(f"- recent_screenshot={screenshot['path']}{ephemeral}") + context_id = str(state.get("context_id") or "").strip() + if context_id: + lines.append(f"- screenshot_context={context_id}") + context_arg = f" --context-id {context_id}" if context_id else "" lines.append( - "- next=plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot " + "- next=plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot" + f"{context_arg} " "before any coordinate action; prefer focus/key/paste/save/app-native helpers first." ) lines.append( @@ -667,6 +738,75 @@ def image_height(path: Path) -> int: return 0 +def ephemeral_screenshot_result( + path: Path, + *, + context_id: str = "", + image_format: str = "png", + width: int = 0, + height: int = 0, +) -> dict[str, Any]: + from helpers import ephemeral_images + + mime = "image/jpeg" if image_format.lower() in {"jpg", "jpeg"} else "image/png" + safe_context = _safe_context_id(context_id) + ref = ephemeral_images.put_image_bytes( + context_id=str(context_id or "").strip(), + mime=mime, + payload=path.read_bytes(), + name=path.name, + ) + path.unlink(missing_ok=True) + prune_context_screenshots(context_id=context_id) + return { + "ok": True, + "path": "", + "format": image_format, + "mime": mime, + "width": width, + "height": height, + "captured_at": iso_now(), + "recent": True, + "ephemeral": True, + "ephemeral_ref": ref, + "context_id": safe_context, + "vision_load": { + "tool_name": "vision_load", + "tool_args": {"paths": [ref]}, + }, + "error": "", + } + + +def prune_context_screenshots( + *, + context_id: str = "", + keep_path: Path | None = None, + max_age_seconds: float | None = None, +) -> None: + screenshot_dir = context_screenshot_dir(context_id) + if not screenshot_dir.exists(): + return + keep = keep_path.resolve(strict=False) if keep_path else None + now = time.time() + for candidate in screenshot_dir.iterdir(): + if not candidate.is_file() or candidate.suffix.lower() not in _SCREENSHOT_SUFFIXES: + continue + if keep is not None and candidate.resolve(strict=False) == keep: + continue + if max_age_seconds is not None: + try: + if now - candidate.stat().st_mtime <= max_age_seconds: + continue + except OSError: + pass + candidate.unlink(missing_ok=True) + try: + screenshot_dir.rmdir() + except OSError: + pass + + def iso_now() -> str: return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) @@ -696,6 +836,7 @@ def main(argv: list[str] | None = None) -> int: payload = collect_state( include_screenshot=bool(args.screenshot), context_id=str(args.context_id or ""), + screenshot_transport="path", ) print(json.dumps(payload, sort_keys=True)) return 0 if payload.get("ok") else 1 @@ -709,6 +850,7 @@ def main(argv: list[str] | None = None) -> int: path=args.path, errors=errors, context_id=str(args.context_id or ""), + transport="path", ) if args.json: print(json.dumps(payload, sort_keys=True)) diff --git a/plugins/_desktop/helpers/prompt_context.py b/plugins/_desktop/helpers/prompt_context.py index 5790fbafa..9350f4262 100644 --- a/plugins/_desktop/helpers/prompt_context.py +++ b/plugins/_desktop/helpers/prompt_context.py @@ -3,12 +3,12 @@ from __future__ import annotations from plugins._desktop.helpers import desktop_state -def build_context() -> str: +def build_context(context_id: str = "") -> str: if not desktop_state.session_manifest_exists(): return "" try: return desktop_state.compact_prompt_context( - desktop_state.collect_state(include_screenshot=False), + desktop_state.collect_state(include_screenshot=False, context_id=context_id), ) except Exception as exc: return ( diff --git a/plugins/_desktop/hooks.py b/plugins/_desktop/hooks.py index 6a027e48e..d122508e0 100644 --- a/plugins/_desktop/hooks.py +++ b/plugins/_desktop/hooks.py @@ -89,7 +89,7 @@ def cleanup_stale_runtime_state(force: bool = False) -> dict[str, Any]: errors: list[str] = [] _migrate_retired_plugin_state(migrated, warnings, errors) - _migrate_unscoped_screenshots(migrated, warnings, errors) + _remove_persisted_screenshots(warnings, errors) retired_packages = _installed_packages(RETIRED_RUNTIME_PACKAGES) if retired_packages: @@ -142,33 +142,18 @@ def _migrate_retired_plugin_state( ) -def _migrate_unscoped_screenshots( - migrated: list[str], +def _remove_persisted_screenshots( warnings: list[str], errors: list[str], ) -> None: screenshots_dir = STATE_DIR / "screenshots" if not screenshots_dir.exists(): return - legacy_screenshots = [ - path - for path in screenshots_dir.iterdir() - if path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg", ".xwd"} - ] - if not legacy_screenshots: - return - - context_dir = screenshots_dir / "default" - context_dir.mkdir(parents=True, exist_ok=True) - for screenshot in legacy_screenshots: - state_migration.migrate_retired_state_tree( - source=screenshot, - destination=context_dir / screenshot.name, - owner="Desktop screenshot", - migrated=migrated, - warnings=warnings, - errors=errors, - ) + try: + shutil.rmtree(screenshots_dir) + warnings.append(f"Removed retired persistent Desktop screenshots: {screenshots_dir}") + except Exception as exc: + errors.append(f"Failed to remove retired persistent Desktop screenshots at {screenshots_dir}: {exc}") def _begin_runtime_preparation() -> None: diff --git a/plugins/_desktop/skills/linux-desktop/SKILL.md b/plugins/_desktop/skills/linux-desktop/SKILL.md index ebc0099b1..0485b4334 100644 --- a/plugins/_desktop/skills/linux-desktop/SKILL.md +++ b/plugins/_desktop/skills/linux-desktop/SKILL.md @@ -30,7 +30,7 @@ The Desktop is an observe-act-verify control surface. Use this decision hierarch 3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands. 4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation. 5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output. -6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Do not report from an earlier screenshot path. +6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations return temporary context paths. Do not report from an earlier screenshot path. Keep these standing rules: @@ -60,7 +60,7 @@ $DESKTOP key ctrl+s The script targets the persistent `agent-zero-desktop` X display, sets `DISPLAY`, `XAUTHORITY`, and `HOME` to the XFCE profile, then uses `xdotool` for input. Startup normally prepares this session. If `check` fails during explicit Desktop work, report that the Desktop runtime is not ready instead of installing packages ad hoc. -If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. +If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Use any returned shell screenshot path promptly; only the latest temporary context screenshot is retained. For direct app launches without coordinates: diff --git a/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh b/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh index 75c0f9dff..cff3fb288 100755 --- a/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh +++ b/plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh @@ -60,7 +60,7 @@ Commands: observe --json [--screenshot] [--context-id ID] Return structured state, optionally with a fresh screenshot. screenshot [PATH] [--context-id ID] - Capture the Desktop to PATH, or to the default screenshot directory. + Capture the Desktop to PATH, or to the temporary context screenshot directory. active-window Print the active window name. geometry PATTERN Print the first matching visible window geometry. wait-window PATTERN Wait for a visible matching window and print its id. diff --git a/tests/test_office_desktop_state.py b/tests/test_office_desktop_state.py index 83742c582..e2ab36454 100644 --- a/tests/test_office_desktop_state.py +++ b/tests/test_office_desktop_state.py @@ -185,11 +185,66 @@ def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp assert screenshot["ok"] is True assert screenshot["path"] == str(tmp_path / "shot.png") assert screenshot["format"] == "png" + assert screenshot["ephemeral"] is False assert (tmp_path / "shot.png").read_bytes() == b"png" assert not (tmp_path / "shot.xwd").exists() -def test_desktop_state_default_screenshot_path_is_context_scoped(tmp_path, monkeypatch): +def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeypatch): + monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path) + capabilities = {"xwd": "/usr/bin/xwd"} + env = {"DISPLAY": ":120"} + + def fake_run(command, *, env, timeout): + raw_path = Path(command[command.index("-out") + 1]) + raw_path.write_bytes(b"xwd") + return _completed(command) + + image_module = types.ModuleType("PIL.Image") + + class FakeImage: + width = 320 + height = 240 + + def __enter__(self): + return self + + def __exit__(self, *_args): + return False + + def save(self, target): + Path(target).write_bytes(b"png") + + image_module.open = lambda _path: FakeImage() + pil_module = types.ModuleType("PIL") + pil_module.Image = image_module + + monkeypatch.setattr(desktop_state, "run", fake_run) + monkeypatch.setitem(sys.modules, "PIL", pil_module) + monkeypatch.setitem(sys.modules, "PIL.Image", image_module) + stale_path = tmp_path / "ctx_id" / "stale.png" + stale_path.parent.mkdir(parents=True) + stale_path.write_bytes(b"stale") + + screenshot = desktop_state.capture_screenshot( + env, + capabilities, + errors=[], + context_id="ctx/id", + transport="path", + ) + + path = Path(screenshot["path"]) + assert screenshot["ok"] is True + assert screenshot["ephemeral"] is True + assert screenshot["context_id"] == "ctx_id" + assert path.parent == tmp_path / "ctx_id" + assert path.name.startswith("desktop-") + assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path) + assert not stale_path.exists() + + +def test_desktop_state_default_screenshot_returns_ephemeral_ref(tmp_path, monkeypatch): monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path) capabilities = {"xwd": "/usr/bin/xwd"} env = {"DISPLAY": ":120"} @@ -229,11 +284,13 @@ def test_desktop_state_default_screenshot_path_is_context_scoped(tmp_path, monke context_id="ctx/id", ) - path = Path(screenshot["path"]) assert screenshot["ok"] is True - assert path.parent == tmp_path / "ctx_id" - assert path.name.startswith("desktop-") - assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path) + assert screenshot["path"] == "" + assert screenshot["ephemeral"] is True + assert screenshot["ephemeral_ref"].startswith("a0-ephemeral-image://") + assert screenshot["vision_load"]["tool_args"]["paths"] == [screenshot["ephemeral_ref"]] + assert screenshot["context_id"] == "ctx_id" + assert not (tmp_path / "ctx_id").exists() def test_xwd_fallback_parser_handles_truecolor_pixels(tmp_path, monkeypatch): diff --git a/tests/test_office_document_store.py b/tests/test_office_document_store.py index cd8324f9b..9a2366e01 100644 --- a/tests/test_office_document_store.py +++ b/tests/test_office_document_store.py @@ -1631,7 +1631,8 @@ def test_desktop_cleanup_moves_retired_state_to_plugin_state(tmp_path, monkeypat assert result["ok"] is True assert (plugin_state / "profiles" / "agent-zero-desktop" / "profile.txt").read_text(encoding="utf-8") == "profile\n" assert (plugin_state / "sessions" / "agent-zero-desktop.json").read_text(encoding="utf-8") == "{}\n" - assert (plugin_state / "screenshots" / "default" / "desktop.png").read_bytes() == b"png" + assert not (plugin_state / "screenshots").exists() + assert any("Removed retired persistent Desktop screenshots" in warning for warning in result["warnings"]) assert not retired_state.exists()