Make Desktop screenshots ephemeral by default

Route in-process Xpra/Desktop screenshot observations through context-scoped ephemeral image refs with vision_load payloads, matching the privacy posture of computer-use and browser screenshots. Keep desktopctl shell observations path-based with aggressive pruning so image payloads are not printed into shell logs, and preserve explicit screenshot paths as durable user-owned artifacts.
This commit is contained in:
Alessandro 2026-05-22 10:21:28 +02:00
parent 430c48d1a5
commit c1bdde057c
8 changed files with 226 additions and 40 deletions

View file

@ -7,7 +7,8 @@ from plugins._desktop.helpers import prompt_context
class IncludeDesktopState(Extension):
async def execute(self, loop_data: LoopData = LoopData(), **kwargs):
context = prompt_context.build_context()
context_id = str(getattr(getattr(self.agent, "context", None), "id", "") or "")
context = prompt_context.build_context(context_id=context_id)
if not context:
loop_data.extras_temporary.pop("desktop_state", None)
return

View file

@ -6,11 +6,14 @@ import os
import re
import shutil
import subprocess
import sys
import time
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).resolve().parents[3]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
SESSION_ID = "agent-zero-desktop"
@ -20,9 +23,10 @@ STATE_DIR = BASE_DIR / "usr" / "plugins" / PLUGIN_NAME
RETIRED_STATE_DIR = BASE_DIR / "usr" / PLUGIN_NAME
SESSION_DIR = STATE_DIR / "sessions"
PROFILE_DIR = STATE_DIR / "profiles"
SCREENSHOT_DIR = STATE_DIR / "screenshots"
SCREENSHOT_DIR = Path(os.environ.get("A0_DESKTOP_SCREENSHOT_DIR") or BASE_DIR / "tmp" / "desktop" / "screenshots")
RECENT_SCREENSHOT_SECONDS = 600
_SAFE_CONTEXT_RE = re.compile(r"[^a-zA-Z0-9_.-]+")
_SCREENSHOT_SUFFIXES = {".png", ".jpg", ".jpeg", ".xwd"}
def session_manifest_path(session_id: str = SESSION_ID) -> Path:
@ -47,6 +51,7 @@ def collect_state(
include_screenshot: bool = False,
screenshot_path: str | Path | None = None,
context_id: str = "",
screenshot_transport: str = "ephemeral",
) -> dict[str, Any]:
errors: list[str] = []
env_info = resolve_environment(errors=errors)
@ -72,9 +77,11 @@ def collect_state(
path=screenshot_path,
errors=errors,
context_id=context_id,
transport=screenshot_transport,
)
return stable_state(
context_id=context_id,
display=display,
profile_dir=profile_dir,
size=size,
@ -94,6 +101,7 @@ def capture_screenshot(
path: str | Path | None = None,
errors: list[str] | None = None,
context_id: str = "",
transport: str = "ephemeral",
) -> dict[str, Any]:
local_errors = errors if errors is not None else []
capabilities = capabilities or collect_capabilities()
@ -109,12 +117,18 @@ def capture_screenshot(
local_errors.append(message)
return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message}
explicit_path = path is not None and str(path).strip() != ""
ephemeral_ref = not explicit_path and str(transport or "").strip().lower() != "path"
screenshot_dir = context_screenshot_dir(context_id)
screenshot_dir.mkdir(parents=True, exist_ok=True)
if not explicit_path:
prune_context_screenshots(context_id=context_id)
screenshot_dir.mkdir(parents=True, exist_ok=True)
timestamp = time.strftime("%Y%m%d-%H%M%S")
target = Path(path) if path else screenshot_dir / f"desktop-{timestamp}.png"
millis = int((time.time() % 1) * 1000)
target = Path(path) if explicit_path else screenshot_dir / f"desktop-{timestamp}-{millis:03d}.png"
target.parent.mkdir(parents=True, exist_ok=True)
raw_path = target.with_suffix(".xwd")
safe_context = _safe_context_id(context_id)
result = run([xwd, "-root", "-silent", "-out", str(raw_path)], env=env, timeout=8)
if result.returncode != 0:
@ -124,12 +138,16 @@ def capture_screenshot(
return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail}
if target.suffix.lower() == ".xwd":
if not explicit_path:
prune_context_screenshots(context_id=context_id, keep_path=raw_path)
return {
"ok": True,
"path": str(raw_path),
"format": "xwd",
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"context_id": safe_context,
"error": "",
}
@ -141,6 +159,16 @@ def capture_screenshot(
width = int(image.width)
height = int(image.height)
raw_path.unlink(missing_ok=True)
if ephemeral_ref:
return ephemeral_screenshot_result(
target,
context_id=context_id,
image_format=target.suffix.lower().lstrip(".") or "png",
width=width,
height=height,
)
if not explicit_path:
prune_context_screenshots(context_id=context_id, keep_path=target)
return {
"ok": True,
"path": str(target),
@ -149,12 +177,24 @@ def capture_screenshot(
"height": height,
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"context_id": safe_context,
"error": "",
}
except Exception as exc:
try:
converted = convert_xwd_to_image(raw_path, target)
raw_path.unlink(missing_ok=True)
if ephemeral_ref:
return ephemeral_screenshot_result(
target,
context_id=context_id,
image_format=target.suffix.lower().lstrip(".") or "png",
width=converted["width"],
height=converted["height"],
)
if not explicit_path:
prune_context_screenshots(context_id=context_id, keep_path=target)
return {
"ok": True,
"path": str(target),
@ -163,17 +203,34 @@ def capture_screenshot(
"height": converted["height"],
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"context_id": safe_context,
"error": "",
}
except Exception as fallback_exc:
message = f"Pillow could not convert the XWD screenshot: {exc}; fallback parser failed: {fallback_exc}"
local_errors.append(message)
if ephemeral_ref:
raw_path.unlink(missing_ok=True)
target.unlink(missing_ok=True)
return {
"ok": False,
"path": "",
"format": "",
"captured_at": iso_now(),
"recent": False,
"ephemeral": True,
"context_id": safe_context,
"error": message,
}
return {
"ok": True,
"path": str(raw_path),
"format": "xwd",
"captured_at": iso_now(),
"recent": True,
"ephemeral": not explicit_path,
"context_id": safe_context,
"error": message,
}
@ -518,17 +575,21 @@ def parse_xprop(output: str) -> dict[str, str]:
def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
prune_context_screenshots(context_id=context_id, max_age_seconds=RECENT_SCREENSHOT_SECONDS)
screenshot_dir = context_screenshot_dir(context_id)
if not screenshot_dir.exists():
return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
candidates = [
path
for path in screenshot_dir.iterdir()
if path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg", ".xwd"}
if path.is_file() and path.suffix.lower() in _SCREENSHOT_SUFFIXES
]
if not candidates:
return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}
latest = max(candidates, key=lambda item: item.stat().st_mtime)
for candidate in candidates:
if candidate != latest:
candidate.unlink(missing_ok=True)
age = max(0.0, time.time() - latest.stat().st_mtime)
return {
"ok": True,
@ -536,6 +597,8 @@ def latest_screenshot(*, context_id: str = "") -> dict[str, Any]:
"format": latest.suffix.lower().lstrip("."),
"captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)),
"recent": age <= RECENT_SCREENSHOT_SECONDS,
"ephemeral": True,
"context_id": _safe_context_id(context_id),
}
@ -543,6 +606,7 @@ def stable_state(
*,
display: str,
profile_dir: str,
context_id: str = "",
size: dict[str, int] | None = None,
pointer: dict[str, int] | None = None,
active_window: dict[str, Any] | None = None,
@ -554,6 +618,7 @@ def stable_state(
clean_errors = [str(error) for error in errors or [] if str(error)]
return {
"ok": not clean_errors,
"context_id": _safe_context_id(context_id),
"display": display,
"profile_dir": profile_dir,
"size": size or {"width": 0, "height": 0},
@ -594,9 +659,15 @@ def compact_prompt_context(state: dict[str, Any] | None = None) -> str:
lines.append("- visible=" + "; ".join(visible))
screenshot = state.get("screenshot") or {}
if screenshot.get("recent") and screenshot.get("path"):
lines.append(f"- recent_screenshot={screenshot['path']}")
ephemeral = " ephemeral" if screenshot.get("ephemeral") else ""
lines.append(f"- recent_screenshot={screenshot['path']}{ephemeral}")
context_id = str(state.get("context_id") or "").strip()
if context_id:
lines.append(f"- screenshot_context={context_id}")
context_arg = f" --context-id {context_id}" if context_id else ""
lines.append(
"- next=plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot "
"- next=plugins/_desktop/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot"
f"{context_arg} "
"before any coordinate action; prefer focus/key/paste/save/app-native helpers first."
)
lines.append(
@ -667,6 +738,75 @@ def image_height(path: Path) -> int:
return 0
def ephemeral_screenshot_result(
path: Path,
*,
context_id: str = "",
image_format: str = "png",
width: int = 0,
height: int = 0,
) -> dict[str, Any]:
from helpers import ephemeral_images
mime = "image/jpeg" if image_format.lower() in {"jpg", "jpeg"} else "image/png"
safe_context = _safe_context_id(context_id)
ref = ephemeral_images.put_image_bytes(
context_id=str(context_id or "").strip(),
mime=mime,
payload=path.read_bytes(),
name=path.name,
)
path.unlink(missing_ok=True)
prune_context_screenshots(context_id=context_id)
return {
"ok": True,
"path": "",
"format": image_format,
"mime": mime,
"width": width,
"height": height,
"captured_at": iso_now(),
"recent": True,
"ephemeral": True,
"ephemeral_ref": ref,
"context_id": safe_context,
"vision_load": {
"tool_name": "vision_load",
"tool_args": {"paths": [ref]},
},
"error": "",
}
def prune_context_screenshots(
*,
context_id: str = "",
keep_path: Path | None = None,
max_age_seconds: float | None = None,
) -> None:
screenshot_dir = context_screenshot_dir(context_id)
if not screenshot_dir.exists():
return
keep = keep_path.resolve(strict=False) if keep_path else None
now = time.time()
for candidate in screenshot_dir.iterdir():
if not candidate.is_file() or candidate.suffix.lower() not in _SCREENSHOT_SUFFIXES:
continue
if keep is not None and candidate.resolve(strict=False) == keep:
continue
if max_age_seconds is not None:
try:
if now - candidate.stat().st_mtime <= max_age_seconds:
continue
except OSError:
pass
candidate.unlink(missing_ok=True)
try:
screenshot_dir.rmdir()
except OSError:
pass
def iso_now() -> str:
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
@ -696,6 +836,7 @@ def main(argv: list[str] | None = None) -> int:
payload = collect_state(
include_screenshot=bool(args.screenshot),
context_id=str(args.context_id or ""),
screenshot_transport="path",
)
print(json.dumps(payload, sort_keys=True))
return 0 if payload.get("ok") else 1
@ -709,6 +850,7 @@ def main(argv: list[str] | None = None) -> int:
path=args.path,
errors=errors,
context_id=str(args.context_id or ""),
transport="path",
)
if args.json:
print(json.dumps(payload, sort_keys=True))

View file

@ -3,12 +3,12 @@ from __future__ import annotations
from plugins._desktop.helpers import desktop_state
def build_context() -> str:
def build_context(context_id: str = "") -> str:
if not desktop_state.session_manifest_exists():
return ""
try:
return desktop_state.compact_prompt_context(
desktop_state.collect_state(include_screenshot=False),
desktop_state.collect_state(include_screenshot=False, context_id=context_id),
)
except Exception as exc:
return (

View file

@ -89,7 +89,7 @@ def cleanup_stale_runtime_state(force: bool = False) -> dict[str, Any]:
errors: list[str] = []
_migrate_retired_plugin_state(migrated, warnings, errors)
_migrate_unscoped_screenshots(migrated, warnings, errors)
_remove_persisted_screenshots(warnings, errors)
retired_packages = _installed_packages(RETIRED_RUNTIME_PACKAGES)
if retired_packages:
@ -142,33 +142,18 @@ def _migrate_retired_plugin_state(
)
def _migrate_unscoped_screenshots(
migrated: list[str],
def _remove_persisted_screenshots(
warnings: list[str],
errors: list[str],
) -> None:
screenshots_dir = STATE_DIR / "screenshots"
if not screenshots_dir.exists():
return
legacy_screenshots = [
path
for path in screenshots_dir.iterdir()
if path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg", ".xwd"}
]
if not legacy_screenshots:
return
context_dir = screenshots_dir / "default"
context_dir.mkdir(parents=True, exist_ok=True)
for screenshot in legacy_screenshots:
state_migration.migrate_retired_state_tree(
source=screenshot,
destination=context_dir / screenshot.name,
owner="Desktop screenshot",
migrated=migrated,
warnings=warnings,
errors=errors,
)
try:
shutil.rmtree(screenshots_dir)
warnings.append(f"Removed retired persistent Desktop screenshots: {screenshots_dir}")
except Exception as exc:
errors.append(f"Failed to remove retired persistent Desktop screenshots at {screenshots_dir}: {exc}")
def _begin_runtime_preparation() -> None:

View file

@ -30,7 +30,7 @@ The Desktop is an observe-act-verify control surface. Use this decision hierarch
3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands.
4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation.
5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output.
6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Do not report from an earlier screenshot path.
6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Agent-facing Desktop screenshots are ephemeral refs; `desktopctl` shell observations return temporary context paths. Do not report from an earlier screenshot path.
Keep these standing rules:
@ -60,7 +60,7 @@ $DESKTOP key ctrl+s
The script targets the persistent `agent-zero-desktop` X display, sets `DISPLAY`, `XAUTHORITY`, and `HOME` to the XFCE profile, then uses `xdotool` for input. Startup normally prepares this session. If `check` fails during explicit Desktop work, report that the Desktop runtime is not ready instead of installing packages ad hoc.
If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback.
If `observe --json --screenshot` shows a reachable display, visible Desktop/window entries, and a fresh screenshot, the Desktop is usable even when `active_window` is `null`; a bare XFCE desktop can have no active application window. Treat missing screenshots, missing display, or unavailable `xdotool`/`xwd` as blockers and stop with the specific readiness message instead of repeating clicks or inventing a fallback. Use any returned shell screenshot path promptly; only the latest temporary context screenshot is retained.
For direct app launches without coordinates:

View file

@ -60,7 +60,7 @@ Commands:
observe --json [--screenshot] [--context-id ID]
Return structured state, optionally with a fresh screenshot.
screenshot [PATH] [--context-id ID]
Capture the Desktop to PATH, or to the default screenshot directory.
Capture the Desktop to PATH, or to the temporary context screenshot directory.
active-window Print the active window name.
geometry PATTERN Print the first matching visible window geometry.
wait-window PATTERN Wait for a visible matching window and print its id.

View file

@ -185,11 +185,66 @@ def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp
assert screenshot["ok"] is True
assert screenshot["path"] == str(tmp_path / "shot.png")
assert screenshot["format"] == "png"
assert screenshot["ephemeral"] is False
assert (tmp_path / "shot.png").read_bytes() == b"png"
assert not (tmp_path / "shot.xwd").exists()
def test_desktop_state_default_screenshot_path_is_context_scoped(tmp_path, monkeypatch):
def test_desktop_state_shell_screenshot_path_is_context_scoped(tmp_path, monkeypatch):
monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path)
capabilities = {"xwd": "/usr/bin/xwd"}
env = {"DISPLAY": ":120"}
def fake_run(command, *, env, timeout):
raw_path = Path(command[command.index("-out") + 1])
raw_path.write_bytes(b"xwd")
return _completed(command)
image_module = types.ModuleType("PIL.Image")
class FakeImage:
width = 320
height = 240
def __enter__(self):
return self
def __exit__(self, *_args):
return False
def save(self, target):
Path(target).write_bytes(b"png")
image_module.open = lambda _path: FakeImage()
pil_module = types.ModuleType("PIL")
pil_module.Image = image_module
monkeypatch.setattr(desktop_state, "run", fake_run)
monkeypatch.setitem(sys.modules, "PIL", pil_module)
monkeypatch.setitem(sys.modules, "PIL.Image", image_module)
stale_path = tmp_path / "ctx_id" / "stale.png"
stale_path.parent.mkdir(parents=True)
stale_path.write_bytes(b"stale")
screenshot = desktop_state.capture_screenshot(
env,
capabilities,
errors=[],
context_id="ctx/id",
transport="path",
)
path = Path(screenshot["path"])
assert screenshot["ok"] is True
assert screenshot["ephemeral"] is True
assert screenshot["context_id"] == "ctx_id"
assert path.parent == tmp_path / "ctx_id"
assert path.name.startswith("desktop-")
assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path)
assert not stale_path.exists()
def test_desktop_state_default_screenshot_returns_ephemeral_ref(tmp_path, monkeypatch):
monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path)
capabilities = {"xwd": "/usr/bin/xwd"}
env = {"DISPLAY": ":120"}
@ -229,11 +284,13 @@ def test_desktop_state_default_screenshot_path_is_context_scoped(tmp_path, monke
context_id="ctx/id",
)
path = Path(screenshot["path"])
assert screenshot["ok"] is True
assert path.parent == tmp_path / "ctx_id"
assert path.name.startswith("desktop-")
assert desktop_state.latest_screenshot(context_id="ctx/id")["path"] == str(path)
assert screenshot["path"] == ""
assert screenshot["ephemeral"] is True
assert screenshot["ephemeral_ref"].startswith("a0-ephemeral-image://")
assert screenshot["vision_load"]["tool_args"]["paths"] == [screenshot["ephemeral_ref"]]
assert screenshot["context_id"] == "ctx_id"
assert not (tmp_path / "ctx_id").exists()
def test_xwd_fallback_parser_handles_truecolor_pixels(tmp_path, monkeypatch):

View file

@ -1631,7 +1631,8 @@ def test_desktop_cleanup_moves_retired_state_to_plugin_state(tmp_path, monkeypat
assert result["ok"] is True
assert (plugin_state / "profiles" / "agent-zero-desktop" / "profile.txt").read_text(encoding="utf-8") == "profile\n"
assert (plugin_state / "sessions" / "agent-zero-desktop.json").read_text(encoding="utf-8") == "{}\n"
assert (plugin_state / "screenshots" / "default" / "desktop.png").read_bytes() == b"png"
assert not (plugin_state / "screenshots").exists()
assert any("Removed retired persistent Desktop screenshots" in warning for warning in result["warnings"])
assert not retired_state.exists()