diff --git a/plugins/_office/api/office_session.py b/plugins/_office/api/office_session.py index 5647c6430..2d8ffbdda 100644 --- a/plugins/_office/api/office_session.py +++ b/plugins/_office/api/office_session.py @@ -61,6 +61,8 @@ class OfficeSession(ApiHandler): return self._desktop_save(input) if action == "desktop_sync": return self._desktop_sync(input) + if action == "desktop_state": + return self._desktop_state(input) return {"ok": False, "error": f"Unsupported office session action: {action}"} async def _open_document(self, doc: dict, input: dict, request: Request) -> dict: @@ -190,6 +192,10 @@ class OfficeSession(ApiHandler): file_id=str(input.get("file_id") or ""), ) + def _desktop_state(self, input: dict) -> dict: + include_screenshot = bool(input.get("include_screenshot") is True) + return libreoffice_desktop.get_manager().state(include_screenshot=include_screenshot) + def _origin(self, request: Request) -> str: origin = request.headers.get("Origin") or request.host_url.rstrip("/") return origin.rstrip("/") diff --git a/plugins/_office/helpers/canvas_context.py b/plugins/_office/helpers/canvas_context.py index 4656baf7b..cd4f0b604 100644 --- a/plugins/_office/helpers/canvas_context.py +++ b/plugins/_office/helpers/canvas_context.py @@ -2,13 +2,15 @@ from __future__ import annotations from typing import Any +from plugins._office.helpers import desktop_state from plugins._office.helpers import document_store def build_context(max_items: int = 6) -> str: documents = document_store.get_open_documents(limit=max_items) + desktop_context = build_desktop_context() if not documents: - return "" + return desktop_context lines = [ "These document artifacts have active canvas sessions. Content is omitted; load skill `office-artifacts` for edit workflow, then use document_artifact:read before content-sensitive edits.", @@ -18,6 +20,8 @@ def build_context(max_items: int = 6) -> str: lines.append( "Use document_artifact:edit with file_id or path for saved edits; tool results refresh the document canvas." ) + if desktop_context: + lines.extend(["", desktop_context]) return "\n".join(lines) @@ -29,3 +33,18 @@ def format_document_line(doc: dict[str, Any]) -> str: f"size={doc.get('size', 0)} bytes, last_modified={doc.get('last_modified', '')}, " f"open_sessions={doc.get('open_sessions', 1)})" ) + + +def build_desktop_context() -> str: + if not desktop_state.session_manifest_exists(): + return "" + try: + return desktop_state.compact_prompt_context( + desktop_state.collect_state(include_screenshot=False), + ) + except Exception as exc: + return ( + "[DESKTOP STATE]\n" + f"- unavailable={exc}\n" + "- next=Open the Desktop canvas manually, then run plugins/_office/skills/linux-desktop/scripts/desktopctl.sh observe --json." + ) diff --git a/plugins/_office/helpers/desktop_state.py b/plugins/_office/helpers/desktop_state.py new file mode 100644 index 000000000..0c70c43b5 --- /dev/null +++ b/plugins/_office/helpers/desktop_state.py @@ -0,0 +1,680 @@ +from __future__ import annotations + +import argparse +import json +import os +import re +import shutil +import subprocess +import time +from pathlib import Path +from typing import Any + +PROJECT_ROOT = Path(__file__).resolve().parents[3] + + +SESSION_ID = "agent-zero-desktop" +BASE_DIR = Path(os.environ.get("A0_BASE_DIR") or ("/a0" if Path("/a0").exists() else PROJECT_ROOT)) +STATE_DIR = BASE_DIR / "tmp" / "_office" / "desktop" +SESSION_DIR = STATE_DIR / "sessions" +PROFILE_DIR = STATE_DIR / "profiles" +SCREENSHOT_DIR = STATE_DIR / "screenshots" +RECENT_SCREENSHOT_SECONDS = 600 + + +def session_manifest_path(session_id: str = SESSION_ID) -> Path: + return Path(os.environ.get("A0_DESKTOP_MANIFEST") or SESSION_DIR / f"{session_id}.json") + + +def session_manifest_exists(session_id: str = SESSION_ID) -> bool: + return session_manifest_path(session_id).exists() + + +def collect_state(*, include_screenshot: bool = False, screenshot_path: str | Path | None = None) -> dict[str, Any]: + errors: list[str] = [] + env_info = resolve_environment(errors=errors) + display = env_info["display"] + profile_dir = env_info["profile_dir"] + env = display_env(display=display, profile_dir=profile_dir) + + capabilities = collect_capabilities() + for name in ("xdotool", "xrandr", "xwininfo", "xprop"): + if not capabilities.get(name): + errors.append(f"{name} is not installed; install Office runtime dependencies through the _office plugin hook.") + + size = collect_display_size(env, capabilities, errors) + pointer = collect_pointer(env, capabilities, errors) + active_window = collect_active_window(env, capabilities, errors) + windows = collect_windows(env, capabilities, errors) + screenshot = latest_screenshot() + + if include_screenshot: + screenshot = capture_screenshot(env, capabilities, path=screenshot_path, errors=errors) + + return stable_state( + display=display, + profile_dir=profile_dir, + size=size, + pointer=pointer, + active_window=active_window, + windows=windows, + screenshot=screenshot, + capabilities=capabilities, + errors=errors, + ) + + +def capture_screenshot( + env: dict[str, str] | None = None, + capabilities: dict[str, str] | None = None, + *, + path: str | Path | None = None, + errors: list[str] | None = None, +) -> dict[str, Any]: + local_errors = errors if errors is not None else [] + capabilities = capabilities or collect_capabilities() + if not env: + env_errors: list[str] = [] + env_info = resolve_environment(errors=env_errors) + local_errors.extend(env_errors) + env = display_env(display=env_info["display"], profile_dir=env_info["profile_dir"]) + + xwd = capabilities.get("xwd") or shutil.which("xwd") or "" + if not xwd: + message = "xwd is not installed; install x11-apps through the _office plugin hook." + local_errors.append(message) + return {"ok": False, "path": "", "format": "", "captured_at": "", "error": message} + + SCREENSHOT_DIR.mkdir(parents=True, exist_ok=True) + timestamp = time.strftime("%Y%m%d-%H%M%S") + target = Path(path) if path else SCREENSHOT_DIR / f"desktop-{timestamp}.png" + target.parent.mkdir(parents=True, exist_ok=True) + raw_path = target.with_suffix(".xwd") + + result = run([xwd, "-root", "-silent", "-out", str(raw_path)], env=env, timeout=8) + if result.returncode != 0: + detail = command_output(result) or "xwd screenshot capture failed." + local_errors.append(detail) + raw_path.unlink(missing_ok=True) + return {"ok": False, "path": "", "format": "", "captured_at": "", "error": detail} + + if target.suffix.lower() == ".xwd": + return { + "ok": True, + "path": str(raw_path), + "format": "xwd", + "captured_at": iso_now(), + "recent": True, + "error": "", + } + + try: + from PIL import Image + + with Image.open(raw_path) as image: + image.save(target) + width = int(image.width) + height = int(image.height) + raw_path.unlink(missing_ok=True) + return { + "ok": True, + "path": str(target), + "format": target.suffix.lower().lstrip(".") or "png", + "width": width, + "height": height, + "captured_at": iso_now(), + "recent": True, + "error": "", + } + except Exception as exc: + try: + converted = convert_xwd_to_image(raw_path, target) + raw_path.unlink(missing_ok=True) + return { + "ok": True, + "path": str(target), + "format": target.suffix.lower().lstrip(".") or "png", + "width": converted["width"], + "height": converted["height"], + "captured_at": iso_now(), + "recent": True, + "error": "", + } + except Exception as fallback_exc: + message = f"Pillow could not convert the XWD screenshot: {exc}; fallback parser failed: {fallback_exc}" + local_errors.append(message) + return { + "ok": True, + "path": str(raw_path), + "format": "xwd", + "captured_at": iso_now(), + "recent": True, + "error": message, + } + + +def convert_xwd_to_image(raw_path: Path, target: Path) -> dict[str, int]: + from PIL import Image + + data = raw_path.read_bytes() + header, endian = parse_xwd_header(data) + width = header["pixmap_width"] + height = header["pixmap_height"] + bytes_per_line = header["bytes_per_line"] + bits_per_pixel = header["bits_per_pixel"] + image_byte_order = "little" if header["byte_order"] == 0 else "big" + color_table_size = header["ncolors"] * 12 + pixel_offset = header["header_size"] + color_table_size + bytes_per_pixel = max((bits_per_pixel + 7) // 8, 1) + if width > 0 and bytes_per_line % width == 0: + bytes_per_pixel = max(bytes_per_pixel, bytes_per_line // width) + if width <= 0 or height <= 0 or bytes_per_line <= 0: + raise ValueError("invalid XWD dimensions") + if pixel_offset + (height * bytes_per_line) > len(data): + raise ValueError("truncated XWD pixel data") + + red_mask = header["red_mask"] + green_mask = header["green_mask"] + blue_mask = header["blue_mask"] + red_shift, red_bits = mask_shift_and_bits(red_mask) + green_shift, green_bits = mask_shift_and_bits(green_mask) + blue_shift, blue_bits = mask_shift_and_bits(blue_mask) + if min(red_bits, green_bits, blue_bits) <= 0: + raise ValueError("unsupported XWD visual masks") + + pixels: list[tuple[int, int, int]] = [] + for row in range(height): + row_start = pixel_offset + (row * bytes_per_line) + for column in range(width): + start = row_start + (column * bytes_per_pixel) + pixel_bytes = data[start : start + bytes_per_pixel] + if len(pixel_bytes) < bytes_per_pixel: + raise ValueError("truncated XWD pixel") + pixel = int.from_bytes(pixel_bytes, image_byte_order, signed=False) + pixels.append( + ( + scale_channel((pixel & red_mask) >> red_shift, red_bits), + scale_channel((pixel & green_mask) >> green_shift, green_bits), + scale_channel((pixel & blue_mask) >> blue_shift, blue_bits), + ), + ) + + image = Image.new("RGB", (width, height)) + image.putdata(pixels) + image.save(target) + return {"width": width, "height": height} + + +def parse_xwd_header(data: bytes) -> tuple[dict[str, int], str]: + if len(data) < 100: + raise ValueError("XWD header is too short") + field_names = ( + "header_size", + "file_version", + "pixmap_format", + "pixmap_depth", + "pixmap_width", + "pixmap_height", + "xoffset", + "byte_order", + "bitmap_unit", + "bitmap_bit_order", + "bitmap_pad", + "bits_per_pixel", + "bytes_per_line", + "visual_class", + "red_mask", + "green_mask", + "blue_mask", + "bits_per_rgb", + "colormap_entries", + "ncolors", + "window_width", + "window_height", + "window_x", + "window_y", + "window_bdrwidth", + ) + for endian in ("big", "little"): + values = [int.from_bytes(data[index : index + 4], endian, signed=False) for index in range(0, 100, 4)] + header = dict(zip(field_names, values, strict=True)) + if 100 <= header["header_size"] <= len(data) and header["file_version"] == 7: + return header, endian + raise ValueError("unsupported XWD header") + + +def mask_shift_and_bits(mask: int) -> tuple[int, int]: + if mask <= 0: + return 0, 0 + shift = 0 + value = mask + while value and value & 1 == 0: + shift += 1 + value >>= 1 + bits = 0 + while value & 1: + bits += 1 + value >>= 1 + return shift, bits + + +def scale_channel(value: int, bits: int) -> int: + if bits >= 8: + return max(0, min(255, value >> (bits - 8))) + max_value = (1 << bits) - 1 + return 0 if max_value <= 0 else round((value / max_value) * 255) + + +def resolve_environment(*, errors: list[str] | None = None, session_id: str = SESSION_ID) -> dict[str, str]: + local_errors = errors if errors is not None else [] + manifest = session_manifest_path(session_id) + payload: dict[str, Any] = {} + if manifest.exists(): + try: + payload = json.loads(manifest.read_text(encoding="utf-8")) + except Exception as exc: + local_errors.append(f"Desktop session manifest is unreadable: {exc}") + elif not (os.environ.get("A0_DESKTOP_DISPLAY") or os.environ.get("DISPLAY")): + local_errors.append(f"Desktop session manifest not found at {manifest}; open the Desktop canvas before GUI control.") + + display_value = str( + os.environ.get("A0_DESKTOP_DISPLAY") + or payload.get("display") + or os.environ.get("DISPLAY") + or "" + ).strip() + if display_value.startswith(":"): + display = display_value + elif display_value: + display = f":{display_value}" + else: + display = "" + local_errors.append("Desktop DISPLAY is unavailable; the persistent Desktop session is not running.") + + profile_dir = str( + os.environ.get("A0_DESKTOP_PROFILE") + or os.environ.get("A0_DESKTOP_HOME") + or payload.get("profile_dir") + or os.environ.get("HOME") + or PROFILE_DIR / session_id + ) + + return { + "display": display, + "profile_dir": profile_dir, + "manifest": str(manifest), + } + + +def display_env(*, display: str, profile_dir: str) -> dict[str, str]: + env = { + **os.environ, + "HOME": profile_dir, + "XDG_CONFIG_HOME": os.environ.get("XDG_CONFIG_HOME") or str(Path(profile_dir) / ".config"), + "XDG_DATA_HOME": os.environ.get("XDG_DATA_HOME") or str(Path(profile_dir) / ".local" / "share"), + "XDG_CACHE_HOME": os.environ.get("XDG_CACHE_HOME") or str(Path(profile_dir) / ".cache"), + "XDG_CURRENT_DESKTOP": os.environ.get("XDG_CURRENT_DESKTOP") or "XFCE", + } + if display: + env["DISPLAY"] = display + xauthority = os.environ.get("A0_DESKTOP_XAUTHORITY") or str(Path(profile_dir) / ".Xauthority") + if Path(xauthority).exists(): + env["XAUTHORITY"] = xauthority + return env + + +def collect_capabilities() -> dict[str, str]: + return { + name: shutil.which(name) or "" + for name in ( + "xdotool", + "xrandr", + "xwininfo", + "xprop", + "xwd", + "xclip", + ) + } + + +def collect_display_size(env: dict[str, str], capabilities: dict[str, str], errors: list[str]) -> dict[str, int]: + if not capabilities.get("xrandr"): + return {"width": 0, "height": 0} + result = run([capabilities["xrandr"], "-q"], env=env, timeout=4) + if result.returncode != 0: + errors.append(command_output(result) or "xrandr could not read the Desktop display.") + return {"width": 0, "height": 0} + match = re.search(r"\bcurrent\s+(\d+)\s+x\s+(\d+)", result.stdout) + if not match: + errors.append("xrandr output did not include the current Desktop size.") + return {"width": 0, "height": 0} + return {"width": int(match.group(1)), "height": int(match.group(2))} + + +def collect_pointer(env: dict[str, str], capabilities: dict[str, str], errors: list[str]) -> dict[str, int]: + if not capabilities.get("xdotool"): + return {"x": 0, "y": 0, "screen": 0, "window": 0} + result = run([capabilities["xdotool"], "getmouselocation", "--shell"], env=env, timeout=3) + if result.returncode != 0: + errors.append(command_output(result) or "xdotool could not read the pointer location.") + return {"x": 0, "y": 0, "screen": 0, "window": 0} + values = parse_shell_values(result.stdout) + return { + "x": int_value(values.get("X")), + "y": int_value(values.get("Y")), + "screen": int_value(values.get("SCREEN")), + "window": int_value(values.get("WINDOW")), + } + + +def collect_active_window(env: dict[str, str], capabilities: dict[str, str], errors: list[str]) -> dict[str, Any] | None: + if not capabilities.get("xdotool"): + return None + result = run([capabilities["xdotool"], "getactivewindow"], env=env, timeout=3) + if result.returncode != 0: + errors.append(command_output(result) or "xdotool could not read the active window.") + return None + window_id = result.stdout.strip().splitlines()[0] if result.stdout.strip() else "" + if not window_id: + return None + return collect_window(env, capabilities, window_id, errors) + + +def collect_windows(env: dict[str, str], capabilities: dict[str, str], errors: list[str]) -> list[dict[str, Any]]: + if not capabilities.get("xdotool"): + return [] + result = run([capabilities["xdotool"], "search", "--onlyvisible", "--name", "."], env=env, timeout=4) + if result.returncode != 0: + detail = command_output(result) + if detail: + errors.append(detail) + return [] + windows: list[dict[str, Any]] = [] + seen: set[str] = set() + for window_id in result.stdout.splitlines(): + window_id = window_id.strip() + if not window_id or window_id in seen: + continue + seen.add(window_id) + windows.append(collect_window(env, capabilities, window_id, errors)) + return windows + + +def collect_window( + env: dict[str, str], + capabilities: dict[str, str], + window_id: str, + errors: list[str], +) -> dict[str, Any]: + props = collect_window_props(env, capabilities, window_id) + geometry = collect_window_geometry(env, capabilities, window_id) + return { + "id": str(window_id), + "title": props.get("title", ""), + "class": props.get("class", ""), + "name": props.get("name", ""), + "pid": int_value(props.get("pid")), + "geometry": geometry, + } + + +def collect_window_geometry(env: dict[str, str], capabilities: dict[str, str], window_id: str) -> dict[str, int]: + geometry = {"x": 0, "y": 0, "width": 0, "height": 0} + if not capabilities.get("xwininfo"): + return geometry + result = run([capabilities["xwininfo"], "-id", str(window_id)], env=env, timeout=3) + if result.returncode != 0: + return geometry + patterns = { + "x": r"Absolute upper-left X:\s*(-?\d+)", + "y": r"Absolute upper-left Y:\s*(-?\d+)", + "width": r"Width:\s*(\d+)", + "height": r"Height:\s*(\d+)", + } + for key, pattern in patterns.items(): + match = re.search(pattern, result.stdout) + if match: + geometry[key] = int(match.group(1)) + return geometry + + +def collect_window_props(env: dict[str, str], capabilities: dict[str, str], window_id: str) -> dict[str, str]: + props = {"title": "", "class": "", "name": "", "pid": ""} + xdotool = capabilities.get("xdotool") + if xdotool: + result = run([xdotool, "getwindowname", str(window_id)], env=env, timeout=3) + if result.returncode == 0: + props["title"] = result.stdout.strip() + xprop = capabilities.get("xprop") + if not xprop: + return props + result = run([xprop, "-id", str(window_id), "WM_CLASS", "WM_NAME", "_NET_WM_NAME", "_NET_WM_PID"], env=env, timeout=3) + if result.returncode != 0: + return props + parsed = parse_xprop(result.stdout) + title = parsed.get("_NET_WM_NAME") or parsed.get("WM_NAME") or props["title"] + props["title"] = title + props["class"] = parsed.get("WM_CLASS_CLASS", "") + props["name"] = parsed.get("WM_CLASS_NAME", "") + props["pid"] = parsed.get("_NET_WM_PID", "") + return props + + +def parse_xprop(output: str) -> dict[str, str]: + values: dict[str, str] = {} + for line in output.splitlines(): + if "=" not in line: + continue + key, raw_value = line.split("=", 1) + key = key.strip().split("(", 1)[0] + raw_value = raw_value.strip() + quoted = re.findall(r'"([^"]*)"', raw_value) + if key == "WM_CLASS" and quoted: + values["WM_CLASS_NAME"] = quoted[0] + values["WM_CLASS_CLASS"] = quoted[-1] + continue + if quoted: + values[key] = quoted[-1] + continue + match = re.search(r"-?\d+", raw_value) + values[key] = match.group(0) if match else raw_value + return values + + +def latest_screenshot() -> dict[str, Any]: + if not SCREENSHOT_DIR.exists(): + return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False} + candidates = [ + path + for path in SCREENSHOT_DIR.iterdir() + if path.is_file() and path.suffix.lower() in {".png", ".jpg", ".jpeg", ".xwd"} + ] + if not candidates: + return {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False} + latest = max(candidates, key=lambda item: item.stat().st_mtime) + age = max(0.0, time.time() - latest.stat().st_mtime) + return { + "ok": True, + "path": str(latest), + "format": latest.suffix.lower().lstrip("."), + "captured_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(latest.stat().st_mtime)), + "recent": age <= RECENT_SCREENSHOT_SECONDS, + } + + +def stable_state( + *, + display: str, + profile_dir: str, + size: dict[str, int] | None = None, + pointer: dict[str, int] | None = None, + active_window: dict[str, Any] | None = None, + windows: list[dict[str, Any]] | None = None, + screenshot: dict[str, Any] | None = None, + capabilities: dict[str, str] | None = None, + errors: list[str] | None = None, +) -> dict[str, Any]: + clean_errors = [str(error) for error in errors or [] if str(error)] + return { + "ok": not clean_errors, + "display": display, + "profile_dir": profile_dir, + "size": size or {"width": 0, "height": 0}, + "pointer": pointer or {"x": 0, "y": 0, "screen": 0, "window": 0}, + "active_window": active_window, + "windows": windows or [], + "screenshot": screenshot or {"ok": False, "path": "", "format": "", "captured_at": "", "recent": False}, + "capabilities": capabilities or collect_capabilities(), + "errors": clean_errors, + } + + +def compact_prompt_context(state: dict[str, Any] | None = None) -> str: + state = state if state is not None else collect_state(include_screenshot=False) + if not state.get("display"): + return "" + lines = ["[DESKTOP STATE]"] + size = state.get("size") or {} + pointer = state.get("pointer") or {} + lines.append( + f"- display={state.get('display', '')} size={size.get('width', 0)}x{size.get('height', 0)} " + f"pointer={pointer.get('x', 0)},{pointer.get('y', 0)}" + ) + active = state.get("active_window") or {} + if active: + lines.append( + f"- active={active.get('title', '') or ''} " + f"class={active.get('class', '') or active.get('name', '')}" + ) + visible = [] + for window in state.get("windows") or []: + title = window.get("title") or "" + window_class = window.get("class") or window.get("name") or "" + visible.append(f"{title} ({window_class})" if window_class else title) + if len(visible) >= 5: + break + if visible: + lines.append("- visible=" + "; ".join(visible)) + screenshot = state.get("screenshot") or {} + if screenshot.get("recent") and screenshot.get("path"): + lines.append(f"- recent_screenshot={screenshot['path']}") + lines.append( + "- next=plugins/_office/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot " + "before any coordinate action; prefer focus/key/paste/save/app-native helpers first." + ) + lines.append( + "- verify=for terminal/CLI-agent output, use the screenshot path from a fresh final " + "observe --json --screenshot captured after the response appears." + ) + if state.get("errors"): + lines.append("- errors=" + "; ".join(str(item) for item in state["errors"][:2])) + return "\n".join(lines) + + +def parse_shell_values(output: str) -> dict[str, str]: + values: dict[str, str] = {} + for line in output.splitlines(): + if "=" not in line: + continue + key, value = line.split("=", 1) + values[key.strip()] = value.strip().strip('"') + return values + + +def int_value(value: Any, default: int = 0) -> int: + try: + return int(str(value).strip()) + except (TypeError, ValueError): + return default + + +def run(command: list[str], *, env: dict[str, str], timeout: float) -> subprocess.CompletedProcess[str]: + try: + return subprocess.run( + command, + check=False, + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + except OSError as exc: + return subprocess.CompletedProcess(command, 127, "", str(exc)) + except subprocess.TimeoutExpired as exc: + stdout = exc.stdout.decode("utf-8", errors="replace") if isinstance(exc.stdout, bytes) else (exc.stdout or "") + stderr = exc.stderr.decode("utf-8", errors="replace") if isinstance(exc.stderr, bytes) else (exc.stderr or "") + return subprocess.CompletedProcess(command, 124, stdout, stderr or "command timed out") + + +def command_output(result: subprocess.CompletedProcess[str]) -> str: + return (result.stderr or result.stdout or "").strip() + + +def image_width(path: Path) -> int: + try: + from PIL import Image + + with Image.open(path) as image: + return int(image.width) + except Exception: + return 0 + + +def image_height(path: Path) -> int: + try: + from PIL import Image + + with Image.open(path) as image: + return int(image.height) + except Exception: + return 0 + + +def iso_now() -> str: + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Observe the Agent Zero persistent Linux Desktop state.") + subparsers = parser.add_subparsers(dest="command") + + state_parser = subparsers.add_parser("state") + state_parser.add_argument("--json", action="store_true") + state_parser.add_argument("--screenshot", action="store_true") + + observe_parser = subparsers.add_parser("observe") + observe_parser.add_argument("--json", action="store_true") + observe_parser.add_argument("--screenshot", action="store_true") + + screenshot_parser = subparsers.add_parser("screenshot") + screenshot_parser.add_argument("path", nargs="?") + screenshot_parser.add_argument("--json", action="store_true") + + args = parser.parse_args(argv) + command = args.command or "state" + if command in {"state", "observe"}: + payload = collect_state(include_screenshot=bool(args.screenshot)) + print(json.dumps(payload, sort_keys=True)) + return 0 if payload.get("ok") else 1 + + if command == "screenshot": + errors: list[str] = [] + env_info = resolve_environment(errors=errors) + payload = capture_screenshot( + display_env(display=env_info["display"], profile_dir=env_info["profile_dir"]), + collect_capabilities(), + path=args.path, + errors=errors, + ) + if args.json: + print(json.dumps(payload, sort_keys=True)) + else: + print(payload.get("path") or payload.get("error") or "") + return 0 if payload.get("ok") else 1 + + parser.print_help() + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/_office/helpers/libreoffice_desktop.py b/plugins/_office/helpers/libreoffice_desktop.py index c87653190..7e8cf7eff 100644 --- a/plugins/_office/helpers/libreoffice_desktop.py +++ b/plugins/_office/helpers/libreoffice_desktop.py @@ -17,7 +17,7 @@ from pathlib import Path from typing import Any from helpers import files, virtual_desktop -from plugins._office.helpers import document_store, libreoffice +from plugins._office.helpers import desktop_state, document_store, libreoffice OFFICIAL_EXTENSIONS = {"odt", "ods", "odp", "docx", "xlsx", "pptx"} @@ -213,6 +213,11 @@ class LibreOfficeDesktopManager: "url_intents": url_intents, } + def state(self, *, include_screenshot: bool = False) -> dict[str, Any]: + with self._lock: + self._reap_dead_locked() + return desktop_state.collect_state(include_screenshot=include_screenshot) + def claim_url_intents(self, session_id: str = SYSTEM_SESSION_ID) -> list[dict[str, Any]]: session = self.get(session_id) or self.get(SYSTEM_SESSION_ID) if not session: @@ -1069,6 +1074,7 @@ fi "path": session.path, "display": session.display, "xpra_port": session.xpra_port, + "profile_dir": str(session.profile_dir), "owner_pid": os.getpid(), "pids": {name: process.pid for name, process in session.processes.items()}, } diff --git a/plugins/_office/hooks.py b/plugins/_office/hooks.py index cae79008f..fb6b58b4b 100644 --- a/plugins/_office/hooks.py +++ b/plugins/_office/hooks.py @@ -60,9 +60,13 @@ RUNTIME_PACKAGES = ( "libglib2.0-bin", "xfce4-terminal", "x11-xserver-utils", + "x11-utils", + "x11-apps", "xdotool", + "xclip", "xauth", "dbus-x11", + "python3-pil", "fonts-dejavu", "fonts-liberation", "fonts-crosextra-caladea", diff --git a/plugins/_office/skills/linux-desktop/SKILL.md b/plugins/_office/skills/linux-desktop/SKILL.md index 0c9fb7e57..2d69b4a4b 100644 --- a/plugins/_office/skills/linux-desktop/SKILL.md +++ b/plugins/_office/skills/linux-desktop/SKILL.md @@ -23,14 +23,24 @@ Use the Desktop as a full Linux GUI when the user explicitly needs a visual work ## Operating Model -1. Prefer `document_artifact` for creating, reading, and editing Markdown, ODT, ODS, ODP, DOCX, XLSX, and PPTX files. -2. Treat Markdown as first-class. For writing, notes, reports, and drafts with no explicit binary Office requirement, create Markdown and use the custom Markdown editor when the user opens the canvas. -3. Treat ODF as first-class for LibreOffice office work: ODT in Writer, ODS in Calc, ODP in Impress. Use DOCX/XLSX/PPTX only for explicit Microsoft compatibility. -4. Use the Desktop only when the user asks for the Desktop, a GUI app, binary Office visual work, or visual confirmation. -5. Never open the Desktop/canvas automatically from a tool result if the user has not opened it. Offer the explicit Open in canvas action instead. -6. Launch common apps from the Desktop icons, the header buttons, or `scripts/desktopctl.sh`. -7. Use the external Agent Zero Browser for web browsing. Do not launch an operating-system browser in this version. -8. Verify GUI work by observing the desktop state, checking window titles, and saving the file before reporting success. +The Desktop is an observe-act-verify control surface. Use this decision hierarchy: + +1. Prefer structured tools such as `document_artifact` for deterministic file creation, reads, and edits. +2. Prefer app-native helpers for visible live edits, such as `desktopctl.sh calc-set-cell` for Calc/UNO spreadsheet changes. +3. Prefer launcher commands, window focus, keyboard shortcuts, menus, paste, and save commands. +4. Use coordinate clicks only as a last resort, and only after a fresh Desktop observation. +5. After any GUI action, verify through Desktop state, active window titles, screenshots, saved file state, or exported output. +6. For terminal or CLI-agent work, verify against a fresh final `observe --json --screenshot` captured after the command has finished or visibly returned to an input prompt. Do not report from an earlier screenshot path. + +Keep these standing rules: + +1. Treat Markdown as first-class. For writing, notes, reports, and drafts with no explicit binary Office requirement, create Markdown and use the custom Markdown editor when the user opens the canvas. +2. Treat ODF as first-class for LibreOffice office work: ODT in Writer, ODS in Calc, ODP in Impress. Use DOCX/XLSX/PPTX only for explicit Microsoft compatibility. +3. Use the Desktop only when the user asks for the Desktop, a GUI app, binary Office visual work, or visual confirmation. +4. Never open the Desktop/canvas automatically from a tool result if the user has not opened it. Offer the explicit Open in canvas action instead. +5. Launch common apps from the Desktop icons, the header buttons, or `scripts/desktopctl.sh`. +6. Use the external Agent Zero Browser for web browsing. Do not launch an operating-system browser in this version. +7. Verify GUI work by observing the desktop state, checking window titles, and saving the file before reporting success. If exact terminal text matters, load or inspect the screenshot path returned by the final observation, not a screenshot captured before the text appeared. ## Control Flow @@ -38,7 +48,10 @@ Use the helper script when the Desktop is already open and you need reliable app ```bash plugins/_office/skills/linux-desktop/scripts/desktopctl.sh check +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh state --json +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot plugins/_office/skills/linux-desktop/scripts/desktopctl.sh launch calc +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh wait-window LibreOffice plugins/_office/skills/linux-desktop/scripts/desktopctl.sh windows LibreOffice plugins/_office/skills/linux-desktop/scripts/desktopctl.sh focus LibreOffice plugins/_office/skills/linux-desktop/scripts/desktopctl.sh key ctrl+s @@ -55,6 +68,9 @@ plugins/_office/skills/linux-desktop/scripts/desktopctl.sh launch impress plugins/_office/skills/linux-desktop/scripts/desktopctl.sh launch terminal plugins/_office/skills/linux-desktop/scripts/desktopctl.sh launch settings plugins/_office/skills/linux-desktop/scripts/desktopctl.sh open-path /a0/usr/workdir +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh focus "LibreOffice" +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh paste-text "Text to insert" +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh key ctrl+s ``` For live spreadsheet coworking, use the Calc helper instead of hand-written UNO snippets: @@ -65,13 +81,17 @@ plugins/_office/skills/linux-desktop/scripts/desktopctl.sh calc-set-cell /a0/usr This opens the workbook in the visible Desktop Calc session if needed, changes the cell through LibreOffice, saves the workbook, and verifies the `.xlsx` on disk. Because the edit happens through the running LibreOffice session, the user can see the sheet update without refreshing the Desktop surface. -For coordinate actions after observing the Desktop: +For coordinate actions, clicks are explicitly last resort. First try `launch`, `open-path`, `wait-window`, `focus`, `key`, `paste-text`, `save`, or an app-native helper. If a coordinate action is still necessary, base it on a fresh screenshot observation and verify immediately afterward: ```bash +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh observe --json --screenshot plugins/_office/skills/linux-desktop/scripts/desktopctl.sh click 120 180 plugins/_office/skills/linux-desktop/scripts/desktopctl.sh dblclick 120 180 +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh right-click 120 180 +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh drag 120 180 400 180 +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh scroll down 3 plugins/_office/skills/linux-desktop/scripts/desktopctl.sh type "Text to enter" -plugins/_office/skills/linux-desktop/scripts/desktopctl.sh location +plugins/_office/skills/linux-desktop/scripts/desktopctl.sh observe --json ``` When browser automation is available, the higher-level QA flow is: @@ -82,6 +102,19 @@ When browser automation is available, the higher-level QA flow is: 4. Cross-check with `desktopctl.sh location` and `desktopctl.sh windows PATTERN`. 5. Capture the browser screenshot as visual evidence. +## Terminal And CLI Agent Verification + +Terminal apps are visual state, not structured logs. When the task depends on exact terminal output, follow this stricter loop: + +1. Run `desktopctl.sh observe --json --screenshot` immediately before acting to record the starting window and screenshot path. +2. Use `focus`, `paste-text` or `type`, and `key Return` to drive the terminal. Prefer CLI-native commands and keyboard input over clicks. +3. Wait until the CLI has visibly produced a response or returned to an input prompt. +4. Run a new final `desktopctl.sh observe --json --screenshot`. +5. Verify exact text only from the screenshot path returned by that final observation, or from a newer screenshot. Never use an earlier screenshot path as final evidence. +6. If the final screenshot is cropped, stale, or unreadable, capture another screenshot or report the result as unverified with that specific reason. + +For nested CLI agents, a successful proof requires both the input prompt and the nested agent's visible response in the final screenshot, or another deterministic saved transcript produced by the CLI itself. + ## Desktop Locations The Desktop exposes stable folders for common user work: diff --git a/plugins/_office/skills/linux-desktop/scripts/desktopctl.sh b/plugins/_office/skills/linux-desktop/scripts/desktopctl.sh index 05c4a2e3d..a885358cb 100755 --- a/plugins/_office/skills/linux-desktop/scripts/desktopctl.sh +++ b/plugins/_office/skills/linux-desktop/scripts/desktopctl.sh @@ -6,6 +6,8 @@ BASE_DIR="${A0_BASE_DIR:-/a0}" PROFILE_DIR="${A0_DESKTOP_PROFILE:-$BASE_DIR/tmp/_office/desktop/profiles/$SESSION}" MANIFEST="${A0_DESKTOP_MANIFEST:-$BASE_DIR/tmp/_office/desktop/sessions/$SESSION.json}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DESKTOP_STATE_HELPER="$SCRIPT_DIR/../../../helpers/desktop_state.py" +DESKTOP_STATE_PYTHON="${A0_DESKTOP_STATE_PYTHON:-$(command -v /usr/bin/python3 || command -v python3 || true)}" display_from_manifest() { if [ ! -f "$MANIFEST" ] || ! command -v python3 >/dev/null 2>&1; then @@ -34,6 +36,10 @@ esac export XAUTHORITY="${A0_DESKTOP_XAUTHORITY:-$PROFILE_DIR/.Xauthority}" export HOME="${A0_DESKTOP_HOME:-$PROFILE_DIR}" +export A0_DESKTOP_SESSION="$SESSION" +export A0_DESKTOP_MANIFEST="$MANIFEST" +export A0_DESKTOP_PROFILE="$PROFILE_DIR" +export A0_DESKTOP_DISPLAY="$DISPLAY" export XDG_CONFIG_HOME="${XDG_CONFIG_HOME:-$HOME/.config}" export XDG_DATA_HOME="${XDG_DATA_HOME:-$HOME/.local/share}" export XDG_CACHE_HOME="${XDG_CACHE_HOME:-$HOME/.cache}" @@ -49,9 +55,21 @@ Usage: desktopctl.sh [args] Commands: env Print the X11 environment used for the Desktop. check Verify that xdotool can reach the Desktop display. + state --json Return structured Desktop state as JSON. + observe --json [--screenshot] + Return structured state, optionally with a fresh screenshot. + screenshot [PATH] Capture the Desktop to PATH, or to the default screenshot directory. + active-window Print the active window name. + geometry PATTERN Print the first matching visible window geometry. + wait-window PATTERN Wait for a visible matching window and print its id. location Print the current X pointer location. windows [PATTERN] List visible window names matching PATTERN. focus PATTERN Focus the first visible window matching PATTERN. + scroll DIRECTION [UNITS] Scroll up, down, left, or right; UNITS defaults to 5 clicks. + drag X1 Y1 X2 Y2 Drag from X1,Y1 to X2,Y2 in Desktop coordinates. + right-click X Y Move and right-click at X,Y in Desktop coordinates. + paste-text TEXT Put TEXT on the Desktop clipboard and paste it with an app-native shortcut. + sequence FILE|- Run a newline-delimited command sequence. key KEY... Send one or more xdotool key names. type TEXT Type text into the focused window. click X Y Move and click at X,Y in Desktop coordinates. @@ -79,6 +97,14 @@ ensure_display() { fi } +desktop_state() { + if [ ! -f "$DESKTOP_STATE_HELPER" ]; then + echo "Desktop state helper not found: $DESKTOP_STATE_HELPER" >&2 + exit 2 + fi + "$DESKTOP_STATE_PYTHON" "$DESKTOP_STATE_HELPER" "$@" +} + run_detached() { ( "$@" >/tmp/a0-desktopctl.log 2>&1 & ) } @@ -98,6 +124,128 @@ first_window() { xdotool search --onlyvisible --name "$pattern" 2>/dev/null | head -n 1 || true } +active_window_id() { + xdotool getactivewindow 2>/dev/null || true +} + +active_window_class() { + window_id="$(active_window_id)" + if [ -z "$window_id" ]; then + return 0 + fi + if command -v xprop >/dev/null 2>&1; then + xprop -id "$window_id" WM_CLASS 2>/dev/null | awk -F'"' '/WM_CLASS/ { print $(NF - 1); exit }' + fi +} + +active_window_class_lower() { + active_window_class | tr '[:upper:]' '[:lower:]' +} + +active_window_is_terminal() { + window_class="$(active_window_class_lower)" + case "$window_class" in + *terminal*|xterm|uxterm|rxvt|urxvt|kitty|alacritty|wezterm|konsole) + return 0 + ;; + *) + return 1 + ;; + esac +} + +paste_key_for_active_window() { + printf '%s\n' "${A0_DESKTOP_PASTE_KEY:-ctrl+v}" +} + +window_geometry() { + window_id="$1" + if command -v xwininfo >/dev/null 2>&1; then + xwininfo -id "$window_id" 2>/dev/null | awk ' + /Absolute upper-left X:/ { x=$4 } + /Absolute upper-left Y:/ { y=$4 } + /Width:/ { w=$2 } + /Height:/ { h=$2 } + END { if (w != "") printf "X=%s\nY=%s\nWIDTH=%s\nHEIGHT=%s\n", x, y, w, h } + ' + else + xdotool getwindowgeometry --shell "$window_id" + fi +} + +wait_window() { + pattern="$1" + timeout="${2:-15}" + end=$((SECONDS + timeout)) + while [ "$SECONDS" -le "$end" ]; do + window_id="$(first_window "$pattern")" + if [ -n "$window_id" ]; then + printf '%s\n' "$window_id" + return 0 + fi + sleep 0.25 + done + echo "Timed out waiting for visible window: $pattern" >&2 + return 1 +} + +scroll_desktop() { + direction="$1" + units="${2:-5}" + case "$direction" in + up) button=4 ;; + down) button=5 ;; + left) button=6 ;; + right) button=7 ;; + *) + echo "scroll direction must be up, down, left, or right." >&2 + exit 2 + ;; + esac + xdotool click --repeat "$units" "$button" +} + +paste_text() { + text="$*" + if active_window_is_terminal; then + xdotool type --delay "${A0_DESKTOP_PASTE_TYPE_DELAY_MS:-${A0_DESKTOP_TYPE_DELAY_MS:-4}}" -- "$text" + return + fi + if command -v xclip >/dev/null 2>&1; then + printf '%s' "$text" | xclip -selection clipboard + xdotool key --clearmodifiers "$(paste_key_for_active_window)" + return + fi + xdotool type --delay "${A0_DESKTOP_TYPE_DELAY_MS:-1}" -- "$text" +} + +run_sequence_line() { + line="$1" + [ -z "$line" ] && return 0 + case "$line" in + \#*) return 0 ;; + esac + # shellcheck disable=SC2086 + "$0" $line +} + +run_sequence() { + source_file="$1" + if [ "$source_file" = "-" ]; then + while IFS= read -r line; do + run_sequence_line "$line" + done + return + fi + if [ ! -f "$source_file" ]; then + echo "sequence requires an existing FILE or - for stdin." >&2 + exit 2 + fi + while IFS= read -r line || [ -n "$line" ]; do + run_sequence_line "$line" + done < "$source_file" +} + launch_app() { app="${1:-}" soffice="${SOFFICE:-$(command -v soffice || true)}" @@ -142,6 +290,55 @@ case "$command_name" in ensure_display xdotool getmouselocation --shell ;; + state) + if [ "${1:-}" != "--json" ]; then + echo "state currently requires --json." >&2 + exit 2 + fi + desktop_state state --json + ;; + observe) + if [ "${1:-}" != "--json" ]; then + echo "observe currently requires --json." >&2 + exit 2 + fi + shift + desktop_state observe --json "$@" + ;; + screenshot) + if [ "${1:-}" = "--json" ]; then + shift + desktop_state screenshot --json "$@" + elif [ "$#" -gt 0 ]; then + desktop_state screenshot "$1" + else + desktop_state screenshot + fi + ;; + active-window) + ensure_display + window_id="$(active_window_id)" + if [ -z "$window_id" ]; then + echo "No active window." >&2 + exit 1 + fi + xdotool getwindowname "$window_id" + ;; + geometry) + ensure_display + pattern="${1:?geometry requires a window name pattern}" + window_id="$(first_window "$pattern")" + if [ -z "$window_id" ]; then + echo "No visible window matched: $pattern" >&2 + exit 1 + fi + window_geometry "$window_id" + ;; + wait-window) + ensure_display + pattern="${1:?wait-window requires a window name pattern}" + wait_window "$pattern" "${2:-15}" + ;; location) ensure_display xdotool getmouselocation --shell @@ -161,6 +358,36 @@ case "$command_name" in fi xdotool windowactivate --sync "$window_id" ;; + scroll) + ensure_display + scroll_desktop "${1:?scroll requires DIRECTION}" "${2:-5}" + ;; + drag) + ensure_display + x1="${1:?drag requires X1}" + y1="${2:?drag requires Y1}" + x2="${3:?drag requires X2}" + y2="${4:?drag requires Y2}" + xdotool mousemove --sync "$x1" "$y1" mousedown 1 mousemove --sync "$x2" "$y2" mouseup 1 + ;; + right-click) + ensure_display + x="${1:?right-click requires X}" + y="${2:?right-click requires Y}" + xdotool mousemove --sync "$x" "$y" click 3 + ;; + paste-text) + ensure_display + if [ "$#" -eq 0 ]; then + echo "paste-text requires TEXT." >&2 + exit 2 + fi + paste_text "$@" + ;; + sequence) + source_file="${1:?sequence requires FILE or -}" + run_sequence "$source_file" + ;; key) ensure_display if [ "$#" -eq 0 ]; then diff --git a/plugins/_office/webui/office-store.js b/plugins/_office/webui/office-store.js index 010580020..d2b198767 100644 --- a/plugins/_office/webui/office-store.js +++ b/plugins/_office/webui/office-store.js @@ -228,6 +228,9 @@ const model = { _desktopPrimeTimer: null, _desktopPrimeAttempts: 0, _desktopKeyboardActive: false, + _desktopBridgeReady: false, + _desktopKeyboardCaptureState: { ready: false, active: false, capture: false, focused: false }, + _desktopLastState: null, _desktopKeyboardCleanup: null, _desktopClipboardCleanup: null, _desktopStarting: null, @@ -1007,7 +1010,9 @@ const model = { } catch { target.focus?.({ preventScroll: true }); } - return Boolean(document.activeElement === target || target.contentDocument?.hasFocus?.()); + const focused = Boolean(document.activeElement === target || target.contentDocument?.hasFocus?.()); + this.updateDesktopKeyboardCaptureState(target); + return focused; }, updateDesktopMonitor() { @@ -1015,6 +1020,8 @@ const model = { this.stopDesktopMonitor(); this.stopDesktopResizeObserver(); this._desktopKeyboardActive = false; + this._desktopBridgeReady = false; + this.updateDesktopKeyboardCaptureState(); return; } const sessionId = this.session?.desktop_session_id || this.session?.session_id || ""; @@ -1216,6 +1223,7 @@ const model = { this.installXpraDesktopWheelBridge(remoteWindow, xpraWindow); if (requestRefresh && xpraWindow.wid != null) client.request_refresh?.(xpraWindow.wid); } + this.installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container); return true; } catch (error) { console.warn("Xpra desktop viewport prime skipped", error); @@ -1223,6 +1231,94 @@ const model = { } }, + installXpraDesktopAgentBridge(frame, remoteWindow, remoteDocument, client, container) { + if (!frame || !remoteWindow || !remoteDocument || !client) return null; + const store = this; + const finite = (value, fallback = 0) => { + const number = Number(value); + return Number.isFinite(number) ? number : fallback; + }; + const metrics = () => { + const desktopWidth = Math.max(1, finite(client.desktop_width || container?.clientWidth || remoteWindow.innerWidth, 1)); + const desktopHeight = Math.max(1, finite(client.desktop_height || container?.clientHeight || remoteWindow.innerHeight, 1)); + const clientWidth = Math.max(1, finite(container?.clientWidth || remoteWindow.innerWidth, desktopWidth)); + const clientHeight = Math.max(1, finite(container?.clientHeight || remoteWindow.innerHeight, desktopHeight)); + return { + desktopWidth, + desktopHeight, + clientWidth, + clientHeight, + scaleX: clientWidth / desktopWidth, + scaleY: clientHeight / desktopHeight, + }; + }; + const bridge = frame.__agentZeroDesktopBridge || {}; + Object.assign(bridge, { + ready: true, + state: async (options = {}) => { + const result = await callOffice("desktop_state", { + include_screenshot: options.includeScreenshot === true || options.include_screenshot === true, + }); + store._desktopLastState = result; + return result; + }, + focus: (options = {}) => store.focusDesktopFrame(frame, { ...options, arm: options.arm !== false }), + requestRefresh: () => { + for (const xpraWindow of Object.values(client.id_to_window || {})) { + if (xpraWindow?.wid != null) client.request_refresh?.(xpraWindow.wid); + } + return true; + }, + desktopToClient: (x, y) => { + const value = metrics(); + return { + x: Math.round(finite(x) * value.scaleX), + y: Math.round(finite(y) * value.scaleY), + scale_x: value.scaleX, + scale_y: value.scaleY, + }; + }, + clientToDesktop: (x, y) => { + const value = metrics(); + return { + x: Math.round(finite(x) / value.scaleX), + y: Math.round(finite(y) / value.scaleY), + scale_x: value.scaleX, + scale_y: value.scaleY, + }; + }, + diagnostics: () => store.desktopBridgeDiagnostics(frame), + }); + frame.agentZeroDesktop = bridge; + frame.__agentZeroDesktopBridge = bridge; + remoteWindow.agentZeroDesktop = bridge; + remoteWindow.__agentZeroDesktopBridge = bridge; + this._desktopBridgeReady = true; + this.updateDesktopKeyboardCaptureState(frame); + return bridge; + }, + + desktopBridgeDiagnostics(frame = null) { + return { + ready: this._desktopBridgeReady, + keyboard: this.updateDesktopKeyboardCaptureState(frame), + lastStateOk: this._desktopLastState?.ok ?? null, + }; + }, + + updateDesktopKeyboardCaptureState(frame = null) { + const target = this.desktopFrame(frame); + const client = target?.contentWindow?.client; + const state = { + ready: Boolean(target?.__agentZeroDesktopBridge || target?.contentWindow?.__agentZeroDesktopBridge), + active: Boolean(this._desktopKeyboardActive), + capture: Boolean(client?.capture_keyboard), + focused: Boolean(target && (document.activeElement === target || target.contentDocument?.hasFocus?.())), + }; + this._desktopKeyboardCaptureState = state; + return state; + }, + normalizeXpraDesktopWindow(xpraWindow, width, height) { if (!xpraWindow) return; const normalizedWidth = Math.max(1, Math.round(Number(width || 0))); diff --git a/tests/test_office_canvas_setup.py b/tests/test_office_canvas_setup.py index 156e3d7dd..764fc8a91 100644 --- a/tests/test_office_canvas_setup.py +++ b/tests/test_office_canvas_setup.py @@ -72,6 +72,14 @@ def test_document_canvas_uses_markdown_editor_and_official_libreoffice_desktop_f assert "primeXpraDesktopFrame" in store assert "normalizeXpraDesktopWindow" in store assert "installXpraDesktopWheelBridge" in store + assert "installXpraDesktopAgentBridge" in store + assert "agentZeroDesktop" in store + assert 'callOffice("desktop_state"' in store + assert "desktopToClient" in store + assert "clientToDesktop" in store + assert "requestRefresh" in store + assert "_desktopBridgeReady" in store + assert "_desktopKeyboardCaptureState" in store assert "installXpraDesktopKeyboardBridge" in store assert "focusDesktopFrame" in store assert "_desktopKeyboardActive" in store @@ -227,6 +235,10 @@ def test_official_libreoffice_desktop_route_and_packages_are_declared(): linux_desktopctl = ( PROJECT_ROOT / "plugins" / "_office" / "skills" / "linux-desktop" / "scripts" / "desktopctl.sh" ).read_text(encoding="utf-8") + desktop_state_helper = ( + PROJECT_ROOT / "plugins" / "_office" / "helpers" / "desktop_state.py" + ).read_text(encoding="utf-8") + hooks_py = (PROJECT_ROOT / "plugins" / "_office" / "hooks.py").read_text(encoding="utf-8") linux_calc_helper = ( PROJECT_ROOT / "plugins" / "_office" / "skills" / "linux-desktop" / "scripts" / "calc_set_cell.py" ).read_text(encoding="utf-8") @@ -320,10 +332,42 @@ def test_official_libreoffice_desktop_route_and_packages_are_declared(): assert "/a0/usr/projects" in linux_desktop_skill assert "desktopctl.sh" in linux_desktop_skill assert "calc-set-cell" in linux_desktop_skill + assert "Clicks are explicitly last resort" in linux_desktop_skill or "clicks are explicitly last resort" in linux_desktop_skill + assert "fresh Desktop observation" in linux_desktop_skill + assert "observe --json --screenshot" in linux_desktop_skill + assert "Terminal And CLI Agent Verification" in linux_desktop_skill + assert "Do not report from an earlier screenshot path" in linux_desktop_skill + assert "screenshot path returned by that final observation" in linux_desktop_skill assert "xdotool" in linux_desktopctl assert "agent-zero-desktop" in linux_desktopctl assert "launch_app" in linux_desktopctl + assert "paste_key_for_active_window" in linux_desktopctl + assert "active_window_is_terminal" in linux_desktopctl + assert "WM_CLASS" in linux_desktopctl + for command in ( + "state)", + "observe)", + "screenshot)", + "active-window)", + "geometry)", + "wait-window)", + "scroll)", + "drag)", + "right-click)", + "paste-text)", + "sequence)", + ): + assert command in linux_desktopctl assert "calc_set_cell.py" in linux_desktopctl + assert "collect_state" in desktop_state_helper + assert "compact_prompt_context" in desktop_state_helper + assert "fresh final" in desktop_state_helper + assert "xwd" in desktop_state_helper + assert "PIL" in desktop_state_helper + assert '"x11-utils"' in hooks_py + assert '"x11-apps"' in hooks_py + assert '"xclip"' in hooks_py + assert '"python3-pil"' in hooks_py assert "wait_for_document" in linux_calc_helper assert "document.store()" in linux_calc_helper assert "read_xlsx_cell" in linux_calc_helper @@ -419,6 +463,8 @@ def test_office_skills_preserve_markdown_first_and_opt_in_desktop_policy(): assert "Download and Open in canvas actions" in office_skill assert "method: \"create\"" in office_skill assert "The Desktop is opt-in" in desktop_skill + assert "coordinate clicks only as a last resort" in desktop_skill + assert "After any GUI action, verify" in desktop_skill assert "custom Markdown editor" in desktop_skill assert "Never open the Desktop/canvas automatically" in desktop_skill assert "persistent Desktop runtime during initial startup" in desktop_skill @@ -432,3 +478,19 @@ def test_office_skills_preserve_markdown_first_and_opt_in_desktop_policy(): assert "must not open the canvas automatically" in excel_skill assert '"format": "odp"' in presentation_skill assert "must not open the canvas automatically" in presentation_skill + + +def test_office_extra_prompt_includes_existing_desktop_state_without_opening_canvas(): + canvas_context = ( + PROJECT_ROOT / "plugins" / "_office" / "helpers" / "canvas_context.py" + ).read_text(encoding="utf-8") + prompt = ( + PROJECT_ROOT / "plugins" / "_office" / "prompts" / "agent.extras.office_canvas.md" + ).read_text(encoding="utf-8") + + assert "build_desktop_context" in canvas_context + assert "session_manifest_exists" in canvas_context + assert "collect_state(include_screenshot=False)" in canvas_context + assert "compact_prompt_context" in canvas_context + assert "ensure_system_desktop" not in canvas_context + assert "[DOCUMENT CANVAS]" in prompt diff --git a/tests/test_office_desktop_state.py b/tests/test_office_desktop_state.py new file mode 100644 index 000000000..44a3336ae --- /dev/null +++ b/tests/test_office_desktop_state.py @@ -0,0 +1,201 @@ +from __future__ import annotations + +import subprocess +import struct +import sys +import types +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from plugins._office.helpers import desktop_state + + +def _completed(command, returncode=0, stdout="", stderr=""): + return subprocess.CompletedProcess(command, returncode, stdout, stderr) + + +def test_desktop_state_collects_x11_state_from_mocked_tools(tmp_path, monkeypatch): + session_dir = tmp_path / "sessions" + profile_dir = tmp_path / "profiles" / desktop_state.SESSION_ID + session_dir.mkdir(parents=True) + profile_dir.mkdir(parents=True) + (session_dir / f"{desktop_state.SESSION_ID}.json").write_text( + '{"display": 120, "profile_dir": "%s"}' % profile_dir, + encoding="utf-8", + ) + + monkeypatch.setattr(desktop_state, "SESSION_DIR", session_dir) + monkeypatch.setattr(desktop_state, "PROFILE_DIR", tmp_path / "profiles") + monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path / "screenshots") + monkeypatch.setattr( + desktop_state.shutil, + "which", + lambda name: f"/usr/bin/{name}" + if name in {"xdotool", "xrandr", "xwininfo", "xprop", "xwd", "xclip"} + else "", + ) + + def fake_run(command, **kwargs): + name = Path(command[0]).name + if name == "xrandr": + return _completed(command, stdout="Screen 0: current 1440 x 900, maximum 1920 x 1080\n") + if name == "xdotool" and command[1:3] == ["getmouselocation", "--shell"]: + return _completed(command, stdout="X=12\nY=34\nSCREEN=0\nWINDOW=111\n") + if name == "xdotool" and command[1] == "getactivewindow": + return _completed(command, stdout="111\n") + if name == "xdotool" and command[1] == "search": + return _completed(command, stdout="111\n222\n") + if name == "xdotool" and command[1] == "getwindowname": + return _completed(command, stdout={"111": "LibreOffice Calc", "222": "Terminal"}[command[2]] + "\n") + if name == "xwininfo": + geometry = { + "111": (5, 7, 800, 600), + "222": (20, 30, 640, 480), + }[command[2]] + return _completed( + command, + stdout=( + f" Absolute upper-left X: {geometry[0]}\n" + f" Absolute upper-left Y: {geometry[1]}\n" + f" Width: {geometry[2]}\n" + f" Height: {geometry[3]}\n" + ), + ) + if name == "xprop": + window_id = command[2] + if window_id == "111": + return _completed( + command, + stdout='WM_CLASS(STRING) = "libreoffice", "libreoffice-calc"\n_NET_WM_PID(CARDINAL) = 4242\n', + ) + return _completed( + command, + stdout='WM_CLASS(STRING) = "xfce4-terminal", "Xfce4-terminal"\n_NET_WM_PID(CARDINAL) = 4343\n', + ) + raise AssertionError(f"unexpected command: {command}") + + monkeypatch.setattr(desktop_state.subprocess, "run", fake_run) + + state = desktop_state.collect_state() + + assert state["ok"] is True + assert state["display"] == ":120" + assert state["profile_dir"] == str(profile_dir) + assert state["size"] == {"width": 1440, "height": 900} + assert state["pointer"]["x"] == 12 + assert state["active_window"]["title"] == "LibreOffice Calc" + assert state["active_window"]["class"] == "libreoffice-calc" + assert state["active_window"]["geometry"]["width"] == 800 + assert [window["title"] for window in state["windows"]] == ["LibreOffice Calc", "Terminal"] + + +def test_desktop_state_screenshot_capture_uses_xwd_and_pillow_when_available(tmp_path, monkeypatch): + monkeypatch.setattr(desktop_state, "SCREENSHOT_DIR", tmp_path) + capabilities = {"xwd": "/usr/bin/xwd"} + env = {"DISPLAY": ":120"} + + def fake_run(command, *, env, timeout): + raw_path = Path(command[command.index("-out") + 1]) + raw_path.write_bytes(b"xwd") + return _completed(command) + + image_module = types.ModuleType("PIL.Image") + + class FakeImage: + width = 320 + height = 240 + + def __enter__(self): + return self + + def __exit__(self, *_args): + return False + + def save(self, target): + Path(target).write_bytes(b"png") + + image_module.open = lambda _path: FakeImage() + pil_module = types.ModuleType("PIL") + pil_module.Image = image_module + + monkeypatch.setattr(desktop_state, "run", fake_run) + monkeypatch.setitem(sys.modules, "PIL", pil_module) + monkeypatch.setitem(sys.modules, "PIL.Image", image_module) + + screenshot = desktop_state.capture_screenshot(env, capabilities, path=tmp_path / "shot.png", errors=[]) + + assert screenshot["ok"] is True + assert screenshot["path"] == str(tmp_path / "shot.png") + assert screenshot["format"] == "png" + assert (tmp_path / "shot.png").read_bytes() == b"png" + assert not (tmp_path / "shot.xwd").exists() + + +def test_xwd_fallback_parser_handles_truecolor_pixels(tmp_path, monkeypatch): + raw_path = tmp_path / "shot.xwd" + target = tmp_path / "shot.png" + header_values = [ + 100, # header_size + 7, # file_version + 2, # pixmap_format + 24, # pixmap_depth + 2, # pixmap_width + 1, # pixmap_height + 0, # xoffset + 1, # byte_order: MSBFirst for pixel bytes + 32, # bitmap_unit + 1, # bitmap_bit_order + 32, # bitmap_pad + 32, # bits_per_pixel + 8, # bytes_per_line + 4, # visual_class: TrueColor + 0x00FF0000, # red_mask + 0x0000FF00, # green_mask + 0x000000FF, # blue_mask + 8, # bits_per_rgb + 256, # colormap_entries + 0, # ncolors + 2, # window_width + 1, # window_height + 0, # window_x + 0, # window_y + 0, # window_bdrwidth + ] + raw_path.write_bytes( + struct.pack(">25I", *header_values) + + bytes.fromhex("00ff0000") + + bytes.fromhex("0000ff00") + ) + + captured: dict[str, object] = {} + image_module = types.ModuleType("PIL.Image") + + class FakeOutputImage: + def putdata(self, pixels): + captured["pixels"] = list(pixels) + + def save(self, path): + Path(path).write_bytes(b"fallback-png") + + def fake_new(mode, size): + captured["mode"] = mode + captured["size"] = size + return FakeOutputImage() + + image_module.new = fake_new + pil_module = types.ModuleType("PIL") + pil_module.Image = image_module + + monkeypatch.setitem(sys.modules, "PIL", pil_module) + monkeypatch.setitem(sys.modules, "PIL.Image", image_module) + + converted = desktop_state.convert_xwd_to_image(raw_path, target) + + assert converted == {"width": 2, "height": 1} + assert captured["mode"] == "RGB" + assert captured["size"] == (2, 1) + assert captured["pixels"] == [(255, 0, 0), (0, 255, 0)] + assert target.read_bytes() == b"fallback-png" diff --git a/tests/test_office_document_store.py b/tests/test_office_document_store.py index 55d1b37dd..6f6bca34e 100644 --- a/tests/test_office_document_store.py +++ b/tests/test_office_document_store.py @@ -503,6 +503,57 @@ def test_official_libreoffice_desktop_status_and_url_contract(tmp_path, monkeypa assert "printing=true" in url +def test_office_session_desktop_state_action_defaults_without_screenshot(monkeypatch): + api_module = types.ModuleType("helpers.api") + + class ApiHandler: + def __init__(self, app=None, thread_lock=None): + self.app = app + self.thread_lock = thread_lock + + api_module.ApiHandler = ApiHandler + api_module.Request = object + monkeypatch.setitem(sys.modules, "helpers.api", api_module) + monkeypatch.delitem(sys.modules, "plugins._office.api.office_session", raising=False) + + from plugins._office.api import office_session + + calls = [] + + class FakeManager: + def state(self, *, include_screenshot=False): + calls.append(include_screenshot) + return { + "ok": True, + "display": ":120", + "profile_dir": "/a0/tmp/_office/desktop/profiles/agent-zero-desktop", + "size": {"width": 1440, "height": 900}, + "pointer": {"x": 0, "y": 0, "screen": 0, "window": 0}, + "active_window": None, + "windows": [], + "screenshot": {"ok": False, "path": ""}, + "capabilities": {}, + "errors": [], + } + + monkeypatch.setattr(office_session.libreoffice_desktop, "get_manager", lambda: FakeManager()) + handler = office_session.OfficeSession(app=None, thread_lock=None) + request = types.SimpleNamespace(headers={}, host_url="http://localhost:32080") + + default_result = asyncio.run(handler.process({"action": "desktop_state"}, request)) + screenshot_result = asyncio.run( + handler.process({"action": "desktop_state", "include_screenshot": True}, request), + ) + + assert default_result["ok"] is True + assert screenshot_result["ok"] is True + assert calls == [False, True] + monkeypatch.delitem(sys.modules, "plugins._office.api.office_session", raising=False) + api_package = sys.modules.get("plugins._office.api") + if api_package is not None: + monkeypatch.delattr(api_package, "office_session", raising=False) + + def test_official_libreoffice_desktop_manager_opens_binary_session(office_state, tmp_path, monkeypatch): class FakeProcess: pid = 4242