From 4ff3244ce63ee336071f875de99cf6922db93daa Mon Sep 17 00:00:00 2001 From: Alessandro <155005371+3clyp50@users.noreply.github.com> Date: Sun, 26 Apr 2026 23:57:48 +0200 Subject: [PATCH] Add browser annotate mode Add Codex-inspired annotation UI to the built-in Browser surfaces, including the Annotate toggle, Cmd/Ctrl+. shortcut, selection overlay, inline comments, and batch Draft to chat / Send now actions. Wire browser_viewer_annotation through the WebSocket and runtime layers, and expose safe DOM metadata extraction for clicked elements and selected areas without leaking password/value data. Expand regression coverage for the Browser UI, annotation dispatch, runtime helper exposure, prompt formatting, and WebUI extension surface harness behavior. --- plugins/_browser/api/ws_browser.py | 25 + .../_browser/assets/browser-page-content.js | 369 +++++++++++++- plugins/_browser/helpers/runtime.py | 18 +- plugins/_browser/webui/browser-panel.html | 313 +++++++++++- plugins/_browser/webui/browser-store.js | 471 +++++++++++++++++- tests/test_browser_agent_regressions.py | 204 +++++++- tests/test_webui_extension_surfaces.py | 17 +- 7 files changed, 1382 insertions(+), 35 deletions(-) diff --git a/plugins/_browser/api/ws_browser.py b/plugins/_browser/api/ws_browser.py index b1df9146b..8430f7e7a 100644 --- a/plugins/_browser/api/ws_browser.py +++ b/plugins/_browser/api/ws_browser.py @@ -43,6 +43,8 @@ class WsBrowser(WsHandler): return await self._command(data, sid) if event == "browser_viewer_input": return await self._input(data, sid) + if event == "browser_viewer_annotation": + return await self._annotation(data, sid) return WsResult.error( code="UNKNOWN_BROWSER_EVENT", @@ -215,6 +217,29 @@ class WsBrowser(WsHandler): else None, } + async def _annotation(self, data: dict[str, Any], sid: str) -> dict[str, Any] | WsResult: + context_id = self._context_id(data) + if not context_id: + return self._error("MISSING_CONTEXT", "context_id is required", data) + runtime = await get_runtime(context_id, create=False) + if not runtime: + return self._error("NO_BROWSER_RUNTIME", "No browser runtime exists for this context", data) + + browser_id = data.get("browser_id") + viewer_id = str(data.get("viewer_id") or "") + payload = data.get("payload") if isinstance(data.get("payload"), dict) else {} + try: + annotation = await runtime.call("annotation_target", browser_id, payload) + except Exception as exc: + return self._error("ANNOTATION_FAILED", str(exc), data) + + return { + "annotation": annotation, + "context_id": context_id, + "browser_id": browser_id, + "viewer_id": viewer_id, + } + async def _snapshot_for_result( self, runtime: Any, diff --git a/plugins/_browser/assets/browser-page-content.js b/plugins/_browser/assets/browser-page-content.js index fec76e633..edf9d9403 100644 --- a/plugins/_browser/assets/browser-page-content.js +++ b/plugins/_browser/assets/browser-page-content.js @@ -1,7 +1,7 @@ (() => { const GLOBAL_KEY = "__spaceBrowserPageContent__"; const DOM_HELPER_KEY = "__spaceBrowserDomHelper__"; - const VERSION = "6"; + const VERSION = "7"; const BLOCK_TAGS = new Set([ "ADDRESS", "ARTICLE", @@ -2842,10 +2842,377 @@ }); } + function cssEscape(value) { + const rawValue = String(value || ""); + if (!rawValue) { + return ""; + } + + if (typeof globalThis.CSS?.escape === "function") { + return globalThis.CSS.escape(rawValue); + } + + return rawValue.replace(/[^a-zA-Z0-9_-]/gu, (character) => `\\${character}`); + } + + function getClassSummary(element) { + try { + return [...(element?.classList || [])] + .map((className) => normalizeAttributeText(className)) + .filter(Boolean) + .slice(0, 4) + .join(" "); + } catch { + return ""; + } + } + + function buildCssSelector(element) { + if (!isElementNode(element)) { + return ""; + } + + const id = normalizeAttributeText(element.getAttribute?.("id")); + if (id) { + return `#${cssEscape(id)}`; + } + + const parts = []; + let current = element; + while (isElementNode(current) && current !== globalThis.document?.documentElement && parts.length < 6) { + const tagName = getTagName(current).toLowerCase(); + if (!tagName) { + break; + } + + let part = tagName; + const classes = getClassSummary(current) + .split(/\s+/u) + .filter(Boolean) + .slice(0, 2); + if (classes.length && !["body", "html"].includes(tagName)) { + part += classes.map((className) => `.${cssEscape(className)}`).join(""); + } + + const parent = current.parentElement; + if (parent) { + const siblings = [...parent.children].filter((sibling) => getTagName(sibling) === getTagName(current)); + if (siblings.length > 1) { + part += `:nth-of-type(${siblings.indexOf(current) + 1})`; + } + } + + parts.unshift(part); + if (tagName === "body") { + break; + } + current = parent; + } + + return parts.join(" > "); + } + + function sanitizeAnnotationDom(value) { + return truncateText( + String(value || "") + .replace(/(]*\btype\s*=\s*(["'])?password\2?)[^>]*?)\s+value\s*=\s*(["'])[\s\S]*?\3/giu, "$1 value=\"[redacted]\"") + .replace(/\svalue\s*=\s*(["'])[\s\S]{0,600}?\1/giu, " value=\"[redacted]\"") + .replace(/\sdata-space-browser-live-value\s*=\s*(["'])[\s\S]{0,600}?\1/giu, "") + .replace(/\sdata-space-browser-selected-text\s*=\s*(["'])[\s\S]{0,600}?\1/giu, ""), + 1200 + ); + } + + function summarizeAnnotationElement(element) { + if (!isElementNode(element)) { + return null; + } + + const summaryData = collectReferenceSummaryData(element, { + includeLabelQuotes: false, + includeLinkUrls: true, + includeSemanticTags: true, + includeStateTags: true + }); + const rawDom = serializeElementSnapshot(element); + return { + classes: getClassSummary(element), + dom: sanitizeAnnotationDom(rawDom), + id: normalizeAttributeText(element.getAttribute?.("id")), + kind: summaryData.kind, + name: normalizeAttributeText(element.getAttribute?.("name")), + rect: getElementRectSafe(element), + role: normalizeAttributeText(element.getAttribute?.("role")).toLowerCase(), + selector: buildCssSelector(element), + semanticTags: Array.isArray(summaryData.semanticTags) ? summaryData.semanticTags.slice(0, 4) : [], + stateTags: Array.isArray(summaryData.state?.stateTags) ? summaryData.state.stateTags.slice(0, 8) : [], + summary: truncateText(summaryData.summary || getLabelText(element, { + includeAlt: true, + includeDescendantImageAlt: true, + includePlaceholder: true, + includeText: true + }), 240), + tagName: getTagName(element) + }; + } + + function annotationViewport() { + return { + height: Math.max(0, Number(globalThis.innerHeight || globalThis.document?.documentElement?.clientHeight || 0)), + scrollX: Number(globalThis.scrollX || globalThis.pageXOffset || 0), + scrollY: Number(globalThis.scrollY || globalThis.pageYOffset || 0), + width: Math.max(0, Number(globalThis.innerWidth || globalThis.document?.documentElement?.clientWidth || 0)) + }; + } + + function normalizeAnnotationPoint(payload = {}, viewport = annotationViewport()) { + const source = payload?.point && typeof payload.point === "object" ? payload.point : payload; + const width = Math.max(1, Number(viewport.width || 1)); + const height = Math.max(1, Number(viewport.height || 1)); + return { + x: Math.max(0, Math.min(width, Number(source?.x || 0))), + y: Math.max(0, Math.min(height, Number(source?.y || 0))) + }; + } + + function normalizeAnnotationRectPayload(payload = {}, viewport = annotationViewport()) { + const source = payload?.rect && typeof payload.rect === "object" ? payload.rect : payload; + const width = Math.max(1, Number(viewport.width || 1)); + const height = Math.max(1, Number(viewport.height || 1)); + const x = Math.max(0, Math.min(width, Number(source?.x || 0))); + const y = Math.max(0, Math.min(height, Number(source?.y || 0))); + return { + height: Math.max(1, Math.min(height - y, Number(source?.height || source?.h || 1))), + width: Math.max(1, Math.min(width - x, Number(source?.width || source?.w || 1))), + x, + y + }; + } + + function intersectRects(leftRect, rightRect) { + if (!leftRect || !rightRect) { + return null; + } + + const x = Math.max(Number(leftRect.x || 0), Number(rightRect.x || 0)); + const y = Math.max(Number(leftRect.y || 0), Number(rightRect.y || 0)); + const right = Math.min( + Number(leftRect.x || 0) + Number(leftRect.width || 0), + Number(rightRect.x || 0) + Number(rightRect.width || 0) + ); + const bottom = Math.min( + Number(leftRect.y || 0) + Number(leftRect.height || 0), + Number(rightRect.y || 0) + Number(rightRect.height || 0) + ); + const width = right - x; + const height = bottom - y; + if (width <= 0 || height <= 0) { + return null; + } + return { + area: width * height, + height, + width, + x, + y + }; + } + + function deepElementFromPoint(x, y) { + let element = null; + try { + element = globalThis.document?.elementFromPoint?.(x, y) || null; + } catch { + return null; + } + + let guard = 0; + while (isElementNode(element) && element.shadowRoot && guard < 8) { + guard += 1; + try { + const nestedElement = element.shadowRoot.elementFromPoint?.(x, y); + if (!nestedElement || nestedElement === element) { + break; + } + element = nestedElement; + } catch { + break; + } + } + + return element; + } + + function findAnnotationTarget(element) { + if (!isElementNode(element)) { + return null; + } + + const selector = [ + "a[href]", + "button", + "input", + "textarea", + "select", + "summary", + "[role]", + "img", + "label", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "li", + "td", + "th", + "article", + "section", + "nav", + "header", + "main", + "footer" + ].join(","); + const target = element.closest?.(selector) || element; + return isElementNode(target) && !isHiddenElement(target) ? target : element; + } + + function isMeaningfulAnnotationElement(element) { + if (!isElementNode(element) || isHiddenElement(element)) { + return false; + } + + if (isInteractiveElement(element) || getTagName(element) === "IMG") { + return true; + } + + const tagName = getTagName(element); + const role = normalizeAttributeText(element.getAttribute?.("role")).toLowerCase(); + return Boolean( + role + || /^H[1-6]$/u.test(tagName) + || ["ARTICLE", "SECTION", "MAIN", "NAV", "HEADER", "FOOTER", "FORM", "LABEL", "P", "LI", "TD", "TH"].includes(tagName) + ); + } + + function collectIntersectingAnnotationElements(rect) { + const selector = [ + "a[href]", + "button", + "input", + "textarea", + "select", + "summary", + "[role]", + "img", + "label", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "li", + "td", + "th", + "article", + "section", + "main", + "nav", + "header", + "footer" + ].join(","); + let candidates = []; + try { + candidates = [...(globalThis.document?.querySelectorAll?.(selector) || [])]; + } catch { + candidates = []; + } + + const seen = new Set(); + return candidates + .map((element) => { + if (!isMeaningfulAnnotationElement(element) || seen.has(element)) { + return null; + } + seen.add(element); + const elementRect = getElementRectSafe(element); + const intersection = intersectRects(rect, elementRect); + if (!intersection || intersection.area < 48) { + return null; + } + return { + element, + elementArea: Math.max(1, Number(elementRect.width || 0) * Number(elementRect.height || 0)), + intersection + }; + }) + .filter(Boolean) + .sort((left, right) => { + if (right.intersection.area !== left.intersection.area) { + return right.intersection.area - left.intersection.area; + } + return left.elementArea - right.elementArea; + }) + .slice(0, 12) + .map((entry) => summarizeAnnotationElement(entry.element)) + .filter(Boolean); + } + + function annotate(payload = null) { + const request = payload && typeof payload === "object" ? payload : {}; + const viewport = annotationViewport(); + const kind = request.kind === "area" || request.rect ? "area" : "element"; + + if (kind === "area") { + const rect = normalizeAnnotationRectPayload(request, viewport); + const point = { + x: rect.x + rect.width / 2, + y: rect.y + rect.height / 2 + }; + const elements = collectIntersectingAnnotationElements(rect); + const fallbackElement = findAnnotationTarget(deepElementFromPoint(point.x, point.y)); + const fallbackTarget = fallbackElement ? summarizeAnnotationElement(fallbackElement) : null; + return { + elements, + kind, + point, + rect, + status: elements.length || fallbackTarget ? "ok" : "empty", + target: elements[0] || fallbackTarget, + viewport + }; + } + + const point = normalizeAnnotationPoint(request, viewport); + const rawElement = deepElementFromPoint(point.x, point.y); + const targetElement = findAnnotationTarget(rawElement); + const target = targetElement ? summarizeAnnotationElement(targetElement) : null; + return { + kind, + point, + rect: target?.rect || { + height: 1, + width: 1, + x: point.x, + y: point.y + }, + status: target ? "ok" : "empty", + target, + viewport + }; + } + globalThis[GLOBAL_KEY] = { click(referenceId) { return activateElement(referenceId); }, + annotate, capture, clear() { state.captureId = 0; diff --git a/plugins/_browser/helpers/runtime.py b/plugins/_browser/helpers/runtime.py index 036d1c715..db459de91 100644 --- a/plugins/_browser/helpers/runtime.py +++ b/plugins/_browser/helpers/runtime.py @@ -557,6 +557,22 @@ class _BrowserRuntimeCore: self.last_interacted_browser_id = resolved_id return result or {} + async def annotation_target( + self, + browser_id: int | str | None, + payload: dict[str, Any] | None = None, + ) -> dict[str, Any]: + await self.ensure_started() + resolved_id = self._resolve_browser_id(browser_id) + page = self._page(resolved_id) + await self._ensure_content_helper(page) + result = await page.evaluate( + "(payload) => globalThis.__spaceBrowserPageContent__.annotate(payload || null)", + payload or None, + ) + self.last_interacted_browser_id = resolved_id + return result or {} + async def evaluate(self, browser_id: int | str | None, script: str) -> dict[str, Any]: await self.ensure_started() resolved_id = self._resolve_browser_id(browser_id) @@ -918,7 +934,7 @@ class _BrowserRuntimeCore: async def _ensure_content_helper(self, page: Any) -> None: has_helper = await page.evaluate( - "() => Boolean(globalThis.__spaceBrowserPageContent__?.capture)" + "() => Boolean(globalThis.__spaceBrowserPageContent__?.capture && globalThis.__spaceBrowserPageContent__?.annotate)" ) if has_helper: return diff --git a/plugins/_browser/webui/browser-panel.html b/plugins/_browser/webui/browser-panel.html index e25808403..108b9c966 100644 --- a/plugins/_browser/webui/browser-panel.html +++ b/plugins/_browser/webui/browser-panel.html @@ -11,7 +11,7 @@