fix(computer-use-mcp): route target clicks through action executor (#1727)

---------

Co-authored-by-agent: Antigravity <antigravity@gemini.com>
This commit is contained in:
刘梓恒 2026-04-26 05:53:15 +08:00 committed by GitHub
parent a9359a3924
commit 2e9020a06f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 545 additions and 214 deletions

3
pnpm-lock.yaml generated
View file

@ -3989,6 +3989,9 @@ importers:
'@modelcontextprotocol/sdk':
specifier: 'catalog:'
version: 1.29.0(@cfworker/json-schema@4.1.1)(zod@4.3.6)
'@moeru/std':
specifier: 'catalog:'
version: 0.1.0-beta.17
node-pty:
specifier: 'catalog:'
version: 1.1.0

View file

@ -50,6 +50,7 @@
},
"dependencies": {
"@modelcontextprotocol/sdk": "catalog:",
"@moeru/std": "catalog:",
"node-pty": "catalog:",
"ws": "^8.20.0",
"zod": "^4.3.6"

View file

@ -51,7 +51,7 @@ export async function createComputerUseMcpServer(config = resolveComputerUseConf
},
})
const cdpCleanup = registerCdpTools({ server, runtime })
registerDesktopGroundingTools({ server, runtime })
registerDesktopGroundingTools({ server, runtime, executeAction })
registerChromeSessionTools({ server, runtime })
return {

View file

@ -1,3 +1,4 @@
import type { ComputerUseConfig } from '../types'
import type { ComputerUseServerRuntime } from './runtime'
import { describe, expect, it, vi } from 'vitest'
@ -6,7 +7,7 @@ import { RunStateManager } from '../state'
import { createDisplayInfo, createLocalExecutionTarget, createTerminalState, createTestConfig } from '../test-fixtures'
import { createExecuteAction } from './action-executor'
function createRuntimeForActionTest() {
function createRuntimeForActionTest(configOverrides: Partial<ComputerUseConfig> = {}) {
const stateManager = new RunStateManager()
const session = {
listPendingActions: vi.fn().mockReturnValue([]),
@ -51,6 +52,11 @@ function createRuntimeForActionTest() {
scroll: vi.fn(),
wait: vi.fn(),
}
const desktopSessionController = {
getSession: vi.fn().mockReturnValue(null),
ensureControlledAppInForeground: vi.fn(),
touch: vi.fn(),
}
const terminalRunner = {
describe: () => ({ kind: 'local-shell-runner' as const, notes: [] }),
execute: vi.fn(),
@ -79,6 +85,7 @@ function createRuntimeForActionTest() {
executor: 'dry-run',
approvalMode: 'never',
defaultCaptureAfter: false,
...configOverrides,
}),
session,
executor,
@ -87,6 +94,8 @@ function createRuntimeForActionTest() {
cdpBridgeManager,
stateManager,
taskMemory: {},
desktopSessionController,
chromeSessionManager: {},
} as unknown as ComputerUseServerRuntime
return {
@ -95,10 +104,255 @@ function createRuntimeForActionTest() {
executor,
cdpBridgeManager,
stateManager,
desktopSessionController,
}
}
describe('createExecuteAction', () => {
it('executes desktop_click_target through the shared policy and audit pipeline', async () => {
const { runtime, executor, session, stateManager } = createRuntimeForActionTest()
stateManager.updateGroundingSnapshot({
snapshotId: 'dg_1',
capturedAt: new Date().toISOString(),
foregroundApp: 'Google Chrome',
windows: [],
screenshot: {
dataBase64: '',
mimeType: 'image/png',
path: '',
capturedAt: new Date().toISOString(),
},
targetCandidates: [
{
id: 't_0',
source: 'ax',
appName: 'Google Chrome',
role: 'AXButton',
label: 'Submit',
bounds: { x: 100, y: 200, width: 80, height: 30 },
confidence: 0.95,
interactable: true,
},
],
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
} as any)
const executeAction = createExecuteAction(runtime)
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
expect(result.isError).not.toBe(true)
expect(executor.click).toHaveBeenCalledWith(expect.objectContaining({
x: 140,
y: 215,
button: 'left',
clickCount: 1,
pointerTrace: [{ x: 140, y: 215, delayMs: 0 }],
}))
expect(session.consumeOperation).toHaveBeenCalledWith(1)
expect(session.setPointerPosition).toHaveBeenCalledWith({ x: 140, y: 215 })
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
event: 'executed',
toolName: 'desktop_click_target',
action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
}))
expect(stateManager.getState().lastClickedCandidateId).toBe('t_0')
expect(stateManager.getState().lastPointerIntent).toMatchObject({
candidateId: 't_0',
phase: 'completed',
executionResult: 'success',
})
expect(result.content.find(item => item.type === 'text')?.text).toContain('Clicked: ax AXButton "Submit"')
})
it('queues desktop_click_target without refocusing when approval is required', async () => {
const { runtime, executor, session, desktopSessionController } = createRuntimeForActionTest({ approvalMode: 'all' })
desktopSessionController.getSession.mockReturnValue({
id: 'ds_1',
controlledApp: 'Google Chrome',
ownedWindows: [],
createdAt: new Date().toISOString(),
lastActiveAt: new Date().toISOString(),
})
executor.getForegroundContext.mockResolvedValue({
available: true,
appName: 'AIRI',
platform: 'darwin',
})
session.createPendingAction.mockReturnValue({
id: 'pa_1',
createdAt: new Date().toISOString(),
toolName: 'desktop_click_target',
action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
context: {
available: true,
appName: 'Google Chrome',
platform: 'darwin',
},
policy: {
allowed: true,
requiresApproval: true,
reasons: [],
riskLevel: 'medium',
estimatedOperationUnits: 1,
},
})
const executeAction = createExecuteAction(runtime)
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
expect(result.structuredContent).toMatchObject({
status: 'approval_required',
pendingActionId: 'pa_1',
action: {
kind: 'desktop_click_target',
input: { candidateId: 't_0' },
},
})
expect(session.createPendingAction).toHaveBeenCalledWith(expect.objectContaining({
toolName: 'desktop_click_target',
action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
context: expect.objectContaining({ appName: 'Google Chrome' }),
}))
expect(desktopSessionController.ensureControlledAppInForeground).not.toHaveBeenCalled()
expect(executor.click).not.toHaveBeenCalled()
expect(session.consumeOperation).not.toHaveBeenCalled()
})
it('uses controlled-app context for desktop_click_target policy and refocuses only during execution', async () => {
const { runtime, executor, session, stateManager, desktopSessionController } = createRuntimeForActionTest()
stateManager.updateGroundingSnapshot({
snapshotId: 'dg_1',
capturedAt: new Date().toISOString(),
foregroundApp: 'Google Chrome',
windows: [],
screenshot: {
dataBase64: '',
mimeType: 'image/png',
path: '',
capturedAt: new Date().toISOString(),
},
targetCandidates: [
{
id: 't_0',
source: 'ax',
appName: 'Google Chrome',
role: 'AXButton',
label: 'Submit',
bounds: { x: 100, y: 200, width: 80, height: 30 },
confidence: 0.95,
interactable: true,
},
],
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
} as any)
desktopSessionController.getSession.mockReturnValue({
id: 'ds_1',
controlledApp: 'Google Chrome',
ownedWindows: [],
createdAt: new Date().toISOString(),
lastActiveAt: new Date().toISOString(),
})
desktopSessionController.ensureControlledAppInForeground.mockResolvedValue(true)
executor.getForegroundContext.mockResolvedValue({
available: true,
appName: 'AIRI',
platform: 'darwin',
})
const executeAction = createExecuteAction(runtime)
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
expect(result.isError).not.toBe(true)
expect(desktopSessionController.ensureControlledAppInForeground).toHaveBeenCalledWith(expect.objectContaining({
currentForeground: expect.objectContaining({ appName: 'AIRI' }),
}))
expect(executor.click).toHaveBeenCalledOnce()
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
event: 'executed',
context: expect.objectContaining({ appName: 'Google Chrome' }),
policy: expect.objectContaining({ allowed: true }),
}))
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
event: 'requested',
result: expect.objectContaining({
actualForegroundContext: expect.objectContaining({ appName: 'AIRI' }),
}),
}))
})
it('returns a structured failure when controlled-app refocus fails during desktop_click_target execution', async () => {
const { runtime, executor, session, stateManager, desktopSessionController } = createRuntimeForActionTest()
stateManager.updateGroundingSnapshot({
snapshotId: 'dg_1',
capturedAt: new Date().toISOString(),
foregroundApp: 'Google Chrome',
windows: [],
screenshot: {
dataBase64: '',
mimeType: 'image/png',
path: '',
capturedAt: new Date().toISOString(),
},
targetCandidates: [
{
id: 't_0',
source: 'ax',
appName: 'Google Chrome',
role: 'AXButton',
label: 'Submit',
bounds: { x: 100, y: 200, width: 80, height: 30 },
confidence: 0.95,
interactable: true,
},
],
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
} as any)
desktopSessionController.getSession.mockReturnValue({
id: 'ds_1',
controlledApp: 'Google Chrome',
ownedWindows: [],
createdAt: new Date().toISOString(),
lastActiveAt: new Date().toISOString(),
})
desktopSessionController.ensureControlledAppInForeground.mockRejectedValue(new Error('Chrome session unavailable'))
executor.getForegroundContext.mockResolvedValue({
available: true,
appName: 'AIRI',
platform: 'darwin',
})
const executeAction = createExecuteAction(runtime)
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
expect(result.isError).toBe(true)
expect(result.content.find(item => item.type === 'text')?.text).toContain('Chrome session unavailable')
expect(executor.click).not.toHaveBeenCalled()
expect(session.consumeOperation).not.toHaveBeenCalled()
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
event: 'failed',
toolName: 'desktop_click_target',
context: expect.objectContaining({ appName: 'Google Chrome' }),
result: expect.objectContaining({ error: 'Chrome session unavailable' }),
}))
})
it('fails desktop_click_target before consuming budget when no observe snapshot exists', async () => {
const { runtime, executor, session } = createRuntimeForActionTest()
const executeAction = createExecuteAction(runtime)
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_missing' } }, 'desktop_click_target')
expect(result.isError).toBe(true)
expect(result.content.find(item => item.type === 'text')?.text).toContain('No desktop_observe snapshot available')
expect(executor.click).not.toHaveBeenCalled()
expect(session.consumeOperation).not.toHaveBeenCalled()
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
event: 'failed',
toolName: 'desktop_click_target',
action: { kind: 'desktop_click_target', input: { candidateId: 't_missing' } },
}))
})
it('refreshes browser surface availability for direct actions before evaluating strategy', async () => {
const { runtime, cdpBridgeManager } = createRuntimeForActionTest()

View file

@ -4,6 +4,7 @@ import type {
ActionInvocation,
ComputerUseConfig,
DesktopExecutor,
ForegroundContext,
PolicyDecision,
ScreenshotArtifact,
TerminalCommandResult,
@ -33,6 +34,7 @@ import {
maskEnvValuePreview,
readEnvValue,
} from '../utils/env-file'
import { executeDesktopClickTarget } from './desktop-grounding-actions'
import { describeExecutionTarget } from './formatters'
import { refreshRuntimeRunState } from './refresh-run-state'
import {
@ -117,10 +119,41 @@ function toTerminalStateContent(state: TerminalState) {
}
}
function getPolicyEvaluationContext(params: {
action: ActionInvocation
actualContext: ForegroundContext
runtime: ComputerUseServerRuntime
}): ForegroundContext {
if (params.action.kind !== 'desktop_click_target') {
return params.actualContext
}
const activeSession = params.runtime.desktopSessionController.getSession()
if (!activeSession?.controlledApp) {
return params.actualContext
}
if (params.actualContext.available && params.actualContext.appName === activeSession.controlledApp) {
return params.actualContext
}
return {
available: true,
appName: activeSession.controlledApp,
platform: params.actualContext.platform,
}
}
export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteAction {
return async (action, toolName, options = {}) => {
const normalizedAction = normalizeConfiguredAppAction(action, runtime.config.openableApps)
const { executionTarget, context, displayInfo } = await refreshRuntimeRunState(runtime)
const { executionTarget, context: actualContext, displayInfo } = await refreshRuntimeRunState(runtime)
const context = getPolicyEvaluationContext({
action: normalizedAction,
actualContext,
runtime,
})
const actualForegroundContext = context === actualContext ? undefined : actualContext
const budget = runtime.session.getBudgetState()
const preflight = getRuntimePreflight({
@ -159,6 +192,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
executionTarget,
displayInfo,
coordinateSpace: preflight.coordinateSpace,
actualForegroundContext,
},
})
@ -255,6 +289,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
let backendResult: Record<string, unknown> = {}
let clipboardStructuredContent: Record<string, unknown> | undefined
let secretStructuredContent: Record<string, unknown> | undefined
let summaryOverride: string | undefined
switch (normalizedAction.kind) {
case 'screenshot': {
@ -515,6 +550,12 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
}
break
}
case 'desktop_click_target': {
const result = await executeDesktopClickTarget(runtime, normalizedAction.input)
backendResult = result.backendResult
summaryOverride = result.summary
break
}
}
runtime.session.consumeOperation(decision.estimatedOperationUnits)
@ -562,7 +603,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
})
return buildSuccessResponse({
summary: `${intent} ${outcome}${advisorySummary ? ` Strategy: ${advisorySummary}` : ''}`,
summary: summaryOverride ?? `${intent} ${outcome}${advisorySummary ? ` Strategy: ${advisorySummary}` : ''}`,
screenshot,
structuredContent: {
status: 'executed',

View file

@ -0,0 +1,195 @@
import type { ExecutorActionResult } from '../types'
import type { ComputerUseServerRuntime } from './runtime'
import { errorMessageFrom } from '@moeru/std'
import { decideBrowserAction } from '../browser-action-router'
import { getUnsupportedBrowserDomActions, isBrowserDomActionSupported } from '../browser-dom/capabilities'
import { resolveSnapByCandidate } from '../snap-resolver'
import { sleep } from '../utils/sleep'
const DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS = 5000
export interface DesktopClickTargetExecution {
summary: string
backendResult: Record<string, unknown>
}
export async function executeDesktopClickTarget(
runtime: ComputerUseServerRuntime,
input: {
candidateId: string
clickCount?: number
button?: 'left' | 'right' | 'middle'
},
): Promise<DesktopClickTargetExecution> {
const { candidateId, clickCount, button } = input
const state = runtime.stateManager.getState()
if (!state.lastGroundingSnapshot) {
throw new Error('No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.')
}
const snapshot = state.lastGroundingSnapshot
if (state.lastClickedCandidateId === candidateId) {
throw new Error(`You already clicked candidate "${candidateId}" without calling desktop_observe again. Call desktop_observe to refresh the state before clicking the same target.`)
}
const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime()
if (snapshotAge > DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS) {
throw new Error(`Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`)
}
const snap = resolveSnapByCandidate(candidateId, snapshot)
if (snap.source === 'none' && !snap.candidateId) {
throw new Error(`Candidate "${candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`)
}
const sessionCtrl = runtime.desktopSessionController
const activeSession = sessionCtrl.getSession()
if (activeSession?.controlledApp) {
const currentForeground = await runtime.executor.getForegroundContext()
const wasAlreadyInFront = await sessionCtrl.ensureControlledAppInForeground({
currentForeground,
chromeSessionManager: runtime.chromeSessionManager,
activateApp: async (appName) => {
await runtime.executor.focusApp({ app: appName })
},
})
if (!wasAlreadyInFront) {
await sleep(200)
}
sessionCtrl.touch()
}
const candidate = snapshot.targetCandidates.find(c => c.id === candidateId)
const intent = {
mode: 'execute' as const,
candidateId,
rawPoint: snap.rawPoint,
snappedPoint: snap.snappedPoint,
source: snap.source,
confidence: candidate?.confidence ?? 0,
path: [
{ x: snap.snappedPoint.x, y: snap.snappedPoint.y, delayMs: 0 },
],
phase: 'executing' as const,
}
runtime.stateManager.updatePointerIntent(intent)
let executionRoute = 'os_input'
let routeNote = ''
let routeReason = 'candidate not found'
let osInputResult: ExecutorActionResult | undefined
const executeOsClick = async () => {
const result = await runtime.executor.click({
x: snap.snappedPoint.x,
y: snap.snappedPoint.y,
button: button || 'left',
clickCount: clickCount ?? 1,
pointerTrace: intent.path,
})
runtime.session.setPointerPosition({ x: snap.snappedPoint.x, y: snap.snappedPoint.y })
return result
}
try {
const bridgeConnected = runtime.browserDomBridge?.getStatus().connected ?? false
const routeDecision = candidate
? decideBrowserAction(candidate, bridgeConnected, button, clickCount)
: { route: 'os_input' as const, reason: 'candidate not found' }
executionRoute = routeDecision.route
routeReason = routeDecision.reason
if (routeDecision.route === 'browser_dom' && routeDecision.selector) {
const requiredActions = routeDecision.bridgeMethod === 'checkCheckbox'
? ['checkCheckbox']
: ['getClickTarget', 'clickAt']
if (!isBrowserDomActionSupported(runtime.browserDomBridge, ...requiredActions)) {
executionRoute = 'os_input'
routeReason = `browser-dom extension transport does not support ${requiredActions.join(' + ')}`
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} is unavailable on the connected extension transport (${getUnsupportedBrowserDomActions(runtime.browserDomBridge, ...requiredActions).join(', ')} unsupported), fell back to OS input`
osInputResult = await executeOsClick()
}
else {
try {
const frameIds = routeDecision.frameId !== undefined ? [routeDecision.frameId] : undefined
if (routeDecision.bridgeMethod === 'checkCheckbox') {
await runtime.browserDomBridge.checkCheckbox({
selector: routeDecision.selector,
frameIds,
})
}
else {
await runtime.browserDomBridge.clickSelector({
selector: routeDecision.selector,
frameIds,
})
}
}
catch (browserError) {
executionRoute = 'os_input'
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} failed (${errorMessageFrom(browserError) ?? 'unknown error'}), fell back to OS input`
osInputResult = await executeOsClick()
}
}
}
else {
osInputResult = await executeOsClick()
}
const completedIntent = {
...intent,
phase: 'completed' as const,
executionResult: routeNote ? 'fallback' as const : 'success' as const,
executionRoute: `${executionRoute} (${routeReason})`,
}
runtime.stateManager.updatePointerIntent(completedIntent, candidateId)
const candidateDesc = candidate ? `${candidate.source} ${candidate.role} "${candidate.label}"` : candidateId
const lines = [
`Clicked: ${candidateDesc}`,
` Snap: ${snap.reason}`,
` Point: (${snap.snappedPoint.x}, ${snap.snappedPoint.y})`,
` Route: ${executionRoute} (${routeReason})`,
` Button: ${button || 'left'}, clicks: ${clickCount ?? 1}`,
]
if (routeNote) {
lines.push(`${routeNote}`)
}
if (snap.reason.includes('stale')) {
lines.push(' ⚠ WARNING: Target source is stale. Consider calling desktop_observe again.')
}
return {
summary: lines.join('\n'),
backendResult: {
candidateId,
snapshotId: snapshot.snapshotId,
snap,
candidate,
executionRoute,
routeReason,
routeNote: routeNote || undefined,
osInputResult,
},
}
}
catch (error) {
const failedIntent = {
...intent,
phase: 'completed' as const,
executionResult: 'error' as const,
executionRoute: `${executionRoute} (${routeReason})`,
}
runtime.stateManager.updatePointerIntent(failedIntent)
throw error
}
}

View file

@ -45,7 +45,7 @@ function createMockServer() {
}
function createRuntime() {
return {
const runtime = {
config: createTestConfig(),
stateManager: new RunStateManager(),
cdpBridgeManager: {
@ -57,6 +57,9 @@ function createRuntime() {
},
browserDomBridge: {},
executor: {},
session: {
setLastScreenshot: vi.fn(),
},
desktopSessionController: {
getSession: vi.fn().mockReturnValue(undefined),
getSessionInfo: vi.fn().mockReturnValue(undefined),
@ -64,6 +67,12 @@ function createRuntime() {
ensureControlledAppInForeground: vi.fn(),
},
} as unknown as ComputerUseServerRuntime
const executeAction = vi.fn().mockResolvedValue({
content: [{ type: 'text', text: 'executed' }],
})
return { runtime, executeAction }
}
describe('registerDesktopGroundingTools', () => {
@ -71,39 +80,36 @@ describe('registerDesktopGroundingTools', () => {
captureDesktopGroundingMock.mockReset()
})
it('registers desktop_click_target and handles missing candidate gracefully', async () => {
const runtime = createRuntime()
it('registers desktop_click_target through the action executor', async () => {
const { runtime, executeAction } = createRuntime()
const { server, invoke } = createMockServer()
registerDesktopGroundingTools({ server, runtime })
runtime.stateManager.updateGroundingSnapshot({
snapshotId: 'dg_1',
capturedAt: new Date().toISOString(),
foregroundApp: 'Google Chrome',
windows: [],
screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() },
targetCandidates: [],
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
} as any)
registerDesktopGroundingTools({ server, runtime, executeAction })
const result = await invoke('desktop_click_target', {
candidateId: 't_missing',
candidateId: 't_0',
clickCount: 2,
button: 'right',
})
expect(result.isError).toBe(true)
expect(result.content).toEqual([
expect.objectContaining({ text: expect.stringContaining('Candidate "t_missing" not found in snapshot') }),
])
expect(result.isError).not.toBe(true)
expect(executeAction).toHaveBeenCalledWith({
kind: 'desktop_click_target',
input: {
candidateId: 't_0',
clickCount: 2,
button: 'right',
},
}, 'desktop_click_target')
})
it('returns observe error content when captureDesktopGrounding fails', async () => {
const runtime = createRuntime()
const { runtime, executeAction } = createRuntime()
captureDesktopGroundingMock.mockRejectedValueOnce(new Error('observe boom'))
const { server, invoke } = createMockServer()
registerDesktopGroundingTools({ server, runtime })
registerDesktopGroundingTools({ server, runtime, executeAction })
const result = await invoke('desktop_observe', {})
@ -114,7 +120,7 @@ describe('registerDesktopGroundingTools', () => {
})
it('stores grounding snapshot and returns image content', async () => {
const runtime = createRuntime()
const { runtime, executeAction } = createRuntime()
captureDesktopGroundingMock.mockResolvedValueOnce({
snapshotId: 'dg_new',
capturedAt: new Date().toISOString(),
@ -127,18 +133,29 @@ describe('registerDesktopGroundingTools', () => {
capturedAt: new Date().toISOString(),
width: 1280,
height: 720,
executionTargetMode: 'remote',
sourceHostName: 'fake-remote',
sourceDisplayId: ':99',
sourceSessionTag: 'vm-local-1',
},
targetCandidates: [],
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
} as any)
const { server, invoke } = createMockServer()
registerDesktopGroundingTools({ server, runtime })
registerDesktopGroundingTools({ server, runtime, executeAction })
const result = await invoke('desktop_observe', {})
const state = runtime.stateManager.getState()
expect(state.lastGroundingSnapshot?.screenshot.dataBase64).toBe('ZmFrZS1wbmc=')
expect(runtime.session.setLastScreenshot).toHaveBeenCalledWith(expect.objectContaining({
path: '/tmp/shot.png',
executionTargetMode: 'remote',
sourceHostName: 'fake-remote',
sourceDisplayId: ':99',
sourceSessionTag: 'vm-local-1',
}))
expect(result.content).toEqual([
expect.objectContaining({ type: 'text' }),
expect.objectContaining({

View file

@ -14,17 +14,15 @@
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
import type { PointerIntent } from '../desktop-grounding-types'
import type { DesktopClickTargetInput } from '../types'
import type { ExecuteAction } from './action-executor'
import type { ComputerUseServerRuntime } from './runtime'
import process from 'node:process'
import { z } from 'zod'
import { decideBrowserAction } from '../browser-action-router'
import { getUnsupportedBrowserDomActions, isBrowserDomActionSupported } from '../browser-dom/capabilities'
import { captureDesktopGrounding, formatGroundingForAgent } from '../desktop-grounding'
import { resolveSnapByCandidate } from '../snap-resolver'
import { sleep } from '../utils/sleep'
import { textContent } from './content'
import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptors/register-helper'
@ -40,8 +38,9 @@ import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptor
export function registerDesktopGroundingTools(params: {
server: McpServer
runtime: ComputerUseServerRuntime
executeAction: ExecuteAction
}) {
const { server, runtime } = params
const { server, runtime, executeAction } = params
// -----------------------------------------------------------------------
// desktop_observe
@ -127,6 +126,7 @@ export function registerDesktopGroundingTools(params: {
// Also update screenshot state so desktop_get_state and other
// tools can see the latest screenshot from this observation
if (snapshot.screenshot && !snapshot.screenshot.placeholder) {
runtime.session.setLastScreenshot(snapshot.screenshot)
runtime.stateManager.updateLastScreenshot({
path: snapshot.screenshot.path || '',
width: snapshot.screenshot.width,
@ -192,187 +192,7 @@ export function registerDesktopGroundingTools(params: {
button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button (default: left)'),
},
handler: async ({ candidateId, clickCount, button }) => {
try {
const state = runtime.stateManager.getState()
// Validate: must have a recent grounding snapshot
if (!state.lastGroundingSnapshot) {
return {
content: [textContent('ERROR: No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.')],
isError: true,
}
}
const snapshot = state.lastGroundingSnapshot
// Session: ensure the controlled app is still in foreground before clicking
const sessionCtrl = runtime.desktopSessionController
const activeSession = sessionCtrl.getSession()
if (activeSession?.controlledApp) {
const currentForeground = await runtime.executor.getForegroundContext()
const wasAlreadyInFront = await sessionCtrl.ensureControlledAppInForeground({
currentForeground,
chromeSessionManager: runtime.chromeSessionManager,
activateApp: async (appName) => {
await runtime.executor.focusApp({ app: appName })
},
})
if (!wasAlreadyInFront) {
await sleep(200)
}
sessionCtrl.touch()
}
// Validate: check for duplicate clicks on same candidate without re-observe
if (state.lastClickedCandidateId === candidateId) {
return {
content: [textContent(`WARNING: You already clicked candidate "${candidateId}" without calling desktop_observe again. Call desktop_observe to refresh the state before clicking the same target.`)],
isError: true,
}
}
// Validate: check snapshot staleness (>5s)
const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime()
if (snapshotAge > 5000) {
return {
content: [textContent(`WARNING: Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`)],
isError: true,
}
}
// Resolve snap
const snap = resolveSnapByCandidate(candidateId, snapshot)
if (snap.source === 'none' && !snap.candidateId) {
return {
content: [textContent(`ERROR: Candidate "${candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`)],
isError: true,
}
}
// Build pointer intent
const intent: PointerIntent = {
mode: 'execute',
candidateId,
rawPoint: snap.rawPoint,
snappedPoint: snap.snappedPoint,
source: snap.source,
confidence: snapshot.targetCandidates.find(c => c.id === candidateId)?.confidence ?? 0,
path: [
{ x: snap.snappedPoint.x, y: snap.snappedPoint.y, delayMs: 0 },
],
}
// Update RunState — pointer intent + clicked candidate (phase: executing)
intent.phase = 'executing'
runtime.stateManager.updatePointerIntent(intent)
// Route the click: browser-dom for chrome_dom candidates, OS input for everything else
const candidate = snapshot.targetCandidates.find(c => c.id === candidateId)
const bridgeConnected = runtime.browserDomBridge?.getStatus().connected ?? false
const routeDecision = candidate
? decideBrowserAction(candidate, bridgeConnected, button, clickCount)
: { route: 'os_input' as const, reason: 'candidate not found' }
let executionRoute = routeDecision.route
let routeNote = ''
let routeReason = routeDecision.reason
if (routeDecision.route === 'browser_dom' && routeDecision.selector) {
const requiredActions = routeDecision.bridgeMethod === 'checkCheckbox'
? ['checkCheckbox']
: ['getClickTarget', 'clickAt']
if (!isBrowserDomActionSupported(runtime.browserDomBridge, ...requiredActions)) {
executionRoute = 'os_input'
routeReason = `browser-dom extension transport does not support ${requiredActions.join(' + ')}`
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} is unavailable on the connected extension transport (${getUnsupportedBrowserDomActions(runtime.browserDomBridge, ...requiredActions).join(', ')} unsupported), fell back to OS input`
await runtime.executor.click({
x: snap.snappedPoint.x,
y: snap.snappedPoint.y,
button: button || 'left',
clickCount: clickCount ?? 1,
pointerTrace: intent.path,
})
}
else {
// Try browser-dom bridge action first, dispatching by method
try {
const frameIds = routeDecision.frameId !== undefined ? [routeDecision.frameId] : undefined
if (routeDecision.bridgeMethod === 'checkCheckbox') {
await runtime.browserDomBridge!.checkCheckbox({
selector: routeDecision.selector,
frameIds,
})
}
else {
await runtime.browserDomBridge!.clickSelector({
selector: routeDecision.selector,
frameIds,
})
}
}
catch (browserError) {
// Fallback to OS input on browser-dom failure
executionRoute = 'os_input'
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} failed (${browserError instanceof Error ? browserError.message : String(browserError)}), fell back to OS input`
await runtime.executor.click({
x: snap.snappedPoint.x,
y: snap.snappedPoint.y,
button: button || 'left',
clickCount: clickCount ?? 1,
pointerTrace: intent.path,
})
}
}
}
else {
// OS-level click (existing path)
await runtime.executor.click({
x: snap.snappedPoint.x,
y: snap.snappedPoint.y,
button: button || 'left',
clickCount: clickCount ?? 1,
pointerTrace: intent.path,
})
}
// Phase: completed — update ghost pointer state for overlay fadeout
intent.phase = 'completed'
intent.executionResult = routeNote ? 'fallback' : 'success'
intent.executionRoute = `${executionRoute} (${routeReason})`
runtime.stateManager.updatePointerIntent(intent, candidateId)
const candidateDesc = candidate ? `${candidate.source} ${candidate.role} "${candidate.label}"` : candidateId
const lines = [
`Clicked: ${candidateDesc}`,
` Snap: ${snap.reason}`,
` Point: (${snap.snappedPoint.x}, ${snap.snappedPoint.y})`,
` Route: ${executionRoute} (${routeReason})`,
` Button: ${button || 'left'}, clicks: ${clickCount ?? 1}`,
]
if (routeNote) {
lines.push(`${routeNote}`)
}
if (snap.reason.includes('stale')) {
lines.push(' ⚠ WARNING: Target source is stale. Consider calling desktop_observe again.')
}
return {
content: [textContent(lines.join('\n'))],
}
}
catch (error) {
const message = error instanceof Error ? error.message : String(error)
return {
content: [textContent(`desktop_click_target failed: ${message}`)],
isError: true,
}
}
},
handler: async (input: DesktopClickTargetInput) =>
executeAction({ kind: 'desktop_click_target', input }, 'desktop_click_target'),
})
}