mirror of
https://github.com/moeru-ai/airi.git
synced 2026-04-28 06:29:33 +00:00
fix(computer-use-mcp): route target clicks through action executor (#1727)
--------- Co-authored-by-agent: Antigravity <antigravity@gemini.com>
This commit is contained in:
parent
a9359a3924
commit
2e9020a06f
8 changed files with 545 additions and 214 deletions
3
pnpm-lock.yaml
generated
3
pnpm-lock.yaml
generated
|
|
@ -3989,6 +3989,9 @@ importers:
|
|||
'@modelcontextprotocol/sdk':
|
||||
specifier: 'catalog:'
|
||||
version: 1.29.0(@cfworker/json-schema@4.1.1)(zod@4.3.6)
|
||||
'@moeru/std':
|
||||
specifier: 'catalog:'
|
||||
version: 0.1.0-beta.17
|
||||
node-pty:
|
||||
specifier: 'catalog:'
|
||||
version: 1.1.0
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"@modelcontextprotocol/sdk": "catalog:",
|
||||
"@moeru/std": "catalog:",
|
||||
"node-pty": "catalog:",
|
||||
"ws": "^8.20.0",
|
||||
"zod": "^4.3.6"
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ export async function createComputerUseMcpServer(config = resolveComputerUseConf
|
|||
},
|
||||
})
|
||||
const cdpCleanup = registerCdpTools({ server, runtime })
|
||||
registerDesktopGroundingTools({ server, runtime })
|
||||
registerDesktopGroundingTools({ server, runtime, executeAction })
|
||||
registerChromeSessionTools({ server, runtime })
|
||||
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import type { ComputerUseConfig } from '../types'
|
||||
import type { ComputerUseServerRuntime } from './runtime'
|
||||
|
||||
import { describe, expect, it, vi } from 'vitest'
|
||||
|
|
@ -6,7 +7,7 @@ import { RunStateManager } from '../state'
|
|||
import { createDisplayInfo, createLocalExecutionTarget, createTerminalState, createTestConfig } from '../test-fixtures'
|
||||
import { createExecuteAction } from './action-executor'
|
||||
|
||||
function createRuntimeForActionTest() {
|
||||
function createRuntimeForActionTest(configOverrides: Partial<ComputerUseConfig> = {}) {
|
||||
const stateManager = new RunStateManager()
|
||||
const session = {
|
||||
listPendingActions: vi.fn().mockReturnValue([]),
|
||||
|
|
@ -51,6 +52,11 @@ function createRuntimeForActionTest() {
|
|||
scroll: vi.fn(),
|
||||
wait: vi.fn(),
|
||||
}
|
||||
const desktopSessionController = {
|
||||
getSession: vi.fn().mockReturnValue(null),
|
||||
ensureControlledAppInForeground: vi.fn(),
|
||||
touch: vi.fn(),
|
||||
}
|
||||
const terminalRunner = {
|
||||
describe: () => ({ kind: 'local-shell-runner' as const, notes: [] }),
|
||||
execute: vi.fn(),
|
||||
|
|
@ -79,6 +85,7 @@ function createRuntimeForActionTest() {
|
|||
executor: 'dry-run',
|
||||
approvalMode: 'never',
|
||||
defaultCaptureAfter: false,
|
||||
...configOverrides,
|
||||
}),
|
||||
session,
|
||||
executor,
|
||||
|
|
@ -87,6 +94,8 @@ function createRuntimeForActionTest() {
|
|||
cdpBridgeManager,
|
||||
stateManager,
|
||||
taskMemory: {},
|
||||
desktopSessionController,
|
||||
chromeSessionManager: {},
|
||||
} as unknown as ComputerUseServerRuntime
|
||||
|
||||
return {
|
||||
|
|
@ -95,10 +104,255 @@ function createRuntimeForActionTest() {
|
|||
executor,
|
||||
cdpBridgeManager,
|
||||
stateManager,
|
||||
desktopSessionController,
|
||||
}
|
||||
}
|
||||
|
||||
describe('createExecuteAction', () => {
|
||||
it('executes desktop_click_target through the shared policy and audit pipeline', async () => {
|
||||
const { runtime, executor, session, stateManager } = createRuntimeForActionTest()
|
||||
stateManager.updateGroundingSnapshot({
|
||||
snapshotId: 'dg_1',
|
||||
capturedAt: new Date().toISOString(),
|
||||
foregroundApp: 'Google Chrome',
|
||||
windows: [],
|
||||
screenshot: {
|
||||
dataBase64: '',
|
||||
mimeType: 'image/png',
|
||||
path: '',
|
||||
capturedAt: new Date().toISOString(),
|
||||
},
|
||||
targetCandidates: [
|
||||
{
|
||||
id: 't_0',
|
||||
source: 'ax',
|
||||
appName: 'Google Chrome',
|
||||
role: 'AXButton',
|
||||
label: 'Submit',
|
||||
bounds: { x: 100, y: 200, width: 80, height: 30 },
|
||||
confidence: 0.95,
|
||||
interactable: true,
|
||||
},
|
||||
],
|
||||
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
|
||||
} as any)
|
||||
|
||||
const executeAction = createExecuteAction(runtime)
|
||||
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
|
||||
|
||||
expect(result.isError).not.toBe(true)
|
||||
expect(executor.click).toHaveBeenCalledWith(expect.objectContaining({
|
||||
x: 140,
|
||||
y: 215,
|
||||
button: 'left',
|
||||
clickCount: 1,
|
||||
pointerTrace: [{ x: 140, y: 215, delayMs: 0 }],
|
||||
}))
|
||||
expect(session.consumeOperation).toHaveBeenCalledWith(1)
|
||||
expect(session.setPointerPosition).toHaveBeenCalledWith({ x: 140, y: 215 })
|
||||
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
|
||||
event: 'executed',
|
||||
toolName: 'desktop_click_target',
|
||||
action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
|
||||
}))
|
||||
expect(stateManager.getState().lastClickedCandidateId).toBe('t_0')
|
||||
expect(stateManager.getState().lastPointerIntent).toMatchObject({
|
||||
candidateId: 't_0',
|
||||
phase: 'completed',
|
||||
executionResult: 'success',
|
||||
})
|
||||
expect(result.content.find(item => item.type === 'text')?.text).toContain('Clicked: ax AXButton "Submit"')
|
||||
})
|
||||
|
||||
it('queues desktop_click_target without refocusing when approval is required', async () => {
|
||||
const { runtime, executor, session, desktopSessionController } = createRuntimeForActionTest({ approvalMode: 'all' })
|
||||
desktopSessionController.getSession.mockReturnValue({
|
||||
id: 'ds_1',
|
||||
controlledApp: 'Google Chrome',
|
||||
ownedWindows: [],
|
||||
createdAt: new Date().toISOString(),
|
||||
lastActiveAt: new Date().toISOString(),
|
||||
})
|
||||
executor.getForegroundContext.mockResolvedValue({
|
||||
available: true,
|
||||
appName: 'AIRI',
|
||||
platform: 'darwin',
|
||||
})
|
||||
session.createPendingAction.mockReturnValue({
|
||||
id: 'pa_1',
|
||||
createdAt: new Date().toISOString(),
|
||||
toolName: 'desktop_click_target',
|
||||
action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
|
||||
context: {
|
||||
available: true,
|
||||
appName: 'Google Chrome',
|
||||
platform: 'darwin',
|
||||
},
|
||||
policy: {
|
||||
allowed: true,
|
||||
requiresApproval: true,
|
||||
reasons: [],
|
||||
riskLevel: 'medium',
|
||||
estimatedOperationUnits: 1,
|
||||
},
|
||||
})
|
||||
|
||||
const executeAction = createExecuteAction(runtime)
|
||||
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
|
||||
|
||||
expect(result.structuredContent).toMatchObject({
|
||||
status: 'approval_required',
|
||||
pendingActionId: 'pa_1',
|
||||
action: {
|
||||
kind: 'desktop_click_target',
|
||||
input: { candidateId: 't_0' },
|
||||
},
|
||||
})
|
||||
expect(session.createPendingAction).toHaveBeenCalledWith(expect.objectContaining({
|
||||
toolName: 'desktop_click_target',
|
||||
action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
|
||||
context: expect.objectContaining({ appName: 'Google Chrome' }),
|
||||
}))
|
||||
expect(desktopSessionController.ensureControlledAppInForeground).not.toHaveBeenCalled()
|
||||
expect(executor.click).not.toHaveBeenCalled()
|
||||
expect(session.consumeOperation).not.toHaveBeenCalled()
|
||||
})
|
||||
|
||||
it('uses controlled-app context for desktop_click_target policy and refocuses only during execution', async () => {
|
||||
const { runtime, executor, session, stateManager, desktopSessionController } = createRuntimeForActionTest()
|
||||
stateManager.updateGroundingSnapshot({
|
||||
snapshotId: 'dg_1',
|
||||
capturedAt: new Date().toISOString(),
|
||||
foregroundApp: 'Google Chrome',
|
||||
windows: [],
|
||||
screenshot: {
|
||||
dataBase64: '',
|
||||
mimeType: 'image/png',
|
||||
path: '',
|
||||
capturedAt: new Date().toISOString(),
|
||||
},
|
||||
targetCandidates: [
|
||||
{
|
||||
id: 't_0',
|
||||
source: 'ax',
|
||||
appName: 'Google Chrome',
|
||||
role: 'AXButton',
|
||||
label: 'Submit',
|
||||
bounds: { x: 100, y: 200, width: 80, height: 30 },
|
||||
confidence: 0.95,
|
||||
interactable: true,
|
||||
},
|
||||
],
|
||||
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
|
||||
} as any)
|
||||
desktopSessionController.getSession.mockReturnValue({
|
||||
id: 'ds_1',
|
||||
controlledApp: 'Google Chrome',
|
||||
ownedWindows: [],
|
||||
createdAt: new Date().toISOString(),
|
||||
lastActiveAt: new Date().toISOString(),
|
||||
})
|
||||
desktopSessionController.ensureControlledAppInForeground.mockResolvedValue(true)
|
||||
executor.getForegroundContext.mockResolvedValue({
|
||||
available: true,
|
||||
appName: 'AIRI',
|
||||
platform: 'darwin',
|
||||
})
|
||||
|
||||
const executeAction = createExecuteAction(runtime)
|
||||
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
|
||||
|
||||
expect(result.isError).not.toBe(true)
|
||||
expect(desktopSessionController.ensureControlledAppInForeground).toHaveBeenCalledWith(expect.objectContaining({
|
||||
currentForeground: expect.objectContaining({ appName: 'AIRI' }),
|
||||
}))
|
||||
expect(executor.click).toHaveBeenCalledOnce()
|
||||
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
|
||||
event: 'executed',
|
||||
context: expect.objectContaining({ appName: 'Google Chrome' }),
|
||||
policy: expect.objectContaining({ allowed: true }),
|
||||
}))
|
||||
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
|
||||
event: 'requested',
|
||||
result: expect.objectContaining({
|
||||
actualForegroundContext: expect.objectContaining({ appName: 'AIRI' }),
|
||||
}),
|
||||
}))
|
||||
})
|
||||
|
||||
it('returns a structured failure when controlled-app refocus fails during desktop_click_target execution', async () => {
|
||||
const { runtime, executor, session, stateManager, desktopSessionController } = createRuntimeForActionTest()
|
||||
stateManager.updateGroundingSnapshot({
|
||||
snapshotId: 'dg_1',
|
||||
capturedAt: new Date().toISOString(),
|
||||
foregroundApp: 'Google Chrome',
|
||||
windows: [],
|
||||
screenshot: {
|
||||
dataBase64: '',
|
||||
mimeType: 'image/png',
|
||||
path: '',
|
||||
capturedAt: new Date().toISOString(),
|
||||
},
|
||||
targetCandidates: [
|
||||
{
|
||||
id: 't_0',
|
||||
source: 'ax',
|
||||
appName: 'Google Chrome',
|
||||
role: 'AXButton',
|
||||
label: 'Submit',
|
||||
bounds: { x: 100, y: 200, width: 80, height: 30 },
|
||||
confidence: 0.95,
|
||||
interactable: true,
|
||||
},
|
||||
],
|
||||
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
|
||||
} as any)
|
||||
desktopSessionController.getSession.mockReturnValue({
|
||||
id: 'ds_1',
|
||||
controlledApp: 'Google Chrome',
|
||||
ownedWindows: [],
|
||||
createdAt: new Date().toISOString(),
|
||||
lastActiveAt: new Date().toISOString(),
|
||||
})
|
||||
desktopSessionController.ensureControlledAppInForeground.mockRejectedValue(new Error('Chrome session unavailable'))
|
||||
executor.getForegroundContext.mockResolvedValue({
|
||||
available: true,
|
||||
appName: 'AIRI',
|
||||
platform: 'darwin',
|
||||
})
|
||||
|
||||
const executeAction = createExecuteAction(runtime)
|
||||
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
|
||||
|
||||
expect(result.isError).toBe(true)
|
||||
expect(result.content.find(item => item.type === 'text')?.text).toContain('Chrome session unavailable')
|
||||
expect(executor.click).not.toHaveBeenCalled()
|
||||
expect(session.consumeOperation).not.toHaveBeenCalled()
|
||||
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
|
||||
event: 'failed',
|
||||
toolName: 'desktop_click_target',
|
||||
context: expect.objectContaining({ appName: 'Google Chrome' }),
|
||||
result: expect.objectContaining({ error: 'Chrome session unavailable' }),
|
||||
}))
|
||||
})
|
||||
|
||||
it('fails desktop_click_target before consuming budget when no observe snapshot exists', async () => {
|
||||
const { runtime, executor, session } = createRuntimeForActionTest()
|
||||
|
||||
const executeAction = createExecuteAction(runtime)
|
||||
const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_missing' } }, 'desktop_click_target')
|
||||
|
||||
expect(result.isError).toBe(true)
|
||||
expect(result.content.find(item => item.type === 'text')?.text).toContain('No desktop_observe snapshot available')
|
||||
expect(executor.click).not.toHaveBeenCalled()
|
||||
expect(session.consumeOperation).not.toHaveBeenCalled()
|
||||
expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
|
||||
event: 'failed',
|
||||
toolName: 'desktop_click_target',
|
||||
action: { kind: 'desktop_click_target', input: { candidateId: 't_missing' } },
|
||||
}))
|
||||
})
|
||||
|
||||
it('refreshes browser surface availability for direct actions before evaluating strategy', async () => {
|
||||
const { runtime, cdpBridgeManager } = createRuntimeForActionTest()
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import type {
|
|||
ActionInvocation,
|
||||
ComputerUseConfig,
|
||||
DesktopExecutor,
|
||||
ForegroundContext,
|
||||
PolicyDecision,
|
||||
ScreenshotArtifact,
|
||||
TerminalCommandResult,
|
||||
|
|
@ -33,6 +34,7 @@ import {
|
|||
maskEnvValuePreview,
|
||||
readEnvValue,
|
||||
} from '../utils/env-file'
|
||||
import { executeDesktopClickTarget } from './desktop-grounding-actions'
|
||||
import { describeExecutionTarget } from './formatters'
|
||||
import { refreshRuntimeRunState } from './refresh-run-state'
|
||||
import {
|
||||
|
|
@ -117,10 +119,41 @@ function toTerminalStateContent(state: TerminalState) {
|
|||
}
|
||||
}
|
||||
|
||||
function getPolicyEvaluationContext(params: {
|
||||
action: ActionInvocation
|
||||
actualContext: ForegroundContext
|
||||
runtime: ComputerUseServerRuntime
|
||||
}): ForegroundContext {
|
||||
if (params.action.kind !== 'desktop_click_target') {
|
||||
return params.actualContext
|
||||
}
|
||||
|
||||
const activeSession = params.runtime.desktopSessionController.getSession()
|
||||
if (!activeSession?.controlledApp) {
|
||||
return params.actualContext
|
||||
}
|
||||
|
||||
if (params.actualContext.available && params.actualContext.appName === activeSession.controlledApp) {
|
||||
return params.actualContext
|
||||
}
|
||||
|
||||
return {
|
||||
available: true,
|
||||
appName: activeSession.controlledApp,
|
||||
platform: params.actualContext.platform,
|
||||
}
|
||||
}
|
||||
|
||||
export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteAction {
|
||||
return async (action, toolName, options = {}) => {
|
||||
const normalizedAction = normalizeConfiguredAppAction(action, runtime.config.openableApps)
|
||||
const { executionTarget, context, displayInfo } = await refreshRuntimeRunState(runtime)
|
||||
const { executionTarget, context: actualContext, displayInfo } = await refreshRuntimeRunState(runtime)
|
||||
const context = getPolicyEvaluationContext({
|
||||
action: normalizedAction,
|
||||
actualContext,
|
||||
runtime,
|
||||
})
|
||||
const actualForegroundContext = context === actualContext ? undefined : actualContext
|
||||
|
||||
const budget = runtime.session.getBudgetState()
|
||||
const preflight = getRuntimePreflight({
|
||||
|
|
@ -159,6 +192,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
|
|||
executionTarget,
|
||||
displayInfo,
|
||||
coordinateSpace: preflight.coordinateSpace,
|
||||
actualForegroundContext,
|
||||
},
|
||||
})
|
||||
|
||||
|
|
@ -255,6 +289,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
|
|||
let backendResult: Record<string, unknown> = {}
|
||||
let clipboardStructuredContent: Record<string, unknown> | undefined
|
||||
let secretStructuredContent: Record<string, unknown> | undefined
|
||||
let summaryOverride: string | undefined
|
||||
|
||||
switch (normalizedAction.kind) {
|
||||
case 'screenshot': {
|
||||
|
|
@ -515,6 +550,12 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
|
|||
}
|
||||
break
|
||||
}
|
||||
case 'desktop_click_target': {
|
||||
const result = await executeDesktopClickTarget(runtime, normalizedAction.input)
|
||||
backendResult = result.backendResult
|
||||
summaryOverride = result.summary
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
runtime.session.consumeOperation(decision.estimatedOperationUnits)
|
||||
|
|
@ -562,7 +603,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
|
|||
})
|
||||
|
||||
return buildSuccessResponse({
|
||||
summary: `${intent} ${outcome}${advisorySummary ? ` Strategy: ${advisorySummary}` : ''}`,
|
||||
summary: summaryOverride ?? `${intent} ${outcome}${advisorySummary ? ` Strategy: ${advisorySummary}` : ''}`,
|
||||
screenshot,
|
||||
structuredContent: {
|
||||
status: 'executed',
|
||||
|
|
|
|||
|
|
@ -0,0 +1,195 @@
|
|||
import type { ExecutorActionResult } from '../types'
|
||||
import type { ComputerUseServerRuntime } from './runtime'
|
||||
|
||||
import { errorMessageFrom } from '@moeru/std'
|
||||
|
||||
import { decideBrowserAction } from '../browser-action-router'
|
||||
import { getUnsupportedBrowserDomActions, isBrowserDomActionSupported } from '../browser-dom/capabilities'
|
||||
import { resolveSnapByCandidate } from '../snap-resolver'
|
||||
import { sleep } from '../utils/sleep'
|
||||
|
||||
const DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS = 5000
|
||||
|
||||
export interface DesktopClickTargetExecution {
|
||||
summary: string
|
||||
backendResult: Record<string, unknown>
|
||||
}
|
||||
|
||||
export async function executeDesktopClickTarget(
|
||||
runtime: ComputerUseServerRuntime,
|
||||
input: {
|
||||
candidateId: string
|
||||
clickCount?: number
|
||||
button?: 'left' | 'right' | 'middle'
|
||||
},
|
||||
): Promise<DesktopClickTargetExecution> {
|
||||
const { candidateId, clickCount, button } = input
|
||||
const state = runtime.stateManager.getState()
|
||||
|
||||
if (!state.lastGroundingSnapshot) {
|
||||
throw new Error('No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.')
|
||||
}
|
||||
|
||||
const snapshot = state.lastGroundingSnapshot
|
||||
|
||||
if (state.lastClickedCandidateId === candidateId) {
|
||||
throw new Error(`You already clicked candidate "${candidateId}" without calling desktop_observe again. Call desktop_observe to refresh the state before clicking the same target.`)
|
||||
}
|
||||
|
||||
const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime()
|
||||
if (snapshotAge > DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS) {
|
||||
throw new Error(`Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`)
|
||||
}
|
||||
|
||||
const snap = resolveSnapByCandidate(candidateId, snapshot)
|
||||
if (snap.source === 'none' && !snap.candidateId) {
|
||||
throw new Error(`Candidate "${candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`)
|
||||
}
|
||||
|
||||
const sessionCtrl = runtime.desktopSessionController
|
||||
const activeSession = sessionCtrl.getSession()
|
||||
if (activeSession?.controlledApp) {
|
||||
const currentForeground = await runtime.executor.getForegroundContext()
|
||||
const wasAlreadyInFront = await sessionCtrl.ensureControlledAppInForeground({
|
||||
currentForeground,
|
||||
chromeSessionManager: runtime.chromeSessionManager,
|
||||
activateApp: async (appName) => {
|
||||
await runtime.executor.focusApp({ app: appName })
|
||||
},
|
||||
})
|
||||
if (!wasAlreadyInFront) {
|
||||
await sleep(200)
|
||||
}
|
||||
sessionCtrl.touch()
|
||||
}
|
||||
|
||||
const candidate = snapshot.targetCandidates.find(c => c.id === candidateId)
|
||||
const intent = {
|
||||
mode: 'execute' as const,
|
||||
candidateId,
|
||||
rawPoint: snap.rawPoint,
|
||||
snappedPoint: snap.snappedPoint,
|
||||
source: snap.source,
|
||||
confidence: candidate?.confidence ?? 0,
|
||||
path: [
|
||||
{ x: snap.snappedPoint.x, y: snap.snappedPoint.y, delayMs: 0 },
|
||||
],
|
||||
phase: 'executing' as const,
|
||||
}
|
||||
|
||||
runtime.stateManager.updatePointerIntent(intent)
|
||||
|
||||
let executionRoute = 'os_input'
|
||||
let routeNote = ''
|
||||
let routeReason = 'candidate not found'
|
||||
let osInputResult: ExecutorActionResult | undefined
|
||||
|
||||
const executeOsClick = async () => {
|
||||
const result = await runtime.executor.click({
|
||||
x: snap.snappedPoint.x,
|
||||
y: snap.snappedPoint.y,
|
||||
button: button || 'left',
|
||||
clickCount: clickCount ?? 1,
|
||||
pointerTrace: intent.path,
|
||||
})
|
||||
runtime.session.setPointerPosition({ x: snap.snappedPoint.x, y: snap.snappedPoint.y })
|
||||
return result
|
||||
}
|
||||
|
||||
try {
|
||||
const bridgeConnected = runtime.browserDomBridge?.getStatus().connected ?? false
|
||||
const routeDecision = candidate
|
||||
? decideBrowserAction(candidate, bridgeConnected, button, clickCount)
|
||||
: { route: 'os_input' as const, reason: 'candidate not found' }
|
||||
|
||||
executionRoute = routeDecision.route
|
||||
routeReason = routeDecision.reason
|
||||
|
||||
if (routeDecision.route === 'browser_dom' && routeDecision.selector) {
|
||||
const requiredActions = routeDecision.bridgeMethod === 'checkCheckbox'
|
||||
? ['checkCheckbox']
|
||||
: ['getClickTarget', 'clickAt']
|
||||
|
||||
if (!isBrowserDomActionSupported(runtime.browserDomBridge, ...requiredActions)) {
|
||||
executionRoute = 'os_input'
|
||||
routeReason = `browser-dom extension transport does not support ${requiredActions.join(' + ')}`
|
||||
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} is unavailable on the connected extension transport (${getUnsupportedBrowserDomActions(runtime.browserDomBridge, ...requiredActions).join(', ')} unsupported), fell back to OS input`
|
||||
osInputResult = await executeOsClick()
|
||||
}
|
||||
else {
|
||||
try {
|
||||
const frameIds = routeDecision.frameId !== undefined ? [routeDecision.frameId] : undefined
|
||||
if (routeDecision.bridgeMethod === 'checkCheckbox') {
|
||||
await runtime.browserDomBridge.checkCheckbox({
|
||||
selector: routeDecision.selector,
|
||||
frameIds,
|
||||
})
|
||||
}
|
||||
else {
|
||||
await runtime.browserDomBridge.clickSelector({
|
||||
selector: routeDecision.selector,
|
||||
frameIds,
|
||||
})
|
||||
}
|
||||
}
|
||||
catch (browserError) {
|
||||
executionRoute = 'os_input'
|
||||
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} failed (${errorMessageFrom(browserError) ?? 'unknown error'}), fell back to OS input`
|
||||
osInputResult = await executeOsClick()
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
osInputResult = await executeOsClick()
|
||||
}
|
||||
|
||||
const completedIntent = {
|
||||
...intent,
|
||||
phase: 'completed' as const,
|
||||
executionResult: routeNote ? 'fallback' as const : 'success' as const,
|
||||
executionRoute: `${executionRoute} (${routeReason})`,
|
||||
}
|
||||
runtime.stateManager.updatePointerIntent(completedIntent, candidateId)
|
||||
|
||||
const candidateDesc = candidate ? `${candidate.source} ${candidate.role} "${candidate.label}"` : candidateId
|
||||
const lines = [
|
||||
`Clicked: ${candidateDesc}`,
|
||||
` Snap: ${snap.reason}`,
|
||||
` Point: (${snap.snappedPoint.x}, ${snap.snappedPoint.y})`,
|
||||
` Route: ${executionRoute} (${routeReason})`,
|
||||
` Button: ${button || 'left'}, clicks: ${clickCount ?? 1}`,
|
||||
]
|
||||
|
||||
if (routeNote) {
|
||||
lines.push(` ⚠ ${routeNote}`)
|
||||
}
|
||||
|
||||
if (snap.reason.includes('stale')) {
|
||||
lines.push(' ⚠ WARNING: Target source is stale. Consider calling desktop_observe again.')
|
||||
}
|
||||
|
||||
return {
|
||||
summary: lines.join('\n'),
|
||||
backendResult: {
|
||||
candidateId,
|
||||
snapshotId: snapshot.snapshotId,
|
||||
snap,
|
||||
candidate,
|
||||
executionRoute,
|
||||
routeReason,
|
||||
routeNote: routeNote || undefined,
|
||||
osInputResult,
|
||||
},
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
const failedIntent = {
|
||||
...intent,
|
||||
phase: 'completed' as const,
|
||||
executionResult: 'error' as const,
|
||||
executionRoute: `${executionRoute} (${routeReason})`,
|
||||
}
|
||||
runtime.stateManager.updatePointerIntent(failedIntent)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
|
@ -45,7 +45,7 @@ function createMockServer() {
|
|||
}
|
||||
|
||||
function createRuntime() {
|
||||
return {
|
||||
const runtime = {
|
||||
config: createTestConfig(),
|
||||
stateManager: new RunStateManager(),
|
||||
cdpBridgeManager: {
|
||||
|
|
@ -57,6 +57,9 @@ function createRuntime() {
|
|||
},
|
||||
browserDomBridge: {},
|
||||
executor: {},
|
||||
session: {
|
||||
setLastScreenshot: vi.fn(),
|
||||
},
|
||||
desktopSessionController: {
|
||||
getSession: vi.fn().mockReturnValue(undefined),
|
||||
getSessionInfo: vi.fn().mockReturnValue(undefined),
|
||||
|
|
@ -64,6 +67,12 @@ function createRuntime() {
|
|||
ensureControlledAppInForeground: vi.fn(),
|
||||
},
|
||||
} as unknown as ComputerUseServerRuntime
|
||||
|
||||
const executeAction = vi.fn().mockResolvedValue({
|
||||
content: [{ type: 'text', text: 'executed' }],
|
||||
})
|
||||
|
||||
return { runtime, executeAction }
|
||||
}
|
||||
|
||||
describe('registerDesktopGroundingTools', () => {
|
||||
|
|
@ -71,39 +80,36 @@ describe('registerDesktopGroundingTools', () => {
|
|||
captureDesktopGroundingMock.mockReset()
|
||||
})
|
||||
|
||||
it('registers desktop_click_target and handles missing candidate gracefully', async () => {
|
||||
const runtime = createRuntime()
|
||||
it('registers desktop_click_target through the action executor', async () => {
|
||||
const { runtime, executeAction } = createRuntime()
|
||||
|
||||
const { server, invoke } = createMockServer()
|
||||
|
||||
registerDesktopGroundingTools({ server, runtime })
|
||||
|
||||
runtime.stateManager.updateGroundingSnapshot({
|
||||
snapshotId: 'dg_1',
|
||||
capturedAt: new Date().toISOString(),
|
||||
foregroundApp: 'Google Chrome',
|
||||
windows: [],
|
||||
screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() },
|
||||
targetCandidates: [],
|
||||
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
|
||||
} as any)
|
||||
registerDesktopGroundingTools({ server, runtime, executeAction })
|
||||
|
||||
const result = await invoke('desktop_click_target', {
|
||||
candidateId: 't_missing',
|
||||
candidateId: 't_0',
|
||||
clickCount: 2,
|
||||
button: 'right',
|
||||
})
|
||||
|
||||
expect(result.isError).toBe(true)
|
||||
expect(result.content).toEqual([
|
||||
expect.objectContaining({ text: expect.stringContaining('Candidate "t_missing" not found in snapshot') }),
|
||||
])
|
||||
expect(result.isError).not.toBe(true)
|
||||
expect(executeAction).toHaveBeenCalledWith({
|
||||
kind: 'desktop_click_target',
|
||||
input: {
|
||||
candidateId: 't_0',
|
||||
clickCount: 2,
|
||||
button: 'right',
|
||||
},
|
||||
}, 'desktop_click_target')
|
||||
})
|
||||
|
||||
it('returns observe error content when captureDesktopGrounding fails', async () => {
|
||||
const runtime = createRuntime()
|
||||
const { runtime, executeAction } = createRuntime()
|
||||
captureDesktopGroundingMock.mockRejectedValueOnce(new Error('observe boom'))
|
||||
|
||||
const { server, invoke } = createMockServer()
|
||||
registerDesktopGroundingTools({ server, runtime })
|
||||
registerDesktopGroundingTools({ server, runtime, executeAction })
|
||||
|
||||
const result = await invoke('desktop_observe', {})
|
||||
|
||||
|
|
@ -114,7 +120,7 @@ describe('registerDesktopGroundingTools', () => {
|
|||
})
|
||||
|
||||
it('stores grounding snapshot and returns image content', async () => {
|
||||
const runtime = createRuntime()
|
||||
const { runtime, executeAction } = createRuntime()
|
||||
captureDesktopGroundingMock.mockResolvedValueOnce({
|
||||
snapshotId: 'dg_new',
|
||||
capturedAt: new Date().toISOString(),
|
||||
|
|
@ -127,18 +133,29 @@ describe('registerDesktopGroundingTools', () => {
|
|||
capturedAt: new Date().toISOString(),
|
||||
width: 1280,
|
||||
height: 720,
|
||||
executionTargetMode: 'remote',
|
||||
sourceHostName: 'fake-remote',
|
||||
sourceDisplayId: ':99',
|
||||
sourceSessionTag: 'vm-local-1',
|
||||
},
|
||||
targetCandidates: [],
|
||||
staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
|
||||
} as any)
|
||||
|
||||
const { server, invoke } = createMockServer()
|
||||
registerDesktopGroundingTools({ server, runtime })
|
||||
registerDesktopGroundingTools({ server, runtime, executeAction })
|
||||
|
||||
const result = await invoke('desktop_observe', {})
|
||||
const state = runtime.stateManager.getState()
|
||||
|
||||
expect(state.lastGroundingSnapshot?.screenshot.dataBase64).toBe('ZmFrZS1wbmc=')
|
||||
expect(runtime.session.setLastScreenshot).toHaveBeenCalledWith(expect.objectContaining({
|
||||
path: '/tmp/shot.png',
|
||||
executionTargetMode: 'remote',
|
||||
sourceHostName: 'fake-remote',
|
||||
sourceDisplayId: ':99',
|
||||
sourceSessionTag: 'vm-local-1',
|
||||
}))
|
||||
expect(result.content).toEqual([
|
||||
expect.objectContaining({ type: 'text' }),
|
||||
expect.objectContaining({
|
||||
|
|
|
|||
|
|
@ -14,17 +14,15 @@
|
|||
|
||||
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
|
||||
|
||||
import type { PointerIntent } from '../desktop-grounding-types'
|
||||
import type { DesktopClickTargetInput } from '../types'
|
||||
import type { ExecuteAction } from './action-executor'
|
||||
import type { ComputerUseServerRuntime } from './runtime'
|
||||
|
||||
import process from 'node:process'
|
||||
|
||||
import { z } from 'zod'
|
||||
|
||||
import { decideBrowserAction } from '../browser-action-router'
|
||||
import { getUnsupportedBrowserDomActions, isBrowserDomActionSupported } from '../browser-dom/capabilities'
|
||||
import { captureDesktopGrounding, formatGroundingForAgent } from '../desktop-grounding'
|
||||
import { resolveSnapByCandidate } from '../snap-resolver'
|
||||
import { sleep } from '../utils/sleep'
|
||||
import { textContent } from './content'
|
||||
import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptors/register-helper'
|
||||
|
|
@ -40,8 +38,9 @@ import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptor
|
|||
export function registerDesktopGroundingTools(params: {
|
||||
server: McpServer
|
||||
runtime: ComputerUseServerRuntime
|
||||
executeAction: ExecuteAction
|
||||
}) {
|
||||
const { server, runtime } = params
|
||||
const { server, runtime, executeAction } = params
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// desktop_observe
|
||||
|
|
@ -127,6 +126,7 @@ export function registerDesktopGroundingTools(params: {
|
|||
// Also update screenshot state so desktop_get_state and other
|
||||
// tools can see the latest screenshot from this observation
|
||||
if (snapshot.screenshot && !snapshot.screenshot.placeholder) {
|
||||
runtime.session.setLastScreenshot(snapshot.screenshot)
|
||||
runtime.stateManager.updateLastScreenshot({
|
||||
path: snapshot.screenshot.path || '',
|
||||
width: snapshot.screenshot.width,
|
||||
|
|
@ -192,187 +192,7 @@ export function registerDesktopGroundingTools(params: {
|
|||
button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button (default: left)'),
|
||||
},
|
||||
|
||||
handler: async ({ candidateId, clickCount, button }) => {
|
||||
try {
|
||||
const state = runtime.stateManager.getState()
|
||||
|
||||
// Validate: must have a recent grounding snapshot
|
||||
if (!state.lastGroundingSnapshot) {
|
||||
return {
|
||||
content: [textContent('ERROR: No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.')],
|
||||
isError: true,
|
||||
}
|
||||
}
|
||||
|
||||
const snapshot = state.lastGroundingSnapshot
|
||||
|
||||
// Session: ensure the controlled app is still in foreground before clicking
|
||||
const sessionCtrl = runtime.desktopSessionController
|
||||
const activeSession = sessionCtrl.getSession()
|
||||
if (activeSession?.controlledApp) {
|
||||
const currentForeground = await runtime.executor.getForegroundContext()
|
||||
const wasAlreadyInFront = await sessionCtrl.ensureControlledAppInForeground({
|
||||
currentForeground,
|
||||
chromeSessionManager: runtime.chromeSessionManager,
|
||||
activateApp: async (appName) => {
|
||||
await runtime.executor.focusApp({ app: appName })
|
||||
},
|
||||
})
|
||||
if (!wasAlreadyInFront) {
|
||||
await sleep(200)
|
||||
}
|
||||
sessionCtrl.touch()
|
||||
}
|
||||
|
||||
// Validate: check for duplicate clicks on same candidate without re-observe
|
||||
if (state.lastClickedCandidateId === candidateId) {
|
||||
return {
|
||||
content: [textContent(`WARNING: You already clicked candidate "${candidateId}" without calling desktop_observe again. Call desktop_observe to refresh the state before clicking the same target.`)],
|
||||
isError: true,
|
||||
}
|
||||
}
|
||||
|
||||
// Validate: check snapshot staleness (>5s)
|
||||
const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime()
|
||||
if (snapshotAge > 5000) {
|
||||
return {
|
||||
content: [textContent(`WARNING: Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`)],
|
||||
isError: true,
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve snap
|
||||
const snap = resolveSnapByCandidate(candidateId, snapshot)
|
||||
|
||||
if (snap.source === 'none' && !snap.candidateId) {
|
||||
return {
|
||||
content: [textContent(`ERROR: Candidate "${candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`)],
|
||||
isError: true,
|
||||
}
|
||||
}
|
||||
|
||||
// Build pointer intent
|
||||
const intent: PointerIntent = {
|
||||
mode: 'execute',
|
||||
candidateId,
|
||||
rawPoint: snap.rawPoint,
|
||||
snappedPoint: snap.snappedPoint,
|
||||
source: snap.source,
|
||||
confidence: snapshot.targetCandidates.find(c => c.id === candidateId)?.confidence ?? 0,
|
||||
path: [
|
||||
{ x: snap.snappedPoint.x, y: snap.snappedPoint.y, delayMs: 0 },
|
||||
],
|
||||
}
|
||||
|
||||
// Update RunState — pointer intent + clicked candidate (phase: executing)
|
||||
intent.phase = 'executing'
|
||||
runtime.stateManager.updatePointerIntent(intent)
|
||||
|
||||
// Route the click: browser-dom for chrome_dom candidates, OS input for everything else
|
||||
const candidate = snapshot.targetCandidates.find(c => c.id === candidateId)
|
||||
const bridgeConnected = runtime.browserDomBridge?.getStatus().connected ?? false
|
||||
const routeDecision = candidate
|
||||
? decideBrowserAction(candidate, bridgeConnected, button, clickCount)
|
||||
: { route: 'os_input' as const, reason: 'candidate not found' }
|
||||
|
||||
let executionRoute = routeDecision.route
|
||||
let routeNote = ''
|
||||
let routeReason = routeDecision.reason
|
||||
|
||||
if (routeDecision.route === 'browser_dom' && routeDecision.selector) {
|
||||
const requiredActions = routeDecision.bridgeMethod === 'checkCheckbox'
|
||||
? ['checkCheckbox']
|
||||
: ['getClickTarget', 'clickAt']
|
||||
|
||||
if (!isBrowserDomActionSupported(runtime.browserDomBridge, ...requiredActions)) {
|
||||
executionRoute = 'os_input'
|
||||
routeReason = `browser-dom extension transport does not support ${requiredActions.join(' + ')}`
|
||||
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} is unavailable on the connected extension transport (${getUnsupportedBrowserDomActions(runtime.browserDomBridge, ...requiredActions).join(', ')} unsupported), fell back to OS input`
|
||||
await runtime.executor.click({
|
||||
x: snap.snappedPoint.x,
|
||||
y: snap.snappedPoint.y,
|
||||
button: button || 'left',
|
||||
clickCount: clickCount ?? 1,
|
||||
pointerTrace: intent.path,
|
||||
})
|
||||
}
|
||||
else {
|
||||
// Try browser-dom bridge action first, dispatching by method
|
||||
try {
|
||||
const frameIds = routeDecision.frameId !== undefined ? [routeDecision.frameId] : undefined
|
||||
if (routeDecision.bridgeMethod === 'checkCheckbox') {
|
||||
await runtime.browserDomBridge!.checkCheckbox({
|
||||
selector: routeDecision.selector,
|
||||
frameIds,
|
||||
})
|
||||
}
|
||||
else {
|
||||
await runtime.browserDomBridge!.clickSelector({
|
||||
selector: routeDecision.selector,
|
||||
frameIds,
|
||||
})
|
||||
}
|
||||
}
|
||||
catch (browserError) {
|
||||
// Fallback to OS input on browser-dom failure
|
||||
executionRoute = 'os_input'
|
||||
routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} failed (${browserError instanceof Error ? browserError.message : String(browserError)}), fell back to OS input`
|
||||
await runtime.executor.click({
|
||||
x: snap.snappedPoint.x,
|
||||
y: snap.snappedPoint.y,
|
||||
button: button || 'left',
|
||||
clickCount: clickCount ?? 1,
|
||||
pointerTrace: intent.path,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
// OS-level click (existing path)
|
||||
await runtime.executor.click({
|
||||
x: snap.snappedPoint.x,
|
||||
y: snap.snappedPoint.y,
|
||||
button: button || 'left',
|
||||
clickCount: clickCount ?? 1,
|
||||
pointerTrace: intent.path,
|
||||
})
|
||||
}
|
||||
|
||||
// Phase: completed — update ghost pointer state for overlay fadeout
|
||||
intent.phase = 'completed'
|
||||
intent.executionResult = routeNote ? 'fallback' : 'success'
|
||||
intent.executionRoute = `${executionRoute} (${routeReason})`
|
||||
runtime.stateManager.updatePointerIntent(intent, candidateId)
|
||||
|
||||
const candidateDesc = candidate ? `${candidate.source} ${candidate.role} "${candidate.label}"` : candidateId
|
||||
|
||||
const lines = [
|
||||
`Clicked: ${candidateDesc}`,
|
||||
` Snap: ${snap.reason}`,
|
||||
` Point: (${snap.snappedPoint.x}, ${snap.snappedPoint.y})`,
|
||||
` Route: ${executionRoute} (${routeReason})`,
|
||||
` Button: ${button || 'left'}, clicks: ${clickCount ?? 1}`,
|
||||
]
|
||||
|
||||
if (routeNote) {
|
||||
lines.push(` ⚠ ${routeNote}`)
|
||||
}
|
||||
|
||||
if (snap.reason.includes('stale')) {
|
||||
lines.push(' ⚠ WARNING: Target source is stale. Consider calling desktop_observe again.')
|
||||
}
|
||||
|
||||
return {
|
||||
content: [textContent(lines.join('\n'))],
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error)
|
||||
return {
|
||||
content: [textContent(`desktop_click_target failed: ${message}`)],
|
||||
isError: true,
|
||||
}
|
||||
}
|
||||
},
|
||||
handler: async (input: DesktopClickTargetInput) =>
|
||||
executeAction({ kind: 'desktop_click_target', input }, 'desktop_click_target'),
|
||||
})
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue