fix(computer-use-mcp): route target clicks through action executor (#1727)

--------- Co-authored-by-agent: Antigravity <antigravity@gemini.com>
2026-04-28 06:29:33 +00:00 · 2026-04-26 05:53:15 +08:00 · 2026-04-26 05:53:15 +08:00 · 2e9020a06f
commit 2e9020a06f
parent a9359a3924
8 changed files with 545 additions and 214 deletions
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -3989,6 +3989,9 @@ importers:
      '@modelcontextprotocol/sdk':
        specifier: 'catalog:'
        version: 1.29.0(@cfworker/json-schema@4.1.1)(zod@4.3.6)
+      '@moeru/std':
+        specifier: 'catalog:'
+        version: 0.1.0-beta.17
      node-pty:
        specifier: 'catalog:'
        version: 1.1.0
--- a/services/computer-use-mcp/package.json
+++ b/services/computer-use-mcp/package.json
@ -50,6 +50,7 @@
  },
  "dependencies": {
    "@modelcontextprotocol/sdk": "catalog:",
+    "@moeru/std": "catalog:",
    "node-pty": "catalog:",
    "ws": "^8.20.0",
    "zod": "^4.3.6"
--- a/services/computer-use-mcp/src/server.ts
+++ b/services/computer-use-mcp/src/server.ts
@ -51,7 +51,7 @@ export async function createComputerUseMcpServer(config = resolveComputerUseConf
    },
  })
  const cdpCleanup = registerCdpTools({ server, runtime })
-  registerDesktopGroundingTools({ server, runtime })
+  registerDesktopGroundingTools({ server, runtime, executeAction })
  registerChromeSessionTools({ server, runtime })

  return {
--- a/services/computer-use-mcp/src/server/action-executor.test.ts
+++ b/services/computer-use-mcp/src/server/action-executor.test.ts
@ -1,3 +1,4 @@
+import type { ComputerUseConfig } from '../types'
 import type { ComputerUseServerRuntime } from './runtime'

 import { describe, expect, it, vi } from 'vitest'
@ -6,7 +7,7 @@ import { RunStateManager } from '../state'
 import { createDisplayInfo, createLocalExecutionTarget, createTerminalState, createTestConfig } from '../test-fixtures'
 import { createExecuteAction } from './action-executor'

-function createRuntimeForActionTest() {
+function createRuntimeForActionTest(configOverrides: Partial<ComputerUseConfig> = {}) {
  const stateManager = new RunStateManager()
  const session = {
    listPendingActions: vi.fn().mockReturnValue([]),
@ -51,6 +52,11 @@ function createRuntimeForActionTest() {
    scroll: vi.fn(),
    wait: vi.fn(),
  }
+  const desktopSessionController = {
+    getSession: vi.fn().mockReturnValue(null),
+    ensureControlledAppInForeground: vi.fn(),
+    touch: vi.fn(),
+  }
  const terminalRunner = {
    describe: () => ({ kind: 'local-shell-runner' as const, notes: [] }),
    execute: vi.fn(),
@ -79,6 +85,7 @@ function createRuntimeForActionTest() {
      executor: 'dry-run',
      approvalMode: 'never',
      defaultCaptureAfter: false,
+      ...configOverrides,
    }),
    session,
    executor,
@ -87,6 +94,8 @@ function createRuntimeForActionTest() {
    cdpBridgeManager,
    stateManager,
    taskMemory: {},
+    desktopSessionController,
+    chromeSessionManager: {},
  } as unknown as ComputerUseServerRuntime

  return {
@ -95,10 +104,255 @@ function createRuntimeForActionTest() {
    executor,
    cdpBridgeManager,
    stateManager,
+    desktopSessionController,
  }
 }

 describe('createExecuteAction', () => {
+  it('executes desktop_click_target through the shared policy and audit pipeline', async () => {
+    const { runtime, executor, session, stateManager } = createRuntimeForActionTest()
+    stateManager.updateGroundingSnapshot({
+      snapshotId: 'dg_1',
+      capturedAt: new Date().toISOString(),
+      foregroundApp: 'Google Chrome',
+      windows: [],
+      screenshot: {
+        dataBase64: '',
+        mimeType: 'image/png',
+        path: '',
+        capturedAt: new Date().toISOString(),
+      },
+      targetCandidates: [
+        {
+          id: 't_0',
+          source: 'ax',
+          appName: 'Google Chrome',
+          role: 'AXButton',
+          label: 'Submit',
+          bounds: { x: 100, y: 200, width: 80, height: 30 },
+          confidence: 0.95,
+          interactable: true,
+        },
+      ],
+      staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
+    } as any)
+
+    const executeAction = createExecuteAction(runtime)
+    const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
+
+    expect(result.isError).not.toBe(true)
+    expect(executor.click).toHaveBeenCalledWith(expect.objectContaining({
+      x: 140,
+      y: 215,
+      button: 'left',
+      clickCount: 1,
+      pointerTrace: [{ x: 140, y: 215, delayMs: 0 }],
+    }))
+    expect(session.consumeOperation).toHaveBeenCalledWith(1)
+    expect(session.setPointerPosition).toHaveBeenCalledWith({ x: 140, y: 215 })
+    expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
+      event: 'executed',
+      toolName: 'desktop_click_target',
+      action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
+    }))
+    expect(stateManager.getState().lastClickedCandidateId).toBe('t_0')
+    expect(stateManager.getState().lastPointerIntent).toMatchObject({
+      candidateId: 't_0',
+      phase: 'completed',
+      executionResult: 'success',
+    })
+    expect(result.content.find(item => item.type === 'text')?.text).toContain('Clicked: ax AXButton "Submit"')
+  })
+
+  it('queues desktop_click_target without refocusing when approval is required', async () => {
+    const { runtime, executor, session, desktopSessionController } = createRuntimeForActionTest({ approvalMode: 'all' })
+    desktopSessionController.getSession.mockReturnValue({
+      id: 'ds_1',
+      controlledApp: 'Google Chrome',
+      ownedWindows: [],
+      createdAt: new Date().toISOString(),
+      lastActiveAt: new Date().toISOString(),
+    })
+    executor.getForegroundContext.mockResolvedValue({
+      available: true,
+      appName: 'AIRI',
+      platform: 'darwin',
+    })
+    session.createPendingAction.mockReturnValue({
+      id: 'pa_1',
+      createdAt: new Date().toISOString(),
+      toolName: 'desktop_click_target',
+      action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
+      context: {
+        available: true,
+        appName: 'Google Chrome',
+        platform: 'darwin',
+      },
+      policy: {
+        allowed: true,
+        requiresApproval: true,
+        reasons: [],
+        riskLevel: 'medium',
+        estimatedOperationUnits: 1,
+      },
+    })
+
+    const executeAction = createExecuteAction(runtime)
+    const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
+
+    expect(result.structuredContent).toMatchObject({
+      status: 'approval_required',
+      pendingActionId: 'pa_1',
+      action: {
+        kind: 'desktop_click_target',
+        input: { candidateId: 't_0' },
+      },
+    })
+    expect(session.createPendingAction).toHaveBeenCalledWith(expect.objectContaining({
+      toolName: 'desktop_click_target',
+      action: { kind: 'desktop_click_target', input: { candidateId: 't_0' } },
+      context: expect.objectContaining({ appName: 'Google Chrome' }),
+    }))
+    expect(desktopSessionController.ensureControlledAppInForeground).not.toHaveBeenCalled()
+    expect(executor.click).not.toHaveBeenCalled()
+    expect(session.consumeOperation).not.toHaveBeenCalled()
+  })
+
+  it('uses controlled-app context for desktop_click_target policy and refocuses only during execution', async () => {
+    const { runtime, executor, session, stateManager, desktopSessionController } = createRuntimeForActionTest()
+    stateManager.updateGroundingSnapshot({
+      snapshotId: 'dg_1',
+      capturedAt: new Date().toISOString(),
+      foregroundApp: 'Google Chrome',
+      windows: [],
+      screenshot: {
+        dataBase64: '',
+        mimeType: 'image/png',
+        path: '',
+        capturedAt: new Date().toISOString(),
+      },
+      targetCandidates: [
+        {
+          id: 't_0',
+          source: 'ax',
+          appName: 'Google Chrome',
+          role: 'AXButton',
+          label: 'Submit',
+          bounds: { x: 100, y: 200, width: 80, height: 30 },
+          confidence: 0.95,
+          interactable: true,
+        },
+      ],
+      staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
+    } as any)
+    desktopSessionController.getSession.mockReturnValue({
+      id: 'ds_1',
+      controlledApp: 'Google Chrome',
+      ownedWindows: [],
+      createdAt: new Date().toISOString(),
+      lastActiveAt: new Date().toISOString(),
+    })
+    desktopSessionController.ensureControlledAppInForeground.mockResolvedValue(true)
+    executor.getForegroundContext.mockResolvedValue({
+      available: true,
+      appName: 'AIRI',
+      platform: 'darwin',
+    })
+
+    const executeAction = createExecuteAction(runtime)
+    const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
+
+    expect(result.isError).not.toBe(true)
+    expect(desktopSessionController.ensureControlledAppInForeground).toHaveBeenCalledWith(expect.objectContaining({
+      currentForeground: expect.objectContaining({ appName: 'AIRI' }),
+    }))
+    expect(executor.click).toHaveBeenCalledOnce()
+    expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
+      event: 'executed',
+      context: expect.objectContaining({ appName: 'Google Chrome' }),
+      policy: expect.objectContaining({ allowed: true }),
+    }))
+    expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
+      event: 'requested',
+      result: expect.objectContaining({
+        actualForegroundContext: expect.objectContaining({ appName: 'AIRI' }),
+      }),
+    }))
+  })
+
+  it('returns a structured failure when controlled-app refocus fails during desktop_click_target execution', async () => {
+    const { runtime, executor, session, stateManager, desktopSessionController } = createRuntimeForActionTest()
+    stateManager.updateGroundingSnapshot({
+      snapshotId: 'dg_1',
+      capturedAt: new Date().toISOString(),
+      foregroundApp: 'Google Chrome',
+      windows: [],
+      screenshot: {
+        dataBase64: '',
+        mimeType: 'image/png',
+        path: '',
+        capturedAt: new Date().toISOString(),
+      },
+      targetCandidates: [
+        {
+          id: 't_0',
+          source: 'ax',
+          appName: 'Google Chrome',
+          role: 'AXButton',
+          label: 'Submit',
+          bounds: { x: 100, y: 200, width: 80, height: 30 },
+          confidence: 0.95,
+          interactable: true,
+        },
+      ],
+      staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
+    } as any)
+    desktopSessionController.getSession.mockReturnValue({
+      id: 'ds_1',
+      controlledApp: 'Google Chrome',
+      ownedWindows: [],
+      createdAt: new Date().toISOString(),
+      lastActiveAt: new Date().toISOString(),
+    })
+    desktopSessionController.ensureControlledAppInForeground.mockRejectedValue(new Error('Chrome session unavailable'))
+    executor.getForegroundContext.mockResolvedValue({
+      available: true,
+      appName: 'AIRI',
+      platform: 'darwin',
+    })
+
+    const executeAction = createExecuteAction(runtime)
+    const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_0' } }, 'desktop_click_target')
+
+    expect(result.isError).toBe(true)
+    expect(result.content.find(item => item.type === 'text')?.text).toContain('Chrome session unavailable')
+    expect(executor.click).not.toHaveBeenCalled()
+    expect(session.consumeOperation).not.toHaveBeenCalled()
+    expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
+      event: 'failed',
+      toolName: 'desktop_click_target',
+      context: expect.objectContaining({ appName: 'Google Chrome' }),
+      result: expect.objectContaining({ error: 'Chrome session unavailable' }),
+    }))
+  })
+
+  it('fails desktop_click_target before consuming budget when no observe snapshot exists', async () => {
+    const { runtime, executor, session } = createRuntimeForActionTest()
+
+    const executeAction = createExecuteAction(runtime)
+    const result = await executeAction({ kind: 'desktop_click_target', input: { candidateId: 't_missing' } }, 'desktop_click_target')
+
+    expect(result.isError).toBe(true)
+    expect(result.content.find(item => item.type === 'text')?.text).toContain('No desktop_observe snapshot available')
+    expect(executor.click).not.toHaveBeenCalled()
+    expect(session.consumeOperation).not.toHaveBeenCalled()
+    expect(session.record).toHaveBeenCalledWith(expect.objectContaining({
+      event: 'failed',
+      toolName: 'desktop_click_target',
+      action: { kind: 'desktop_click_target', input: { candidateId: 't_missing' } },
+    }))
+  })
+
  it('refreshes browser surface availability for direct actions before evaluating strategy', async () => {
    const { runtime, cdpBridgeManager } = createRuntimeForActionTest()

--- a/services/computer-use-mcp/src/server/action-executor.ts
+++ b/services/computer-use-mcp/src/server/action-executor.ts
@ -4,6 +4,7 @@ import type {
  ActionInvocation,
  ComputerUseConfig,
  DesktopExecutor,
+  ForegroundContext,
  PolicyDecision,
  ScreenshotArtifact,
  TerminalCommandResult,
@ -33,6 +34,7 @@ import {
  maskEnvValuePreview,
  readEnvValue,
 } from '../utils/env-file'
+import { executeDesktopClickTarget } from './desktop-grounding-actions'
 import { describeExecutionTarget } from './formatters'
 import { refreshRuntimeRunState } from './refresh-run-state'
 import {
@ -117,10 +119,41 @@ function toTerminalStateContent(state: TerminalState) {
  }
 }

+function getPolicyEvaluationContext(params: {
+  action: ActionInvocation
+  actualContext: ForegroundContext
+  runtime: ComputerUseServerRuntime
+}): ForegroundContext {
+  if (params.action.kind !== 'desktop_click_target') {
+    return params.actualContext
+  }
+
+  const activeSession = params.runtime.desktopSessionController.getSession()
+  if (!activeSession?.controlledApp) {
+    return params.actualContext
+  }
+
+  if (params.actualContext.available && params.actualContext.appName === activeSession.controlledApp) {
+    return params.actualContext
+  }
+
+  return {
+    available: true,
+    appName: activeSession.controlledApp,
+    platform: params.actualContext.platform,
+  }
+}
+
 export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteAction {
  return async (action, toolName, options = {}) => {
    const normalizedAction = normalizeConfiguredAppAction(action, runtime.config.openableApps)
-    const { executionTarget, context, displayInfo } = await refreshRuntimeRunState(runtime)
+    const { executionTarget, context: actualContext, displayInfo } = await refreshRuntimeRunState(runtime)
+    const context = getPolicyEvaluationContext({
+      action: normalizedAction,
+      actualContext,
+      runtime,
+    })
+    const actualForegroundContext = context === actualContext ? undefined : actualContext

    const budget = runtime.session.getBudgetState()
    const preflight = getRuntimePreflight({
@ -159,6 +192,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
        executionTarget,
        displayInfo,
        coordinateSpace: preflight.coordinateSpace,
+        actualForegroundContext,
      },
    })

@ -255,6 +289,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
      let backendResult: Record<string, unknown> = {}
      let clipboardStructuredContent: Record<string, unknown> | undefined
      let secretStructuredContent: Record<string, unknown> | undefined
+      let summaryOverride: string | undefined

      switch (normalizedAction.kind) {
        case 'screenshot': {
@ -515,6 +550,12 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
          }
          break
        }
+        case 'desktop_click_target': {
+          const result = await executeDesktopClickTarget(runtime, normalizedAction.input)
+          backendResult = result.backendResult
+          summaryOverride = result.summary
+          break
+        }
      }

      runtime.session.consumeOperation(decision.estimatedOperationUnits)
@ -562,7 +603,7 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA
      })

      return buildSuccessResponse({
-        summary: `${intent} ${outcome}${advisorySummary ? ` Strategy: ${advisorySummary}` : ''}`,
+        summary: summaryOverride ?? `${intent} ${outcome}${advisorySummary ? ` Strategy: ${advisorySummary}` : ''}`,
        screenshot,
        structuredContent: {
          status: 'executed',
--- a/services/computer-use-mcp/src/server/desktop-grounding-actions.ts
+++ b/services/computer-use-mcp/src/server/desktop-grounding-actions.ts
@ -0,0 +1,195 @@
+import type { ExecutorActionResult } from '../types'
+import type { ComputerUseServerRuntime } from './runtime'
+
+import { errorMessageFrom } from '@moeru/std'
+
+import { decideBrowserAction } from '../browser-action-router'
+import { getUnsupportedBrowserDomActions, isBrowserDomActionSupported } from '../browser-dom/capabilities'
+import { resolveSnapByCandidate } from '../snap-resolver'
+import { sleep } from '../utils/sleep'
+
+const DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS = 5000
+
+export interface DesktopClickTargetExecution {
+  summary: string
+  backendResult: Record<string, unknown>
+}
+
+export async function executeDesktopClickTarget(
+  runtime: ComputerUseServerRuntime,
+  input: {
+    candidateId: string
+    clickCount?: number
+    button?: 'left' | 'right' | 'middle'
+  },
+): Promise<DesktopClickTargetExecution> {
+  const { candidateId, clickCount, button } = input
+  const state = runtime.stateManager.getState()
+
+  if (!state.lastGroundingSnapshot) {
+    throw new Error('No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.')
+  }
+
+  const snapshot = state.lastGroundingSnapshot
+
+  if (state.lastClickedCandidateId === candidateId) {
+    throw new Error(`You already clicked candidate "${candidateId}" without calling desktop_observe again. Call desktop_observe to refresh the state before clicking the same target.`)
+  }
+
+  const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime()
+  if (snapshotAge > DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS) {
+    throw new Error(`Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`)
+  }
+
+  const snap = resolveSnapByCandidate(candidateId, snapshot)
+  if (snap.source === 'none' && !snap.candidateId) {
+    throw new Error(`Candidate "${candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`)
+  }
+
+  const sessionCtrl = runtime.desktopSessionController
+  const activeSession = sessionCtrl.getSession()
+  if (activeSession?.controlledApp) {
+    const currentForeground = await runtime.executor.getForegroundContext()
+    const wasAlreadyInFront = await sessionCtrl.ensureControlledAppInForeground({
+      currentForeground,
+      chromeSessionManager: runtime.chromeSessionManager,
+      activateApp: async (appName) => {
+        await runtime.executor.focusApp({ app: appName })
+      },
+    })
+    if (!wasAlreadyInFront) {
+      await sleep(200)
+    }
+    sessionCtrl.touch()
+  }
+
+  const candidate = snapshot.targetCandidates.find(c => c.id === candidateId)
+  const intent = {
+    mode: 'execute' as const,
+    candidateId,
+    rawPoint: snap.rawPoint,
+    snappedPoint: snap.snappedPoint,
+    source: snap.source,
+    confidence: candidate?.confidence ?? 0,
+    path: [
+      { x: snap.snappedPoint.x, y: snap.snappedPoint.y, delayMs: 0 },
+    ],
+    phase: 'executing' as const,
+  }
+
+  runtime.stateManager.updatePointerIntent(intent)
+
+  let executionRoute = 'os_input'
+  let routeNote = ''
+  let routeReason = 'candidate not found'
+  let osInputResult: ExecutorActionResult | undefined
+
+  const executeOsClick = async () => {
+    const result = await runtime.executor.click({
+      x: snap.snappedPoint.x,
+      y: snap.snappedPoint.y,
+      button: button || 'left',
+      clickCount: clickCount ?? 1,
+      pointerTrace: intent.path,
+    })
+    runtime.session.setPointerPosition({ x: snap.snappedPoint.x, y: snap.snappedPoint.y })
+    return result
+  }
+
+  try {
+    const bridgeConnected = runtime.browserDomBridge?.getStatus().connected ?? false
+    const routeDecision = candidate
+      ? decideBrowserAction(candidate, bridgeConnected, button, clickCount)
+      : { route: 'os_input' as const, reason: 'candidate not found' }
+
+    executionRoute = routeDecision.route
+    routeReason = routeDecision.reason
+
+    if (routeDecision.route === 'browser_dom' && routeDecision.selector) {
+      const requiredActions = routeDecision.bridgeMethod === 'checkCheckbox'
+        ? ['checkCheckbox']
+        : ['getClickTarget', 'clickAt']
+
+      if (!isBrowserDomActionSupported(runtime.browserDomBridge, ...requiredActions)) {
+        executionRoute = 'os_input'
+        routeReason = `browser-dom extension transport does not support ${requiredActions.join(' + ')}`
+        routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} is unavailable on the connected extension transport (${getUnsupportedBrowserDomActions(runtime.browserDomBridge, ...requiredActions).join(', ')} unsupported), fell back to OS input`
+        osInputResult = await executeOsClick()
+      }
+      else {
+        try {
+          const frameIds = routeDecision.frameId !== undefined ? [routeDecision.frameId] : undefined
+          if (routeDecision.bridgeMethod === 'checkCheckbox') {
+            await runtime.browserDomBridge.checkCheckbox({
+              selector: routeDecision.selector,
+              frameIds,
+            })
+          }
+          else {
+            await runtime.browserDomBridge.clickSelector({
+              selector: routeDecision.selector,
+              frameIds,
+            })
+          }
+        }
+        catch (browserError) {
+          executionRoute = 'os_input'
+          routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} failed (${errorMessageFrom(browserError) ?? 'unknown error'}), fell back to OS input`
+          osInputResult = await executeOsClick()
+        }
+      }
+    }
+    else {
+      osInputResult = await executeOsClick()
+    }
+
+    const completedIntent = {
+      ...intent,
+      phase: 'completed' as const,
+      executionResult: routeNote ? 'fallback' as const : 'success' as const,
+      executionRoute: `${executionRoute} (${routeReason})`,
+    }
+    runtime.stateManager.updatePointerIntent(completedIntent, candidateId)
+
+    const candidateDesc = candidate ? `${candidate.source} ${candidate.role} "${candidate.label}"` : candidateId
+    const lines = [
+      `Clicked: ${candidateDesc}`,
+      `  Snap: ${snap.reason}`,
+      `  Point: (${snap.snappedPoint.x}, ${snap.snappedPoint.y})`,
+      `  Route: ${executionRoute} (${routeReason})`,
+      `  Button: ${button || 'left'}, clicks: ${clickCount ?? 1}`,
+    ]
+
+    if (routeNote) {
+      lines.push(`  ⚠ ${routeNote}`)
+    }
+
+    if (snap.reason.includes('stale')) {
+      lines.push('  ⚠ WARNING: Target source is stale. Consider calling desktop_observe again.')
+    }
+
+    return {
+      summary: lines.join('\n'),
+      backendResult: {
+        candidateId,
+        snapshotId: snapshot.snapshotId,
+        snap,
+        candidate,
+        executionRoute,
+        routeReason,
+        routeNote: routeNote || undefined,
+        osInputResult,
+      },
+    }
+  }
+  catch (error) {
+    const failedIntent = {
+      ...intent,
+      phase: 'completed' as const,
+      executionResult: 'error' as const,
+      executionRoute: `${executionRoute} (${routeReason})`,
+    }
+    runtime.stateManager.updatePointerIntent(failedIntent)
+    throw error
+  }
+}
--- a/services/computer-use-mcp/src/server/register-desktop-grounding-tools.test.ts
+++ b/services/computer-use-mcp/src/server/register-desktop-grounding-tools.test.ts
@ -45,7 +45,7 @@ function createMockServer() {
 }

 function createRuntime() {
-  return {
+  const runtime = {
    config: createTestConfig(),
    stateManager: new RunStateManager(),
    cdpBridgeManager: {
@ -57,6 +57,9 @@ function createRuntime() {
    },
    browserDomBridge: {},
    executor: {},
+    session: {
+      setLastScreenshot: vi.fn(),
+    },
    desktopSessionController: {
      getSession: vi.fn().mockReturnValue(undefined),
      getSessionInfo: vi.fn().mockReturnValue(undefined),
@ -64,6 +67,12 @@ function createRuntime() {
      ensureControlledAppInForeground: vi.fn(),
    },
  } as unknown as ComputerUseServerRuntime
+
+  const executeAction = vi.fn().mockResolvedValue({
+    content: [{ type: 'text', text: 'executed' }],
+  })
+
+  return { runtime, executeAction }
 }

 describe('registerDesktopGroundingTools', () => {
@ -71,39 +80,36 @@ describe('registerDesktopGroundingTools', () => {
    captureDesktopGroundingMock.mockReset()
  })

-  it('registers desktop_click_target and handles missing candidate gracefully', async () => {
-    const runtime = createRuntime()
+  it('registers desktop_click_target through the action executor', async () => {
+    const { runtime, executeAction } = createRuntime()

    const { server, invoke } = createMockServer()

-    registerDesktopGroundingTools({ server, runtime })
-
-    runtime.stateManager.updateGroundingSnapshot({
-      snapshotId: 'dg_1',
-      capturedAt: new Date().toISOString(),
-      foregroundApp: 'Google Chrome',
-      windows: [],
-      screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() },
-      targetCandidates: [],
-      staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
-    } as any)
+    registerDesktopGroundingTools({ server, runtime, executeAction })

    const result = await invoke('desktop_click_target', {
-      candidateId: 't_missing',
+      candidateId: 't_0',
+      clickCount: 2,
+      button: 'right',
    })

-    expect(result.isError).toBe(true)
-    expect(result.content).toEqual([
-      expect.objectContaining({ text: expect.stringContaining('Candidate "t_missing" not found in snapshot') }),
-    ])
+    expect(result.isError).not.toBe(true)
+    expect(executeAction).toHaveBeenCalledWith({
+      kind: 'desktop_click_target',
+      input: {
+        candidateId: 't_0',
+        clickCount: 2,
+        button: 'right',
+      },
+    }, 'desktop_click_target')
  })

  it('returns observe error content when captureDesktopGrounding fails', async () => {
-    const runtime = createRuntime()
+    const { runtime, executeAction } = createRuntime()
    captureDesktopGroundingMock.mockRejectedValueOnce(new Error('observe boom'))

    const { server, invoke } = createMockServer()
-    registerDesktopGroundingTools({ server, runtime })
+    registerDesktopGroundingTools({ server, runtime, executeAction })

    const result = await invoke('desktop_observe', {})

@ -114,7 +120,7 @@ describe('registerDesktopGroundingTools', () => {
  })

  it('stores grounding snapshot and returns image content', async () => {
-    const runtime = createRuntime()
+    const { runtime, executeAction } = createRuntime()
    captureDesktopGroundingMock.mockResolvedValueOnce({
      snapshotId: 'dg_new',
      capturedAt: new Date().toISOString(),
@ -127,18 +133,29 @@ describe('registerDesktopGroundingTools', () => {
        capturedAt: new Date().toISOString(),
        width: 1280,
        height: 720,
+        executionTargetMode: 'remote',
+        sourceHostName: 'fake-remote',
+        sourceDisplayId: ':99',
+        sourceSessionTag: 'vm-local-1',
      },
      targetCandidates: [],
      staleFlags: { screenshot: false, ax: false, chromeSemantic: false },
    } as any)

    const { server, invoke } = createMockServer()
-    registerDesktopGroundingTools({ server, runtime })
+    registerDesktopGroundingTools({ server, runtime, executeAction })

    const result = await invoke('desktop_observe', {})
    const state = runtime.stateManager.getState()

    expect(state.lastGroundingSnapshot?.screenshot.dataBase64).toBe('ZmFrZS1wbmc=')
+    expect(runtime.session.setLastScreenshot).toHaveBeenCalledWith(expect.objectContaining({
+      path: '/tmp/shot.png',
+      executionTargetMode: 'remote',
+      sourceHostName: 'fake-remote',
+      sourceDisplayId: ':99',
+      sourceSessionTag: 'vm-local-1',
+    }))
    expect(result.content).toEqual([
      expect.objectContaining({ type: 'text' }),
      expect.objectContaining({
--- a/services/computer-use-mcp/src/server/register-desktop-grounding.ts
+++ b/services/computer-use-mcp/src/server/register-desktop-grounding.ts
@ -14,17 +14,15 @@

 import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'

-import type { PointerIntent } from '../desktop-grounding-types'
+import type { DesktopClickTargetInput } from '../types'
+import type { ExecuteAction } from './action-executor'
 import type { ComputerUseServerRuntime } from './runtime'

 import process from 'node:process'

 import { z } from 'zod'

-import { decideBrowserAction } from '../browser-action-router'
-import { getUnsupportedBrowserDomActions, isBrowserDomActionSupported } from '../browser-dom/capabilities'
 import { captureDesktopGrounding, formatGroundingForAgent } from '../desktop-grounding'
-import { resolveSnapByCandidate } from '../snap-resolver'
 import { sleep } from '../utils/sleep'
 import { textContent } from './content'
 import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptors/register-helper'
@ -40,8 +38,9 @@ import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptor
 export function registerDesktopGroundingTools(params: {
  server: McpServer
  runtime: ComputerUseServerRuntime
+  executeAction: ExecuteAction
 }) {
-  const { server, runtime } = params
+  const { server, runtime, executeAction } = params

  // -----------------------------------------------------------------------
  // desktop_observe
@ -127,6 +126,7 @@ export function registerDesktopGroundingTools(params: {
        // Also update screenshot state so desktop_get_state and other
        // tools can see the latest screenshot from this observation
        if (snapshot.screenshot && !snapshot.screenshot.placeholder) {
+          runtime.session.setLastScreenshot(snapshot.screenshot)
          runtime.stateManager.updateLastScreenshot({
            path: snapshot.screenshot.path || '',
            width: snapshot.screenshot.width,
@ -192,187 +192,7 @@ export function registerDesktopGroundingTools(params: {
      button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button (default: left)'),
    },

-    handler: async ({ candidateId, clickCount, button }) => {
-      try {
-        const state = runtime.stateManager.getState()
-
-        // Validate: must have a recent grounding snapshot
-        if (!state.lastGroundingSnapshot) {
-          return {
-            content: [textContent('ERROR: No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.')],
-            isError: true,
-          }
-        }
-
-        const snapshot = state.lastGroundingSnapshot
-
-        // Session: ensure the controlled app is still in foreground before clicking
-        const sessionCtrl = runtime.desktopSessionController
-        const activeSession = sessionCtrl.getSession()
-        if (activeSession?.controlledApp) {
-          const currentForeground = await runtime.executor.getForegroundContext()
-          const wasAlreadyInFront = await sessionCtrl.ensureControlledAppInForeground({
-            currentForeground,
-            chromeSessionManager: runtime.chromeSessionManager,
-            activateApp: async (appName) => {
-              await runtime.executor.focusApp({ app: appName })
-            },
-          })
-          if (!wasAlreadyInFront) {
-            await sleep(200)
-          }
-          sessionCtrl.touch()
-        }
-
-        // Validate: check for duplicate clicks on same candidate without re-observe
-        if (state.lastClickedCandidateId === candidateId) {
-          return {
-            content: [textContent(`WARNING: You already clicked candidate "${candidateId}" without calling desktop_observe again. Call desktop_observe to refresh the state before clicking the same target.`)],
-            isError: true,
-          }
-        }
-
-        // Validate: check snapshot staleness (>5s)
-        const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime()
-        if (snapshotAge > 5000) {
-          return {
-            content: [textContent(`WARNING: Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`)],
-            isError: true,
-          }
-        }
-
-        // Resolve snap
-        const snap = resolveSnapByCandidate(candidateId, snapshot)
-
-        if (snap.source === 'none' && !snap.candidateId) {
-          return {
-            content: [textContent(`ERROR: Candidate "${candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`)],
-            isError: true,
-          }
-        }
-
-        // Build pointer intent
-        const intent: PointerIntent = {
-          mode: 'execute',
-          candidateId,
-          rawPoint: snap.rawPoint,
-          snappedPoint: snap.snappedPoint,
-          source: snap.source,
-          confidence: snapshot.targetCandidates.find(c => c.id === candidateId)?.confidence ?? 0,
-          path: [
-            { x: snap.snappedPoint.x, y: snap.snappedPoint.y, delayMs: 0 },
-          ],
-        }
-
-        // Update RunState — pointer intent + clicked candidate (phase: executing)
-        intent.phase = 'executing'
-        runtime.stateManager.updatePointerIntent(intent)
-
-        // Route the click: browser-dom for chrome_dom candidates, OS input for everything else
-        const candidate = snapshot.targetCandidates.find(c => c.id === candidateId)
-        const bridgeConnected = runtime.browserDomBridge?.getStatus().connected ?? false
-        const routeDecision = candidate
-          ? decideBrowserAction(candidate, bridgeConnected, button, clickCount)
-          : { route: 'os_input' as const, reason: 'candidate not found' }
-
-        let executionRoute = routeDecision.route
-        let routeNote = ''
-        let routeReason = routeDecision.reason
-
-        if (routeDecision.route === 'browser_dom' && routeDecision.selector) {
-          const requiredActions = routeDecision.bridgeMethod === 'checkCheckbox'
-            ? ['checkCheckbox']
-            : ['getClickTarget', 'clickAt']
-
-          if (!isBrowserDomActionSupported(runtime.browserDomBridge, ...requiredActions)) {
-            executionRoute = 'os_input'
-            routeReason = `browser-dom extension transport does not support ${requiredActions.join(' + ')}`
-            routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} is unavailable on the connected extension transport (${getUnsupportedBrowserDomActions(runtime.browserDomBridge, ...requiredActions).join(', ')} unsupported), fell back to OS input`
-            await runtime.executor.click({
-              x: snap.snappedPoint.x,
-              y: snap.snappedPoint.y,
-              button: button || 'left',
-              clickCount: clickCount ?? 1,
-              pointerTrace: intent.path,
-            })
-          }
-          else {
-            // Try browser-dom bridge action first, dispatching by method
-            try {
-              const frameIds = routeDecision.frameId !== undefined ? [routeDecision.frameId] : undefined
-              if (routeDecision.bridgeMethod === 'checkCheckbox') {
-                await runtime.browserDomBridge!.checkCheckbox({
-                  selector: routeDecision.selector,
-                  frameIds,
-                })
-              }
-              else {
-                await runtime.browserDomBridge!.clickSelector({
-                  selector: routeDecision.selector,
-                  frameIds,
-                })
-              }
-            }
-            catch (browserError) {
-              // Fallback to OS input on browser-dom failure
-              executionRoute = 'os_input'
-              routeNote = `browser-dom ${routeDecision.bridgeMethod ?? 'click'} failed (${browserError instanceof Error ? browserError.message : String(browserError)}), fell back to OS input`
-              await runtime.executor.click({
-                x: snap.snappedPoint.x,
-                y: snap.snappedPoint.y,
-                button: button || 'left',
-                clickCount: clickCount ?? 1,
-                pointerTrace: intent.path,
-              })
-            }
-          }
-        }
-        else {
-          // OS-level click (existing path)
-          await runtime.executor.click({
-            x: snap.snappedPoint.x,
-            y: snap.snappedPoint.y,
-            button: button || 'left',
-            clickCount: clickCount ?? 1,
-            pointerTrace: intent.path,
-          })
-        }
-
-        // Phase: completed — update ghost pointer state for overlay fadeout
-        intent.phase = 'completed'
-        intent.executionResult = routeNote ? 'fallback' : 'success'
-        intent.executionRoute = `${executionRoute} (${routeReason})`
-        runtime.stateManager.updatePointerIntent(intent, candidateId)
-
-        const candidateDesc = candidate ? `${candidate.source} ${candidate.role} "${candidate.label}"` : candidateId
-
-        const lines = [
-          `Clicked: ${candidateDesc}`,
-          `  Snap: ${snap.reason}`,
-          `  Point: (${snap.snappedPoint.x}, ${snap.snappedPoint.y})`,
-          `  Route: ${executionRoute} (${routeReason})`,
-          `  Button: ${button || 'left'}, clicks: ${clickCount ?? 1}`,
-        ]
-
-        if (routeNote) {
-          lines.push(`  ⚠ ${routeNote}`)
-        }
-
-        if (snap.reason.includes('stale')) {
-          lines.push('  ⚠ WARNING: Target source is stale. Consider calling desktop_observe again.')
-        }
-
-        return {
-          content: [textContent(lines.join('\n'))],
-        }
-      }
-      catch (error) {
-        const message = error instanceof Error ? error.message : String(error)
-        return {
-          content: [textContent(`desktop_click_target failed: ${message}`)],
-          isError: true,
-        }
-      }
-    },
+    handler: async (input: DesktopClickTargetInput) =>
+      executeAction({ kind: 'desktop_click_target', input }, 'desktop_click_target'),
  })
 }