From 4f2b9e9bd88a1bb87ef0f59cdef28c634f2ee792 Mon Sep 17 00:00:00 2001 From: tanzhenxin Date: Wed, 25 Mar 2026 07:43:48 +0000 Subject: [PATCH] feat(channels): add multimodal support with image handling - Add model configuration option for channel-specific model selection - Support base64-encoded images in prompts via AcpBridge - Add media utilities for WeChat/Weixin channel - Update settings schema for model configuration Enables channels to process images and use custom models. Co-authored-by: Qwen-Coder --- packages/channels/base/src/AcpBridge.ts | 26 ++++++++- packages/channels/base/src/ChannelBase.ts | 5 +- packages/channels/base/src/types.ts | 5 ++ packages/channels/weixin/src/WeixinAdapter.ts | 46 ++++++++++++++- packages/channels/weixin/src/media.ts | 56 +++++++++++++++++++ packages/channels/weixin/src/monitor.ts | 26 +++++++-- packages/cli/src/commands/channel/start.ts | 7 ++- .../schemas/settings.schema.json | 5 ++ 8 files changed, 164 insertions(+), 12 deletions(-) create mode 100644 packages/channels/weixin/src/media.ts diff --git a/packages/channels/base/src/AcpBridge.ts b/packages/channels/base/src/AcpBridge.ts index 502f4955f..2b6155dba 100644 --- a/packages/channels/base/src/AcpBridge.ts +++ b/packages/channels/base/src/AcpBridge.ts @@ -17,6 +17,7 @@ import type { export interface AcpBridgeOptions { cliEntryPath: string; cwd: string; + model?: string; } export interface AvailableCommand { @@ -51,7 +52,12 @@ export class AcpBridge extends EventEmitter { async start(): Promise { const { cliEntryPath, cwd } = this.options; - this.child = spawn(process.execPath, [cliEntryPath, '--acp'], { + const args = [cliEntryPath, '--acp']; + if (this.options.model) { + args.push('--model', this.options.model); + } + + this.child = spawn(process.execPath, args, { cwd, stdio: ['pipe', 'pipe', 'pipe'], env: { ...process.env }, @@ -123,7 +129,11 @@ export class AcpBridge extends EventEmitter { return response.sessionId; } - async prompt(sessionId: string, text: string): Promise { + async prompt( + sessionId: string, + text: string, + options?: { imageBase64?: string; imageMimeType?: string }, + ): Promise { const conn = this.ensureConnection(); const chunks: string[] = []; @@ -132,10 +142,20 @@ export class AcpBridge extends EventEmitter { }; this.on('textChunk', onChunk); + const prompt: Array> = []; + if (options?.imageBase64 && options.imageMimeType) { + prompt.push({ + type: 'image', + data: options.imageBase64, + mimeType: options.imageMimeType, + }); + } + prompt.push({ type: 'text', text }); + try { await conn.prompt({ sessionId, - prompt: [{ type: 'text', text }], + prompt: prompt as Array<{ type: 'text'; text: string }>, }); } finally { this.off('textChunk', onChunk); diff --git a/packages/channels/base/src/ChannelBase.ts b/packages/channels/base/src/ChannelBase.ts index fe59dbeaa..caf46245e 100644 --- a/packages/channels/base/src/ChannelBase.ts +++ b/packages/channels/base/src/ChannelBase.ts @@ -74,7 +74,10 @@ export abstract class ChannelBase { this.instructedSessions.add(sessionId); } - const response = await this.bridge.prompt(sessionId, promptText); + const response = await this.bridge.prompt(sessionId, promptText, { + imageBase64: envelope.imageBase64, + imageMimeType: envelope.imageMimeType, + }); if (response) { await this.sendMessage(envelope.chatId, response); diff --git a/packages/channels/base/src/types.ts b/packages/channels/base/src/types.ts index aaa4a6508..1eb405dd3 100644 --- a/packages/channels/base/src/types.ts +++ b/packages/channels/base/src/types.ts @@ -16,6 +16,7 @@ export interface ChannelConfig { cwd: string; approvalMode?: string; instructions?: string; + model?: string; groupPolicy: GroupPolicy; // default: "disabled" groups: Record; // "*" for defaults, group IDs for overrides } @@ -30,6 +31,10 @@ export interface Envelope { isGroup: boolean; isMentioned: boolean; isReplyToBot: boolean; + /** Base64-encoded image data (e.g. from WeChat CDN download). */ + imageBase64?: string; + /** MIME type for the image (e.g. "image/jpeg", "image/png"). */ + imageMimeType?: string; } export interface SessionTarget { diff --git a/packages/channels/weixin/src/WeixinAdapter.ts b/packages/channels/weixin/src/WeixinAdapter.ts index 9254a601c..5c631eda9 100644 --- a/packages/channels/weixin/src/WeixinAdapter.ts +++ b/packages/channels/weixin/src/WeixinAdapter.ts @@ -8,7 +8,9 @@ import type { ChannelConfig, Envelope } from '@qwen-code/channel-base'; import type { AcpBridge } from '@qwen-code/channel-base'; import { loadAccount, DEFAULT_BASE_URL } from './accounts.js'; import { startPollLoop, getContextToken } from './monitor.js'; +import type { ImageCdnRef } from './monitor.js'; import { sendText } from './send.js'; +import { downloadAndDecrypt } from './media.js'; import { getConfig, sendTyping } from './api.js'; import { TypingStatus } from './types.js'; @@ -56,7 +58,7 @@ export class WeixinChannel extends ChannelBase { isReplyToBot: false, }; - this.handleInbound(envelope).catch((err) => { + this.handleInboundWithImage(envelope, msg.image).catch((err) => { const errMsg = err instanceof Error ? err.message : JSON.stringify(err, null, 2); process.stderr.write( @@ -76,17 +78,36 @@ export class WeixinChannel extends ChannelBase { ); } - override async handleInbound(envelope: Envelope): Promise { + private async handleInboundWithImage( + envelope: Envelope, + image?: ImageCdnRef, + ): Promise { // Check group gate before showing typing const groupResult = this.groupGate.check(envelope); if (!groupResult.allowed) { return; } - // Show typing indicator while agent processes + // Show typing indicator immediately — before image download await this.setTyping(envelope.chatId, true); try { + // Download image from CDN (after typing has started) + if (image) { + try { + const imageData = await downloadAndDecrypt( + image.encryptQueryParam, + image.aesKey, + ); + envelope.imageBase64 = imageData.toString('base64'); + envelope.imageMimeType = detectImageMime(imageData); + } catch (err) { + process.stderr.write( + `[Weixin:${this.name}] Failed to download image: ${err instanceof Error ? err.message : err}\n`, + ); + } + } + await super.handleInbound(envelope); } finally { await this.setTyping(envelope.chatId, false); @@ -139,3 +160,22 @@ export class WeixinChannel extends ChannelBase { } } } + +/** Detect image MIME type from magic bytes. */ +function detectImageMime(data: Buffer): string { + if (data[0] === 0x89 && data[1] === 0x50 && data[2] === 0x4e) { + return 'image/png'; + } + if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46) { + return 'image/gif'; + } + if ( + data[0] === 0x52 && + data[1] === 0x49 && + data[2] === 0x46 && + data[3] === 0x46 + ) { + return 'image/webp'; + } + return 'image/jpeg'; +} diff --git a/packages/channels/weixin/src/media.ts b/packages/channels/weixin/src/media.ts new file mode 100644 index 000000000..8cd7fa9eb --- /dev/null +++ b/packages/channels/weixin/src/media.ts @@ -0,0 +1,56 @@ +/** + * CDN download with AES-128-ECB decryption. + * Ported from cc-weixin/plugins/weixin/src/media.ts (download path only). + */ + +import { createDecipheriv } from 'node:crypto'; + +const CDN_BASE_URL = 'https://novac2c.cdn.weixin.qq.com/c2c'; + +function buildCdnDownloadUrl(encryptedQueryParam: string): string { + return `${CDN_BASE_URL}/download?encrypted_query_param=${encodeURIComponent(encryptedQueryParam)}`; +} + +function decryptAesEcb(ciphertext: Buffer, key: Buffer): Buffer { + const decipher = createDecipheriv('aes-128-ecb', key, null); + return Buffer.concat([decipher.update(ciphertext), decipher.final()]); +} + +/** + * Parse aes_key from CDNMedia into a raw 16-byte Buffer. + * Two encodings exist: + * - base64(raw 16 bytes) → images + * - base64(hex string of 16 bytes) → file/voice/video + */ +function parseAesKey(aesKeyBase64: string): Buffer { + const decoded = Buffer.from(aesKeyBase64, 'base64'); + if (decoded.length === 16) { + return decoded; + } + if ( + decoded.length === 32 && + /^[0-9a-fA-F]{32}$/.test(decoded.toString('ascii')) + ) { + return Buffer.from(decoded.toString('ascii'), 'hex'); + } + throw new Error( + `Invalid aes_key: expected 16 raw bytes or 32 hex chars, got ${decoded.length} bytes`, + ); +} + +/** Download encrypted media from CDN and decrypt it. */ +export async function downloadAndDecrypt( + encryptQueryParam: string, + aesKey: string, +): Promise { + const url = buildCdnDownloadUrl(encryptQueryParam); + + const resp = await fetch(url); + if (!resp.ok) { + throw new Error(`CDN download failed: HTTP ${resp.status}`); + } + + const ciphertext = Buffer.from(await resp.arrayBuffer()); + const keyBuf = parseAesKey(aesKey); + return decryptAesEcb(ciphertext, keyBuf); +} diff --git a/packages/channels/weixin/src/monitor.ts b/packages/channels/weixin/src/monitor.ts index e4a61ab71..42ab90b9d 100644 --- a/packages/channels/weixin/src/monitor.ts +++ b/packages/channels/weixin/src/monitor.ts @@ -31,10 +31,17 @@ function saveCursor(cursor: string): void { writeFileSync(cursorPath(), cursor, 'utf-8'); } +export interface ImageCdnRef { + encryptQueryParam: string; + aesKey: string; +} + export interface ParsedMessage { fromUserId: string; messageId: string; text: string; + /** CDN reference for deferred image download. */ + image?: ImageCdnRef; } export type OnMessageCallback = (msg: ParsedMessage) => Promise; @@ -131,22 +138,33 @@ async function processMessage( contextTokens.set(fromUserId, msg.context_token); } - // Extract text content + // Extract text and image CDN reference let textContent = ''; + let image: ImageCdnRef | undefined; + if (msg.item_list) { for (const item of msg.item_list) { if (item.type === MessageItemType.TEXT && item.text_item?.text) { textContent += (textContent ? '\n' : '') + item.text_item.text; + } else if (item.type === MessageItemType.IMAGE && item.image_item) { + const media = item.image_item.media; + if (media?.encrypt_query_param && media.aes_key) { + image = { + encryptQueryParam: media.encrypt_query_param, + aesKey: media.aes_key, + }; + } } - // MVP: skip media items, text only } } - if (!textContent) return; + // Need either text or image to proceed + if (!textContent && !image) return; await onMessage({ fromUserId, messageId: String(msg.message_id || ''), - text: textContent, + text: textContent || '(image)', + image, }); } diff --git a/packages/cli/src/commands/channel/start.ts b/packages/cli/src/commands/channel/start.ts index ef26d19ac..afcd6320a 100644 --- a/packages/cli/src/commands/channel/start.ts +++ b/packages/cli/src/commands/channel/start.ts @@ -104,6 +104,7 @@ export const startCommand: CommandModule = { cwd: (rawConfig['cwd'] as string) || process.cwd(), approvalMode: rawConfig['approvalMode'] as string | undefined, instructions: rawConfig['instructions'] as string | undefined, + model: rawConfig['model'] as string | undefined, groupPolicy: (rawConfig['groupPolicy'] as ChannelConfig['groupPolicy']) || 'disabled', @@ -120,7 +121,11 @@ export const startCommand: CommandModule = { writeStdoutLine(`[Channel] CLI entry: ${cliEntryPath}`); writeStdoutLine(`[Channel] Starting "${name}" (type=${config.type})...`); - const bridge = new AcpBridge({ cliEntryPath, cwd: config.cwd }); + const bridge = new AcpBridge({ + cliEntryPath, + cwd: config.cwd, + model: config.model, + }); await bridge.start(); let channel: TelegramChannel | WeixinChannel; diff --git a/packages/vscode-ide-companion/schemas/settings.schema.json b/packages/vscode-ide-companion/schemas/settings.schema.json index 8e5725ae0..cf04eea4f 100644 --- a/packages/vscode-ide-companion/schemas/settings.schema.json +++ b/packages/vscode-ide-companion/schemas/settings.schema.json @@ -8,6 +8,11 @@ "type": "object", "additionalProperties": true }, + "channels": { + "description": "Configuration for messaging channels.", + "type": "object", + "additionalProperties": true + }, "modelProviders": { "description": "Model providers configuration grouped by authType. Each authType contains an array of model configurations.", "type": "object",