diff --git a/docs/users/configuration/settings.md b/docs/users/configuration/settings.md index 21e468c44..a6b2c87b2 100644 --- a/docs/users/configuration/settings.md +++ b/docs/users/configuration/settings.md @@ -96,18 +96,18 @@ Settings are organized into categories. All settings should be placed within the #### model -| Setting | Type | Description | Default | -| -------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | -| `model.name` | string | The Qwen model to use for conversations. | `undefined` | -| `model.maxSessionTurns` | number | Maximum number of user/model/tool turns to keep in a session. -1 means unlimited. | `-1` | -| `model.summarizeToolOutput` | object | Enables or disables the summarization of tool output. You can specify the token budget for the summarization using the `tokenBudget` setting. Note: Currently only the `run_shell_command` tool is supported. For example `{"run_shell_command": {"tokenBudget": 2000}}` | `undefined` | -| `model.generationConfig` | object | Advanced overrides passed to the underlying content generator. Supports request controls such as `timeout`, `maxRetries`, `disableCacheControl`, `contextWindowSize` (override model's context window size), `maxOutputTokens` (override model's maximum output tokens), and `customHeaders` (custom HTTP headers for API requests), along with fine-tuning knobs under `samplingParams` (for example `temperature`, `top_p`, `max_tokens`). Leave unset to rely on provider defaults. | `undefined` | -| `model.chatCompression.contextPercentageThreshold` | number | Sets the threshold for chat history compression as a percentage of the model's total token limit. This is a value between 0 and 1 that applies to both automatic compression and the manual `/compress` command. For example, a value of `0.6` will trigger compression when the chat history exceeds 60% of the token limit. Use `0` to disable compression entirely. | `0.7` | -| `model.skipNextSpeakerCheck` | boolean | Skip the next speaker check. | `false` | -| `model.skipLoopDetection` | boolean | Disables loop detection checks. Loop detection prevents infinite loops in AI responses but can generate false positives that interrupt legitimate workflows. Enable this option if you experience frequent false positive loop detection interruptions. | `false` | -| `model.skipStartupContext` | boolean | Skips sending the startup workspace context (environment summary and acknowledgement) at the beginning of each session. Enable this if you prefer to provide context manually or want to save tokens on startup. | `false` | -| `model.enableOpenAILogging` | boolean | Enables logging of OpenAI API calls for debugging and analysis. When enabled, API requests and responses are logged to JSON files. | `false` | -| `model.openAILoggingDir` | string | Custom directory path for OpenAI API logs. If not specified, defaults to `logs/openai` in the current working directory. Supports absolute paths, relative paths (resolved from current working directory), and `~` expansion (home directory). | `undefined` | +| Setting | Type | Description | Default | +| -------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | +| `model.name` | string | The Qwen model to use for conversations. | `undefined` | +| `model.maxSessionTurns` | number | Maximum number of user/model/tool turns to keep in a session. -1 means unlimited. | `-1` | +| `model.summarizeToolOutput` | object | Enables or disables the summarization of tool output. You can specify the token budget for the summarization using the `tokenBudget` setting. Note: Currently only the `run_shell_command` tool is supported. For example `{"run_shell_command": {"tokenBudget": 2000}}` | `undefined` | +| `model.generationConfig` | object | Advanced overrides passed to the underlying content generator. Supports request controls such as `timeout`, `maxRetries`, `disableCacheControl`, `contextWindowSize` (override model's context window size), and `customHeaders` (custom HTTP headers for API requests), along with fine-tuning knobs under `samplingParams` (for example `temperature`, `top_p`, `max_tokens`). Leave unset to rely on provider defaults. | `undefined` | +| `model.chatCompression.contextPercentageThreshold` | number | Sets the threshold for chat history compression as a percentage of the model's total token limit. This is a value between 0 and 1 that applies to both automatic compression and the manual `/compress` command. For example, a value of `0.6` will trigger compression when the chat history exceeds 60% of the token limit. Use `0` to disable compression entirely. | `0.7` | +| `model.skipNextSpeakerCheck` | boolean | Skip the next speaker check. | `false` | +| `model.skipLoopDetection` | boolean | Disables loop detection checks. Loop detection prevents infinite loops in AI responses but can generate false positives that interrupt legitimate workflows. Enable this option if you experience frequent false positive loop detection interruptions. | `false` | +| `model.skipStartupContext` | boolean | Skips sending the startup workspace context (environment summary and acknowledgement) at the beginning of each session. Enable this if you prefer to provide context manually or want to save tokens on startup. | `false` | +| `model.enableOpenAILogging` | boolean | Enables logging of OpenAI API calls for debugging and analysis. When enabled, API requests and responses are logged to JSON files. | `false` | +| `model.openAILoggingDir` | string | Custom directory path for OpenAI API logs. If not specified, defaults to `logs/openai` in the current working directory. Supports absolute paths, relative paths (resolved from current working directory), and `~` expansion (home directory). | `undefined` | **Example model.generationConfig:** @@ -118,7 +118,6 @@ Settings are organized into categories. All settings should be placed within the "timeout": 60000, "disableCacheControl": false, "contextWindowSize": 128000, - "maxOutputTokens": 8192, "customHeaders": { "X-Request-ID": "req-123", "X-User-ID": "user-456" @@ -137,10 +136,6 @@ Settings are organized into categories. All settings should be placed within the Overrides the default context window size for the selected model. Qwen Code determines the context window using built-in defaults based on model name matching, with a constant fallback value. Use this setting when a provider's effective context limit differs from Qwen Code's default. This value defines the model's assumed maximum context capacity, not a per-request token limit. -**maxOutputTokens:** - -Overrides the default maximum output tokens for the selected model. Qwen Code determines the maximum output tokens using built-in defaults based on model name matching, with a constant fallback value of 8,192 tokens. Use this setting when a provider's effective output limit differs from Qwen Code's default. This value defines the maximum number of tokens the model can generate in a single response. - **customHeaders:** Allows you to add custom HTTP headers to all API requests. This is useful for request tracing, monitoring, API gateway routing, or when different models require different headers. If `customHeaders` is defined in `modelProviders[].generationConfig.customHeaders`, it will be used directly; otherwise, headers from `model.generationConfig.customHeaders` will be used. No merging occurs between the two levels. diff --git a/packages/core/src/config/config.test.ts b/packages/core/src/config/config.test.ts index a2e5094bb..475e19524 100644 --- a/packages/core/src/config/config.test.ts +++ b/packages/core/src/config/config.test.ts @@ -1356,7 +1356,7 @@ describe('Model Switching and Config Updates', () => { vi.clearAllMocks(); }); - it('should update contextWindowSize and maxOutputTokens when switching models with hot-update', async () => { + it('should update contextWindowSize when switching models with hot-update', async () => { const config = new Config(baseParams); // Initialize with first model @@ -1365,7 +1365,6 @@ describe('Model Switching and Config Updates', () => { ['authType']: AuthType.QWEN_OAUTH, ['apiKey']: 'test-key', ['contextWindowSize']: 1_000_000, - ['maxOutputTokens']: 8_192, ['samplingParams']: { temperature: 0.7 }, ['disableCacheControl']: false, }; @@ -1375,7 +1374,6 @@ describe('Model Switching and Config Updates', () => { sources: { model: { kind: 'settings' }, contextWindowSize: { kind: 'computed', detail: 'auto' }, - maxOutputTokens: { kind: 'computed', detail: 'auto' }, }, }); @@ -1385,7 +1383,6 @@ describe('Model Switching and Config Updates', () => { const contentGenConfig = config.getContentGeneratorConfig(); expect(contentGenConfig['model']).toBe('qwen3-coder-plus'); expect(contentGenConfig['contextWindowSize']).toBe(1_000_000); - expect(contentGenConfig['maxOutputTokens']).toBe(8_192); // Switch to a different model with different token limits const newConfig: ContentGeneratorConfig = { @@ -1393,7 +1390,6 @@ describe('Model Switching and Config Updates', () => { ['authType']: AuthType.QWEN_OAUTH, ['apiKey']: 'test-key', ['contextWindowSize']: 128_000, - ['maxOutputTokens']: 4_096, ['samplingParams']: { temperature: 0.8 }, ['disableCacheControl']: true, }; @@ -1403,7 +1399,6 @@ describe('Model Switching and Config Updates', () => { sources: { model: { kind: 'programmatic', detail: 'user' }, contextWindowSize: { kind: 'computed', detail: 'auto' }, - maxOutputTokens: { kind: 'computed', detail: 'auto' }, samplingParams: { kind: 'settings' }, disableCacheControl: { kind: 'settings' }, }, @@ -1423,7 +1418,6 @@ describe('Model Switching and Config Updates', () => { const updatedConfig = config.getContentGeneratorConfig(); expect(updatedConfig['model']).toBe('qwen-max'); expect(updatedConfig['contextWindowSize']).toBe(128_000); - expect(updatedConfig['maxOutputTokens']).toBe(4_096); expect(updatedConfig['samplingParams']?.temperature).toBe(0.8); expect(updatedConfig['disableCacheControl']).toBe(true); @@ -1433,8 +1427,6 @@ describe('Model Switching and Config Updates', () => { expect(sources['model']?.detail).toBe('user'); expect(sources['contextWindowSize']?.kind).toBe('computed'); expect(sources['contextWindowSize']?.detail).toBe('auto'); - expect(sources['maxOutputTokens']?.kind).toBe('computed'); - expect(sources['maxOutputTokens']?.detail).toBe('auto'); expect(sources['samplingParams']?.kind).toBe('settings'); expect(sources['disableCacheControl']?.kind).toBe('settings'); }); @@ -1448,7 +1440,6 @@ describe('Model Switching and Config Updates', () => { ['authType']: AuthType.QWEN_OAUTH, ['apiKey']: 'test-key', ['contextWindowSize']: 1_000_000, - ['maxOutputTokens']: 8_192, }; vi.mocked(resolveContentGeneratorConfigWithSources).mockReturnValue({ @@ -1464,7 +1455,6 @@ describe('Model Switching and Config Updates', () => { ['authType']: AuthType.USE_GEMINI, ['apiKey']: 'gemini-key', ['contextWindowSize']: 32_000, - ['maxOutputTokens']: 2_048, }; vi.mocked(resolveContentGeneratorConfigWithSources).mockReturnValue({ @@ -1493,7 +1483,7 @@ describe('Model Switching and Config Updates', () => { expect(refreshAuthSpy).toHaveBeenCalledWith(AuthType.USE_GEMINI); }); - it('should handle model switch when contextWindowSize and maxOutputTokens are undefined', async () => { + it('should handle model switch when contextWindowSize is undefined', async () => { const config = new Config(baseParams); // Initialize with config that has undefined token limits @@ -1502,7 +1492,6 @@ describe('Model Switching and Config Updates', () => { ['authType']: AuthType.QWEN_OAUTH, ['apiKey']: 'test-key', ['contextWindowSize']: undefined, - ['maxOutputTokens']: undefined, }; vi.mocked(resolveContentGeneratorConfigWithSources).mockReturnValue({ @@ -1518,7 +1507,6 @@ describe('Model Switching and Config Updates', () => { ['authType']: AuthType.QWEN_OAUTH, ['apiKey']: 'test-key', ['contextWindowSize']: 128_000, - ['maxOutputTokens']: 4_096, }; vi.mocked(resolveContentGeneratorConfigWithSources).mockReturnValue({ @@ -1538,6 +1526,5 @@ describe('Model Switching and Config Updates', () => { // Verify limits are now defined const updatedConfig = config.getContentGeneratorConfig(); expect(updatedConfig['contextWindowSize']).toBe(128_000); - expect(updatedConfig['maxOutputTokens']).toBe(4_096); }); }); diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index 0083733ab..ab60d5ac1 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -919,7 +919,6 @@ export class Config { this.contentGeneratorConfig.disableCacheControl = config.disableCacheControl; this.contentGeneratorConfig.contextWindowSize = config.contextWindowSize; - this.contentGeneratorConfig.maxOutputTokens = config.maxOutputTokens; if ('model' in sources) { this.contentGeneratorConfigSources['model'] = sources['model']; @@ -936,10 +935,6 @@ export class Config { this.contentGeneratorConfigSources['contextWindowSize'] = sources['contextWindowSize']; } - if ('maxOutputTokens' in sources) { - this.contentGeneratorConfigSources['maxOutputTokens'] = - sources['maxOutputTokens']; - } return; } diff --git a/packages/core/src/core/contentGenerator.ts b/packages/core/src/core/contentGenerator.ts index c18823804..93ca5795a 100644 --- a/packages/core/src/core/contentGenerator.ts +++ b/packages/core/src/core/contentGenerator.ts @@ -30,11 +30,7 @@ import { StrictMissingModelIdError, } from '../models/modelConfigErrors.js'; import { PROVIDER_SOURCED_FIELDS } from '../models/modelsConfig.js'; -import { - tokenLimit, - DEFAULT_TOKEN_LIMIT, - DEFAULT_OUTPUT_TOKEN_LIMIT, -} from './tokenLimits.js'; +import { tokenLimit, DEFAULT_TOKEN_LIMIT } from './tokenLimits.js'; /** * Interface abstracting the core functionalities for generating content and counting tokens. @@ -99,9 +95,6 @@ export type ContentGeneratorConfig = { // Context window size override. If set to a positive number, it will override // the automatic detection. Leave undefined to use automatic detection. contextWindowSize?: number; - // Maximum output tokens override. If set to a positive number, it will override - // the automatic detection. Leave undefined to use automatic detection. - maxOutputTokens?: number; // Custom HTTP headers to be sent with requests customHeaders?: Record; }; @@ -206,31 +199,6 @@ export function resolveContentGeneratorConfigWithSources( setSource(sources, 'contextWindowSize', seedOrUnknown('contextWindowSize')); } - // Initialize maxOutputTokens if not set by user - // This ensures maxOutputTokens is always available as a model-bound property - if (newContentGeneratorConfig.maxOutputTokens === undefined) { - if (newContentGeneratorConfig.model) { - newContentGeneratorConfig.maxOutputTokens = tokenLimit( - newContentGeneratorConfig.model, - 'output', - ); - setSource(sources, 'maxOutputTokens', { - kind: 'computed', - detail: 'auto-detected from model', - }); - } else { - // Fallback to default when model is not available - newContentGeneratorConfig.maxOutputTokens = DEFAULT_OUTPUT_TOKEN_LIMIT; - setSource(sources, 'maxOutputTokens', { - kind: 'computed', - detail: 'default fallback', - }); - } - } else { - // User explicitly set maxOutputTokens - setSource(sources, 'maxOutputTokens', seedOrUnknown('maxOutputTokens')); - } - // Validate required fields based on authType. This does not perform any // fallback resolution (resolution is handled by ModelConfigResolver). const validation = validateModelConfig( diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts index e07a02a20..b98034aa8 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts @@ -59,7 +59,6 @@ describe('DashScopeOpenAICompatibleProvider', () => { maxRetries: 2, model: 'qwen-max', authType: AuthType.QWEN_OAUTH, - maxOutputTokens: 8192, // Default output token limit for testing } as ContentGeneratorConfig; // Mock Config @@ -736,16 +735,16 @@ describe('DashScopeOpenAICompatibleProvider', () => { }); describe('output token limits', () => { - it('should limit max_tokens when it exceeds model output limit', () => { + it('should limit max_tokens when it exceeds model limit', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { model: 'qwen3-coder-plus', messages: [{ role: 'user', content: 'Hello' }], - max_tokens: 100000, // Exceeds the configured limit + max_tokens: 100000, // Exceeds the model's output limit }; const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); // Should be limited to configured maxOutputTokens + expect(result.max_tokens).toBe(65536); // Should be limited to model's output limit (64K) }); it('should limit max_tokens when it exceeds model limit for qwen-vl-max-latest', () => { @@ -764,7 +763,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { model: 'qwen3-coder-plus', messages: [{ role: 'user', content: 'Hello' }], - max_tokens: 1000, // Within the 8192 limit + max_tokens: 1000, // Within the model's output limit }; const result = provider.buildRequest(request, 'test-prompt-id'); @@ -796,16 +795,16 @@ describe('DashScopeOpenAICompatibleProvider', () => { expect(result.max_tokens).toBeNull(); // Should remain null }); - it('should use configured output limit for unknown models', () => { + it('should use default output limit for unknown models', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { model: 'unknown-model', messages: [{ role: 'user', content: 'Hello' }], - max_tokens: 10000, // Exceeds the configured 8192 limit + max_tokens: 10000, // Exceeds the default limit }; const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); // Should be limited to configured maxOutputTokens + expect(result.max_tokens).toBe(4096); // Should be limited to default output limit (4K) }); it('should preserve other request parameters when limiting max_tokens', () => { @@ -824,7 +823,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { const result = provider.buildRequest(request, 'test-prompt-id'); // max_tokens should be limited - expect(result.max_tokens).toBe(8192); + expect(result.max_tokens).toBe(65536); // Limited to model's output limit (64K) // Other parameters should be preserved expect(result.temperature).toBe(0.8); @@ -850,12 +849,12 @@ describe('DashScopeOpenAICompatibleProvider', () => { ], }, ], - max_tokens: 20000, // Exceeds the configured limit + max_tokens: 20000, // Exceeds the model's output limit }; const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); // Should be limited to configured maxOutputTokens + expect(result.max_tokens).toBe(8192); // Should be limited to model's output limit (8K) expect( (result as { vl_high_resolution_images?: boolean }) .vl_high_resolution_images, @@ -882,7 +881,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); // Limited to configured maxOutputTokens + expect(result.max_tokens).toBe(32768); // Limited to model's output limit (32K) expect( (result as { vl_high_resolution_images?: boolean }) .vl_high_resolution_images, @@ -909,7 +908,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); + expect(result.max_tokens).toBe(8192); // Limited to model's output limit (8K) expect( (result as { vl_high_resolution_images?: boolean }) .vl_high_resolution_images, @@ -920,13 +919,13 @@ describe('DashScopeOpenAICompatibleProvider', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { model: 'qwen3-coder-plus', messages: [{ role: 'user', content: 'Hello' }], - max_tokens: 100000, // Exceeds the configured limit + max_tokens: 100000, // Exceeds the model's output limit stream: true, }; const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(8192); // Should be limited to configured maxOutputTokens + expect(result.max_tokens).toBe(65536); // Should be limited to model's output limit (64K) expect(result.stream).toBe(true); // Streaming should be preserved }); }); diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts index acfe2abf6..dce7654f5 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts @@ -16,6 +16,7 @@ import type { ChatCompletionToolWithCache, } from './types.js'; import { buildRuntimeFetchOptions } from '../../../utils/runtimeFetchOptions.js'; +import { tokenLimit } from '../../tokenLimits.js'; export class DashScopeOpenAICompatibleProvider implements OpenAICompatibleProvider @@ -318,9 +319,9 @@ export class DashScopeOpenAICompatibleProvider * @param request - The chat completion request parameters * @returns The request with max_tokens adjusted to respect the model's limits (if present) */ - private applyOutputTokenLimit( - request: T, - ): T { + private applyOutputTokenLimit< + T extends { max_tokens?: number | null; model: string }, + >(request: T): T { const currentMaxTokens = request.max_tokens; // Only process if max_tokens is already present in the request @@ -328,14 +329,9 @@ export class DashScopeOpenAICompatibleProvider return request; // No max_tokens parameter, return unchanged } - // Get output token limit from config - // This value is either user-configured or auto-detected during config initialization - const modelLimit = this.contentGeneratorConfig?.maxOutputTokens; - if (!modelLimit) { - // No limit configured or config not initialized yet - // In this case, we don't modify max_tokens and let the API handle it - return request; - } + // Dynamically calculate output token limit using tokenLimit function + // This ensures we always use the latest model-specific limits without relying on user configuration + const modelLimit = tokenLimit(request.model, 'output'); // If max_tokens exceeds the model limit, cap it to the model's limit if (currentMaxTokens > modelLimit) {