From 77b64d91d6d0761ecdf2225a612d02b67b1649fd Mon Sep 17 00:00:00 2001 From: qer Date: Tue, 26 May 2026 20:38:53 +0800 Subject: [PATCH 1/4] fix: use context token count for completion budget --- .changeset/use-context-token-budget.md | 6 + .../agent-core/src/agent/compaction/full.ts | 14 +- packages/agent-core/src/agent/turn/index.ts | 1 + .../agent-core/src/agent/turn/kosong-llm.ts | 13 +- .../agent-core/src/utils/completion-budget.ts | 31 +---- .../agent-core/test/agent/kosong-llm.test.ts | 54 ++++++++ .../test/utils/completion-budget.test.ts | 124 +++++------------- 7 files changed, 112 insertions(+), 131 deletions(-) create mode 100644 .changeset/use-context-token-budget.md diff --git a/.changeset/use-context-token-budget.md b/.changeset/use-context-token-budget.md new file mode 100644 index 0000000..620a4ba --- /dev/null +++ b/.changeset/use-context-token-budget.md @@ -0,0 +1,6 @@ +--- +"@moonshot-ai/agent-core": patch +"@moonshot-ai/kimi-code": patch +--- + +Use reported context usage when clamping completion tokens in long conversations. diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts index b4b6942..eaf1c4d 100644 --- a/packages/agent-core/src/agent/compaction/full.ts +++ b/packages/agent-core/src/agent/compaction/full.ts @@ -447,11 +447,11 @@ export class FullCompaction { const delays = retryBackoffDelays(maxAttempts); let retryCount = 0; - // Clamp the completion budget against the compaction input. Compaction - // is triggered when context is already near full, so an unbounded - // default cap is most at risk of either exceeding the model limit or - // returning empty `content` on reasoning models. The cloned provider - // is local to this call and never persisted back to agent state. + // Clamp the completion budget against the context count tracked from + // provider usage. `tokenCountWithPending` keeps real tokens for covered + // history and estimates only messages added since the last usage update. + // The cloned provider is local to this call and never persisted back to + // agent state. const completionBudget = resolveCompletionBudget({ reservedContextSize: this.agent.providerManager?.config.loopControl?.reservedContextSize, @@ -460,9 +460,7 @@ export class FullCompaction { provider: this.agent.config.provider, budget: completionBudget, capability: this.agent.config.modelCapabilities, - messages, - systemPrompt: this.agent.config.systemPrompt, - tools: this.agent.tools.loopTools, + inputTokenCount: this.tokenCountWithPending, }); for (let attempt = 1; ; attempt += 1) { diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index d8756a7..0e180b6 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -381,6 +381,7 @@ export class TurnFlow { capability: this.agent.config.modelCapabilities, generate: this.agent.generate, completionBudgetConfig, + inputTokenCount: () => this.agent.context.tokenCountWithPending, }), buildMessages: () => this.agent.context.messages, dispatchEvent: this.buildDispatchEvent(turnId), diff --git a/packages/agent-core/src/agent/turn/kosong-llm.ts b/packages/agent-core/src/agent/turn/kosong-llm.ts index 3b15d6b..a86cc0c 100644 --- a/packages/agent-core/src/agent/turn/kosong-llm.ts +++ b/packages/agent-core/src/agent/turn/kosong-llm.ts @@ -57,9 +57,11 @@ export interface KosongLLMConfig { readonly generate?: GenerateFn | undefined; /** * Completion budget config resolved from agent/provider settings. The - * final cap is computed per request from the current messages and tools. + * final cap is computed per request from the current input token count. */ readonly completionBudgetConfig?: CompletionBudgetConfig | undefined; + /** Current input token count, including pending context not yet covered by usage. */ + readonly inputTokenCount?: (() => number) | undefined; } export class KosongLLM implements LLM { @@ -70,6 +72,7 @@ export class KosongLLM implements LLM { private readonly provider: ChatProvider; private readonly generate: GenerateFn; private readonly completionBudgetConfig: CompletionBudgetConfig | undefined; + private readonly inputTokenCount: (() => number) | undefined; constructor(config: KosongLLMConfig) { this.provider = config.provider; @@ -78,6 +81,7 @@ export class KosongLLM implements LLM { this.capability = config.capability; this.generate = config.generate ?? kosongGenerate; this.completionBudgetConfig = config.completionBudgetConfig; + this.inputTokenCount = config.inputTokenCount; } async chat(params: LLMChatParams): Promise { @@ -91,16 +95,11 @@ export class KosongLLM implements LLM { // throwaway shallow clone. `effectiveProvider` is local to this call // and never written back to `this.provider`, so retries (handled at // a higher layer) keep using the same long-lived provider/client. - // The clamp must see every input the provider will serialize on the - // wire — system prompt and tool schemas included — or a near-full - // context can still slip past the limit. const effectiveProvider = applyCompletionBudget({ provider: this.provider, budget: this.completionBudgetConfig, capability: this.capability, - messages: params.messages, - systemPrompt: this.systemPrompt, - tools: params.tools, + inputTokenCount: this.inputTokenCount?.() ?? 0, }); const result = await this.generate( diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts index 361d654..9442861 100644 --- a/packages/agent-core/src/utils/completion-budget.ts +++ b/packages/agent-core/src/utils/completion-budget.ts @@ -1,15 +1,4 @@ -import type { - ChatProvider, - Message, - ModelCapability, - Tool, -} from '@moonshot-ai/kosong'; - -import { - estimateTokens, - estimateTokensForMessages, - estimateTokensForTools, -} from './tokens'; +import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong'; /** Completion-token budget for the next LLM request. */ export interface CompletionBudgetConfig { @@ -65,9 +54,7 @@ function parseEnvBudget(raw: string | undefined): EnvBudget { export function computeCompletionBudgetCap(args: { readonly budget: CompletionBudgetConfig; readonly capability: ModelCapability | undefined; - readonly messages: readonly Message[]; - readonly systemPrompt?: string; - readonly tools?: readonly Tool[]; + readonly inputTokenCount: number; }): number { const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN; const maxCtx = args.capability?.max_context_tokens ?? 0; @@ -77,11 +64,7 @@ export function computeCompletionBudgetCap(args: { args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK, ); } - const input = - estimateTokensForMessages([...args.messages]) + - estimateTokens(args.systemPrompt ?? '') + - estimateTokensForTools(args.tools ?? []); - const remaining = maxCtx - input - safetyMargin; + const remaining = maxCtx - args.inputTokenCount - safetyMargin; if (remaining <= 0) { return MIN_FLOOR; } @@ -105,18 +88,14 @@ export function applyCompletionBudget(args: { readonly provider: ChatProvider; readonly budget: CompletionBudgetConfig | undefined; readonly capability: ModelCapability | undefined; - readonly messages: readonly Message[]; - readonly systemPrompt?: string; - readonly tools?: readonly Tool[]; + readonly inputTokenCount: number; }): ChatProvider { if (args.budget === undefined) return args.provider; if (args.provider.withMaxCompletionTokens === undefined) return args.provider; const cap = computeCompletionBudgetCap({ budget: args.budget, capability: args.capability, - messages: args.messages, - systemPrompt: args.systemPrompt, - tools: args.tools, + inputTokenCount: args.inputTokenCount, }); return args.provider.withMaxCompletionTokens(cap); } diff --git a/packages/agent-core/test/agent/kosong-llm.test.ts b/packages/agent-core/test/agent/kosong-llm.test.ts index 4430101..1f70273 100644 --- a/packages/agent-core/test/agent/kosong-llm.test.ts +++ b/packages/agent-core/test/agent/kosong-llm.test.ts @@ -1,6 +1,7 @@ import { emptyUsage, type ChatProvider, + type ModelCapability, type StreamedMessagePart, type ToolCall, } from '@moonshot-ai/kosong'; @@ -83,6 +84,48 @@ describe('KosongLLM streaming tool-call deltas', () => { }); }); +describe('KosongLLM completion budget', () => { + it('uses the supplied input token count when applying the completion cap', async () => { + let appliedCap: number | undefined; + let generatedProvider: ChatProvider | undefined; + const providerWithBudget: ChatProvider = { + ...provider, + withMaxCompletionTokens(n: number) { + appliedCap = n; + return { ...this, withMaxCompletionTokens: this.withMaxCompletionTokens }; + }, + }; + const generate: GenerateFn = async (nextProvider) => { + generatedProvider = nextProvider; + return { + id: 'response-1', + message: { role: 'assistant', content: [], toolCalls: [] }, + usage: emptyUsage(), + finishReason: 'completed', + rawFinishReason: 'stop', + }; + }; + const llm = new KosongLLM({ + provider: providerWithBudget, + modelName: 'test-model', + systemPrompt: 'system', + capability: makeCapability(10000), + completionBudgetConfig: { fallback: 32000 }, + inputTokenCount: () => 3000, + generate, + }); + + await llm.chat({ + messages: [], + tools: [], + signal: new AbortController().signal, + }); + + expect(appliedCap).toBe(5976); + expect(generatedProvider).not.toBe(providerWithBudget); + }); +}); + async function collectToolCallDeltas( parts: readonly StreamedMessagePart[], ): Promise { @@ -130,3 +173,14 @@ function stripStreamIndex(toolCall: ToolCall): ToolCall { const { _streamIndex: _, ...rest } = toolCall; return rest; } + +function makeCapability(maxContextTokens: number): ModelCapability { + return { + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: maxContextTokens, + }; +} diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts index 77767c6..64f7a8d 100644 --- a/packages/agent-core/test/utils/completion-budget.test.ts +++ b/packages/agent-core/test/utils/completion-budget.test.ts @@ -1,9 +1,4 @@ -import type { - ChatProvider, - Message, - ModelCapability, - Tool, -} from '@moonshot-ai/kosong'; +import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong'; import { beforeEach, describe, expect, it, vi } from 'vitest'; import { @@ -12,19 +7,6 @@ import { resolveCompletionBudget, } from '../../src/utils/completion-budget'; -function makeMessages(approxAsciiTokens: number): Message[] { - // estimateTokens treats ASCII as ~4 chars/token. Pad with a single - // string so the message lands near the requested count. - const charCount = approxAsciiTokens * 4; - return [ - { - role: 'user', - content: [{ type: 'text', text: 'a'.repeat(charCount) }], - toolCalls: [], - }, - ]; -} - function makeCapability(maxContextTokens: number): ModelCapability { return { image_in: false, @@ -36,20 +18,12 @@ function makeCapability(maxContextTokens: number): ModelCapability { }; } -function makeTool(name: string, asciiCharsInDescription: number): Tool { - return { - name, - description: 'd'.repeat(asciiCharsInDescription), - parameters: { type: 'object', properties: {} }, - }; -} - describe('computeCompletionBudgetCap', () => { it('uses fallback when context size is unknown and no hard cap is set', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 8192 }, capability: undefined, - messages: makeMessages(100), + inputTokenCount: 100, }); expect(cap).toBe(8192); }); @@ -58,7 +32,7 @@ describe('computeCompletionBudgetCap', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 10, fallback: 8192 }, capability: makeCapability(0), - messages: makeMessages(100), + inputTokenCount: 100, }); expect(cap).toBe(10); }); @@ -68,36 +42,37 @@ describe('computeCompletionBudgetCap', () => { computeCompletionBudgetCap({ budget: { hardCap: 0 }, capability: undefined, - messages: makeMessages(10), + inputTokenCount: 10, }), ).toBe(1); expect( computeCompletionBudgetCap({ budget: { hardCap: -100 }, capability: undefined, - messages: makeMessages(10), + inputTokenCount: 10, }), ).toBe(1); }); it('uses the remaining context window when no hard cap is set', () => { const maxCtx = 100000; + const inputTokenCount = 1000; const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(maxCtx), - messages: makeMessages(1000), + inputTokenCount, }); - expect(cap).toBe(maxCtx - 1001 - 1024); + expect(cap).toBe(maxCtx - inputTokenCount - 1024); }); it('clamps explicit hard cap down to the remaining context window', () => { - // max_context_tokens 10000, input ~ 1000, safetyMargin 1024 → remaining ~ 7976 + const inputTokenCount = 1000; const cap = computeCompletionBudgetCap({ budget: { hardCap: 32000 }, capability: makeCapability(10000), - messages: makeMessages(1000), + inputTokenCount, }); - expect(cap).toBeLessThanOrEqual(10000 - 1000 - 1024); + expect(cap).toBeLessThanOrEqual(10000 - inputTokenCount - 1024); expect(cap).toBeGreaterThan(7000); }); @@ -105,79 +80,51 @@ describe('computeCompletionBudgetCap', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(10000), - messages: makeMessages(11000), + inputTokenCount: 11000, }); expect(cap).toBe(1); }); it('never exceeds remaining context, even when remaining is below the historical floor', () => { - // input ~ 8900, safetyMargin 1024 → remaining ~ 75 (positive but below 256). - // The cap MUST stay <= remaining so the request does not overflow. const maxCtx = 10000; + const inputTokenCount = 8900; const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(maxCtx), - messages: makeMessages(8900), + inputTokenCount, }); - expect(cap).toBeGreaterThanOrEqual(1); - expect(cap).toBeLessThanOrEqual(maxCtx - 8900 - 1024); + expect(cap).toBe(76); }); it('respects custom safetyMargin', () => { + const inputTokenCount = 1000; const cap = computeCompletionBudgetCap({ budget: { fallback: 32000, safetyMargin: 4096 }, capability: makeCapability(20000), - messages: makeMessages(1000), + inputTokenCount, }); - // remaining = 20000 - (1000 + 1 for 'user' role) - 4096 = 14903 - expect(cap).toBe(14903); + expect(cap).toBe(20000 - inputTokenCount - 4096); }); it('keeps explicit hard cap when smaller than remaining', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 1024 }, capability: makeCapability(100000), - messages: makeMessages(1000), + inputTokenCount: 1000, }); expect(cap).toBe(1024); }); - it('counts the system prompt as input', () => { + it('uses the caller-provided real plus pending input token count', () => { const maxCtx = 10000; const safetyMargin = 1024; - const systemPrompt = 'a'.repeat(2000 * 4); // ~2000 tokens + const inputTokenCount = 3001; const cap = computeCompletionBudgetCap({ budget: { fallback: 32000, safetyMargin }, capability: makeCapability(maxCtx), - messages: makeMessages(1000), - systemPrompt, - }); - // remaining = 10000 - (1001 + 2000) - 1024 = 5975 - expect(cap).toBeLessThanOrEqual(maxCtx - 1001 - 2000 - safetyMargin); - expect(cap).toBeGreaterThan(5500); - }); - - it('counts tool schemas as input', () => { - const maxCtx = 10000; - const safetyMargin = 1024; - const tools: Tool[] = [ - makeTool('tool_a', 4000), // ~1000 tokens of description per tool - makeTool('tool_b', 4000), - ]; - const capWithTools = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin }, - capability: makeCapability(maxCtx), - messages: makeMessages(1000), - tools, - }); - const capWithoutTools = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin }, - capability: makeCapability(maxCtx), - messages: makeMessages(1000), + inputTokenCount, }); - expect(capWithTools).toBeLessThan(capWithoutTools); - // Tool descriptions add ~2000 tokens, so cap should drop by roughly that. - expect(capWithoutTools - capWithTools).toBeGreaterThan(1500); + expect(cap).toBe(5975); }); }); @@ -208,7 +155,7 @@ describe('applyCompletionBudget', () => { provider: original, budget: undefined, capability: makeCapability(10000), - messages: makeMessages(100), + inputTokenCount: 100, }); expect(result).toBe(original); expect(withMaxCompletionTokens).not.toHaveBeenCalled(); @@ -222,46 +169,43 @@ describe('applyCompletionBudget', () => { provider: opaque, budget: { hardCap: 8192 }, capability: makeCapability(10000), - messages: makeMessages(100), + inputTokenCount: 100, }); expect(result).toBe(opaque); }); it('clones the provider with the clamped cap when budget is configured', () => { + const inputTokenCount = 1000; const result = applyCompletionBudget({ provider: original, budget: { fallback: 32000 }, capability: makeCapability(10000), - messages: makeMessages(1000), + inputTokenCount, }); expect(withMaxCompletionTokens).toHaveBeenCalledOnce(); const cap = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - expect(cap).toBeLessThanOrEqual(10000 - 1000 - 1024); + expect(cap).toBeLessThanOrEqual(10000 - inputTokenCount - 1024); expect(cap).toBeGreaterThan(7000); expect(result).not.toBe(original); }); - it('forwards systemPrompt and tools to the cap computation', () => { - const tools: Tool[] = [makeTool('tool_a', 4000)]; - const systemPrompt = 'a'.repeat(4000); // ~1000 tokens + it('uses the provided input token count for the cap computation', () => { applyCompletionBudget({ provider: original, budget: { fallback: 32000 }, capability: makeCapability(10000), - messages: makeMessages(1000), - systemPrompt, - tools, + inputTokenCount: 3000, }); - const capWithExtras = withMaxCompletionTokens.mock.calls[0]?.[0] as number; + const capWithMoreInput = withMaxCompletionTokens.mock.calls[0]?.[0] as number; withMaxCompletionTokens.mockClear(); applyCompletionBudget({ provider: original, budget: { fallback: 32000 }, capability: makeCapability(10000), - messages: makeMessages(1000), + inputTokenCount: 1000, }); - const capBare = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - expect(capWithExtras).toBeLessThan(capBare); + const capWithLessInput = withMaxCompletionTokens.mock.calls[0]?.[0] as number; + expect(capWithMoreInput).toBeLessThan(capWithLessInput); }); }); From 2c6a196ed6a2ec3df8f83ed2d785b390c58882f9 Mon Sep 17 00:00:00 2001 From: qer Date: Tue, 26 May 2026 21:10:03 +0800 Subject: [PATCH 2/4] fix: include request overhead before usage --- .../agent-core/src/agent/context/index.ts | 8 +++ packages/agent-core/src/agent/turn/index.ts | 8 ++- packages/agent-core/test/agent/turn.test.ts | 58 +++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/packages/agent-core/src/agent/context/index.ts b/packages/agent-core/src/agent/context/index.ts index a9d65cb..b8d1963 100644 --- a/packages/agent-core/src/agent/context/index.ts +++ b/packages/agent-core/src/agent/context/index.ts @@ -23,6 +23,7 @@ const TOOL_OUTPUT_EMPTY_TEXT = 'Tool output is empty.'; export class ContextMemory { private _history: ContextMessage[] = []; private _tokenCount = 0; + private _hasProviderTokenUsage = false; private tokenCountCoveredMessageCount = 0; private openSteps: Map = new Map(); private pendingToolResultIds = new Set(); @@ -72,6 +73,7 @@ export class ContextMemory { this.agent.records.logRecord({ type: 'context.clear' }); this._history = []; this._tokenCount = 0; + this._hasProviderTokenUsage = false; this.tokenCountCoveredMessageCount = 0; this.openSteps.clear(); this.pendingToolResultIds.clear(); @@ -97,6 +99,7 @@ export class ContextMemory { this.openSteps.clear(); this.flushDeferredMessagesIfToolExchangeClosed(); this._tokenCount = summary.tokensAfter; + this._hasProviderTokenUsage = false; this.tokenCountCoveredMessageCount = this._history.length; this.agent.injection.onContextCompacted(summary.compactedCount); this.agent.emitStatusUpdated(); @@ -118,6 +121,10 @@ export class ContextMemory { return this._tokenCount + estimateTokensForMessages(project(pendingMessages)); } + get hasProviderTokenUsage(): boolean { + return this._hasProviderTokenUsage; + } + get history(): readonly ContextMessage[] { return this._history; } @@ -152,6 +159,7 @@ export class ContextMemory { event.usage.inputCacheCreation + event.usage.inputOther + event.usage.output; + this._hasProviderTokenUsage = true; this.tokenCountCoveredMessageCount = openStepIndex === -1 ? this._history.length : openStepIndex + 1; } diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index 0e180b6..32aaf98 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -33,6 +33,7 @@ import type { AgentEvent, TurnEndedEvent } from '../../rpc'; import type { TelemetryPropertyValue } from '../../telemetry'; import { abortable } from '../../utils/abort'; import { resolveCompletionBudget } from '../../utils/completion-budget'; +import { estimateTokens, estimateTokensForTools } from '../../utils/tokens'; import { USER_PROMPT_ORIGIN, type PromptOrigin } from '../context'; import { renderUserPromptHookBlockResult, renderUserPromptHookResult } from '../hooks'; import { canonicalTelemetryArgs, isPlainRecord } from './canonical-args'; @@ -366,6 +367,9 @@ export class TurnFlow { const model = this.agent.config.model; const provider = this.agent.config.provider.withThinking(this.agent.config.thinkingLevel); const loopControl = this.agent.providerManager?.config.loopControl; + const requestOverheadTokenCount = + estimateTokens(this.agent.config.systemPrompt) + + estimateTokensForTools(this.agent.tools.loopTools); const completionBudgetConfig = resolveCompletionBudget({ reservedContextSize: loopControl?.reservedContextSize, }); @@ -381,7 +385,9 @@ export class TurnFlow { capability: this.agent.config.modelCapabilities, generate: this.agent.generate, completionBudgetConfig, - inputTokenCount: () => this.agent.context.tokenCountWithPending, + inputTokenCount: () => + this.agent.context.tokenCountWithPending + + (this.agent.context.hasProviderTokenUsage ? 0 : requestOverheadTokenCount), }), buildMessages: () => this.agent.context.messages, dispatchEvent: this.buildDispatchEvent(turnId), diff --git a/packages/agent-core/test/agent/turn.test.ts b/packages/agent-core/test/agent/turn.test.ts index 67c84be..db92448 100644 --- a/packages/agent-core/test/agent/turn.test.ts +++ b/packages/agent-core/test/agent/turn.test.ts @@ -10,6 +10,7 @@ import { APIStatusError, APITimeoutError, type ChatProvider, + type Message, type ModelCapability, type ToolCall, } from '@moonshot-ai/kosong'; @@ -55,6 +56,52 @@ function captureLogs(): { logger: Logger; entries: CapturedLogEntry[] } { } describe('Agent turn flow', () => { + it('adds system prompt and tool schemas to the first completion budget only until usage is available', async () => { + const maxContextTokens = 50_000; + const appliedCaps: number[] = []; + const expectedCaps: number[] = []; + const generate: GenerateFn = async (provider, systemPrompt, tools, history) => { + const cap = (provider as { readonly modelParameters?: Record }) + .modelParameters?.['max_completion_tokens']; + if (typeof cap !== 'number') throw new Error('Expected max_completion_tokens to be applied'); + appliedCaps.push(cap); + + const estimatedInput = + estimateTokens(systemPrompt) + + estimateTokensForMessages(history) + + estimateTokensForTools(tools); + expectedCaps.push(maxContextTokens - estimatedInput - 1024); + + const message: Message = { + role: 'assistant', + content: [{ type: 'text', text: `answer ${String(appliedCaps.length)}` }], + toolCalls: [], + }; + return { + id: `mock-${String(appliedCaps.length)}`, + message, + usage: { + inputOther: estimatedInput, + output: estimateTokensForMessages([message]), + inputCacheRead: 0, + inputCacheCreation: 0, + }, + finishReason: 'completed', + rawFinishReason: 'stop', + }; + }; + const ctx = testAgent({ generate }); + ctx.configure({ modelCapabilities: makeCapability(maxContextTokens) }); + await ctx.rpc.setActiveTools({ names: ['Bash'] }); + + await ctx.rpc.prompt({ input: [{ type: 'text', text: 'first prompt' }] }); + await ctx.untilTurnEnd(); + await ctx.rpc.prompt({ input: [{ type: 'text', text: 'second prompt' }] }); + await ctx.untilTurnEnd(); + + expect(appliedCaps).toEqual(expectedCaps); + }); + it('tracks turn_started and turn_interrupted telemetry', async () => { const records: TelemetryRecord[] = []; const ctx = testAgent({ telemetry: recordingTelemetry(records) }); @@ -1441,6 +1488,17 @@ async function waitForFile(path: string): Promise { throw new Error(`Timed out waiting for ${path}`); } +function makeCapability(maxContextTokens: number): ModelCapability { + return { + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: maxContextTokens, + }; +} + function mediaCapabilities(): ModelCapability { return { image_in: true, From 72e1548410601558246b078b479e116588917f42 Mon Sep 17 00:00:00 2001 From: qer Date: Tue, 26 May 2026 21:23:34 +0800 Subject: [PATCH 3/4] fix: avoid starving compaction budget --- .../agent-core/src/agent/compaction/full.ts | 10 ++-- .../agent-core/src/agent/context/index.ts | 8 --- packages/agent-core/src/agent/turn/index.ts | 8 +-- .../agent-core/test/agent/compaction.test.ts | 48 +++++++++++++++ packages/agent-core/test/agent/turn.test.ts | 58 ------------------- 5 files changed, 53 insertions(+), 79 deletions(-) diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts index eaf1c4d..036645e 100644 --- a/packages/agent-core/src/agent/compaction/full.ts +++ b/packages/agent-core/src/agent/compaction/full.ts @@ -447,11 +447,9 @@ export class FullCompaction { const delays = retryBackoffDelays(maxAttempts); let retryCount = 0; - // Clamp the completion budget against the context count tracked from - // provider usage. `tokenCountWithPending` keeps real tokens for covered - // history and estimates only messages added since the last usage update. - // The cloned provider is local to this call and never persisted back to - // agent state. + // Use an optimistic cap for compaction. Local tool/schema estimates can + // overcount and starve summary generation; the provider can clamp against + // the actual serialized request if the cap is too large. const completionBudget = resolveCompletionBudget({ reservedContextSize: this.agent.providerManager?.config.loopControl?.reservedContextSize, @@ -460,7 +458,7 @@ export class FullCompaction { provider: this.agent.config.provider, budget: completionBudget, capability: this.agent.config.modelCapabilities, - inputTokenCount: this.tokenCountWithPending, + inputTokenCount: 0, }); for (let attempt = 1; ; attempt += 1) { diff --git a/packages/agent-core/src/agent/context/index.ts b/packages/agent-core/src/agent/context/index.ts index b8d1963..a9d65cb 100644 --- a/packages/agent-core/src/agent/context/index.ts +++ b/packages/agent-core/src/agent/context/index.ts @@ -23,7 +23,6 @@ const TOOL_OUTPUT_EMPTY_TEXT = 'Tool output is empty.'; export class ContextMemory { private _history: ContextMessage[] = []; private _tokenCount = 0; - private _hasProviderTokenUsage = false; private tokenCountCoveredMessageCount = 0; private openSteps: Map = new Map(); private pendingToolResultIds = new Set(); @@ -73,7 +72,6 @@ export class ContextMemory { this.agent.records.logRecord({ type: 'context.clear' }); this._history = []; this._tokenCount = 0; - this._hasProviderTokenUsage = false; this.tokenCountCoveredMessageCount = 0; this.openSteps.clear(); this.pendingToolResultIds.clear(); @@ -99,7 +97,6 @@ export class ContextMemory { this.openSteps.clear(); this.flushDeferredMessagesIfToolExchangeClosed(); this._tokenCount = summary.tokensAfter; - this._hasProviderTokenUsage = false; this.tokenCountCoveredMessageCount = this._history.length; this.agent.injection.onContextCompacted(summary.compactedCount); this.agent.emitStatusUpdated(); @@ -121,10 +118,6 @@ export class ContextMemory { return this._tokenCount + estimateTokensForMessages(project(pendingMessages)); } - get hasProviderTokenUsage(): boolean { - return this._hasProviderTokenUsage; - } - get history(): readonly ContextMessage[] { return this._history; } @@ -159,7 +152,6 @@ export class ContextMemory { event.usage.inputCacheCreation + event.usage.inputOther + event.usage.output; - this._hasProviderTokenUsage = true; this.tokenCountCoveredMessageCount = openStepIndex === -1 ? this._history.length : openStepIndex + 1; } diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index 32aaf98..0e180b6 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -33,7 +33,6 @@ import type { AgentEvent, TurnEndedEvent } from '../../rpc'; import type { TelemetryPropertyValue } from '../../telemetry'; import { abortable } from '../../utils/abort'; import { resolveCompletionBudget } from '../../utils/completion-budget'; -import { estimateTokens, estimateTokensForTools } from '../../utils/tokens'; import { USER_PROMPT_ORIGIN, type PromptOrigin } from '../context'; import { renderUserPromptHookBlockResult, renderUserPromptHookResult } from '../hooks'; import { canonicalTelemetryArgs, isPlainRecord } from './canonical-args'; @@ -367,9 +366,6 @@ export class TurnFlow { const model = this.agent.config.model; const provider = this.agent.config.provider.withThinking(this.agent.config.thinkingLevel); const loopControl = this.agent.providerManager?.config.loopControl; - const requestOverheadTokenCount = - estimateTokens(this.agent.config.systemPrompt) + - estimateTokensForTools(this.agent.tools.loopTools); const completionBudgetConfig = resolveCompletionBudget({ reservedContextSize: loopControl?.reservedContextSize, }); @@ -385,9 +381,7 @@ export class TurnFlow { capability: this.agent.config.modelCapabilities, generate: this.agent.generate, completionBudgetConfig, - inputTokenCount: () => - this.agent.context.tokenCountWithPending + - (this.agent.context.hasProviderTokenUsage ? 0 : requestOverheadTokenCount), + inputTokenCount: () => this.agent.context.tokenCountWithPending, }), buildMessages: () => this.agent.context.messages, dispatchEvent: this.buildDispatchEvent(turnId), diff --git a/packages/agent-core/test/agent/compaction.test.ts b/packages/agent-core/test/agent/compaction.test.ts index c0bf654..3fdf6f9 100644 --- a/packages/agent-core/test/agent/compaction.test.ts +++ b/packages/agent-core/test/agent/compaction.test.ts @@ -201,6 +201,54 @@ describe('Agent compaction', () => { await ctx.expectResumeMatches(); }); + it('uses the model context window for compaction completion budget', async () => { + const maxContextTokens = 5_000; + let appliedCap: number | undefined; + const generate: GenerateFn = async (provider) => { + const cap = (provider as { readonly modelParameters?: Record }) + .modelParameters?.['max_completion_tokens']; + if (typeof cap !== 'number') throw new Error('Expected max_completion_tokens to be applied'); + appliedCap = cap; + + return { + id: 'mock-compaction-budget', + message: { + role: 'assistant', + content: [{ type: 'text', text: 'Budgeted summary.' }], + toolCalls: [], + }, + usage: { + inputOther: 1, + output: 4, + inputCacheRead: 0, + inputCacheCreation: 0, + }, + finishReason: 'completed', + rawFinishReason: 'stop', + }; + }; + const ctx = testAgent({ compactionStrategy: alwaysCompactOnce, generate }); + ctx.configure({ + provider: CATALOGUED_PROVIDER, + modelCapabilities: { + ...CATALOGUED_MODEL_CAPABILITIES, + max_context_tokens: maxContextTokens, + }, + }); + appendExchange(ctx, 1, 'old user one', 'old assistant one', maxContextTokens - 100); + const compacted = new Promise((resolve) => { + ctx.emitter.once('context.apply_compaction', () => { + resolve(); + }); + }); + + await ctx.rpc.beginCompaction({ instruction: 'Keep the important test facts.' }); + await compacted; + + expect(appliedCap).toBe(maxContextTokens - 1024); + expect(appliedCap).toBeGreaterThan(1); + }); + it('projects the compacted prefix before sending the summary request', async () => { const ctx = testAgent({ compactionStrategy: alwaysCompactOnce }); ctx.configure({ diff --git a/packages/agent-core/test/agent/turn.test.ts b/packages/agent-core/test/agent/turn.test.ts index db92448..67c84be 100644 --- a/packages/agent-core/test/agent/turn.test.ts +++ b/packages/agent-core/test/agent/turn.test.ts @@ -10,7 +10,6 @@ import { APIStatusError, APITimeoutError, type ChatProvider, - type Message, type ModelCapability, type ToolCall, } from '@moonshot-ai/kosong'; @@ -56,52 +55,6 @@ function captureLogs(): { logger: Logger; entries: CapturedLogEntry[] } { } describe('Agent turn flow', () => { - it('adds system prompt and tool schemas to the first completion budget only until usage is available', async () => { - const maxContextTokens = 50_000; - const appliedCaps: number[] = []; - const expectedCaps: number[] = []; - const generate: GenerateFn = async (provider, systemPrompt, tools, history) => { - const cap = (provider as { readonly modelParameters?: Record }) - .modelParameters?.['max_completion_tokens']; - if (typeof cap !== 'number') throw new Error('Expected max_completion_tokens to be applied'); - appliedCaps.push(cap); - - const estimatedInput = - estimateTokens(systemPrompt) + - estimateTokensForMessages(history) + - estimateTokensForTools(tools); - expectedCaps.push(maxContextTokens - estimatedInput - 1024); - - const message: Message = { - role: 'assistant', - content: [{ type: 'text', text: `answer ${String(appliedCaps.length)}` }], - toolCalls: [], - }; - return { - id: `mock-${String(appliedCaps.length)}`, - message, - usage: { - inputOther: estimatedInput, - output: estimateTokensForMessages([message]), - inputCacheRead: 0, - inputCacheCreation: 0, - }, - finishReason: 'completed', - rawFinishReason: 'stop', - }; - }; - const ctx = testAgent({ generate }); - ctx.configure({ modelCapabilities: makeCapability(maxContextTokens) }); - await ctx.rpc.setActiveTools({ names: ['Bash'] }); - - await ctx.rpc.prompt({ input: [{ type: 'text', text: 'first prompt' }] }); - await ctx.untilTurnEnd(); - await ctx.rpc.prompt({ input: [{ type: 'text', text: 'second prompt' }] }); - await ctx.untilTurnEnd(); - - expect(appliedCaps).toEqual(expectedCaps); - }); - it('tracks turn_started and turn_interrupted telemetry', async () => { const records: TelemetryRecord[] = []; const ctx = testAgent({ telemetry: recordingTelemetry(records) }); @@ -1488,17 +1441,6 @@ async function waitForFile(path: string): Promise { throw new Error(`Timed out waiting for ${path}`); } -function makeCapability(maxContextTokens: number): ModelCapability { - return { - image_in: false, - video_in: false, - audio_in: false, - thinking: false, - tool_use: true, - max_context_tokens: maxContextTokens, - }; -} - function mediaCapabilities(): ModelCapability { return { image_in: true, From c93400d490000dd989f3661a4a41b38f9f3cc3c1 Mon Sep 17 00:00:00 2001 From: qer Date: Tue, 26 May 2026 22:08:50 +0800 Subject: [PATCH 4/4] fix: avoid local completion budget truncation --- .changeset/use-context-token-budget.md | 2 +- .../agent-core/src/agent/compaction/full.ts | 4 - packages/agent-core/src/agent/turn/index.ts | 1 - .../agent-core/src/agent/turn/kosong-llm.ts | 7 +- .../agent-core/src/utils/completion-budget.ts | 31 ++----- .../agent-core/test/agent/compaction.test.ts | 3 +- .../agent-core/test/agent/kosong-llm.test.ts | 5 +- .../test/utils/completion-budget.test.ts | 85 +++---------------- 8 files changed, 27 insertions(+), 111 deletions(-) diff --git a/.changeset/use-context-token-budget.md b/.changeset/use-context-token-budget.md index 620a4ba..e980dca 100644 --- a/.changeset/use-context-token-budget.md +++ b/.changeset/use-context-token-budget.md @@ -3,4 +3,4 @@ "@moonshot-ai/kimi-code": patch --- -Use reported context usage when clamping completion tokens in long conversations. +Avoid overly small local completion caps that can truncate reasoning before summaries are produced. diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts index 036645e..bf0f87d 100644 --- a/packages/agent-core/src/agent/compaction/full.ts +++ b/packages/agent-core/src/agent/compaction/full.ts @@ -447,9 +447,6 @@ export class FullCompaction { const delays = retryBackoffDelays(maxAttempts); let retryCount = 0; - // Use an optimistic cap for compaction. Local tool/schema estimates can - // overcount and starve summary generation; the provider can clamp against - // the actual serialized request if the cap is too large. const completionBudget = resolveCompletionBudget({ reservedContextSize: this.agent.providerManager?.config.loopControl?.reservedContextSize, @@ -458,7 +455,6 @@ export class FullCompaction { provider: this.agent.config.provider, budget: completionBudget, capability: this.agent.config.modelCapabilities, - inputTokenCount: 0, }); for (let attempt = 1; ; attempt += 1) { diff --git a/packages/agent-core/src/agent/turn/index.ts b/packages/agent-core/src/agent/turn/index.ts index 0e180b6..d8756a7 100644 --- a/packages/agent-core/src/agent/turn/index.ts +++ b/packages/agent-core/src/agent/turn/index.ts @@ -381,7 +381,6 @@ export class TurnFlow { capability: this.agent.config.modelCapabilities, generate: this.agent.generate, completionBudgetConfig, - inputTokenCount: () => this.agent.context.tokenCountWithPending, }), buildMessages: () => this.agent.context.messages, dispatchEvent: this.buildDispatchEvent(turnId), diff --git a/packages/agent-core/src/agent/turn/kosong-llm.ts b/packages/agent-core/src/agent/turn/kosong-llm.ts index a86cc0c..ca522eb 100644 --- a/packages/agent-core/src/agent/turn/kosong-llm.ts +++ b/packages/agent-core/src/agent/turn/kosong-llm.ts @@ -57,11 +57,9 @@ export interface KosongLLMConfig { readonly generate?: GenerateFn | undefined; /** * Completion budget config resolved from agent/provider settings. The - * final cap is computed per request from the current input token count. + * final cap is applied to each request. */ readonly completionBudgetConfig?: CompletionBudgetConfig | undefined; - /** Current input token count, including pending context not yet covered by usage. */ - readonly inputTokenCount?: (() => number) | undefined; } export class KosongLLM implements LLM { @@ -72,7 +70,6 @@ export class KosongLLM implements LLM { private readonly provider: ChatProvider; private readonly generate: GenerateFn; private readonly completionBudgetConfig: CompletionBudgetConfig | undefined; - private readonly inputTokenCount: (() => number) | undefined; constructor(config: KosongLLMConfig) { this.provider = config.provider; @@ -81,7 +78,6 @@ export class KosongLLM implements LLM { this.capability = config.capability; this.generate = config.generate ?? kosongGenerate; this.completionBudgetConfig = config.completionBudgetConfig; - this.inputTokenCount = config.inputTokenCount; } async chat(params: LLMChatParams): Promise { @@ -99,7 +95,6 @@ export class KosongLLM implements LLM { provider: this.provider, budget: this.completionBudgetConfig, capability: this.capability, - inputTokenCount: this.inputTokenCount?.() ?? 0, }); const result = await this.generate( diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts index 9442861..5136ec6 100644 --- a/packages/agent-core/src/utils/completion-budget.ts +++ b/packages/agent-core/src/utils/completion-budget.ts @@ -6,12 +6,9 @@ export interface CompletionBudgetConfig { readonly hardCap?: number; /** Conservative cap for providers/models whose context window is unknown. */ readonly fallback?: number; - /** Tokens kept out of the output budget to absorb estimation drift. */ - readonly safetyMargin?: number; } const MIN_FLOOR = 1; -const DEFAULT_SAFETY_MARGIN = 1024; const DEFAULT_UNKNOWN_CONTEXT_FALLBACK = 32000; /** @@ -48,30 +45,20 @@ function parseEnvBudget(raw: string | undefined): EnvBudget { } /** - * Compute the effective `max_completion_tokens` cap. Known-context requests - * use the remaining window unless a hard cap is configured. + * Compute the effective `max_completion_tokens` cap. */ export function computeCompletionBudgetCap(args: { readonly budget: CompletionBudgetConfig; readonly capability: ModelCapability | undefined; - readonly inputTokenCount: number; }): number { - const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN; const maxCtx = args.capability?.max_context_tokens ?? 0; - if (maxCtx <= 0) { - return Math.max( - MIN_FLOOR, - args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK, - ); - } - const remaining = maxCtx - args.inputTokenCount - safetyMargin; - if (remaining <= 0) { - return MIN_FLOOR; - } - if (args.budget.hardCap === undefined) { - return Math.max(MIN_FLOOR, remaining); - } - return Math.max(MIN_FLOOR, Math.min(args.budget.hardCap, remaining)); + // The provider backend computes the safe request-specific value from the + // serialized prompt. Locally using the largest cap avoids cutting off + // thinking before the model produces a summary. + const cap = + args.budget.hardCap ?? + (maxCtx > 0 ? maxCtx : args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK); + return Math.max(MIN_FLOOR, cap); } /** @@ -88,14 +75,12 @@ export function applyCompletionBudget(args: { readonly provider: ChatProvider; readonly budget: CompletionBudgetConfig | undefined; readonly capability: ModelCapability | undefined; - readonly inputTokenCount: number; }): ChatProvider { if (args.budget === undefined) return args.provider; if (args.provider.withMaxCompletionTokens === undefined) return args.provider; const cap = computeCompletionBudgetCap({ budget: args.budget, capability: args.capability, - inputTokenCount: args.inputTokenCount, }); return args.provider.withMaxCompletionTokens(cap); } diff --git a/packages/agent-core/test/agent/compaction.test.ts b/packages/agent-core/test/agent/compaction.test.ts index 3fdf6f9..032c798 100644 --- a/packages/agent-core/test/agent/compaction.test.ts +++ b/packages/agent-core/test/agent/compaction.test.ts @@ -245,8 +245,7 @@ describe('Agent compaction', () => { await ctx.rpc.beginCompaction({ instruction: 'Keep the important test facts.' }); await compacted; - expect(appliedCap).toBe(maxContextTokens - 1024); - expect(appliedCap).toBeGreaterThan(1); + expect(appliedCap).toBe(maxContextTokens); }); it('projects the compacted prefix before sending the summary request', async () => { diff --git a/packages/agent-core/test/agent/kosong-llm.test.ts b/packages/agent-core/test/agent/kosong-llm.test.ts index 1f70273..b679a0a 100644 --- a/packages/agent-core/test/agent/kosong-llm.test.ts +++ b/packages/agent-core/test/agent/kosong-llm.test.ts @@ -85,7 +85,7 @@ describe('KosongLLM streaming tool-call deltas', () => { }); describe('KosongLLM completion budget', () => { - it('uses the supplied input token count when applying the completion cap', async () => { + it('applies the model context window as the completion cap', async () => { let appliedCap: number | undefined; let generatedProvider: ChatProvider | undefined; const providerWithBudget: ChatProvider = { @@ -111,7 +111,6 @@ describe('KosongLLM completion budget', () => { systemPrompt: 'system', capability: makeCapability(10000), completionBudgetConfig: { fallback: 32000 }, - inputTokenCount: () => 3000, generate, }); @@ -121,7 +120,7 @@ describe('KosongLLM completion budget', () => { signal: new AbortController().signal, }); - expect(appliedCap).toBe(5976); + expect(appliedCap).toBe(10000); expect(generatedProvider).not.toBe(providerWithBudget); }); }); diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts index 64f7a8d..9b75f5f 100644 --- a/packages/agent-core/test/utils/completion-budget.test.ts +++ b/packages/agent-core/test/utils/completion-budget.test.ts @@ -23,7 +23,6 @@ describe('computeCompletionBudgetCap', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 8192 }, capability: undefined, - inputTokenCount: 100, }); expect(cap).toBe(8192); }); @@ -32,7 +31,6 @@ describe('computeCompletionBudgetCap', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 10, fallback: 8192 }, capability: makeCapability(0), - inputTokenCount: 100, }); expect(cap).toBe(10); }); @@ -42,90 +40,48 @@ describe('computeCompletionBudgetCap', () => { computeCompletionBudgetCap({ budget: { hardCap: 0 }, capability: undefined, - inputTokenCount: 10, }), ).toBe(1); expect( computeCompletionBudgetCap({ budget: { hardCap: -100 }, capability: undefined, - inputTokenCount: 10, }), ).toBe(1); }); - it('uses the remaining context window when no hard cap is set', () => { + it('uses the model context window when no hard cap is set', () => { const maxCtx = 100000; - const inputTokenCount = 1000; const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(maxCtx), - inputTokenCount, }); - expect(cap).toBe(maxCtx - inputTokenCount - 1024); + expect(cap).toBe(maxCtx); }); - it('clamps explicit hard cap down to the remaining context window', () => { - const inputTokenCount = 1000; + it('uses the explicit hard cap when configured', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 32000 }, capability: makeCapability(10000), - inputTokenCount, }); - expect(cap).toBeLessThanOrEqual(10000 - inputTokenCount - 1024); - expect(cap).toBeGreaterThan(7000); + expect(cap).toBe(32000); }); - it('returns 1 when input already exceeds context minus margin', () => { + it('ignores fallback when the model context window is known', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(10000), - inputTokenCount: 11000, }); - expect(cap).toBe(1); - }); - - it('never exceeds remaining context, even when remaining is below the historical floor', () => { - const maxCtx = 10000; - const inputTokenCount = 8900; - const cap = computeCompletionBudgetCap({ - budget: { fallback: 32000 }, - capability: makeCapability(maxCtx), - inputTokenCount, - }); - expect(cap).toBe(76); - }); - - it('respects custom safetyMargin', () => { - const inputTokenCount = 1000; - const cap = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin: 4096 }, - capability: makeCapability(20000), - inputTokenCount, - }); - expect(cap).toBe(20000 - inputTokenCount - 4096); + expect(cap).toBe(10000); }); it('keeps explicit hard cap when smaller than remaining', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 1024 }, capability: makeCapability(100000), - inputTokenCount: 1000, }); expect(cap).toBe(1024); }); - - it('uses the caller-provided real plus pending input token count', () => { - const maxCtx = 10000; - const safetyMargin = 1024; - const inputTokenCount = 3001; - const cap = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin }, - capability: makeCapability(maxCtx), - inputTokenCount, - }); - expect(cap).toBe(5975); - }); }); describe('applyCompletionBudget', () => { @@ -155,7 +111,6 @@ describe('applyCompletionBudget', () => { provider: original, budget: undefined, capability: makeCapability(10000), - inputTokenCount: 100, }); expect(result).toBe(original); expect(withMaxCompletionTokens).not.toHaveBeenCalled(); @@ -169,43 +124,31 @@ describe('applyCompletionBudget', () => { provider: opaque, budget: { hardCap: 8192 }, capability: makeCapability(10000), - inputTokenCount: 100, }); expect(result).toBe(opaque); }); - it('clones the provider with the clamped cap when budget is configured', () => { - const inputTokenCount = 1000; + it('clones the provider with the model context window when budget is configured', () => { const result = applyCompletionBudget({ provider: original, budget: { fallback: 32000 }, capability: makeCapability(10000), - inputTokenCount, }); expect(withMaxCompletionTokens).toHaveBeenCalledOnce(); const cap = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - expect(cap).toBeLessThanOrEqual(10000 - inputTokenCount - 1024); - expect(cap).toBeGreaterThan(7000); + expect(cap).toBe(10000); expect(result).not.toBe(original); }); - it('uses the provided input token count for the cap computation', () => { - applyCompletionBudget({ - provider: original, - budget: { fallback: 32000 }, - capability: makeCapability(10000), - inputTokenCount: 3000, - }); - const capWithMoreInput = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - withMaxCompletionTokens.mockClear(); - applyCompletionBudget({ + it('uses the explicit hard cap when configured', () => { + const result = applyCompletionBudget({ provider: original, - budget: { fallback: 32000 }, + budget: { hardCap: 8192 }, capability: makeCapability(10000), - inputTokenCount: 1000, }); - const capWithLessInput = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - expect(capWithMoreInput).toBeLessThan(capWithLessInput); + expect(withMaxCompletionTokens).toHaveBeenCalledOnce(); + expect(withMaxCompletionTokens.mock.calls[0]?.[0]).toBe(8192); + expect(result).not.toBe(original); }); });