diff --git a/.changeset/use-context-token-budget.md b/.changeset/use-context-token-budget.md new file mode 100644 index 0000000..e980dca --- /dev/null +++ b/.changeset/use-context-token-budget.md @@ -0,0 +1,6 @@ +--- +"@moonshot-ai/agent-core": patch +"@moonshot-ai/kimi-code": patch +--- + +Avoid overly small local completion caps that can truncate reasoning before summaries are produced. diff --git a/packages/agent-core/src/agent/compaction/full.ts b/packages/agent-core/src/agent/compaction/full.ts index b4b6942..bf0f87d 100644 --- a/packages/agent-core/src/agent/compaction/full.ts +++ b/packages/agent-core/src/agent/compaction/full.ts @@ -447,11 +447,6 @@ export class FullCompaction { const delays = retryBackoffDelays(maxAttempts); let retryCount = 0; - // Clamp the completion budget against the compaction input. Compaction - // is triggered when context is already near full, so an unbounded - // default cap is most at risk of either exceeding the model limit or - // returning empty `content` on reasoning models. The cloned provider - // is local to this call and never persisted back to agent state. const completionBudget = resolveCompletionBudget({ reservedContextSize: this.agent.providerManager?.config.loopControl?.reservedContextSize, @@ -460,9 +455,6 @@ export class FullCompaction { provider: this.agent.config.provider, budget: completionBudget, capability: this.agent.config.modelCapabilities, - messages, - systemPrompt: this.agent.config.systemPrompt, - tools: this.agent.tools.loopTools, }); for (let attempt = 1; ; attempt += 1) { diff --git a/packages/agent-core/src/agent/turn/kosong-llm.ts b/packages/agent-core/src/agent/turn/kosong-llm.ts index 3b15d6b..ca522eb 100644 --- a/packages/agent-core/src/agent/turn/kosong-llm.ts +++ b/packages/agent-core/src/agent/turn/kosong-llm.ts @@ -57,7 +57,7 @@ export interface KosongLLMConfig { readonly generate?: GenerateFn | undefined; /** * Completion budget config resolved from agent/provider settings. The - * final cap is computed per request from the current messages and tools. + * final cap is applied to each request. */ readonly completionBudgetConfig?: CompletionBudgetConfig | undefined; } @@ -91,16 +91,10 @@ export class KosongLLM implements LLM { // throwaway shallow clone. `effectiveProvider` is local to this call // and never written back to `this.provider`, so retries (handled at // a higher layer) keep using the same long-lived provider/client. - // The clamp must see every input the provider will serialize on the - // wire — system prompt and tool schemas included — or a near-full - // context can still slip past the limit. const effectiveProvider = applyCompletionBudget({ provider: this.provider, budget: this.completionBudgetConfig, capability: this.capability, - messages: params.messages, - systemPrompt: this.systemPrompt, - tools: params.tools, }); const result = await this.generate( diff --git a/packages/agent-core/src/utils/completion-budget.ts b/packages/agent-core/src/utils/completion-budget.ts index 361d654..5136ec6 100644 --- a/packages/agent-core/src/utils/completion-budget.ts +++ b/packages/agent-core/src/utils/completion-budget.ts @@ -1,15 +1,4 @@ -import type { - ChatProvider, - Message, - ModelCapability, - Tool, -} from '@moonshot-ai/kosong'; - -import { - estimateTokens, - estimateTokensForMessages, - estimateTokensForTools, -} from './tokens'; +import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong'; /** Completion-token budget for the next LLM request. */ export interface CompletionBudgetConfig { @@ -17,12 +6,9 @@ export interface CompletionBudgetConfig { readonly hardCap?: number; /** Conservative cap for providers/models whose context window is unknown. */ readonly fallback?: number; - /** Tokens kept out of the output budget to absorb estimation drift. */ - readonly safetyMargin?: number; } const MIN_FLOOR = 1; -const DEFAULT_SAFETY_MARGIN = 1024; const DEFAULT_UNKNOWN_CONTEXT_FALLBACK = 32000; /** @@ -59,36 +45,20 @@ function parseEnvBudget(raw: string | undefined): EnvBudget { } /** - * Compute the effective `max_completion_tokens` cap. Known-context requests - * use the remaining window unless a hard cap is configured. + * Compute the effective `max_completion_tokens` cap. */ export function computeCompletionBudgetCap(args: { readonly budget: CompletionBudgetConfig; readonly capability: ModelCapability | undefined; - readonly messages: readonly Message[]; - readonly systemPrompt?: string; - readonly tools?: readonly Tool[]; }): number { - const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN; const maxCtx = args.capability?.max_context_tokens ?? 0; - if (maxCtx <= 0) { - return Math.max( - MIN_FLOOR, - args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK, - ); - } - const input = - estimateTokensForMessages([...args.messages]) + - estimateTokens(args.systemPrompt ?? '') + - estimateTokensForTools(args.tools ?? []); - const remaining = maxCtx - input - safetyMargin; - if (remaining <= 0) { - return MIN_FLOOR; - } - if (args.budget.hardCap === undefined) { - return Math.max(MIN_FLOOR, remaining); - } - return Math.max(MIN_FLOOR, Math.min(args.budget.hardCap, remaining)); + // The provider backend computes the safe request-specific value from the + // serialized prompt. Locally using the largest cap avoids cutting off + // thinking before the model produces a summary. + const cap = + args.budget.hardCap ?? + (maxCtx > 0 ? maxCtx : args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK); + return Math.max(MIN_FLOOR, cap); } /** @@ -105,18 +75,12 @@ export function applyCompletionBudget(args: { readonly provider: ChatProvider; readonly budget: CompletionBudgetConfig | undefined; readonly capability: ModelCapability | undefined; - readonly messages: readonly Message[]; - readonly systemPrompt?: string; - readonly tools?: readonly Tool[]; }): ChatProvider { if (args.budget === undefined) return args.provider; if (args.provider.withMaxCompletionTokens === undefined) return args.provider; const cap = computeCompletionBudgetCap({ budget: args.budget, capability: args.capability, - messages: args.messages, - systemPrompt: args.systemPrompt, - tools: args.tools, }); return args.provider.withMaxCompletionTokens(cap); } diff --git a/packages/agent-core/test/agent/compaction.test.ts b/packages/agent-core/test/agent/compaction.test.ts index c0bf654..032c798 100644 --- a/packages/agent-core/test/agent/compaction.test.ts +++ b/packages/agent-core/test/agent/compaction.test.ts @@ -201,6 +201,53 @@ describe('Agent compaction', () => { await ctx.expectResumeMatches(); }); + it('uses the model context window for compaction completion budget', async () => { + const maxContextTokens = 5_000; + let appliedCap: number | undefined; + const generate: GenerateFn = async (provider) => { + const cap = (provider as { readonly modelParameters?: Record }) + .modelParameters?.['max_completion_tokens']; + if (typeof cap !== 'number') throw new Error('Expected max_completion_tokens to be applied'); + appliedCap = cap; + + return { + id: 'mock-compaction-budget', + message: { + role: 'assistant', + content: [{ type: 'text', text: 'Budgeted summary.' }], + toolCalls: [], + }, + usage: { + inputOther: 1, + output: 4, + inputCacheRead: 0, + inputCacheCreation: 0, + }, + finishReason: 'completed', + rawFinishReason: 'stop', + }; + }; + const ctx = testAgent({ compactionStrategy: alwaysCompactOnce, generate }); + ctx.configure({ + provider: CATALOGUED_PROVIDER, + modelCapabilities: { + ...CATALOGUED_MODEL_CAPABILITIES, + max_context_tokens: maxContextTokens, + }, + }); + appendExchange(ctx, 1, 'old user one', 'old assistant one', maxContextTokens - 100); + const compacted = new Promise((resolve) => { + ctx.emitter.once('context.apply_compaction', () => { + resolve(); + }); + }); + + await ctx.rpc.beginCompaction({ instruction: 'Keep the important test facts.' }); + await compacted; + + expect(appliedCap).toBe(maxContextTokens); + }); + it('projects the compacted prefix before sending the summary request', async () => { const ctx = testAgent({ compactionStrategy: alwaysCompactOnce }); ctx.configure({ diff --git a/packages/agent-core/test/agent/kosong-llm.test.ts b/packages/agent-core/test/agent/kosong-llm.test.ts index 4430101..b679a0a 100644 --- a/packages/agent-core/test/agent/kosong-llm.test.ts +++ b/packages/agent-core/test/agent/kosong-llm.test.ts @@ -1,6 +1,7 @@ import { emptyUsage, type ChatProvider, + type ModelCapability, type StreamedMessagePart, type ToolCall, } from '@moonshot-ai/kosong'; @@ -83,6 +84,47 @@ describe('KosongLLM streaming tool-call deltas', () => { }); }); +describe('KosongLLM completion budget', () => { + it('applies the model context window as the completion cap', async () => { + let appliedCap: number | undefined; + let generatedProvider: ChatProvider | undefined; + const providerWithBudget: ChatProvider = { + ...provider, + withMaxCompletionTokens(n: number) { + appliedCap = n; + return { ...this, withMaxCompletionTokens: this.withMaxCompletionTokens }; + }, + }; + const generate: GenerateFn = async (nextProvider) => { + generatedProvider = nextProvider; + return { + id: 'response-1', + message: { role: 'assistant', content: [], toolCalls: [] }, + usage: emptyUsage(), + finishReason: 'completed', + rawFinishReason: 'stop', + }; + }; + const llm = new KosongLLM({ + provider: providerWithBudget, + modelName: 'test-model', + systemPrompt: 'system', + capability: makeCapability(10000), + completionBudgetConfig: { fallback: 32000 }, + generate, + }); + + await llm.chat({ + messages: [], + tools: [], + signal: new AbortController().signal, + }); + + expect(appliedCap).toBe(10000); + expect(generatedProvider).not.toBe(providerWithBudget); + }); +}); + async function collectToolCallDeltas( parts: readonly StreamedMessagePart[], ): Promise { @@ -130,3 +172,14 @@ function stripStreamIndex(toolCall: ToolCall): ToolCall { const { _streamIndex: _, ...rest } = toolCall; return rest; } + +function makeCapability(maxContextTokens: number): ModelCapability { + return { + image_in: false, + video_in: false, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: maxContextTokens, + }; +} diff --git a/packages/agent-core/test/utils/completion-budget.test.ts b/packages/agent-core/test/utils/completion-budget.test.ts index 77767c6..9b75f5f 100644 --- a/packages/agent-core/test/utils/completion-budget.test.ts +++ b/packages/agent-core/test/utils/completion-budget.test.ts @@ -1,9 +1,4 @@ -import type { - ChatProvider, - Message, - ModelCapability, - Tool, -} from '@moonshot-ai/kosong'; +import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong'; import { beforeEach, describe, expect, it, vi } from 'vitest'; import { @@ -12,19 +7,6 @@ import { resolveCompletionBudget, } from '../../src/utils/completion-budget'; -function makeMessages(approxAsciiTokens: number): Message[] { - // estimateTokens treats ASCII as ~4 chars/token. Pad with a single - // string so the message lands near the requested count. - const charCount = approxAsciiTokens * 4; - return [ - { - role: 'user', - content: [{ type: 'text', text: 'a'.repeat(charCount) }], - toolCalls: [], - }, - ]; -} - function makeCapability(maxContextTokens: number): ModelCapability { return { image_in: false, @@ -36,20 +18,11 @@ function makeCapability(maxContextTokens: number): ModelCapability { }; } -function makeTool(name: string, asciiCharsInDescription: number): Tool { - return { - name, - description: 'd'.repeat(asciiCharsInDescription), - parameters: { type: 'object', properties: {} }, - }; -} - describe('computeCompletionBudgetCap', () => { it('uses fallback when context size is unknown and no hard cap is set', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 8192 }, capability: undefined, - messages: makeMessages(100), }); expect(cap).toBe(8192); }); @@ -58,7 +31,6 @@ describe('computeCompletionBudgetCap', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 10, fallback: 8192 }, capability: makeCapability(0), - messages: makeMessages(100), }); expect(cap).toBe(10); }); @@ -68,117 +40,48 @@ describe('computeCompletionBudgetCap', () => { computeCompletionBudgetCap({ budget: { hardCap: 0 }, capability: undefined, - messages: makeMessages(10), }), ).toBe(1); expect( computeCompletionBudgetCap({ budget: { hardCap: -100 }, capability: undefined, - messages: makeMessages(10), }), ).toBe(1); }); - it('uses the remaining context window when no hard cap is set', () => { + it('uses the model context window when no hard cap is set', () => { const maxCtx = 100000; const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(maxCtx), - messages: makeMessages(1000), }); - expect(cap).toBe(maxCtx - 1001 - 1024); + expect(cap).toBe(maxCtx); }); - it('clamps explicit hard cap down to the remaining context window', () => { - // max_context_tokens 10000, input ~ 1000, safetyMargin 1024 → remaining ~ 7976 + it('uses the explicit hard cap when configured', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 32000 }, capability: makeCapability(10000), - messages: makeMessages(1000), }); - expect(cap).toBeLessThanOrEqual(10000 - 1000 - 1024); - expect(cap).toBeGreaterThan(7000); + expect(cap).toBe(32000); }); - it('returns 1 when input already exceeds context minus margin', () => { + it('ignores fallback when the model context window is known', () => { const cap = computeCompletionBudgetCap({ budget: { fallback: 32000 }, capability: makeCapability(10000), - messages: makeMessages(11000), - }); - expect(cap).toBe(1); - }); - - it('never exceeds remaining context, even when remaining is below the historical floor', () => { - // input ~ 8900, safetyMargin 1024 → remaining ~ 75 (positive but below 256). - // The cap MUST stay <= remaining so the request does not overflow. - const maxCtx = 10000; - const cap = computeCompletionBudgetCap({ - budget: { fallback: 32000 }, - capability: makeCapability(maxCtx), - messages: makeMessages(8900), }); - expect(cap).toBeGreaterThanOrEqual(1); - expect(cap).toBeLessThanOrEqual(maxCtx - 8900 - 1024); - }); - - it('respects custom safetyMargin', () => { - const cap = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin: 4096 }, - capability: makeCapability(20000), - messages: makeMessages(1000), - }); - // remaining = 20000 - (1000 + 1 for 'user' role) - 4096 = 14903 - expect(cap).toBe(14903); + expect(cap).toBe(10000); }); it('keeps explicit hard cap when smaller than remaining', () => { const cap = computeCompletionBudgetCap({ budget: { hardCap: 1024 }, capability: makeCapability(100000), - messages: makeMessages(1000), }); expect(cap).toBe(1024); }); - - it('counts the system prompt as input', () => { - const maxCtx = 10000; - const safetyMargin = 1024; - const systemPrompt = 'a'.repeat(2000 * 4); // ~2000 tokens - const cap = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin }, - capability: makeCapability(maxCtx), - messages: makeMessages(1000), - systemPrompt, - }); - // remaining = 10000 - (1001 + 2000) - 1024 = 5975 - expect(cap).toBeLessThanOrEqual(maxCtx - 1001 - 2000 - safetyMargin); - expect(cap).toBeGreaterThan(5500); - }); - - it('counts tool schemas as input', () => { - const maxCtx = 10000; - const safetyMargin = 1024; - const tools: Tool[] = [ - makeTool('tool_a', 4000), // ~1000 tokens of description per tool - makeTool('tool_b', 4000), - ]; - const capWithTools = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin }, - capability: makeCapability(maxCtx), - messages: makeMessages(1000), - tools, - }); - const capWithoutTools = computeCompletionBudgetCap({ - budget: { fallback: 32000, safetyMargin }, - capability: makeCapability(maxCtx), - messages: makeMessages(1000), - }); - expect(capWithTools).toBeLessThan(capWithoutTools); - // Tool descriptions add ~2000 tokens, so cap should drop by roughly that. - expect(capWithoutTools - capWithTools).toBeGreaterThan(1500); - }); }); describe('applyCompletionBudget', () => { @@ -208,7 +111,6 @@ describe('applyCompletionBudget', () => { provider: original, budget: undefined, capability: makeCapability(10000), - messages: makeMessages(100), }); expect(result).toBe(original); expect(withMaxCompletionTokens).not.toHaveBeenCalled(); @@ -222,46 +124,31 @@ describe('applyCompletionBudget', () => { provider: opaque, budget: { hardCap: 8192 }, capability: makeCapability(10000), - messages: makeMessages(100), }); expect(result).toBe(opaque); }); - it('clones the provider with the clamped cap when budget is configured', () => { + it('clones the provider with the model context window when budget is configured', () => { const result = applyCompletionBudget({ provider: original, budget: { fallback: 32000 }, capability: makeCapability(10000), - messages: makeMessages(1000), }); expect(withMaxCompletionTokens).toHaveBeenCalledOnce(); const cap = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - expect(cap).toBeLessThanOrEqual(10000 - 1000 - 1024); - expect(cap).toBeGreaterThan(7000); + expect(cap).toBe(10000); expect(result).not.toBe(original); }); - it('forwards systemPrompt and tools to the cap computation', () => { - const tools: Tool[] = [makeTool('tool_a', 4000)]; - const systemPrompt = 'a'.repeat(4000); // ~1000 tokens - applyCompletionBudget({ - provider: original, - budget: { fallback: 32000 }, - capability: makeCapability(10000), - messages: makeMessages(1000), - systemPrompt, - tools, - }); - const capWithExtras = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - withMaxCompletionTokens.mockClear(); - applyCompletionBudget({ + it('uses the explicit hard cap when configured', () => { + const result = applyCompletionBudget({ provider: original, - budget: { fallback: 32000 }, + budget: { hardCap: 8192 }, capability: makeCapability(10000), - messages: makeMessages(1000), }); - const capBare = withMaxCompletionTokens.mock.calls[0]?.[0] as number; - expect(capWithExtras).toBeLessThan(capBare); + expect(withMaxCompletionTokens).toHaveBeenCalledOnce(); + expect(withMaxCompletionTokens.mock.calls[0]?.[0]).toBe(8192); + expect(result).not.toBe(original); }); });