Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/use-context-token-budget.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@moonshot-ai/agent-core": patch
"@moonshot-ai/kimi-code": patch
---

Avoid overly small local completion caps that can truncate reasoning before summaries are produced.
8 changes: 0 additions & 8 deletions packages/agent-core/src/agent/compaction/full.ts
Original file line number Diff line number Diff line change
Expand Up @@ -447,11 +447,6 @@ export class FullCompaction {
const delays = retryBackoffDelays(maxAttempts);
let retryCount = 0;

// Clamp the completion budget against the compaction input. Compaction
// is triggered when context is already near full, so an unbounded
// default cap is most at risk of either exceeding the model limit or
// returning empty `content` on reasoning models. The cloned provider
// is local to this call and never persisted back to agent state.
const completionBudget = resolveCompletionBudget({
reservedContextSize:
this.agent.providerManager?.config.loopControl?.reservedContextSize,
Expand All @@ -460,9 +455,6 @@ export class FullCompaction {
provider: this.agent.config.provider,
budget: completionBudget,
capability: this.agent.config.modelCapabilities,
messages,
systemPrompt: this.agent.config.systemPrompt,
tools: this.agent.tools.loopTools,
});

for (let attempt = 1; ; attempt += 1) {
Expand Down
8 changes: 1 addition & 7 deletions packages/agent-core/src/agent/turn/kosong-llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export interface KosongLLMConfig {
readonly generate?: GenerateFn | undefined;
/**
* Completion budget config resolved from agent/provider settings. The
* final cap is computed per request from the current messages and tools.
* final cap is applied to each request.
*/
readonly completionBudgetConfig?: CompletionBudgetConfig | undefined;
}
Expand Down Expand Up @@ -91,16 +91,10 @@ export class KosongLLM implements LLM {
// throwaway shallow clone. `effectiveProvider` is local to this call
// and never written back to `this.provider`, so retries (handled at
// a higher layer) keep using the same long-lived provider/client.
// The clamp must see every input the provider will serialize on the
// wire — system prompt and tool schemas included — or a near-full
// context can still slip past the limit.
const effectiveProvider = applyCompletionBudget({
provider: this.provider,
budget: this.completionBudgetConfig,
capability: this.capability,
messages: params.messages,
systemPrompt: this.systemPrompt,
tools: params.tools,
});

const result = await this.generate(
Expand Down
54 changes: 9 additions & 45 deletions packages/agent-core/src/utils/completion-budget.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,14 @@
import type {
ChatProvider,
Message,
ModelCapability,
Tool,
} from '@moonshot-ai/kosong';

import {
estimateTokens,
estimateTokensForMessages,
estimateTokensForTools,
} from './tokens';
import type { ChatProvider, ModelCapability } from '@moonshot-ai/kosong';

/** Completion-token budget for the next LLM request. */
export interface CompletionBudgetConfig {
/** Explicit user-configured maximum. */
readonly hardCap?: number;
/** Conservative cap for providers/models whose context window is unknown. */
readonly fallback?: number;
/** Tokens kept out of the output budget to absorb estimation drift. */
readonly safetyMargin?: number;
}

const MIN_FLOOR = 1;
const DEFAULT_SAFETY_MARGIN = 1024;
const DEFAULT_UNKNOWN_CONTEXT_FALLBACK = 32000;

/**
Expand Down Expand Up @@ -59,36 +45,20 @@ function parseEnvBudget(raw: string | undefined): EnvBudget {
}

/**
* Compute the effective `max_completion_tokens` cap. Known-context requests
* use the remaining window unless a hard cap is configured.
* Compute the effective `max_completion_tokens` cap.
*/
export function computeCompletionBudgetCap(args: {
readonly budget: CompletionBudgetConfig;
readonly capability: ModelCapability | undefined;
readonly messages: readonly Message[];
readonly systemPrompt?: string;
readonly tools?: readonly Tool[];
}): number {
const safetyMargin = args.budget.safetyMargin ?? DEFAULT_SAFETY_MARGIN;
const maxCtx = args.capability?.max_context_tokens ?? 0;
if (maxCtx <= 0) {
return Math.max(
MIN_FLOOR,
args.budget.hardCap ?? args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK,
);
}
const input =
estimateTokensForMessages([...args.messages]) +
estimateTokens(args.systemPrompt ?? '') +
estimateTokensForTools(args.tools ?? []);
const remaining = maxCtx - input - safetyMargin;
if (remaining <= 0) {
return MIN_FLOOR;
}
if (args.budget.hardCap === undefined) {
return Math.max(MIN_FLOOR, remaining);
}
return Math.max(MIN_FLOOR, Math.min(args.budget.hardCap, remaining));
// The provider backend computes the safe request-specific value from the
// serialized prompt. Locally using the largest cap avoids cutting off
// thinking before the model produces a summary.
const cap =
args.budget.hardCap ??
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Bound configured hard cap by model context limit

computeCompletionBudgetCap now prefers budget.hardCap verbatim, so if KIMI_MODEL_MAX_COMPLETION_TOKENS/KIMI_MODEL_MAX_TOKENS is set above the model’s context window, every request can carry an impossible completion limit and fail with provider-side invalid-request/context-limit errors. Before this commit, explicit caps were still clamped by available context, so this change introduces hard failures for oversized but previously tolerable operator configuration.

Useful? React with 👍 / 👎.

(maxCtx > 0 ? maxCtx : args.budget.fallback ?? DEFAULT_UNKNOWN_CONTEXT_FALLBACK);
Comment on lines +58 to +60
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Reintroduce remaining-context clamp for completion budget

computeCompletionBudgetCap now returns hardCap (or full max_context_tokens) without subtracting current input tokens, so near-full conversations can send max_completion_tokens values where prompt_tokens + max_completion_tokens exceeds the model window. In that case the provider can reject the request (e.g., invalid request/context overflow) before any summary is produced, which is a regression from the previous behavior that clamped to remaining context and is especially risky in compaction flows that run when context is already close to full.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid maxing completion cap and triggering TPM throttling

Setting the cap to full max_context_tokens for known models inflates max_completion_tokens on every request, even when the prompt is short. Kimi’s gateway rate-limit accounting uses prompt_tokens + max_completion_tokens (not actual generated tokens), so this change can sharply increase measured TPM/TPD and cause avoidable 429/quota errors in normal turns. The previous input-aware clamp kept this parameter closer to feasible remaining budget and avoided this artificial rate-limit pressure.

Useful? React with 👍 / 👎.

return Math.max(MIN_FLOOR, cap);
}

/**
Expand All @@ -105,18 +75,12 @@ export function applyCompletionBudget(args: {
readonly provider: ChatProvider;
readonly budget: CompletionBudgetConfig | undefined;
readonly capability: ModelCapability | undefined;
readonly messages: readonly Message[];
readonly systemPrompt?: string;
readonly tools?: readonly Tool[];
}): ChatProvider {
if (args.budget === undefined) return args.provider;
if (args.provider.withMaxCompletionTokens === undefined) return args.provider;
const cap = computeCompletionBudgetCap({
budget: args.budget,
capability: args.capability,
messages: args.messages,
systemPrompt: args.systemPrompt,
tools: args.tools,
});
return args.provider.withMaxCompletionTokens(cap);
}
47 changes: 47 additions & 0 deletions packages/agent-core/test/agent/compaction.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,53 @@ describe('Agent compaction', () => {
await ctx.expectResumeMatches();
});

it('uses the model context window for compaction completion budget', async () => {
const maxContextTokens = 5_000;
let appliedCap: number | undefined;
const generate: GenerateFn = async (provider) => {
const cap = (provider as { readonly modelParameters?: Record<string, unknown> })
.modelParameters?.['max_completion_tokens'];
if (typeof cap !== 'number') throw new Error('Expected max_completion_tokens to be applied');
appliedCap = cap;

return {
id: 'mock-compaction-budget',
message: {
role: 'assistant',
content: [{ type: 'text', text: 'Budgeted summary.' }],
toolCalls: [],
},
usage: {
inputOther: 1,
output: 4,
inputCacheRead: 0,
inputCacheCreation: 0,
},
finishReason: 'completed',
rawFinishReason: 'stop',
};
};
const ctx = testAgent({ compactionStrategy: alwaysCompactOnce, generate });
ctx.configure({
provider: CATALOGUED_PROVIDER,
modelCapabilities: {
...CATALOGUED_MODEL_CAPABILITIES,
max_context_tokens: maxContextTokens,
},
});
appendExchange(ctx, 1, 'old user one', 'old assistant one', maxContextTokens - 100);
const compacted = new Promise<void>((resolve) => {
ctx.emitter.once('context.apply_compaction', () => {
resolve();
});
});

await ctx.rpc.beginCompaction({ instruction: 'Keep the important test facts.' });
await compacted;

expect(appliedCap).toBe(maxContextTokens);
});

it('projects the compacted prefix before sending the summary request', async () => {
const ctx = testAgent({ compactionStrategy: alwaysCompactOnce });
ctx.configure({
Expand Down
53 changes: 53 additions & 0 deletions packages/agent-core/test/agent/kosong-llm.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import {
emptyUsage,
type ChatProvider,
type ModelCapability,
type StreamedMessagePart,
type ToolCall,
} from '@moonshot-ai/kosong';
Expand Down Expand Up @@ -83,6 +84,47 @@ describe('KosongLLM streaming tool-call deltas', () => {
});
});

describe('KosongLLM completion budget', () => {
it('applies the model context window as the completion cap', async () => {
let appliedCap: number | undefined;
let generatedProvider: ChatProvider | undefined;
const providerWithBudget: ChatProvider = {
...provider,
withMaxCompletionTokens(n: number) {
appliedCap = n;
return { ...this, withMaxCompletionTokens: this.withMaxCompletionTokens };
},
};
const generate: GenerateFn = async (nextProvider) => {
generatedProvider = nextProvider;
return {
id: 'response-1',
message: { role: 'assistant', content: [], toolCalls: [] },
usage: emptyUsage(),
finishReason: 'completed',
rawFinishReason: 'stop',
};
};
const llm = new KosongLLM({
provider: providerWithBudget,
modelName: 'test-model',
systemPrompt: 'system',
capability: makeCapability(10000),
completionBudgetConfig: { fallback: 32000 },
generate,
});

await llm.chat({
messages: [],
tools: [],
signal: new AbortController().signal,
});

expect(appliedCap).toBe(10000);
expect(generatedProvider).not.toBe(providerWithBudget);
});
});

async function collectToolCallDeltas(
parts: readonly StreamedMessagePart[],
): Promise<ToolCallDelta[]> {
Expand Down Expand Up @@ -130,3 +172,14 @@ function stripStreamIndex(toolCall: ToolCall): ToolCall {
const { _streamIndex: _, ...rest } = toolCall;
return rest;
}

function makeCapability(maxContextTokens: number): ModelCapability {
return {
image_in: false,
video_in: false,
audio_in: false,
thinking: false,
tool_use: true,
max_context_tokens: maxContextTokens,
};
}
Loading
Loading