Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion EVENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,36 @@ Emitted when an assistant message finishes (one per LLM turn).
"providerID": "anthropic",
"agent": "default",
"cost": { "input": 0.003, "output": 0.012, "cache": { "read": 0, "write": 0 } },
"tokens": { "input": 1024, "output": 512, "cache": { "read": 0, "write": 0 } },
"tokens": {
"input": 1024,
"output": 512,
"reasoning": 0,
"cache": { "read": 8800, "write": 1024 }
},
"context": { "used": 10848, "limit": 200000, "ratio": 0.05424 },
"finish": "tool-calls"
}
```

`finish` values: `"tool-calls"` (model wants to call tools), `"end_turn"` (model is done), `"max_tokens"` (output truncated).

**`tokens`** (5-way breakdown, mirrors upstream `LLM.Usage`):

- `input` (number) — raw input tokens billed at the standard input rate.
- `output` (number) — output (completion) tokens.
- `reasoning` (number) — extended-thinking / reasoning tokens (0 when thinking is off).
- `cache.read` (number) — tokens served from the prompt cache (billed at cache-read rate). Distinguishes cache hits from fresh input.
- `cache.write` (number) — tokens written to the prompt cache (billed at cache-write rate).

These fields are non-overlapping: a token is counted in exactly one bucket.

**`context`** (context-window utilization):

- `used` (number) — tokens occupying the model's context window this turn: `input + cache.read + cache.write`.
- `limit` (number) — the model's total context-window size in tokens, sourced from the models.dev registry (`model.limit.context`).
- `ratio` (number) — `used / limit` (≥0; may exceed 1 if usage exceeds the model's registered limit). A value approaching or exceeding 1 signals context-exhaustion risk.
- `null` — emitted when the model's context limit is not known (e.g. unregistered custom endpoint).

### `text`

Emitted when a text block from the assistant is complete.
Expand Down
61 changes: 60 additions & 1 deletion packages/cli/src/cli/cmd/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,31 @@ function fallback(part: ToolPart) {
})
}

/**
* Build the context-window utilization object for `message_complete` events.
*
* Returns `null` (meaning "unknown") when:
* - `contextLimit` is `null` — Provider.getModel threw (unregistered model)
* - `contextLimit` is `0` — custom model without a registered limit defaults to
Comment thread
byapparov marked this conversation as resolved.
* `limit.context = 0` (the provider's default for unregistered custom models).
* A zero limit would yield `Infinity`/`NaN` for ratio, which `JSON.stringify`
* serialises as `null` inside the object — diverging from the documented
* top-level `null` contract (EVENTS.md).
*
* @internal exported for unit-testing only
*/
export function buildContextWindow(
contextLimit: number | null,
contextUsed: number,
): { used: number; limit: number; ratio: number } | null {
if (contextLimit == null || contextLimit <= 0) return null
return {
used: contextUsed,
limit: contextLimit,
Comment thread
byapparov marked this conversation as resolved.
ratio: contextUsed / contextLimit,
}
}

function glob(info: ToolProps<typeof GlobTool>) {
const root = info.input.path ?? ""
const title = `Glob "${info.input.pattern}"`
Expand Down Expand Up @@ -463,12 +488,46 @@ export const RunCommand = cmd({
const info = event.properties.info
if (args.format === "json") {
if (info.finish) {
// Build 5-way token breakdown mirroring upstream LLM.Usage shape.
// info.tokens already carries the full breakdown from StepFinishPart
// accumulation — reasoning and cache split are not dropped upstream.
const tokens = {
input: info.tokens.input,
output: info.tokens.output,
reasoning: info.tokens.reasoning,
Comment thread
byapparov marked this conversation as resolved.
cache: {
read: info.tokens.cache.read,
write: info.tokens.cache.write,
},
}

// Context-window utilization: used = input + cache.read + cache.write
// (all prompt tokens that occupy the model's context window this turn).
// cache.write tokens are written to the cache ON this turn — they are
// part of the prompt sent to the model and count against the context
// window, just billed at the cache-write rate. Excluding them
// undercounts utilization on the first turn of a conversation.
// limit comes from the model registry (models.dev). On lookup failure
// (or limit===0 for custom models) buildContextWindow returns null.
const contextLimit = await Provider.getModel(info.providerID, info.modelID)
.then((m) => m.limit.context)
.catch((e) => {
if (e instanceof Provider.ModelNotFoundError) return null
throw e
})
const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write
const context = buildContextWindow(contextLimit, contextUsed)

emit("message_complete", {
modelID: info.modelID,
providerID: info.providerID,
agent: info.agent,
// cost is sourced from info.cost which accumulates real per-step costs
// from StepFinishPart. Do NOT use the new step.ended event cost field
// which emits cost:0 and is reconciled later (the cost:0 trap).
cost: info.cost,
tokens: info.tokens,
tokens,
context,
finish: info.finish,
})
}
Expand Down
191 changes: 191 additions & 0 deletions packages/cli/test/cli/usage-token-breakdown.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import path from "path"
import { describe, expect, test } from "bun:test"
import { buildContextWindow } from "../../src/cli/cmd/run"

const EVENTS_MD = path.resolve(import.meta.dir, "../../../../EVENTS.md")

/**
* Tests for issue #86 — 5-way token breakdown + context-window utilization
* in message_complete events.
*/
describe("buildContextWindow (#86)", () => {
// 🟠 regression: limit===0 (custom model default) must return null, not {ratio:Infinity}
test("returns null when contextLimit is 0 (custom model without registered limit)", () => {
const result = buildContextWindow(0, 9824)
expect(result).toBeNull()
})

test("returns null when contextLimit is null (Provider.getModel threw)", () => {
const result = buildContextWindow(null, 9824)
expect(result).toBeNull()
})

test("returns null when both limit and used are 0", () => {
const result = buildContextWindow(0, 0)
expect(result).toBeNull()
})

test("computes used as the value passed in (caller sets input + cache.read + cache.write)", () => {
// Regression: cache.write must be included in contextUsed at the call site (run.ts).
// buildContextWindow receives the pre-summed value; this test verifies the helper
// honours it (and that the sum is documented correctly: input + cache.read + cache.write).
const input = 1024
const cacheRead = 8800
const cacheWrite = 1024
const contextUsed = input + cacheRead + cacheWrite // = 10848
const result = buildContextWindow(200_000, contextUsed)
expect(result).not.toBeNull()
expect(result!.used).toBe(10848)
})

test("sets limit to the contextLimit value", () => {
const result = buildContextWindow(200_000, 9824)
expect(result).not.toBeNull()
expect(result!.limit).toBe(200_000)
})

test("ratio is used / limit", () => {
const result = buildContextWindow(200_000, 9824)
expect(result).not.toBeNull()
expect(result!.ratio).toBeCloseTo(9824 / 200_000, 10)
})

test("ratio is between 0 and 1 for realistic values", () => {
const result = buildContextWindow(128_000, 64_000)
expect(result).not.toBeNull()
expect(result!.ratio).toBe(0.5)
})

test("ratio is exactly 1 when context is fully used", () => {
const result = buildContextWindow(100_000, 100_000)
expect(result).not.toBeNull()
expect(result!.ratio).toBe(1)
})

test("ratio is 0 when no tokens used (empty prompt start)", () => {
const result = buildContextWindow(200_000, 0)
expect(result).not.toBeNull()
expect(result!.ratio).toBe(0)
})

test("result is JSON-serialisable without Infinity or NaN", () => {
const result = buildContextWindow(200_000, 9824)
const serialised = JSON.stringify(result)
expect(serialised).not.toContain("null")
const parsed = JSON.parse(serialised)
expect(parsed.ratio).toBeCloseTo(9824 / 200_000, 10)
})

test("top-level null serialises cleanly (not as object with null ratio)", () => {
// The documented contract: limit unknown → top-level null, not {ratio:null}
const result = buildContextWindow(0, 9824)
expect(JSON.stringify(result)).toBe("null")
})

test("ratio may exceed 1 when usage exceeds the registered limit (unclamped)", () => {
// EVENTS.md documents ratio as ≥0 (may exceed 1), not clamped to [0,1].
// Stale/lowered model.limit.context can produce ratio > 1 in production.
const result = buildContextWindow(100_000, 110_000)
expect(result).not.toBeNull()
expect(result!.ratio).toBeGreaterThan(1)
expect(result!.ratio).toBeCloseTo(1.1, 10)
})
})

describe("message_complete emit block shape (source-verified, #86)", () => {
// These source-text checks verify structural wiring in the emit call site
// that cannot be covered by pure unit-testing buildContextWindow.
const RUN_SRC = path.resolve(import.meta.dir, "../../src/cli/cmd/run.ts")

test("emit block passes tokens with reasoning and cache read/write fields", async () => {
Comment thread
byapparov marked this conversation as resolved.
const source = await Bun.file(RUN_SRC).text()
const emitIdx = source.indexOf('emit("message_complete"')
expect(emitIdx).toBeGreaterThan(-1)
const blockStart = Math.max(0, emitIdx - 1500)
const block = source.slice(blockStart, emitIdx + 200)
expect(block).toContain("reasoning")
expect(block).toContain("cache")
expect(block).toContain("read")
expect(block).toContain("write")
})

test("emit block calls buildContextWindow (not inline ternary)", async () => {
const source = await Bun.file(RUN_SRC).text()
const emitIdx = source.indexOf('emit("message_complete"')
expect(emitIdx).toBeGreaterThan(-1)
const blockStart = Math.max(0, emitIdx - 1500)
const block = source.slice(blockStart, emitIdx + 200)
expect(block).toContain("buildContextWindow")
})

test("emit block includes context field", async () => {
const source = await Bun.file(RUN_SRC).text()
const emitIdx = source.indexOf('emit("message_complete"')
expect(emitIdx).toBeGreaterThan(-1)
// emit object spans ~400 chars; search up to closing paren
const block = source.slice(emitIdx, emitIdx + 500)
expect(block).toContain("context")
})

test("cost field is still emitted (not regressed)", async () => {
const source = await Bun.file(RUN_SRC).text()
const idx = source.indexOf('emit("message_complete"')
expect(idx).toBeGreaterThan(-1)
const block = source.slice(idx, idx + 800)
expect(block).toContain("cost:")
})

test("getModel catch rethrows non-ModelNotFoundError (targeted catch, not swallow-all)", async () => {
// Verify the catch block only silences ModelNotFoundError; unexpected errors must propagate.
// Source check: catch body must reference ModelNotFoundError (not be an empty arrow).
const source = await Bun.file(RUN_SRC).text()
const getModelIdx = source.indexOf("Provider.getModel(info.providerID")
expect(getModelIdx).toBeGreaterThan(-1)
const catchWindow = source.slice(getModelIdx, getModelIdx + 400)
expect(catchWindow).toContain("ModelNotFoundError")
expect(catchWindow).toContain("throw e")
})

test("contextUsed includes cache.write (regression: must not omit cache.write from context sum)", async () => {
Comment thread
byapparov marked this conversation as resolved.
// Regression for the bug where `contextUsed = tokens.input + tokens.cache.read`
// omitted cache.write, undercounting first-turn utilization. The fix is:
// const contextUsed = tokens.input + tokens.cache.read + tokens.cache.write
// This source-text check verifies the three-way sum is present at the call site.
const source = await Bun.file(RUN_SRC).text()
const contextUsedIdx = source.indexOf("const contextUsed =")
expect(contextUsedIdx).toBeGreaterThan(-1)
const line = source.slice(contextUsedIdx, contextUsedIdx + 100)
expect(line).toContain("cache.write")
})
})

describe("EVENTS.md documents token breakdown and context (#86)", () => {
test("EVENTS.md message_complete section includes reasoning token field", async () => {
const doc = await Bun.file(EVENTS_MD).text()
const idx = doc.indexOf("message_complete")
expect(idx).toBeGreaterThan(-1)
const section = doc.slice(idx, idx + 1500)
expect(section).toContain("reasoning")
})

test("EVENTS.md message_complete section documents context field", async () => {
const doc = await Bun.file(EVENTS_MD).text()
const idx = doc.indexOf("message_complete")
expect(idx).toBeGreaterThan(-1)
const section = doc.slice(idx, idx + 1500)
expect(section).toContain("context")
expect(section).toContain("used")
expect(section).toContain("limit")
expect(section).toContain("ratio")
})

test("EVENTS.md documents null as the unknown-limit sentinel", async () => {
const doc = await Bun.file(EVENTS_MD).text()
const idx = doc.indexOf("message_complete")
expect(idx).toBeGreaterThan(-1)
// null sentinel doc is ~1593 chars after message_complete heading; use 2000 window
const section = doc.slice(idx, idx + 2000)
// The documented contract: null = context limit not known
expect(section).toContain("null")
})
})