From 0dc7a6d85f82b2ed1c446e922517d46f15543133 Mon Sep 17 00:00:00 2001 From: Vasyl Vdovychenko Date: Thu, 18 Jun 2026 10:07:18 -0400 Subject: [PATCH] feat(ai): admin Shadow + Models tabs (AI-076, Phase 12 slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read-only visualization of the slice-1 shadow data on /ai-quality. - Shadow tab: shadow_runs rolled up per (feature, primary, shadow) via one GROUP BY (no N+1) — p50 latency (percentile_cont), SQL-side lexical agreement (exact-match rate, avg length-ratio, both-present rate), shadow-minus-primary latency/cost/token deltas + projected monthly cost delta (pure unit-tested ToPairDto helper). Row -> modal with paged side-by-side primary-vs-shadow response samples (limit 1..50). Caption: agreement is lexical, NOT a quality verdict (judge scoring = later slice). Empty state explains shadow is OFF by default + how to enable. - Models tab: flat view of seeded models registry, status color-coded. - 3 read-only endpoints under /admin/ai-quality (shadow/summary, shadow/samples, models), admin-auth inherited. NO mutation — promote/ rollback + table-driven routing is the next slice. 700 unit tests green (9 new); admin tsc + build clean; integration tests (empty/auth/400/seeded) run in CI. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 8 + apps/admin/src/api/client.ts | 84 +++++ apps/admin/src/pages/AiQualityPage.tsx | 300 +++++++++++++++++- .../Api/Endpoints/AdminAiQualityEndpoints.cs | 176 ++++++++++ backend/src/Contracts/Admin/AiQualityDtos.cs | 66 ++++ .../AdminShadowEndpointTests.cs | 173 ++++++++++ .../ShadowPairMapperTests.cs | 140 ++++++++ 7 files changed, 945 insertions(+), 2 deletions(-) create mode 100644 tests/TextStack.IntegrationTests/AdminShadowEndpointTests.cs create mode 100644 tests/TextStack.UnitTests/ShadowPairMapperTests.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index 9589700e..f642fc20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ ## [Unreleased] +### Phase 12 — admin Shadow + Models tabs (AI-076, slice 2) (2026-06-18) + +Makes the slice-1 shadow data visible. Two **read-only** tabs on the admin AI-quality page (`/ai-quality`): +- **Shadow** — rolls up `shadow_runs` into per-(feature, primary-model, shadow-model) pairs via a single PostgreSQL `GROUP BY` (no N+1; `percentile_cont` p50 latency, SQL-side **lexical agreement**: exact-match rate + avg length-ratio + both-present rate over rows where both responses exist). Shadow-minus-primary latency/cost/token deltas + a window-normalized **projected monthly cost delta** are computed in a pure, unit-tested `ToPairDto` helper. Row → modal with paged side-by-side primary-vs-shadow response samples (redacted at write-time; limit clamped 1..50). A caption states agreement is **lexical, not a quality verdict** — semantic judge scoring is a later slice. Empty state explains shadow is OFF by default + how to enable it. +- **Models** — flat view of the seeded `models` registry (feature, provider, model, status), status color-coded Primary/Shadow/Retired. + +Backend: 3 endpoints under `/admin/ai-quality` (`GET /shadow/summary`, `/shadow/samples`, `/models`), DTOs appended to the Ai-quality contract, admin-auth inherited. **Strictly read-only** — promote/rollback + table-driven routing is the next slice. Backend `AdminAiQualityEndpoints.cs` + `AiQualityDtos.cs`; admin `AiQualityPage.tsx` + `api/client.ts`. 700 unit tests green (9 new on the delta/projection math); admin tsc + build clean; integration tests (summary empty/auth/400, models-seeded) run in CI. + ### Phase 12 — ModelGateway v2 shadow routing + model registry (AI-075, slice 1) (2026-06-18) First RLOps slice: the gateway can now **shadow** a second model against the one serving production, with zero impact on the user. When a feature has an `Ai:Shadow` route configured and the call is sampled, `ModelGateway` — **after** the primary response is ready — fires the same `LlmRequest` at the shadow provider's **untraced `-raw` sibling** (so no `llm_traces` row, no recursion, no double cost-count) as a fire-and-forget background task, then persists one redacted primary-vs-shadow comparison row in **`shadow_runs`** (both responses, latency, cost, tokens, trace ids). Invariants (unit-tested): primary latency/correctness untouched; shadow never threads the caller's cancellation token (own timeout, default 15s); any shadow failure/timeout is swallowed + logged; `StreamAsync` re-yields primary deltas unchanged and shadows once only on **clean** stream completion (suppressed on mid-stream throw). A new **`models`** registry table records which (provider, model) serves each feature by lifecycle `Status` (Primary/Shadow/Retired) — seeded idempotently at startup from the current primary routes (unique natural-key index makes the seed race-safe across replicas; the seeder is guarded so a DB hiccup can't abort API boot). The `models` table is **audit/seed only** in this slice — the gateway still routes by config; table-driven hot-swap + canary/escalate/cost-cap/drift/admin-UI are later slices. **Shadow is OFF by default** (no routes, sample rate 0.0) — no paid background calls until explicitly enabled per feature. New: `ModelGateway` v2, `ShadowOptions`, `IShadowRunWriter`/`DbShadowRunWriter`, `ModelRegistration`/`ShadowRun` entities, `ModelRegistrySeeder`, migration `AddModelRegistryAndShadowRun`. 691 unit tests green. diff --git a/apps/admin/src/api/client.ts b/apps/admin/src/api/client.ts index 34f8db5e..1fbfd2ba 100644 --- a/apps/admin/src/api/client.ts +++ b/apps/admin/src/api/client.ts @@ -556,6 +556,66 @@ export interface CrewAbEvalResult { passed: boolean cases?: unknown[] } +// Shadow comparison +export interface ShadowPair { + featureTag: string + primaryModelId: string + shadowModelId: string + runs: number + primaryP50LatencyMs: number + shadowP50LatencyMs: number + latencyDeltaMs: number + primaryCostUsd: number + shadowCostUsd: number + costDeltaUsd: number + projectedMonthlyCostDeltaUsd: number + primaryTokensOut: number + shadowTokensOut: number + tokensOutDelta: number + exactMatchRate: number + avgLengthRatio: number + bothPresentRate: number + firstSeen: string + lastSeen: string +} +export interface ShadowSummary { + from: string + to: string + totalRuns: number + pairs: ShadowPair[] +} +export interface ShadowSample { + id: string + primaryResponse: string | null + shadowResponse: string | null + primaryLatencyMs: number + shadowLatencyMs: number + primaryCostUsd: number + shadowCostUsd: number + primaryTokensOut: number + shadowTokensOut: number + exactMatch: boolean + promptHash: string + primaryTraceId: string | null + shadowTraceId: string | null + createdAt: string +} +export interface ShadowSamplesPage { + total: number + items: ShadowSample[] +} +// Model registry +export interface ModelRegistration { + id: string + featureTag: string + providerKey: string + modelId: string + status: 'Primary' | 'Shadow' | 'Retired' + createdAt: string +} +export interface ModelsRegistry { + models: ModelRegistration[] +} async function fetchJson(path: string, init?: RequestInit): Promise { const res = await fetch(`${API_BASE}${path}`, { @@ -1198,6 +1258,30 @@ export const adminApi = { }) }, + getShadowSummary: async (params: { from?: string; to?: string; feature?: string }): Promise => { + const query = new URLSearchParams() + if (params.from) query.set('from', params.from) + if (params.to) query.set('to', params.to) + if (params.feature) query.set('feature', params.feature) + const qs = query.toString() + return fetchJson(`/admin/ai-quality/shadow/summary${qs ? `?${qs}` : ''}`) + }, + + getShadowSamples: async (params: { feature: string; primaryModelId: string; shadowModelId: string; limit?: number; offset?: number }): Promise => { + const query = new URLSearchParams() + query.set('feature', params.feature) + query.set('primaryModelId', params.primaryModelId) + query.set('shadowModelId', params.shadowModelId) + if (params.limit) query.set('limit', String(params.limit)) + if (params.offset) query.set('offset', String(params.offset)) + const qs = query.toString() + return fetchJson(`/admin/ai-quality/shadow/samples${qs ? `?${qs}` : ''}`) + }, + + getModels: async (): Promise => { + return fetchJson('/admin/ai-quality/models') + }, + // Podcasts generatePodcast: async (editionId: string, lang?: string, force?: boolean): Promise => { return fetchJson('/admin/podcasts', { diff --git a/apps/admin/src/pages/AiQualityPage.tsx b/apps/admin/src/pages/AiQualityPage.tsx index 91c2f2c2..73d2b0b6 100644 --- a/apps/admin/src/pages/AiQualityPage.tsx +++ b/apps/admin/src/pages/AiQualityPage.tsx @@ -11,9 +11,13 @@ import { EvalRun, CriticDefectEvalResult, CrewAbEvalResult, + ShadowSummary, + ShadowPair, + ShadowSample, + ModelRegistration, } from '../api/client' -type Tab = 'summary' | 'traces' | 'transcripts' | 'evals' +type Tab = 'summary' | 'traces' | 'transcripts' | 'evals' | 'shadow' | 'models' const KNOWN_FEATURES = ['explain', 'translate', 'distractor', 'bookmeta', 'tagsuggestion', 'eval.judge'] @@ -23,7 +27,7 @@ export function AiQualityPage() {

AI Quality

- {(['summary', 'traces', 'transcripts', 'evals'] as Tab[]).map((t) => ( + {(['summary', 'traces', 'transcripts', 'evals', 'shadow', 'models'] as Tab[]).map((t) => (
) } @@ -886,6 +892,296 @@ function EvalsTab() { ) } +// ─────────────────────────── Shadow ─────────────────────────── + +const SHADOW_PAGE = 25 + +function deltaColor(delta: number, lowerIsBetter: boolean): string { + if (delta === 0) return '#6b7280' + const good = lowerIsBetter ? delta < 0 : delta > 0 + return good ? '#059669' : '#dc2626' +} + +function fmtDelta(delta: number, suffix: string): string { + const sign = delta > 0 ? '+' : '' + return `${sign}${delta}${suffix}` +} + +function ShadowTab() { + const [data, setData] = useState(null) + const [loading, setLoading] = useState(true) + const [error, setError] = useState(null) + const [days, setDays] = useState(30) + const [feature, setFeature] = useState('') + const [selected, setSelected] = useState(null) + + useEffect(() => { + setLoading(true) + const from = new Date(Date.now() - days * 86400000).toISOString() + adminApi + .getShadowSummary({ from, feature: feature || undefined }) + .then((d) => { + setData(d) + setError(null) + }) + .catch((e) => setError(e instanceof Error ? e.message : 'Failed to load')) + .finally(() => setLoading(false)) + }, [days, feature]) + + // Feature options derived from the summary's pairs (distinct featureTags), not hardcoded. + const featureOptions = [...new Set((data?.pairs ?? []).map((p) => p.featureTag))].sort() + + return ( + <> +
+
+ +
+
+ {RANGES.map((r) => ( + + ))} +
+
+ + {error && } + + {loading ? ( +

Loading…

+ ) : !data || data.pairs.length === 0 ? ( +

+ No shadow runs in this window. Shadow routing is OFF by default — enable Ai:Shadow:Routes:{feature} + a + sample rate to start comparing. +

+ ) : ( + <> +
+ + +
+ + + + + + + + + + + + + + + + + {data.pairs.map((p) => ( + setSelected(p)} + style={{ cursor: 'pointer', borderBottom: '1px solid #f3f4f6' }} + > + + + + + + + + + + + + ))} + +
FeaturePrimaryShadowRunsp50 latency (P / S / Δ)Cost (P / S / Δ)Proj. monthly ΔExact-matchLen ratioLast seen
{p.featureTag}{p.primaryModelId}{p.shadowModelId}{p.runs.toLocaleString()} + {p.primaryP50LatencyMs} / {p.shadowP50LatencyMs} ms{' '} + + {fmtDelta(p.latencyDeltaMs, 'ms')} + + + ${p.primaryCostUsd.toFixed(4)} / ${p.shadowCostUsd.toFixed(4)}{' '} + + {p.costDeltaUsd >= 0 ? '+' : '-'}${Math.abs(p.costDeltaUsd).toFixed(4)} + + + {p.projectedMonthlyCostDeltaUsd >= 0 ? '+' : '-'}${Math.abs(p.projectedMonthlyCostDeltaUsd).toFixed(2)} + {(p.exactMatchRate * 100).toFixed(1)}%{p.avgLengthRatio.toFixed(2)}×{timeAgo(p.lastSeen)}
+

+ Agreement = lexical (exact-match + length ratio), not a quality verdict. Semantic judge scoring lands in a later + slice. +

+ + )} + + {selected && setSelected(null)} />} + + ) +} + +function ShadowSamplesModal({ pair, onClose }: { pair: ShadowPair; onClose: () => void }) { + const [items, setItems] = useState([]) + const [total, setTotal] = useState(0) + const [offset, setOffset] = useState(0) + const [loading, setLoading] = useState(true) + const [error, setError] = useState(null) + + useEffect(() => { + setLoading(true) + adminApi + .getShadowSamples({ + feature: pair.featureTag, + primaryModelId: pair.primaryModelId, + shadowModelId: pair.shadowModelId, + limit: SHADOW_PAGE, + offset, + }) + .then((d) => { + setItems(d.items) + setTotal(d.total) + setError(null) + }) + .catch((e) => setError(e instanceof Error ? e.message : 'Failed to load samples')) + .finally(() => setLoading(false)) + }, [pair, offset]) + + return ( +
+
e.stopPropagation()} style={modal}> +
+

{pair.featureTag} · shadow samples

+ +
+
+ primary {pair.primaryModelId} · shadow {pair.shadowModelId} +
+ + {error && } + + {loading ? ( +

Loading…

+ ) : items.length === 0 ? ( +

No samples for this pair.

+ ) : ( + <> +
+ {items.map((s) => ( +
+
+ {timeAgo(s.createdAt)} + + {s.exactMatch ? 'exact match' : 'differs'} + +
+
+
+
+ Primary · {s.primaryLatencyMs}ms · ${s.primaryCostUsd.toFixed(4)} · {s.primaryTokensOut} tok +
+
{s.primaryResponse ?? '—'}
+
+
+
+ Shadow · {s.shadowLatencyMs}ms · ${s.shadowCostUsd.toFixed(4)} · {s.shadowTokensOut} tok +
+
{s.shadowResponse ?? '—'}
+
+
+
+ ))} +
+ + + )} +
+
+ ) +} + +// ─────────────────────────── Models ─────────────────────────── + +function modelStatusColor(status: string): string { + if (status === 'Primary') return '#059669' + if (status === 'Shadow') return '#d97706' + return '#6b7280' // Retired +} + +function ModelsTab() { + const [models, setModels] = useState([]) + const [loading, setLoading] = useState(true) + const [error, setError] = useState(null) + + useEffect(() => { + adminApi + .getModels() + .then((d) => { + setModels(d.models) + setError(null) + }) + .catch((e) => setError(e instanceof Error ? e.message : 'Failed to load')) + .finally(() => setLoading(false)) + }, []) + + return ( + <> +

+ Registered models per feature and their routing status. Read-only. +

+ + {error && } + + {loading ? ( +

Loading…

+ ) : models.length === 0 ? ( +

No models registered yet.

+ ) : ( + + + + + + + + + + + + {models.map((m) => ( + + + + + + + + ))} + +
FeatureProviderModelStatusCreated
{m.featureTag}{m.providerKey}{m.modelId}{m.status}{timeAgo(m.createdAt)}
+ )} + + ) +} + // ─────────────────────────── shared ─────────────────────────── function Pager({ diff --git a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs index 5ce9ebf8..c36fdc68 100644 --- a/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs +++ b/backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs @@ -34,6 +34,9 @@ public static void MapAdminAiQualityEndpoints(this WebApplication app) group.MapPost("/evals/studybuddy/run", RunStudyBuddyEval); group.MapPost("/evals/criticdefects/run", RunCriticDefectEval); group.MapPost("/evals/crew-ab/run", RunCrewAbEval); + group.MapGet("/shadow/summary", GetShadowSummary); + group.MapGet("/shadow/samples", GetShadowSamples); + group.MapGet("/models", GetModels); } // Phase 7 DoD gate (AI-046): A/B the single-call baseline vs the full FieldCrew on the same brief+source over @@ -506,6 +509,179 @@ ORDER BY day return Results.Ok(summary); } + // Phase 12 (RLOps): per-pair shadow rollup. ONE GROUP BY (no N+1); percentile_cont + + // the agreement averages have no EF translation → raw SQL. Snake_case aliases map onto + // ShadowPairRow via the snake_case naming convention (PascalCase aliases break it). + private static async Task GetShadowSummary( + AppDbContext db, + [FromQuery] DateTimeOffset? from, + [FromQuery] DateTimeOffset? to, + [FromQuery] string? feature, + CancellationToken ct) + { + var toUtc = to ?? DateTimeOffset.UtcNow; + var fromUtc = from ?? toUtc.AddDays(-30); + if (fromUtc >= toUtc) + return Results.BadRequest(new { error = "'from' must be before 'to'" }); + var feat = string.IsNullOrWhiteSpace(feature) ? null : feature.Trim(); + var windowDays = (toUtc - fromUtc).TotalDays; + + var rows = await db.Database.SqlQuery($""" + SELECT feature_tag AS feature_tag, + primary_model_id AS primary_model_id, + shadow_model_id AS shadow_model_id, + count(*) AS runs, + coalesce(percentile_cont(0.5) WITHIN GROUP (ORDER BY primary_latency_ms), 0) AS primary_p50, + coalesce(percentile_cont(0.5) WITHIN GROUP (ORDER BY shadow_latency_ms), 0) AS shadow_p50, + coalesce(sum(primary_cost_usd),0) AS primary_cost_usd, + coalesce(sum(shadow_cost_usd),0) AS shadow_cost_usd, + coalesce(sum(primary_tokens_out),0) AS primary_tokens_out, + coalesce(sum(shadow_tokens_out),0) AS shadow_tokens_out, + avg((primary_response IS NOT NULL AND shadow_response IS NOT NULL AND primary_response = shadow_response)::int::float8) AS exact_match_rate, + avg(CASE WHEN primary_response IS NOT NULL AND shadow_response IS NOT NULL THEN length(shadow_response)::float8/nullif(length(primary_response),0) END) AS avg_length_ratio, + avg((primary_response IS NOT NULL AND shadow_response IS NOT NULL)::int::float8) AS both_present_rate, + min(created_at) AS first_seen, + max(created_at) AS last_seen + FROM shadow_runs + WHERE created_at >= {fromUtc} AND created_at < {toUtc} + AND ({feat}::text IS NULL OR feature_tag = {feat}) + GROUP BY feature_tag, primary_model_id, shadow_model_id + ORDER BY runs DESC + """).ToListAsync(ct); + + var pairs = rows.Select(r => ToPairDto(r, windowDays)).ToList(); + + var summary = new ShadowSummaryDto( + From: fromUtc, + To: toUtc, + TotalRuns: pairs.Sum(p => p.Runs), + Pairs: pairs); + + return Results.Ok(summary); + } + + /// + /// Pure mapper from a raw shadow-pair aggregate row → the DTO, computing all deltas + /// (shadow − primary) and the 30-day cost projection. Extracted + public for unit testing. + /// is guarded to ≥1 so a sub-day window can't over-project. + /// + public static ShadowPairDto ToPairDto(ShadowPairRow r, double windowDays) + { + var days = Math.Max(1.0, windowDays); + var primaryP50 = (int)Math.Round(r.PrimaryP50); + var shadowP50 = (int)Math.Round(r.ShadowP50); + var costDelta = r.ShadowCostUsd - r.PrimaryCostUsd; + + return new ShadowPairDto( + FeatureTag: r.FeatureTag, + PrimaryModelId: r.PrimaryModelId, + ShadowModelId: r.ShadowModelId, + Runs: r.Runs, + PrimaryP50LatencyMs: primaryP50, + ShadowP50LatencyMs: shadowP50, + LatencyDeltaMs: shadowP50 - primaryP50, + PrimaryCostUsd: r.PrimaryCostUsd, + ShadowCostUsd: r.ShadowCostUsd, + CostDeltaUsd: costDelta, + ProjectedMonthlyCostDeltaUsd: costDelta * 30m / (decimal)days, + PrimaryTokensOut: r.PrimaryTokensOut, + ShadowTokensOut: r.ShadowTokensOut, + TokensOutDelta: r.ShadowTokensOut - r.PrimaryTokensOut, + ExactMatchRate: r.ExactMatchRate ?? 0, + AvgLengthRatio: r.AvgLengthRatio ?? 0, + BothPresentRate: r.BothPresentRate ?? 0, + FirstSeen: r.FirstSeen, + LastSeen: r.LastSeen); + } + + // Phase 12 (RLOps): redacted side-by-side samples for one pair. Requires all three pair + // keys (a sample list only makes sense within a single comparison). EF LINQ read, newest-first. + private static async Task GetShadowSamples( + AppDbContext db, + [FromQuery] string? feature, + [FromQuery] string? primaryModelId, + [FromQuery] string? shadowModelId, + [FromQuery] int limit = 25, + [FromQuery] int offset = 0, + CancellationToken ct = default) + { + if (string.IsNullOrWhiteSpace(feature) + || string.IsNullOrWhiteSpace(primaryModelId) + || string.IsNullOrWhiteSpace(shadowModelId)) + return Results.BadRequest( + new { error = "feature, primaryModelId and shadowModelId are all required" }); + + limit = Math.Clamp(limit, 1, 50); + offset = Math.Max(offset, 0); + var feat = feature.Trim(); + var primary = primaryModelId.Trim(); + var shadow = shadowModelId.Trim(); + + var query = db.ShadowRuns.Where(s => + s.FeatureTag == feat + && s.PrimaryModelId == primary + && s.ShadowModelId == shadow); + + var total = await query.LongCountAsync(ct); + var items = await query + .OrderByDescending(s => s.CreatedAt) + .Skip(offset).Take(limit) + .Select(s => new ShadowSampleDto( + s.Id, + s.PrimaryResponse, + s.ShadowResponse, + s.PrimaryLatencyMs, + s.ShadowLatencyMs, + s.PrimaryCostUsd, + s.ShadowCostUsd, + s.PrimaryTokensOut, + s.ShadowTokensOut, + s.PrimaryResponse != null && s.ShadowResponse != null && s.PrimaryResponse == s.ShadowResponse, + s.PromptHash, + s.PrimaryTraceId, + s.ShadowTraceId, + s.CreatedAt)) + .ToListAsync(ct); + + return Results.Ok(new ShadowSamplesPageDto(total, items)); + } + + // Phase 12 (RLOps): the whole models registry (tiny table). Project to memory first, + // then Status.ToString() — the stored-string enum has no in-query .ToString() translation. + private static async Task GetModels(AppDbContext db, CancellationToken ct) + { + var rows = await db.Models + .OrderBy(m => m.FeatureTag).ThenBy(m => m.Status) + .Select(m => new { m.Id, m.FeatureTag, m.ProviderKey, m.ModelId, m.Status, m.CreatedAt }) + .ToListAsync(ct); + + var models = rows.Select(m => new ModelRegistrationDto( + m.Id, m.FeatureTag, m.ProviderKey, m.ModelId, m.Status.ToString(), m.CreatedAt)).ToList(); + + return Results.Ok(new ModelsRegistryDto(models)); + } + + /// Raw-SQL row for the shadow-pair aggregate (public + mutable for EF SqlQuery + /// materialization, like FeatureRow). Avg columns are nullable — an all-null group yields NULL. + public sealed class ShadowPairRow + { + public string FeatureTag { get; set; } = ""; + public string PrimaryModelId { get; set; } = ""; + public string ShadowModelId { get; set; } = ""; + public long Runs { get; set; } + public double PrimaryP50 { get; set; } + public double ShadowP50 { get; set; } + public decimal PrimaryCostUsd { get; set; } + public decimal ShadowCostUsd { get; set; } + public long PrimaryTokensOut { get; set; } + public long ShadowTokensOut { get; set; } + public double? ExactMatchRate { get; set; } + public double? AvgLengthRatio { get; set; } + public double? BothPresentRate { get; set; } + public DateTimeOffset FirstSeen { get; set; } + public DateTimeOffset LastSeen { get; set; } + } + // Raw-SQL row shapes (mutable props + parameterless ctor for EF SqlQuery materialization). // MUST be public: EF's SqlQuery materializer can't construct private nested types // (fails only once rows exist), which 500'd the Summary on prod. diff --git a/backend/src/Contracts/Admin/AiQualityDtos.cs b/backend/src/Contracts/Admin/AiQualityDtos.cs index dd334151..c7000994 100644 --- a/backend/src/Contracts/Admin/AiQualityDtos.cs +++ b/backend/src/Contracts/Admin/AiQualityDtos.cs @@ -113,3 +113,69 @@ public record EvalRunDto( string? BreakdownJson, string? GitSha, DateTimeOffset CreatedAt); + +// ── Shadow-run comparison + models registry (Phase 12 RLOps) ────────────────── + +/// One primary↔shadow pairing rolled up over the window (from shadow_runs). +/// Deltas are shadow − primary; the monthly projection scales the window's cost delta +/// to 30 days. Agreement metrics (exact/length/both-present) only count rows where BOTH +/// responses are present. +public record ShadowPairDto( + string FeatureTag, + string PrimaryModelId, + string ShadowModelId, + long Runs, + int PrimaryP50LatencyMs, + int ShadowP50LatencyMs, + int LatencyDeltaMs, + decimal PrimaryCostUsd, + decimal ShadowCostUsd, + decimal CostDeltaUsd, + decimal ProjectedMonthlyCostDeltaUsd, + long PrimaryTokensOut, + long ShadowTokensOut, + long TokensOutDelta, + double ExactMatchRate, + double AvgLengthRatio, + double BothPresentRate, + DateTimeOffset FirstSeen, + DateTimeOffset LastSeen); + +/// The shadow Summary payload: window + total runs + per-pair rollups. +public record ShadowSummaryDto( + DateTimeOffset From, + DateTimeOffset To, + long TotalRuns, + IReadOnlyList Pairs); + +/// One redacted shadow sample (primary vs shadow side by side) for the drill-in. +public record ShadowSampleDto( + Guid Id, + string? PrimaryResponse, + string? ShadowResponse, + int PrimaryLatencyMs, + int ShadowLatencyMs, + decimal PrimaryCostUsd, + decimal ShadowCostUsd, + int PrimaryTokensOut, + int ShadowTokensOut, + bool ExactMatch, + string PromptHash, + Guid? PrimaryTraceId, + Guid? ShadowTraceId, + DateTimeOffset CreatedAt); + +/// Paged shadow-sample list for one pair. +public record ShadowSamplesPageDto(long Total, IReadOnlyList Items); + +/// One row in the models registry (table models); Status is the string enum. +public record ModelRegistrationDto( + Guid Id, + string FeatureTag, + string ProviderKey, + string ModelId, + string Status, + DateTimeOffset CreatedAt); + +/// The models registry payload (whole table; tiny). +public record ModelsRegistryDto(IReadOnlyList Models); diff --git a/tests/TextStack.IntegrationTests/AdminShadowEndpointTests.cs b/tests/TextStack.IntegrationTests/AdminShadowEndpointTests.cs new file mode 100644 index 00000000..1ba8a9f8 --- /dev/null +++ b/tests/TextStack.IntegrationTests/AdminShadowEndpointTests.cs @@ -0,0 +1,173 @@ +using System.Net; +using System.Text.Json; + +namespace TextStack.IntegrationTests; + +/// +/// Integration tests for the admin shadow-comparison + models-registry endpoints (Phase 12 RLOps), +/// against the live API on the admin host (textstack.dev). Mirrors AdminAgentRunsEndpointTests: +/// auth + validation paths assert without seeded rows (an empty shadow_runs table still returns a +/// well-formed empty summary), and the models registry returns the seeded primary routes. Authed +/// assertions need the fixture's test user to be admin; otherwise AdminAuth → 401/403 and the test +/// is skipped rather than false-passing. +/// +/// To run: `docker compose up` (API on :8080) with `ENABLE_TEST_AUTH=true`; runs in CI. +/// +public class AdminShadowEndpointTests : IClassFixture +{ + private readonly AuthenticatedApiFixture _fixture; + + public AdminShadowEndpointTests(AuthenticatedApiFixture fixture) => _fixture = fixture; + + [Fact] + public async Task GetShadowSummary_NoAuth_Unauthorized() + { + var request = new HttpRequestMessage(HttpMethod.Get, "/admin/ai-quality/shadow/summary"); + request.Headers.Host = AuthenticatedApiFixture.AdminHost; + + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(response.StatusCode is HttpStatusCode.NotFound, "endpoint not deployed"); + Assert.Equal(HttpStatusCode.Unauthorized, response.StatusCode); + } + + [Fact] + public async Task GetShadowSummary_Authed_NoData_ReturnsEmptyPairs() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + // Narrow to a tiny far-past window so the table is empty regardless of real data. + var request = _fixture.CreateAdminRequest( + HttpMethod.Get, + "/admin/ai-quality/shadow/summary?from=2000-01-01T00:00:00Z&to=2000-01-02T00:00:00Z"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.OK, response.StatusCode); + + using var doc = JsonDocument.Parse( + await response.Content.ReadAsStringAsync(TestContext.Current.CancellationToken)); + var root = doc.RootElement; + + Assert.True(root.TryGetProperty("pairs", out var pairs), "summary has pairs"); + Assert.Equal(JsonValueKind.Array, pairs.ValueKind); + Assert.Empty(pairs.EnumerateArray()); + Assert.Equal(0, root.GetProperty("totalRuns").GetInt64()); + Assert.True(root.TryGetProperty("from", out _)); + Assert.True(root.TryGetProperty("to", out _)); + } + + [Fact] + public async Task GetShadowSummary_InvertedWindow_BadRequest() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + var request = _fixture.CreateAdminRequest( + HttpMethod.Get, + "/admin/ai-quality/shadow/summary?from=2030-01-02T00:00:00Z&to=2030-01-01T00:00:00Z"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.BadRequest, response.StatusCode); + } + + [Fact] + public async Task GetShadowSamples_NoAuth_Unauthorized() + { + var request = new HttpRequestMessage(HttpMethod.Get, "/admin/ai-quality/shadow/samples"); + request.Headers.Host = AuthenticatedApiFixture.AdminHost; + + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(response.StatusCode is HttpStatusCode.NotFound, "endpoint not deployed"); + Assert.Equal(HttpStatusCode.Unauthorized, response.StatusCode); + } + + [Fact] + public async Task GetShadowSamples_MissingPairParams_BadRequest() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + // feature only — primaryModelId + shadowModelId omitted. + var request = _fixture.CreateAdminRequest( + HttpMethod.Get, "/admin/ai-quality/shadow/samples?feature=explain"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.BadRequest, response.StatusCode); + } + + [Fact] + public async Task GetShadowSamples_AllPairParams_NoData_ReturnsEmptyPage() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + var request = _fixture.CreateAdminRequest( + HttpMethod.Get, + "/admin/ai-quality/shadow/samples?feature=__none__&primaryModelId=__a__&shadowModelId=__b__"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.OK, response.StatusCode); + + using var doc = JsonDocument.Parse( + await response.Content.ReadAsStringAsync(TestContext.Current.CancellationToken)); + var root = doc.RootElement; + Assert.Equal(0, root.GetProperty("total").GetInt64()); + Assert.Empty(root.GetProperty("items").EnumerateArray()); + } + + [Fact] + public async Task GetModels_NoAuth_Unauthorized() + { + var request = new HttpRequestMessage(HttpMethod.Get, "/admin/ai-quality/models"); + request.Headers.Host = AuthenticatedApiFixture.AdminHost; + + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(response.StatusCode is HttpStatusCode.NotFound, "endpoint not deployed"); + Assert.Equal(HttpStatusCode.Unauthorized, response.StatusCode); + } + + [Fact] + public async Task GetModels_Authed_ReturnsSeededPrimaryRows() + { + Assert.SkipUnless(_fixture.IsAuthenticated, "auth unavailable"); + + var request = _fixture.CreateAdminRequest(HttpMethod.Get, "/admin/ai-quality/models"); + var response = await _fixture.Client.SendAsync(request, TestContext.Current.CancellationToken); + + Assert.SkipWhen(IntegrationSkip.Unavailable(response), "endpoint not deployed"); + Assert.SkipWhen( + response.StatusCode is HttpStatusCode.Unauthorized or HttpStatusCode.Forbidden, + "test user is not admin"); + Assert.Equal(HttpStatusCode.OK, response.StatusCode); + + using var doc = JsonDocument.Parse( + await response.Content.ReadAsStringAsync(TestContext.Current.CancellationToken)); + var models = doc.RootElement.GetProperty("models"); + Assert.Equal(JsonValueKind.Array, models.ValueKind); + // The startup seeder inserts the current PRIMARY routes → at least one row. + Assert.NotEmpty(models.EnumerateArray()); + foreach (var m in models.EnumerateArray()) + { + Assert.True(m.TryGetProperty("featureTag", out _)); + Assert.True(m.TryGetProperty("providerKey", out _)); + Assert.True(m.TryGetProperty("modelId", out _)); + Assert.True(m.TryGetProperty("status", out var status)); + Assert.Equal(JsonValueKind.String, status.ValueKind); + } + } +} diff --git a/tests/TextStack.UnitTests/ShadowPairMapperTests.cs b/tests/TextStack.UnitTests/ShadowPairMapperTests.cs new file mode 100644 index 00000000..17e4592f --- /dev/null +++ b/tests/TextStack.UnitTests/ShadowPairMapperTests.cs @@ -0,0 +1,140 @@ +using Api.Endpoints; + +namespace TextStack.UnitTests; + +/// +/// Unit tests for the pure shadow-pair mapper (AdminAiQualityEndpoints.ToPairDto) — the delta + +/// 30-day projection math that the shadow Summary endpoint applies after SQL aggregation. +/// +public class ShadowPairMapperTests +{ + private static AdminAiQualityEndpoints.ShadowPairRow Row( + double primaryP50 = 100, + double shadowP50 = 150, + decimal primaryCost = 1.0m, + decimal shadowCost = 2.0m, + long primaryTokensOut = 1000, + long shadowTokensOut = 1200, + double? exactMatch = 0.5, + double? lengthRatio = 1.1, + double? bothPresent = 0.9, + long runs = 10) => new() + { + FeatureTag = "explain", + PrimaryModelId = "gpt-4.1-mini", + ShadowModelId = "gpt-4.1-nano", + Runs = runs, + PrimaryP50 = primaryP50, + ShadowP50 = shadowP50, + PrimaryCostUsd = primaryCost, + ShadowCostUsd = shadowCost, + PrimaryTokensOut = primaryTokensOut, + ShadowTokensOut = shadowTokensOut, + ExactMatchRate = exactMatch, + AvgLengthRatio = lengthRatio, + BothPresentRate = bothPresent, + FirstSeen = DateTimeOffset.UnixEpoch, + LastSeen = DateTimeOffset.UnixEpoch.AddDays(1), + }; + + [Fact] + public void ToPairDto_Deltas_AreShadowMinusPrimary() + { + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryP50: 100, shadowP50: 150, primaryCost: 1.0m, shadowCost: 2.5m, + primaryTokensOut: 1000, shadowTokensOut: 1200), + windowDays: 30); + + Assert.Equal(50, dto.LatencyDeltaMs); // 150 - 100 + Assert.Equal(1.5m, dto.CostDeltaUsd); // 2.5 - 1.0 + Assert.Equal(200, dto.TokensOutDelta); // 1200 - 1000 + } + + [Fact] + public void ToPairDto_NegativeDeltas_WhenShadowCheaperAndFaster() + { + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryP50: 200, shadowP50: 120, primaryCost: 3.0m, shadowCost: 1.0m, + primaryTokensOut: 1500, shadowTokensOut: 1000), + windowDays: 30); + + Assert.Equal(-80, dto.LatencyDeltaMs); + Assert.Equal(-2.0m, dto.CostDeltaUsd); + Assert.Equal(-500, dto.TokensOutDelta); + } + + [Fact] + public void ToPairDto_Projection_ScalesWindowTo30Days() + { + // 15-day window, cost delta of 1.0 → 30-day projection doubles it. + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryCost: 0m, shadowCost: 1.0m), windowDays: 15); + + Assert.Equal(2.0m, dto.ProjectedMonthlyCostDeltaUsd); + } + + [Fact] + public void ToPairDto_Projection_30DayWindow_IsIdentity() + { + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryCost: 1.0m, shadowCost: 2.0m), windowDays: 30); + + Assert.Equal(1.0m, dto.CostDeltaUsd); + Assert.Equal(1.0m, dto.ProjectedMonthlyCostDeltaUsd); + } + + [Fact] + public void ToPairDto_WindowDaysGuard_SubDayWindow_DoesNotOverProject() + { + // 0.1-day window must clamp to 1 day, so projection = costDelta * 30, not * 300. + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryCost: 0m, shadowCost: 1.0m), windowDays: 0.1); + + Assert.Equal(30m, dto.ProjectedMonthlyCostDeltaUsd); + } + + [Fact] + public void ToPairDto_WindowDaysGuard_Zero_DoesNotDivideByZero() + { + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryCost: 0m, shadowCost: 2.0m), windowDays: 0); + + Assert.Equal(60m, dto.ProjectedMonthlyCostDeltaUsd); // clamped to 1 day → *30 + } + + [Fact] + public void ToPairDto_NullAgreementMetrics_DefaultToZero() + { + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(exactMatch: null, lengthRatio: null, bothPresent: null), + windowDays: 30); + + Assert.Equal(0, dto.ExactMatchRate); + Assert.Equal(0, dto.AvgLengthRatio); + Assert.Equal(0, dto.BothPresentRate); + } + + [Fact] + public void ToPairDto_RoundsP50DoublesToInt() + { + var dto = AdminAiQualityEndpoints.ToPairDto( + Row(primaryP50: 100.4, shadowP50: 150.6), windowDays: 30); + + Assert.Equal(100, dto.PrimaryP50LatencyMs); + Assert.Equal(151, dto.ShadowP50LatencyMs); + Assert.Equal(51, dto.LatencyDeltaMs); // 151 - 100 (rounded values) + } + + [Fact] + public void ToPairDto_PassesThroughIdentityFields() + { + var dto = AdminAiQualityEndpoints.ToPairDto(Row(runs: 42), windowDays: 30); + + Assert.Equal("explain", dto.FeatureTag); + Assert.Equal("gpt-4.1-mini", dto.PrimaryModelId); + Assert.Equal("gpt-4.1-nano", dto.ShadowModelId); + Assert.Equal(42, dto.Runs); + Assert.Equal(DateTimeOffset.UnixEpoch, dto.FirstSeen); + Assert.Equal(DateTimeOffset.UnixEpoch.AddDays(1), dto.LastSeen); + } +}