Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

### Phase 12 — ContinuousEvalWorker (AI-079, slice 5a) (2026-06-18)

Automates the eval suite on a cadence so quality regressions on prod are caught without an admin clicking "run evals". `ContinuousEvalWorker` (Api host `BackgroundService`, ~10min startup delay + hourly check) runs `EvalSuiteRunner` for the configured features when due (no `run_type='scheduled'` row newer than `Eval:Scheduled:IntervalHours`, default 24h), persists with the new **`eval_runs.run_type`** column (`scheduled`/`manual`), and emails the admin (via `ResendEmailService`, no-op if unset) when a feature's score drops ≥ `RegressionDrop` (default 0.5 on the 1-5 scale) vs the **prior scheduled** run (`EvalRegressionDetector`, pure). **OFF by default** (`Eval:Scheduled:Enabled=false`) — it spends judge $ when on, so it also respects an optional `eval.judge` daily cap (fail-open). Concurrency-safe: the in-process overlap guard the admin trigger used is extracted into a shared singleton `IEvalRunGate` (so a scheduled run + an admin run can't collide), plus a **Postgres advisory lock** for multi-replica. The worker never crashes the host (every tick wrapped); the advisory lock is released with `CancellationToken.None` so host shutdown can't leak it onto a pooled connection (QA P1); the admin trigger releases the gate even on a synchronous setup failure (QA P2). New `GET /admin/ai-quality/drift/eval-trend` exposes the scheduled-only score trend (consumed by the Drift tab in slice 5b). Migration `AddEvalRunType` (backfills existing rows to `manual`). 780 unit tests green. Slice 5b (DriftDetectionWorker + Drift tab) follows.

### Phase 12 — cost-aware routing + per-feature daily budget (AI-078, slice 4) (2026-06-18)

Per-feature daily USD budgets with cost-aware enforcement — the DoD "cost-aware routing cuts spend" lever.
Expand Down
4 changes: 3 additions & 1 deletion backend/src/Ai/TextStack.Ai.EvalSuite/EvalSuiteRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ public async Task<IReadOnlyList<EvalRunResult>> RunAsync(
bool persist,
IAppDbContext? db,
string? gitSha,
CancellationToken ct)
CancellationToken ct,
string runType = "manual")
{
var defs = EvalDefinitions.Build(keys);
var chatConfig = new ChatConfiguration(new LlmServiceChatClient(judgeClient, defaultFeatureTag: "eval.judge"));
Expand Down Expand Up @@ -85,6 +86,7 @@ public async Task<IReadOnlyList<EvalRunResult>> RunAsync(
N = summary.N,
BreakdownJson = Breakdown(entry.Rubric, summary),
GitSha = gitSha,
RunType = runType,
CreatedAt = DateTimeOffset.UtcNow,
});
}
Expand Down
78 changes: 60 additions & 18 deletions backend/src/Api/Endpoints/AdminAiQualityEndpoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public static void MapAdminAiQualityEndpoints(this WebApplication app)
group.MapGet("/agent-runs", GetAgentRuns);
group.MapGet("/agent-runs/{id:guid}", GetAgentRun);
group.MapGet("/evals", GetEvals);
group.MapGet("/drift/eval-trend", GetEvalTrend);
group.MapPost("/evals/run", RunEvals);
group.MapGet("/evals/status", GetEvalStatus);
group.MapPost("/evals/toolcalls/run", RunToolCallEval);
Expand Down Expand Up @@ -236,37 +237,51 @@ private static async Task<IResult> RunToolCallEval(
}

// In-app eval runner state (one run at a time). Triggered from the admin Evals tab.
// The single-slot guard is now the shared IEvalRunGate (slice 5a) so an admin run and a
// scheduled ContinuousEvalWorker run can't collide; these fields are only display state.
private static volatile bool _evalRunning;
private static DateTimeOffset? _evalStartedAt;
private static string? _evalLastError;
private static readonly object _evalLock = new();

private static IResult RunEvals(
[FromBody] RunEvalsRequest? body,
IServiceScopeFactory scopeFactory,
IConfiguration config,
Application.Ai.IEvalRunGate gate,
ILogger<Program> logger)
{
lock (_evalLock)
if (!gate.TryEnter())
return Results.Conflict(new { error = "An eval run is already in progress" });

// The gate is now held; any synchronous failure before the background task's
// finally takes over MUST release it, else evals are blocked until restart (QA P2).
string judgeKey, judgeModelId;
string? gitSha;
IReadOnlyList<string>? features;
try
{
if (_evalRunning)
return Results.Conflict(new { error = "An eval run is already in progress" });
_evalRunning = true;
_evalStartedAt = DateTimeOffset.UtcNow;
_evalLastError = null;
}

// Judge defaults to local Ollama (free); generation always goes through the
// gateway (routes by FeatureTag → OpenAI/Ollama exactly like prod).
// The OpenAI judge runs the dedicated 'openai-judge' provider (Eval:JudgeModel,
// default gpt-4.1) — stronger + independent of the nano generation model.
var useOpenAiJudge = string.Equals(body?.Judge, "openai", StringComparison.OrdinalIgnoreCase);
var judgeKey = useOpenAiJudge ? "openai-judge" : "ollama";
var judgeModelId = useOpenAiJudge
? config["Eval:JudgeModel"] ?? "gpt-4.1"
: config["Ollama:Model"] ?? "gemma4:e2b";
var gitSha = Environment.GetEnvironmentVariable("GIT_SHA");
var features = body?.Features;
// Judge defaults to local Ollama (free); generation always goes through the
// gateway (routes by FeatureTag → OpenAI/Ollama exactly like prod).
// The OpenAI judge runs the dedicated 'openai-judge' provider (Eval:JudgeModel,
// default gpt-4.1) — stronger + independent of the nano generation model.
var useOpenAiJudge = string.Equals(body?.Judge, "openai", StringComparison.OrdinalIgnoreCase);
judgeKey = useOpenAiJudge ? "openai-judge" : "ollama";
judgeModelId = useOpenAiJudge
? config["Eval:JudgeModel"] ?? "gpt-4.1"
: config["Ollama:Model"] ?? "gemma4:e2b";
gitSha = Environment.GetEnvironmentVariable("GIT_SHA");
features = body?.Features;
}
catch
{
_evalRunning = false;
gate.Exit();
throw;
}

_ = Task.Run(async () =>
{
Expand All @@ -278,7 +293,7 @@ private static IResult RunEvals(
var gateway = sp.GetRequiredService<ILlmService>();
var judge = sp.GetRequiredKeyedService<ILlmService>(judgeKey);
var db = sp.GetRequiredService<IAppDbContext>();
await runner.RunAsync(_ => gateway, judge, judgeModelId, features, persist: true, db, gitSha, CancellationToken.None);
await runner.RunAsync(_ => gateway, judge, judgeModelId, features, persist: true, db, gitSha, CancellationToken.None, runType: "manual");
}
catch (Exception ex)
{
Expand All @@ -288,6 +303,7 @@ private static IResult RunEvals(
finally
{
_evalRunning = false;
gate.Exit();
}
});

Expand Down Expand Up @@ -417,12 +433,38 @@ private static async Task<IResult> GetEvals(
.OrderByDescending(r => r.CreatedAt)
.Take(limit)
.Select(r => new EvalRunDto(
r.Id, r.Feature, r.ModelId, r.JudgeModelId, r.Score, r.N, r.BreakdownJson, r.GitSha, r.CreatedAt))
r.Id, r.Feature, r.ModelId, r.JudgeModelId, r.Score, r.N, r.BreakdownJson, r.GitSha, r.RunType, r.CreatedAt))
.ToListAsync(ct);

return Results.Ok(runs);
}

// Phase 12 RLOps slice 5a: scheduled-only eval trend for the Drift tab (slice 5b UI).
// Last N RunType='scheduled' rows (optionally per feature), newest-first.
private static async Task<IResult> GetEvalTrend(
AppDbContext db,
[FromQuery] string? feature,
[FromQuery] int limit = 100,
CancellationToken ct = default)
{
limit = Math.Clamp(limit, 1, 1000);
var query = db.EvalRuns.Where(r => r.RunType == "scheduled");
if (!string.IsNullOrWhiteSpace(feature))
{
var feat = feature.Trim();
query = query.Where(r => r.Feature == feat);
}

var points = await query
.OrderByDescending(r => r.CreatedAt)
.Take(limit)
.Select(r => new ScheduledEvalPointDto(
r.Feature, r.ModelId, (double)r.Score, r.N, r.GitSha ?? "", r.CreatedAt))
.ToListAsync(ct);

return Results.Ok((IReadOnlyList<ScheduledEvalPointDto>)points);
}

private static async Task<IResult> GetSummary(
AppDbContext db,
[FromQuery] DateTimeOffset? from,
Expand Down
6 changes: 6 additions & 0 deletions backend/src/Api/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
// Application layer
builder.Services.AddApplication();
builder.Services.AddSingleton<TextStack.Ai.EvalSuite.EvalSuiteRunner>();
// Scheduled-eval support (Phase 12 RLOps slice 5a): shared single-slot overlap gate +
// pure regression detector, consumed by both the admin trigger and ContinuousEvalWorker.
builder.Services.AddSingleton<Application.Ai.IEvalRunGate, Application.Ai.EvalRunGate>();
builder.Services.AddSingleton<Application.Ai.EvalRegressionDetector>();
builder.Services.AddSingleton<TextStack.Ai.EvalSuite.RagEvalRunner>();
builder.Services.AddSingleton<TextStack.Ai.EvalSuite.ToolCallEvalRunner>();
builder.Services.AddSingleton<TextStack.Ai.EvalSuite.StudyBuddyEvalRunner>();
Expand Down Expand Up @@ -222,6 +226,8 @@

// AI-058: weekly semantic concept clustering (groups vocab by meaning across all books)
builder.Services.AddHostedService<ConceptClusteringWorker>();
// Phase 12 RLOps slice 5a: scheduled continuous evals (OFF by default — Eval:Scheduled:Enabled).
builder.Services.AddHostedService<ContinuousEvalWorker>();

// Rate limiting
builder.Services.AddRateLimiter(options =>
Expand Down
Loading
Loading