diff --git a/evals/README.md b/evals/README.md index 1199241..e6605d0 100644 --- a/evals/README.md +++ b/evals/README.md @@ -28,17 +28,17 @@ Run these after any changes to the provider, mock, or shared utilities to catch # From evals/ # Run a single suite (all test cases) -npm run eval:aiconfig-create # ai-configs/aiconfig-create -npm run eval:aiconfig-update # ai-configs/aiconfig-update -npm run eval:aiconfig-tools # ai-configs/aiconfig-tools -npm run eval:aiconfig-variations # ai-configs/aiconfig-variations +npm run eval:configs-create # agentcontrol/configs-create +npm run eval:configs-update # agentcontrol/configs-update +npm run eval:agentcontrol-tools # agentcontrol/tools +npm run eval:configs-variations # agentcontrol/configs-variations npm run eval:flag-create # feature-flags/launchdarkly-flag-create # Quick smoke check — first test case only (~15-20s, ~$0.05) -npm run eval:aiconfig-create:single -npm run eval:aiconfig-update:single -npm run eval:aiconfig-tools:single -npm run eval:aiconfig-variations:single +npm run eval:configs-create:single +npm run eval:configs-update:single +npm run eval:agentcontrol-tools:single +npm run eval:configs-variations:single npm run eval:flag-create:single # Aggregate and CI operations @@ -147,7 +147,7 @@ This handles agents that call `get-foo` before AND after mutation; using `indexO ### Cross-model evaluation (`run-models.js`) -The cross-model runner evaluates all suites against one or more model aliases without touching the canonical `eval-scores.json`. Results are written to `/results..json` (e.g., `aiconfig-create/results.haiku.json`). +The cross-model runner evaluates all suites against one or more model aliases without touching the canonical `eval-scores.json`. Results are written to `/results..json` (e.g., `configs-create/results.haiku.json`). ```bash npm run eval:haiku # claude-haiku-4-5-20251001 @@ -222,7 +222,7 @@ Read the SKILL.md and note every MCP tool it references. Verify each tool exists mkdir ``` -Use the same name as the skill directory (e.g., `aiconfig-create`). Create `promptfooconfig.yaml`: +Use the same name as the skill directory (e.g., `configs-create`). Create `promptfooconfig.yaml`: ```yaml # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json @@ -264,7 +264,7 @@ Add an entry to `scripts/_manifest.js`: ```js { suite: "", - skillKey: "/", // e.g. "ai-configs/aiconfig-create" + skillKey: "/", // e.g. "agentcontrol/configs-create" skillDir: "skills//", readme: "skills///README.md", }, @@ -364,7 +364,7 @@ Running `npm run eval:all` writes a summary at the repo root: "updatedAt": "2026-05-19T00:00:00Z", "lastCommit": "fc69376", "skills": { - "ai-configs/aiconfig-create": { + "agentcontrol/configs-create": { "score": 100, "passed": 4, "total": 4, @@ -377,6 +377,6 @@ Running `npm run eval:all` writes a summary at the repo root: ``` - `lastCommit` — the short git SHA at the time of the last `eval:all` run. Used by `eval:diff` to determine which suites have changed since scores were recorded. -- `skillKey` — the canonical key is `/` (e.g., `ai-configs/aiconfig-create`). +- `skillKey` — the canonical key is `/` (e.g., `agentcontrol/configs-create`). Run `node scripts/aggregate.js` (without `--run`) to rebuild this file from existing `/results.json` files without making any API calls. diff --git a/evals/aiconfig-tools/promptfooconfig.yaml b/evals/agentcontrol-tools/promptfooconfig.yaml similarity index 97% rename from evals/aiconfig-tools/promptfooconfig.yaml rename to evals/agentcontrol-tools/promptfooconfig.yaml index 90af29d..a0878bb 100644 --- a/evals/aiconfig-tools/promptfooconfig.yaml +++ b/evals/agentcontrol-tools/promptfooconfig.yaml @@ -1,13 +1,13 @@ # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json # # Run with shared defaults: -# promptfoo eval -c shared/defaults.yaml -c aiconfig-tools/promptfooconfig.yaml +# promptfoo eval -c shared/defaults.yaml -c agentcontrol-tools/promptfooconfig.yaml # -# The aiconfig-tools skill covers creating agent tool definitions and attaching +# The agentcontrol-tools skill covers creating agent tool definitions and attaching # them to config variations. Key invariant: tools must be created with # raw JSON Schema format (not OpenAI function-calling wrapper), and must be # created before being attached. -description: "End-to-end evaluation of the aiconfig-tools skill" +description: "End-to-end evaluation of the agentcontrol-tools skill" prompts: - file://../../skills/agentcontrol/tools/SKILL.md @@ -67,7 +67,7 @@ tests: - type: llm-rubric threshold: 0.75 value: | - Evaluate the aiconfig-tools workflow: + Evaluate the agentcontrol-tools workflow: 1. Did it create the tool first with create-ai-tool? 2. Did the tool schema use raw JSON Schema format (type: object, properties)? 3. Did the schema include both requested parameters (query, limit)? diff --git a/evals/aiconfig-create/promptfooconfig.yaml b/evals/configs-create/promptfooconfig.yaml similarity index 95% rename from evals/aiconfig-create/promptfooconfig.yaml rename to evals/configs-create/promptfooconfig.yaml index c29ca4c..6b0dbbd 100644 --- a/evals/aiconfig-create/promptfooconfig.yaml +++ b/evals/configs-create/promptfooconfig.yaml @@ -1,13 +1,13 @@ # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json # # Run with shared defaults: -# promptfoo eval -c shared/defaults.yaml -c aiconfig-create/promptfooconfig.yaml +# promptfoo eval -c shared/defaults.yaml -c configs-create/promptfooconfig.yaml # -# The aiconfig-create skill guides the agent through choosing agent vs +# The configs-create skill guides the agent through choosing agent vs # completion mode, creating a config + variation, and verifying the setup. # The recommended path is setup-ai-config (one-step); the agent may also # use create-ai-config + create-ai-config-variation for more control. -description: "End-to-end evaluation of the aiconfig-create skill" +description: "End-to-end evaluation of the configs-create skill" prompts: - file://../../skills/agentcontrol/configs-create/SKILL.md @@ -179,9 +179,9 @@ tests: # ------------------------------------------------------------------ # Test 4: Targeting reminder # After creation, the skill should remind the user they need to run - # targeting (/aiconfig-targeting) before the SDK returns enabled=True. + # targeting (/configs-targeting) before the SDK returns enabled=True. # ------------------------------------------------------------------ - - description: "Post-creation: response mentions targeting step or aiconfig-targeting" + - description: "Post-creation: response mentions targeting step or configs-targeting" vars: user_request: > Create a config called "email-assistant" for composing emails. @@ -202,7 +202,7 @@ tests: after creation. Score 1.0 if the response: 1. Confirms the config was created successfully. 2. Mentions that targeting must be configured before the SDK returns enabled=True, - OR references the /aiconfig-targeting skill, OR notes the fallthrough points at + OR references the /configs-targeting skill, OR notes the fallthrough points at a disabled variation. Score 0.5 if it mentions the config was created but omits the targeting warning. Score 0.0 if it neither confirms creation nor mentions targeting. diff --git a/evals/aiconfig-update/promptfooconfig.yaml b/evals/configs-update/promptfooconfig.yaml similarity index 97% rename from evals/aiconfig-update/promptfooconfig.yaml rename to evals/configs-update/promptfooconfig.yaml index 0724fe8..c979ec4 100644 --- a/evals/aiconfig-update/promptfooconfig.yaml +++ b/evals/configs-update/promptfooconfig.yaml @@ -1,11 +1,11 @@ # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json # # Run with shared defaults: -# promptfoo eval -c shared/defaults.yaml -c aiconfig-update/promptfooconfig.yaml +# promptfoo eval -c shared/defaults.yaml -c configs-update/promptfooconfig.yaml # -# The aiconfig-update skill covers updating variation model/prompts/parameters, +# The configs-update skill covers updating variation model/prompts/parameters, # updating config metadata, archiving instead of deleting, and verification. -description: "End-to-end evaluation of the aiconfig-update skill" +description: "End-to-end evaluation of the configs-update skill" prompts: - file://../../skills/agentcontrol/configs-update/SKILL.md @@ -62,7 +62,7 @@ tests: - type: llm-rubric threshold: 0.7 value: | - Evaluate the aiconfig-update workflow: + Evaluate the configs-update workflow: 1. Did it explore current state (health or get-ai-config) before mutating? 2. Did it use update-ai-config-variation to change the model? 3. Did it use correct Provider.model-id format for modelConfigKey (e.g. OpenAI.gpt-4o-mini)? diff --git a/evals/aiconfig-variations/promptfooconfig.yaml b/evals/configs-variations/promptfooconfig.yaml similarity index 98% rename from evals/aiconfig-variations/promptfooconfig.yaml rename to evals/configs-variations/promptfooconfig.yaml index 6edcc82..c83dadd 100644 --- a/evals/aiconfig-variations/promptfooconfig.yaml +++ b/evals/configs-variations/promptfooconfig.yaml @@ -1,12 +1,12 @@ # yaml-language-server: $schema=https://promptfoo.dev/config-schema.json # # Run with shared defaults: -# promptfoo eval -c shared/defaults.yaml -c aiconfig-variations/promptfooconfig.yaml +# promptfoo eval -c shared/defaults.yaml -c configs-variations/promptfooconfig.yaml # -# The aiconfig-variations skill covers cloning a variation to test one change +# The configs-variations skill covers cloning a variation to test one change # at a time (the primary path), creating from scratch (when explicitly asked), # and safety rules around not deleting the baseline variation. -description: "End-to-end evaluation of the aiconfig-variations skill" +description: "End-to-end evaluation of the configs-variations skill" prompts: - file://../../skills/agentcontrol/configs-variations/SKILL.md diff --git a/evals/package.json b/evals/package.json index b2e2db2..3a597e7 100644 --- a/evals/package.json +++ b/evals/package.json @@ -3,14 +3,14 @@ "private": true, "type": "commonjs", "scripts": { - "eval:aiconfig-create": "promptfoo eval -c shared/defaults.yaml -c aiconfig-create/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-create/results.json", - "eval:aiconfig-create:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-create/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", - "eval:aiconfig-update": "promptfoo eval -c shared/defaults.yaml -c aiconfig-update/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-update/results.json", - "eval:aiconfig-update:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-update/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", - "eval:aiconfig-tools": "promptfoo eval -c shared/defaults.yaml -c aiconfig-tools/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-tools/results.json", - "eval:aiconfig-tools:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-tools/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", - "eval:aiconfig-variations": "promptfoo eval -c shared/defaults.yaml -c aiconfig-variations/promptfooconfig.yaml --env-file .env --no-cache -o aiconfig-variations/results.json", - "eval:aiconfig-variations:single": "promptfoo eval -c shared/defaults.yaml -c aiconfig-variations/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", + "eval:configs-create": "promptfoo eval -c shared/defaults.yaml -c configs-create/promptfooconfig.yaml --env-file .env --no-cache -o configs-create/results.json", + "eval:configs-create:single": "promptfoo eval -c shared/defaults.yaml -c configs-create/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", + "eval:configs-update": "promptfoo eval -c shared/defaults.yaml -c configs-update/promptfooconfig.yaml --env-file .env --no-cache -o configs-update/results.json", + "eval:configs-update:single": "promptfoo eval -c shared/defaults.yaml -c configs-update/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", + "eval:agentcontrol-tools": "promptfoo eval -c shared/defaults.yaml -c agentcontrol-tools/promptfooconfig.yaml --env-file .env --no-cache -o agentcontrol-tools/results.json", + "eval:agentcontrol-tools:single": "promptfoo eval -c shared/defaults.yaml -c agentcontrol-tools/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", + "eval:configs-variations": "promptfoo eval -c shared/defaults.yaml -c configs-variations/promptfooconfig.yaml --env-file .env --no-cache -o configs-variations/results.json", + "eval:configs-variations:single": "promptfoo eval -c shared/defaults.yaml -c configs-variations/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", "eval:flag-create": "promptfoo eval -c shared/defaults.yaml -c launchdarkly-flag-create/promptfooconfig.yaml --env-file .env --no-cache -o launchdarkly-flag-create/results.json", "eval:flag-create:single": "promptfoo eval -c shared/defaults.yaml -c launchdarkly-flag-create/promptfooconfig.yaml --env-file .env --no-cache --filter-first-n 1", "eval:all": "node scripts/aggregate.js --run", diff --git a/evals/scripts/_manifest.js b/evals/scripts/_manifest.js index 711d87f..c0ca881 100644 --- a/evals/scripts/_manifest.js +++ b/evals/scripts/_manifest.js @@ -14,25 +14,25 @@ */ const SUITES = [ { - suite: "aiconfig-create", + suite: "configs-create", skillKey: "agentcontrol/configs-create", skillDir: "skills/agentcontrol/configs-create", readme: "skills/agentcontrol/configs-create/README.md", }, { - suite: "aiconfig-update", + suite: "configs-update", skillKey: "agentcontrol/configs-update", skillDir: "skills/agentcontrol/configs-update", readme: "skills/agentcontrol/configs-update/README.md", }, { - suite: "aiconfig-tools", + suite: "agentcontrol-tools", skillKey: "agentcontrol/tools", skillDir: "skills/agentcontrol/tools", readme: "skills/agentcontrol/tools/README.md", }, { - suite: "aiconfig-variations", + suite: "configs-variations", skillKey: "agentcontrol/configs-variations", skillDir: "skills/agentcontrol/configs-variations", readme: "skills/agentcontrol/configs-variations/README.md", diff --git a/evals/scripts/_models.js b/evals/scripts/_models.js index 3396767..7aa5781 100644 --- a/evals/scripts/_models.js +++ b/evals/scripts/_models.js @@ -30,7 +30,7 @@ function resolveModel(input) { /** * Reverse-lookup a friendly alias for a model id, falling back to the model * id itself. Used to label per-model output files like - * `aiconfig-create/results.haiku.json`. + * `configs-create/results.haiku.json`. */ function aliasFor(modelId) { for (const [alias, id] of Object.entries(MODEL_ALIASES)) { diff --git a/evals/scripts/aggregate.js b/evals/scripts/aggregate.js index 7218375..1c14ece 100644 --- a/evals/scripts/aggregate.js +++ b/evals/scripts/aggregate.js @@ -6,7 +6,7 @@ * Modes: * node scripts/aggregate.js # rebuild from existing results.json * node scripts/aggregate.js --run # run every suite then aggregate - * node scripts/aggregate.js --run --only=aiconfig-create,aiconfig-update + * node scripts/aggregate.js --run --only=configs-create,configs-update * * Exits 0 on success, 1 on failure. */ diff --git a/evals/scripts/run-models.js b/evals/scripts/run-models.js index 2e5b5c8..fa55ca2 100644 --- a/evals/scripts/run-models.js +++ b/evals/scripts/run-models.js @@ -9,7 +9,7 @@ * * Usage: * node scripts/run-models.js --model=haiku - * node scripts/run-models.js --model=sonnet --only=aiconfig-create + * node scripts/run-models.js --model=sonnet --only=configs-create * node scripts/run-models.js --models=haiku,sonnet,opus * * Output: @@ -78,7 +78,7 @@ function usage() { "Examples:", " npm run eval:haiku", " npm run eval:matrix", - " node scripts/run-models.js --model=haiku --only=aiconfig-create", + " node scripts/run-models.js --model=haiku --only=configs-create", ].join("\n"); }