microsoft · hillary-mutisya · May 4, 2026 · May 4, 2026
diff --git a/ts/packages/agents/browser/src/agent/browserActionHandler.mts b/ts/packages/agents/browser/src/agent/browserActionHandler.mts
@@ -2181,6 +2181,13 @@ async function createViewServiceHost(
     const port = context.agentContext.localHostPort;
     const sessionDir = await getSessionFolderPath(context);
 
+    if (!sessionDir) {
+        debug(
+            "Session directory not available, skipping view service host creation",
+        );
+        return undefined;
+    }
+
     const timeoutPromise = new Promise<undefined>((_resolve, reject) => {
         timeoutHandle = setTimeout(
             () => reject(new Error("Browser views service creation timed out")),
@@ -2198,7 +2205,7 @@ async function createViewServiceHost(
                     ),
                 );
 
-                const folderPath = path.join(sessionDir!, "files");
+                const folderPath = path.join(sessionDir, "files");
 
                 fs.mkdirSync(folderPath, { recursive: true });
 

diff --git a/ts/packages/agents/powershell/benchmark/BASELINE-2026-04-30.md b/ts/packages/agents/powershell/benchmark/BASELINE-2026-04-30.md
@@ -0,0 +1,80 @@
+# PowerShell Dispatcher Benchmark - Baseline Results
+
+**Date:** 2026-04-30
+**Test Environment:** Windows, TypeAgent dispatcher with grammar-based routing
+
+## Summary
+
+| Test Mode                            | Pass Rate | Passed | Total |
+| ------------------------------------ | --------- | ------ | ----- |
+| **Competition Mode** (all agents)    | **85.7%** | 30     | 35    |
+| **Isolation Mode** (PowerShell only) | **0%**    | 0      | 125   |
+
+## Competition Mode Results (grammar-competition)
+
+### Passed Tests (30/35)
+
+| Category           | Utterances                                                           | Status                       |
+| ------------------ | -------------------------------------------------------------------- | ---------------------------- |
+| Network ping       | `ping google.com`                                                    | PASS                         |
+| File search        | `grep error in logs`, `find TODO in src`                             | PASS                         |
+| Processes          | `show running processes`, `kill notepad`                             | PASS                         |
+| Start process      | `start notepad`, `launch calculator`, `run cmd`                      | PASS                         |
+| System info        | `show system information`, `show disk usage`, `show uptime`          | PASS                         |
+| Services           | `list all services`, `restart service spooler`                       | PASS                         |
+| Network utils      | `ipconfig`, `netstat`, `nslookup google.com`                         | PASS                         |
+| Archives           | `zip logs`, `unzip backup.zip`                                       | PASS                         |
+| Data ops           | `read csv data.csv`, `convert data.csv to json`                      | PASS                         |
+| Unix aliases       | `cat readme.md`, `rm temp.txt`, `cp file.txt backup.txt`, `ps`, `df` | PASS                         |
+| Negative: music    | `play some music`, `play shake it off`                               | PASS (correctly NOT matched) |
+| Negative: calendar | `schedule a meeting for tomorrow`, `what's on my calendar`           | PASS                         |
+| Negative: email    | `send an email to john`                                              | PASS                         |
+
+### Failed Tests (5/35)
+
+| Utterance                 | Expected                  | Actual                         | Root Cause                                                            |
+| ------------------------- | ------------------------- | ------------------------------ | --------------------------------------------------------------------- |
+| `list files in downloads` | `listFiles`               | `listFiles`                    | **Test data bug**: Expected `findLargeFiles` due to flow name mapping |
+| `show files in documents` | `listFiles`               | `listFiles`                    | **Test data bug**: Same as above                                      |
+| `ls`                      | `listFiles`               | `listFiles`                    | **Test data bug**: Same as above                                      |
+| `mv old.txt new.txt`      | `moveFile` / `powershell` | `readFile` / `utility`         | **Grammar gap**: `mv` alias not properly mapped                       |
+| `check my inbox`          | not powershell            | `serviceStatus` / `powershell` | **Disambiguation issue**: "check" is too broad                        |
+
+### Analysis
+
+1. **3 false failures** - Test data expects `findLargeFiles` but correct action is `listFiles`
+2. **1 grammar gap** - `mv` command not routing to `moveFile` action
+3. **1 disambiguation issue** - "check my inbox" incorrectly matches PowerShell's `serviceStatus`
+
+**Corrected pass rate:** 33/35 = **94.3%** (excluding test data bugs)
+
+## Isolation Mode Results (grammar-subschemas)
+
+### Status: All 125 tests failed
+
+The grammar IS matching correctly (logs show `powershell.listFiles - Result: ACCEPTED`), but evaluation fails because:
+
+- **Isolation mode schema format:** `powershell.listFiles`
+- **Expected sub-schema format:** `powershell-files` with action `listFiles`
+
+This is a **configuration issue** with how the dispatcher loads schemas in isolation mode, not a grammar problem.
+
+## Component Accuracy (Competition Mode)
+
+| Component        | Accuracy      |
+| ---------------- | ------------- |
+| Grammar matching | 91.4% (64/70) |
+
+## Test Artifacts
+
+Results written to: `benchmark/results/2026-04-30T1650/`
+
+- `scorecard.json` - Machine-readable results
+- `details.md` - Full test details with traces
+
+## Recommendations
+
+1. **Fix test data** - Update `grammar-competition.json` to expect `listFiles` instead of `findLargeFiles`
+2. **Fix isolation mode** - Update dispatcher config to properly load sub-schemas
+3. **Add `mv` grammar pattern** - Ensure move command routes correctly
+4. **Improve "check" disambiguation** - Make `serviceStatus` grammar more specific
diff --git a/ts/packages/agents/powershell/benchmark/BASELINE-2026-05-04.md b/ts/packages/agents/powershell/benchmark/BASELINE-2026-05-04.md
@@ -0,0 +1,137 @@
+# PowerShell Dispatcher Benchmark - Results
+
+**Date:** 2026-05-04
+**Test Environment:** Windows, TypeAgent dispatcher with grammar-based routing
+
+## Summary
+
+| Test Mode                                        | Pass Rate | Passed | Total |
+| ------------------------------------------------ | --------- | ------ | ----- |
+| **Competition Mode** (all agents)                | **91.4%** | 32     | 35    |
+| **Isolation Mode** (PowerShell only, no samples) | **92.0%** | 115    | 125   |
+
+## Fixes Applied Since Last Baseline
+
+### 1. Schema Name Configuration (Critical)
+
+**File:** `benchmark/run-benchmark.mts`
+
+Schema names changed from short form to fully qualified form:
+
+```typescript
+// Before (broken)
+schemas: ["powershell", "powershell-files", "powershell-processes", ...]
+
+// After (working)
+schemas: ["powershell", "powershell.powershell-files", "powershell.powershell-processes", ...]
+```
+
+### 2. Browser Agent Initialization Error
+
+**File:** `packages/agents/browser/src/agent/browserActionHandler.mts`
+
+Added null check for `sessionDir` in `createViewServiceHost` to prevent TypeError when session directory is unavailable.
+
+### 3. Sample Flow Control
+
+**Files:** `benchmark/run-benchmark.mts`, `src/actionHandler.mts`
+
+Added `--no-samples` CLI flag and `TYPEAGENT_NO_SAMPLES` environment variable to disable sample recipe seeding during benchmark runs, allowing namespace schema grammars to be tested in isolation.
+
+## Competition Mode Results (grammar-competition)
+
+**Pass Rate:** 91.4% (32/35)
+
+### Failed Tests (3/35)
+
+| Utterance            | Expected                  | Actual          | Root Cause                   |
+| -------------------- | ------------------------- | --------------- | ---------------------------- |
+| `mv old.txt new.txt` | `moveFile` / `powershell` | Other agent     | Grammar pattern gap          |
+| `check my inbox`     | not powershell            | `serviceStatus` | "check" is too broad         |
+| Various              | Test data issues          | -               | Flow name mapping mismatches |
+
+## Isolation Mode Results (grammar-subschemas)
+
+**Pass Rate:** 92.0% (115/125)
+
+### Component Accuracy
+
+| Component            | Accuracy  | Details         |
+| -------------------- | --------- | --------------- |
+| Grammar matching     | **97.6%** | 122/125 correct |
+| Parameter extraction | **90.8%** | 109/120 correct |
+
+### Passed Tests by Namespace
+
+| Namespace                   | Pass Rate    | Notes                           |
+| --------------------------- | ------------ | ------------------------------- |
+| Files (listFiles)           | 80% (4/5)    | 1 parameter extraction issue    |
+| Files (readFile)            | 100% (5/5)   | All patterns working            |
+| Files (writeFile)           | 100% grammar | Parameter quote handling issues |
+| Files (copyFile)            | 100% (3/3)   | All patterns working            |
+| Files (moveFile)            | 100% (3/3)   | All patterns working            |
+| Files (deleteFile)          | 100% (3/3)   | All patterns working            |
+| Files (testPath)            | 100% (3/3)   | All patterns working            |
+| Files (findText)            | 67% (2/3)    | 1 pattern/path split issue      |
+| Files (newItem)             | 100% (4/4)   | All patterns working            |
+| Processes (list/stop/start) | 92%          | 2 grammar mismatches            |
+| System                      | 100% (16/16) | All patterns working            |
+| Services                    | 100% (15/15) | All patterns working            |
+| Network                     | 92%          | 2 minor issues                  |
+| Data                        | 100% (10/10) | All patterns working            |
+| Archives                    | 88% (7/8)    | 1 parameter extraction issue    |
+
+### Failed Tests (10/125)
+
+| Scenario    | Utterance                         | Issue Type | Details                                 |
+| ----------- | --------------------------------- | ---------- | --------------------------------------- |
+| gs-files-01 | "show me the files in documents"  | Parameter  | Path captured extra words               |
+| gs-files-03 | "write 'hello world' to test.txt" | Parameter  | Quote handling                          |
+| gs-files-03 | "save 'test data' to output.txt"  | Parameter  | Quote handling                          |
+| gs-files-03 | "append 'new line' to log.txt"    | Parameter  | Quote handling                          |
+| gs-files-08 | "grep TODO src"                   | Parameter  | Pattern/path split                      |
+| gs-proc-02  | "show top 5 memory hogs"          | Grammar    | Matched `listFiles` not `processMemory` |
+| gs-proc-03  | "show top 10 cpu hogs"            | Grammar    | Matched `listFiles` not `processCpu`    |
+| gs-net-04   | "show my ip address"              | Grammar    | Matched `listFiles` not `ipConfig`      |
+| gs-net-05   | "resolve gmail.com MX"            | Parameter  | Type suffix not parsed                  |
+| gs-arch-02  | "extract archive.zip to output"   | Parameter  | Path extraction issue                   |
+
+### Root Cause Analysis
+
+1. **Grammar Conflicts (3 failures):** Phrases like "show top 5" and "show my" match the broad `listFiles` pattern before more specific patterns
+2. **Greedy Wildcards (4 failures):** Wildcard captures consume more tokens than intended
+3. **Quote Handling (3 failures):** Content parameters include surrounding quotes that tests expect stripped
+
+## Test Artifacts
+
+Results written to: `benchmark/results/2026-05-04T1728/`
+
+- `scorecard.json` - Machine-readable results
+- `details.md` - Full test details with traces
+
+## Recommendations
+
+### High Priority
+
+1. **Add specificity to memory/cpu patterns** - Ensure "show top N memory/cpu" routes to process namespace
+2. **Fix "show my" grammar conflict** - "show my ip" should match `ipConfig` not `listFiles`
+
+### Medium Priority
+
+3. **Improve wildcard boundaries** - Add stop words to prevent greedy matching
+4. **Handle quoted content** - Strip quotes from content parameters in grammar
+
+### Low Priority
+
+5. **DNS type parsing** - Support "resolve X MX" syntax with type parameter
+6. **Archive path extraction** - Improve "extract X to Y" pattern
+
+## Comparison with Previous Baseline
+
+| Metric           | 2026-04-30 | 2026-05-04 | Change |
+| ---------------- | ---------- | ---------- | ------ |
+| Competition Mode | 85.7%      | 91.4%      | +5.7%  |
+| Isolation Mode   | 0%         | 92.0%      | +92.0% |
+| Grammar Accuracy | 91.4%      | 97.6%      | +6.2%  |
+
+The isolation mode is now functional and achieves comparable accuracy to competition mode, validating that namespace schema grammars work correctly when properly configured.
diff --git a/ts/packages/agents/powershell/benchmark/harness/benchmarkRunner.mts b/ts/packages/agents/powershell/benchmark/harness/benchmarkRunner.mts
@@ -131,6 +131,8 @@ export class BenchmarkRunner {
     private loadScenarios(): BenchmarkScenario[] {
         const scenarioFiles = [
             "grammar-match.json",
+            "grammar-subschemas.json",
+            "grammar-competition.json",
             "execution.json",
             "llm-translation.json",
             "fallback-chain.json",
@@ -243,13 +245,18 @@ export class BenchmarkRunner {
             }
 
             // Resolve canonical flow names to actual LLM-generated names
+            // ONLY for non-grammar scenarios - grammar tests use built-in action names
+            const isGrammarTest = scenario.category.startsWith("grammar");
             const resolvedUtterance = {
                 ...utterance,
                 expected: {
                     ...utterance.expected,
-                    matchedFlow: utterance.expected.matchedFlow
-                        ? this.resolveFlowName(utterance.expected.matchedFlow)
-                        : utterance.expected.matchedFlow,
+                    matchedFlow:
+                        utterance.expected.matchedFlow && !isGrammarTest
+                            ? this.resolveFlowName(
+                                  utterance.expected.matchedFlow,
+                              )
+                            : utterance.expected.matchedFlow,
                 },
             };
 

diff --git a/ts/packages/agents/powershell/benchmark/harness/evaluators/grammarEvaluator.mts b/ts/packages/agents/powershell/benchmark/harness/evaluators/grammarEvaluator.mts
@@ -47,6 +47,38 @@ export function evaluateGrammarMatch(
         }
     }
 
+    // Check if the right agent was matched (for competition testing)
+    if (expected.matchedAgent !== undefined) {
+        const actualAgent = extractAgentName(commandResult, trace);
+        if (expected.matchedAgent === null) {
+            // Negative test: should NOT match powershell
+            results.push({
+                passed: actualAgent !== "powershell",
+                component: "grammar",
+                expected: "not powershell",
+                actual: actualAgent ?? "no match",
+                message:
+                    actualAgent === "powershell"
+                        ? `Expected non-powershell match but matched powershell`
+                        : undefined,
+            });
+        } else {
+            results.push({
+                passed:
+                    actualAgent?.toLowerCase() ===
+                    expected.matchedAgent.toLowerCase(),
+                component: "grammar",
+                expected: `agent: ${expected.matchedAgent}`,
+                actual: `agent: ${actualAgent ?? "no match"}`,
+                message:
+                    actualAgent?.toLowerCase() !==
+                    expected.matchedAgent.toLowerCase()
+                        ? `Expected agent '${expected.matchedAgent}' but got '${actualAgent ?? "no match"}'`
+                        : undefined,
+            });
+        }
+    }
+
     // Check extracted parameters
     if (
         expected.extractedParams &&
@@ -96,6 +128,24 @@ export function evaluateGrammarMatch(
     return results;
 }
 
+function extractAgentName(
+    commandResult: unknown,
+    trace: PipelineTrace,
+): string | null {
+    if (trace.matchedAgent) return trace.matchedAgent;
+    const result = commandResult as any;
+
+    const firstAction = result?.actions?.[0];
+    if (firstAction?.schemaName) {
+        // Extract base agent name from schema (e.g., "powershell-files" -> "powershell")
+        const schemaName = firstAction.schemaName as string;
+        if (schemaName.startsWith("powershell")) return "powershell";
+        return schemaName.split("-")[0];
+    }
+
+    return null;
+}
+
 function extractFlowName(
     commandResult: unknown,
     trace: PipelineTrace,
@@ -108,7 +158,11 @@ function extractFlowName(
     const firstAction = result?.actions?.[0];
     if (firstAction) {
         // Per-flow action types: actionName IS the flow name
-        if (firstAction.schemaName === "powershell") {
+        // Handle both monolithic "powershell" and sub-schemas "powershell-*"
+        if (
+            firstAction.schemaName === "powershell" ||
+            firstAction.schemaName?.startsWith("powershell-")
+        ) {
             return firstAction.actionName;
         }
         // Legacy: executePowerShellFlow with flowName param

diff --git a/ts/packages/agents/powershell/benchmark/harness/types.mts b/ts/packages/agents/powershell/benchmark/harness/types.mts
@@ -5,13 +5,17 @@ export interface BenchmarkScenario {
     id: string;
     category:
         | "grammar-match"
+        | "grammar-subschemas"
+        | "grammar-competition"
         | "llm-translation"
         | "execution"
         | "fallback-chain"
         | "end-to-end";
     description: string;
     setup: {
-        requiredFlows: string[];
+        requiredFlows?: string[];
+        requiredSchemas?: string[];
+        allAgents?: boolean;
         environmentSetup?: string;
         configOverrides?: Record<string, unknown>;
     };
@@ -24,6 +28,7 @@ export interface TestUtterance {
     expected: {
         routedTo: "grammar" | "llm-translation" | "reasoning";
         matchedFlow?: string | null;
+        matchedAgent?: string | null;
         extractedParams?: Record<string, unknown>;
         execution?: {
             shouldSucceed: boolean;