diff --git a/package.json b/package.json
index ed50085..103b952 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "assemblyai",
-  "version": "4.33.3",
+  "version": "4.34.0",
   "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
   "engines": {
     "node": ">=18"
diff --git a/samples/streaming-dual-channel-mic-system/README.md b/samples/streaming-dual-channel-mic-system/README.md
new file mode 100644
index 0000000..8172f8e
--- /dev/null
+++ b/samples/streaming-dual-channel-mic-system/README.md
@@ -0,0 +1,240 @@
+# streaming-dual-channel-mic-system
+
+Browser sample that streams **microphone + system audio** as a single mixed mono
+stream to AssemblyAI's Streaming v3 endpoint, with per-word physical-channel
+attribution (`mic` / `system`) layered on top of AAI's voice diarization
+(`speaker_label`).
+
+## Run
+
+```bash
+npm install
+npm run dev
+```
+
+Then open the printed URL in Chrome, paste a streaming temporary token, and
+click **Start**.
+
+Note: the sample's `package.json` references `"assemblyai": "file:../.."` so it
+builds against the local SDK source. Run `pnpm build` (or `npm run build`) once
+at the SDK root before installing here.
+
+## Getting a temporary token
+
+API-key auth is unsupported in browsers. Mint a token from your backend:
+
+```ts
+const token = await client.streaming.createTemporaryToken({
+  expires_in_seconds: 600,
+})
+```
+
+## Swappable VAD
+
+The SDK's `channelAttribution.createVad` factory is an extension point — any
+class that implements `VadDetector` (`process(frame: Float32Array) → { active,
+energy }` plus `reset()`) can replace the default `EnergyVad`. To plug in a
+custom VAD (Silero / DNN / your own), pass a factory:
+
+```ts
+channelAttribution: {
+  createVad: (channelName) => new YourCustomVadDetector(channelName),
+}
+```
+
+The factory is called once per declared channel at transcriber construction
+time, and the channel name (`mic` / `system` / whatever you declared in
+`channels: [{ name }]`) is passed in — so factories that wrap higher-level VAD
+libraries (which manage their own audio source) can map each `VadDetector`
+instance to its corresponding channel.
+
+This sample uses the default `EnergyVad` and exposes its tuning knobs via the
+sliders described below.
+
+## Resolve unknown channels
+
+[`channelAttribution.resolveUnknownChannelsMethod`](../../src/types/streaming/index.ts)
+controls how words whose per-word VAD attribution resolved to `"unknown"` are
+filled in. Confident per-word VAD decisions (`"mic"` / `"system"`) are never
+modified by any strategy. Default: `"window"`.
+
+The sample's "Resolve unknown channels" dropdown switches between:
+
+- **`window`** (default): look at the dominant non-`"unknown"` channel among
+  ±2 neighboring words in the same turn. Ignores `speaker_label`, so it
+  works even when AAI re-uses a label for two physically distinct voices.
+  Words with no non-`"unknown"` neighbors stay `"unknown"`.
+- **`speaker-history`**: accumulate per-`speaker_label` per-channel active
+  VAD energy across the session. Fill `"unknown"` words with the speaker's
+  dominant channel when their total evidence clears
+  `speakerHistoryMinRmsEvidence` (default `0.5`) and beats runner-up by
+  `speakerHistoryDominanceRatio` (default `3`). Robust when speaker labels
+  are stable; does nothing when a speaker's evidence is split.
+- **`none`**: disable resolution. `"unknown"` words render as-is.
+
+Resolved words are flagged with `word.channelResolved = true`, and the sample
+renders them with a trailing asterisk (e.g. `[mic*/spk A]`) so you can see
+exactly when resolution fired.
+
+### EnergyVad tuning sliders
+
+The sample lets you tune the default
+[`EnergyVad`](../../src/services/streaming/energy-vad.ts) parameters
+in real time:
+
+- **Threshold ratio** (default `3`, range `1.5`–`5`, step `0.5`): the VAD
+  trips when `frameRMS > noiseFloor × thresholdRatio`. Lower values are more
+  sensitive (catch quieter speech, more false positives on background).
+  Higher values miss quiet utterance onsets/offsets.
+- **Hangover frames** (default `10` = ~200 ms, range `0`–`25`, step `5`):
+  how many frames the VAD stays "active" after the last detected speech
+  frame. Longer hangovers smooth attribution across brief silences within
+  an utterance.
+
+The slider values are baked into the `EnergyVad` instances created at start
+time via `channelAttribution.createVad`; they cannot be changed
+mid-session — Stop and Start again to apply new values.
+
+### Speaker-change log
+
+When `speakerLabels` is enabled and a turn's words include a transition in
+the composite `(channel, speaker_label)` key vs. the previous final word,
+the sample logs a line like:
+
+    [Speaker change: mic-A → system-B]
+
+This is the recommended pattern for transcript renderers that want to split
+on speaker boundaries: compare the `(channel, speaker_label)` composite key
+between consecutive words. `channel` reliably reports the physical source
+(VAD-derived); `speaker_label` is AAI's acoustic diarization on the mixed
+mono stream. Either change is a real boundary.
+
+## Platform caveats
+
+- **macOS:** `getDisplayMedia({ audio: true })` does **not** capture system
+  audio by default. Install [BlackHole](https://existential.audio/blackhole/) or
+  [Loopback.app](https://rogueamoeba.com/loopback/) and route system audio
+  through the loopback device to make it available.
+- **Windows:** sharing the whole screen via the picker exposes full system
+  audio; sharing only a tab exposes just that tab's audio.
+
+## Speakers + open mic: apply echo cancellation at capture
+
+When the user listens to system audio through **speakers** (rather than
+headphones) and their mic is open, the mic physically picks up the speaker
+playback. The two channels then carry highly-correlated audio at similar
+amplitudes, and the energy-based attribution can't reliably tell apart
+"real mic speech" from "speakers played into mic."
+
+**Transcription accuracy is unaffected** — AAI still transcribes what was
+said. What's affected is **per-word channel attribution**: words that
+actually came from system audio may be tagged as `mic`. If you don't use
+the per-word `channel` field downstream, you can ignore this. If you do —
+for instance, to render `[mic]` / `[sys]` prefixes in a transcript UI —
+apply echo cancellation **at the capture layer**, before audio reaches
+the SDK. Two examples below.
+
+### Example 1: Browser (`getUserMedia` with built-in AEC)
+
+If you're capturing in the browser (like this sample app does),
+`getUserMedia` already exposes Chrome's WebRTC AEC. Pass
+`echoCancellation: true` when requesting the mic stream:
+
+```ts
+const micStream = await navigator.mediaDevices.getUserMedia({
+  audio: {
+    echoCancellation: true, // Subtracts speaker playback from mic.
+    noiseSuppression: true, // Optional: cleaner ambient.
+    autoGainControl: true, // Optional: smooths mic level.
+  },
+})
+
+// Then hand the cleaned stream to the SDK as the mic channel.
+const capture = new DualChannelCapture({
+  micStream,
+  systemStream, // from getDisplayMedia({audio: true})
+  transcriber,
+})
+```
+
+This is what the browser sample app already does — it's why `[mic]`
+attribution works correctly even with speakers playing into the mic.
+
+### Example 2: Native / Node (swap in a DNN VAD via `createVad`)
+
+In server-side or native runtimes (Node, Electron, the Swift helpers
+behind native CLIs, etc.) there is no `getUserMedia`. The right answer is
+still to do echo cancellation **at capture** — macOS has
+`AVAudioEngine.setVoiceProcessingEnabled(true)`, Linux has PulseAudio's
+`module-echo-cancel`, telephony stacks usually have it in the codec.
+**Use what your capture layer provides.**
+
+If platform-level AEC isn't available, the next-best option is to plug
+in a DNN voice-activity detector via `channelAttribution.createVad`. A
+DNN VAD distinguishes "real speech" from "playback that the mic
+recaptured" using spectral characteristics (rather than energy), so it's
+much more robust to speaker leak. [Silero VAD](https://github.com/snakers4/silero-vad)
+is the typical choice; the [`@ricky0123/vad`](https://www.npmjs.com/package/@ricky0123/vad)
+package bundles it for browser and Node.
+
+```ts
+import { MicVAD } from "@ricky0123/vad-web"
+import type { VadDetector, VadDetectorResult } from "assemblyai"
+
+// Adapter: wrap a Silero session as the SDK's VadDetector interface.
+class SileroVad implements VadDetector {
+  constructor(private readonly speechProb: () => number) {}
+  process(frame: Float32Array): VadDetectorResult {
+    const p = this.speechProb()
+    let sumSq = 0
+    for (let i = 0; i < frame.length; i++) sumSq += frame[i] * frame[i]
+    const rms = Math.sqrt(sumSq / Math.max(1, frame.length))
+    return { active: p > 0.5, energy: rms }
+  }
+  reset(): void {}
+}
+
+// One MicVAD per channel.
+const micVad = await MicVAD.new({
+  /* ... */
+})
+const systemVad = await MicVAD.new({
+  /* ... */
+})
+
+const transcriber = client.streaming.transcriber({
+  speechModel: "u3-rt-pro",
+  sampleRate: 16_000,
+  channels: [{ name: "mic" }, { name: "system" }],
+  channelAttribution: {
+    createVad: (channelName) =>
+      new SileroVad(
+        channelName === "mic"
+          ? () => micVad.lastSpeechProb()
+          : () => systemVad.lastSpeechProb(),
+      ),
+  },
+})
+```
+
+The factory is called once per declared channel at transcriber
+construction time, so it's a clean place to wire each channel's VAD to
+its own Silero session.
+
+### Why the SDK can't ship AEC itself
+
+Echo cancellation belongs at the capture layer — the moment audio enters
+your application — because:
+
+1. Every platform (browser, macOS, iOS, Linux, Windows, telephony) has
+   its own AEC implementation tuned for its own audio stack. The SDK
+   only sees PCM after capture, by which point platform-specific delay
+   compensation and double-talk handling are out of reach.
+2. A pure-JS AEC inside the SDK would re-invent what the OS / browser
+   already does well, with worse latency and worse quality.
+3. Customers' AEC needs differ. Voice agents want aggressive AEC;
+   meeting recorders want light-touch AEC that doesn't suppress
+   overlapping speech. The capture layer is where that choice is made.
+
+Channel attribution in this SDK assumes capture-layer AEC is already
+applied when speaker leak is a real concern.
diff --git a/samples/streaming-dual-channel-mic-system/index.html b/samples/streaming-dual-channel-mic-system/index.html
new file mode 100644
index 0000000..8361bd5
--- /dev/null
+++ b/samples/streaming-dual-channel-mic-system/index.html
@@ -0,0 +1,424 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>AssemblyAI — Dual-Channel Mic + System</title>
+    <style>
+      body {
+        font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+        max-width: 760px;
+        margin: 2rem auto;
+        padding: 0 1rem;
+        line-height: 1.5;
+      }
+      button {
+        padding: 0.5rem 1rem;
+        font-size: 1rem;
+        cursor: pointer;
+      }
+      input[type="text"] {
+        width: 100%;
+        padding: 0.5rem;
+        font: inherit;
+      }
+      .panel-tabs {
+        display: flex;
+        gap: 0.25rem;
+        margin-top: 1rem;
+        margin-bottom: -1px;
+      }
+      .panel-tab-btn {
+        padding: 0.4rem 0.9rem;
+        font-size: 0.85rem;
+        border: 1px solid #d1d5db;
+        background: #f9fafb;
+        border-radius: 0.4rem 0.4rem 0 0;
+        cursor: pointer;
+      }
+      .panel-tab-btn.active {
+        background: #f3f3f3;
+        border-bottom-color: #f3f3f3;
+        font-weight: bold;
+      }
+      .panel {
+        padding: 1rem;
+        background: #f3f3f3;
+        border-radius: 0 6px 6px 6px;
+        height: 60vh;
+        max-height: 600px;
+      }
+      .panel.hidden {
+        display: none !important;
+      }
+      #output {
+        overflow-y: auto;
+        display: flex;
+        flex-direction: column;
+        gap: 0.6rem;
+      }
+      #output > * {
+        margin: 0;
+      }
+      #events {
+        overflow: auto;
+        font-size: 0.75rem;
+        background: #1f2937;
+        color: #e5e7eb;
+        border-radius: 0 6px 6px 6px;
+      }
+      #events .event-line {
+        white-space: pre;
+        padding: 0.15rem 0;
+      }
+      #events .event-line.event-Turn {
+        color: #d1fae5;
+      }
+      #events .event-line.event-Begin {
+        color: #fde68a;
+      }
+      #events .event-line.event-Termination {
+        color: #fecaca;
+      }
+      .turn-container {
+        display: flex;
+        flex-direction: column;
+        gap: 0.4rem;
+      }
+      .resolve-hint {
+        display: inline-block;
+        margin-left: 0.5rem;
+        color: #6b7280;
+        font-size: 0.8rem;
+      }
+      /* Each turn is its own bubble. Mic = right side; everything else (system, unknown, custom channels) = left, iMessage-style. */
+      .turn-row {
+        max-width: 75%;
+        padding: 0.6rem 0.85rem;
+        border-radius: 0.9rem;
+        line-height: 1.5;
+        word-wrap: break-word;
+        overflow-wrap: anywhere;
+      }
+      .turn-row.align-right {
+        align-self: flex-end;
+        background: #dbeafe;
+      }
+      .turn-row.align-left {
+        align-self: flex-start;
+        background: #e5e7eb;
+      }
+      .turn-row.partial {
+        color: #6b7280;
+        font-style: italic;
+      }
+      .turn-row.final {
+        color: inherit;
+        font-style: normal;
+      }
+      .turn-row .partial-marker {
+        display: block;
+        font-size: 0.7rem;
+        color: #9ca3af;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+        margin-bottom: 0.35rem;
+      }
+      .word-chip {
+        display: block;
+        white-space: normal;
+        padding: 0.1rem 0;
+      }
+      .speaker-change-log {
+        align-self: center;
+        color: #b45309;
+        font-weight: bold;
+        font-size: 0.8rem;
+        padding: 0.4rem 0.7rem;
+        background: #fef3c7;
+        border-radius: 0.5rem;
+        max-width: 80%;
+        text-align: center;
+      }
+      .rollup-line {
+        align-self: center;
+        color: #6b7280;
+        font-size: 0.75rem;
+        padding: 0.15rem 0.5rem;
+      }
+      .description {
+        background: #fef9c3;
+        border: 1px solid #facc15;
+        border-radius: 0.5rem;
+        padding: 0.75rem 1rem;
+        font-size: 0.85rem;
+        margin: 1rem 0;
+      }
+      .description summary {
+        cursor: pointer;
+        font-weight: bold;
+        font-size: 0.9rem;
+        user-select: none;
+      }
+      .description summary + * {
+        margin-top: 0.5rem;
+      }
+      .description pre {
+        background: #fff;
+        padding: 0.5rem;
+        border-radius: 0.25rem;
+        margin: 0.5rem 0;
+        font-size: 0.75rem;
+        white-space: pre-wrap;
+        word-break: break-all;
+        font-family: inherit;
+      }
+      .tuning-row {
+        display: grid;
+        grid-template-columns: 1fr 1fr;
+        gap: 0.75rem 1.5rem;
+        margin: 0.5rem 0;
+      }
+      .tuning-label {
+        display: grid;
+        grid-template-columns: auto 1fr auto;
+        gap: 0.5rem;
+        align-items: center;
+        font-size: 0.9rem;
+      }
+      .tuning-value {
+        font-variant-numeric: tabular-nums;
+        color: #6b7280;
+        min-width: 2.5rem;
+        text-align: right;
+      }
+      .mic {
+        color: #2563eb;
+      }
+      .system {
+        color: #15803d;
+      }
+      .unknown {
+        color: #6b7280;
+      }
+      #meters {
+        margin-top: 1rem;
+        display: grid;
+        gap: 0.5rem;
+        min-width: 0;
+      }
+      .meter-row {
+        display: grid;
+        grid-template-columns: 2.5rem minmax(0, 1fr) 4rem 5rem;
+        gap: 0.5rem;
+        align-items: center;
+        min-width: 0;
+      }
+      .meter-label {
+        font-weight: bold;
+      }
+      .meter-bar {
+        width: 100%;
+        height: 0.75rem;
+        background: #e5e7eb;
+        border-radius: 4px;
+        overflow: hidden;
+      }
+      .meter-fill {
+        height: 100%;
+        width: 0%;
+        transition: width 60ms linear;
+      }
+      .meter-fill.mic {
+        background: #2563eb;
+      }
+      .meter-fill.system {
+        background: #15803d;
+      }
+      .meter-db {
+        font-variant-numeric: tabular-nums;
+        font-size: 0.85rem;
+        color: #6b7280;
+      }
+      .meter-flag {
+        font-size: 0.85rem;
+      }
+      .meter-flag.active {
+        color: #b45309;
+        font-weight: bold;
+      }
+      #diag {
+        margin-top: 0.25rem;
+        color: #b45309;
+        font-size: 0.85rem;
+      }
+      .controls-row {
+        display: grid;
+        grid-template-columns: minmax(0, 1fr) minmax(0, 1fr);
+        gap: 1rem;
+        align-items: start;
+        margin: 1rem 0;
+      }
+      .controls-left {
+        min-width: 0;
+      }
+      .controls-left p {
+        margin: 0 0 0.5rem;
+      }
+      .yt-embed {
+        margin: 0;
+        min-width: 0;
+        width: 100%;
+        aspect-ratio: 16 / 9;
+        background: #000;
+        border-radius: 0.5rem;
+        overflow: hidden;
+      }
+      .yt-embed iframe {
+        width: 100%;
+        height: 100%;
+        border: 0;
+      }
+    </style>
+  </head>
+  <body>
+    <h1>Dual-Channel Mic + System</h1>
+    <p>
+      Streams mic + system audio (via <code>getDisplayMedia</code>) as a single
+      mixed mono stream to AssemblyAI's Streaming v3 endpoint, with per-word
+      channel attribution. Requires a temporary token; never expose your API key
+      in the browser.
+    </p>
+    <details class="description">
+      <summary>How do I get a temporary token?</summary>
+      <p>
+        Grab an API key from your
+        <a
+          href="https://www.assemblyai.com/dashboard/api-keys"
+          target="_blank"
+          rel="noopener noreferrer"
+          >AssemblyAI dashboard</a
+        >, then run this in a terminal to mint a 60-second token (replace
+        <code>&lt;YOUR_API_KEY&gt;</code>):
+      </p>
+      <pre>
+curl -G https://streaming.assemblyai.com/v3/token \
+  -H "Authorization: &lt;YOUR_API_KEY&gt;" \
+  -d expires_in_seconds=60</pre
+      >
+      <p>Paste the <code>token</code> value from the response below.</p>
+    </details>
+    <label
+      >Temporary token
+      <input
+        id="token"
+        type="text"
+        placeholder="paste a streaming temporary token"
+      />
+    </label>
+    <p>
+      <label>
+        Resolve <code>unknown</code> channels:
+        <select id="resolve-method">
+          <option value="none">none</option>
+          <option value="window" selected>window (neighbor-based)</option>
+          <option value="speaker-history">speaker-history</option>
+        </select>
+      </label>
+      <span class="resolve-hint"
+        >Resolved words are marked with a trailing <code>*</code> (e.g.
+        <code>[mic*]</code>).</span
+      >
+    </p>
+    <p>
+      <label>
+        Max speakers (optional):
+        <input
+          id="max-speakers"
+          type="number"
+          min="1"
+          step="1"
+          placeholder="auto"
+        />
+      </label>
+      <span class="resolve-hint"
+        >Caps AAI diarization. Leave blank to let the server decide.</span
+      >
+    </p>
+    <div class="tuning-row">
+      <label class="tuning-label">
+        Threshold ratio
+        <input
+          id="threshold-ratio"
+          type="range"
+          min="1.5"
+          max="5"
+          step="0.5"
+          value="3"
+        />
+        <span id="threshold-ratio-value" class="tuning-value">3</span>
+      </label>
+      <label class="tuning-label">
+        Hangover frames
+        <input
+          id="hangover-frames"
+          type="range"
+          min="0"
+          max="25"
+          step="5"
+          value="10"
+        />
+        <span id="hangover-frames-value" class="tuning-value">10</span>
+      </label>
+    </div>
+    <div class="controls-row">
+      <div class="controls-left">
+        <p>
+          <button id="start">Start</button>
+          <button id="stop" disabled>Stop</button>
+        </p>
+        <div id="meters">
+          <div class="meter-row">
+            <span class="meter-label">mic</span>
+            <div class="meter-bar">
+              <div id="mic-fill" class="meter-fill mic"></div>
+            </div>
+            <span id="mic-db" class="meter-db">–∞ dB</span>
+            <span id="mic-flag" class="meter-flag"></span>
+          </div>
+          <div class="meter-row">
+            <span class="meter-label">sys</span>
+            <div class="meter-bar">
+              <div id="sys-fill" class="meter-fill system"></div>
+            </div>
+            <span id="sys-db" class="meter-db">–∞ dB</span>
+            <span id="sys-flag" class="meter-flag"></span>
+          </div>
+          <div id="diag"></div>
+        </div>
+      </div>
+      <div class="yt-embed">
+        <iframe
+          src="https://www.youtube-nocookie.com/embed/BsK1UyWMvuE"
+          allow="autoplay; encrypted-media; picture-in-picture"
+          allowfullscreen
+        ></iframe>
+      </div>
+    </div>
+    <div class="panel-tabs">
+      <button
+        type="button"
+        class="panel-tab-btn active"
+        data-panel-tab="output"
+      >
+        Transcript
+      </button>
+      <button type="button" class="panel-tab-btn" data-panel-tab="events">
+        Events
+      </button>
+    </div>
+    <div id="output" class="panel"></div>
+    <div id="events" class="panel hidden"></div>
+    <script type="module" src="/main.ts"></script>
+  </body>
+</html>
diff --git a/samples/streaming-dual-channel-mic-system/main.ts b/samples/streaming-dual-channel-mic-system/main.ts
new file mode 100644
index 0000000..00cf44c
--- /dev/null
+++ b/samples/streaming-dual-channel-mic-system/main.ts
@@ -0,0 +1,392 @@
+import {
+  DualChannelCapture,
+  EnergyVad,
+  StreamingTranscriber,
+  type TurnEvent,
+} from "assemblyai"
+
+// The SDK accepts an external VAD via channelAttribution.createVad. For example,
+// to plug in a Silero / DNN VAD, pass:
+//
+//   channelAttribution: {
+//     createVad: (channelName) => new YourCustomVadDetector(channelName),
+//   }
+//
+// Where YourCustomVadDetector implements the SDK's VadDetector interface
+// (process(frame: Float32Array) → { active, energy } and reset()). The default
+// is EnergyVad, which we use here.
+
+const tokenInput = document.getElementById("token") as HTMLInputElement
+const startBtn = document.getElementById("start") as HTMLButtonElement
+const stopBtn = document.getElementById("stop") as HTMLButtonElement
+const resolveMethodSelect = document.getElementById(
+  "resolve-method",
+) as HTMLSelectElement
+const thresholdRatioInput = document.getElementById(
+  "threshold-ratio",
+) as HTMLInputElement
+const thresholdRatioValue = document.getElementById(
+  "threshold-ratio-value",
+) as HTMLSpanElement
+const hangoverFramesInput = document.getElementById(
+  "hangover-frames",
+) as HTMLInputElement
+const hangoverFramesValue = document.getElementById(
+  "hangover-frames-value",
+) as HTMLSpanElement
+const maxSpeakersInput = document.getElementById(
+  "max-speakers",
+) as HTMLInputElement
+const output = document.getElementById("output") as HTMLDivElement
+const events = document.getElementById("events") as HTMLDivElement
+const panelTabBtns =
+  document.querySelectorAll<HTMLButtonElement>(".panel-tab-btn")
+
+/** Append one streaming event to the Events panel as a single JSON line. */
+function logEvent(kind: string, payload: unknown): void {
+  const line = document.createElement("div")
+  line.className = `event-line event-${kind}`
+  line.textContent = JSON.stringify({ kind, ...(payload as object) })
+  events.appendChild(line)
+  events.scrollTop = events.scrollHeight
+}
+
+// Panel tab switching.
+for (const btn of panelTabBtns) {
+  btn.addEventListener("click", () => {
+    const active = btn.dataset.panelTab
+    for (const b of panelTabBtns) b.classList.toggle("active", b === btn)
+    output.classList.toggle("hidden", active !== "output")
+    events.classList.toggle("hidden", active !== "events")
+  })
+}
+const micFill = document.getElementById("mic-fill") as HTMLDivElement
+const sysFill = document.getElementById("sys-fill") as HTMLDivElement
+const micDb = document.getElementById("mic-db") as HTMLSpanElement
+const sysDb = document.getElementById("sys-db") as HTMLSpanElement
+const micFlag = document.getElementById("mic-flag") as HTMLSpanElement
+const sysFlag = document.getElementById("sys-flag") as HTMLSpanElement
+const diag = document.getElementById("diag") as HTMLDivElement
+
+let transcriber: StreamingTranscriber | undefined
+let capture: DualChannelCapture | undefined
+let micStream: MediaStream | undefined
+let systemStream: MediaStream | undefined
+
+// Wire slider value labels up-front so the user sees what the slider is set to
+// before they press Start.
+thresholdRatioInput.addEventListener("input", () => {
+  thresholdRatioValue.textContent = thresholdRatioInput.value
+})
+hangoverFramesInput.addEventListener("input", () => {
+  hangoverFramesValue.textContent = hangoverFramesInput.value
+})
+
+function appendLine(line: string, className?: string): void {
+  const div = document.createElement("div")
+  if (className) div.className = className
+  div.textContent = line
+  output.appendChild(div)
+  output.scrollTop = output.scrollHeight
+}
+
+function rmsToDbStr(rms: number): string {
+  if (rms < 1e-6) return "–∞ dB"
+  return `${(20 * Math.log10(rms)).toFixed(1)} dB`
+}
+
+function rmsToBarPct(rms: number): number {
+  if (rms < 1e-6) return 0
+  const db = 20 * Math.log10(rms)
+  return Math.max(0, Math.min(100, ((db + 60) / 60) * 100))
+}
+
+async function start(): Promise<void> {
+  const token = tokenInput.value.trim()
+  if (!token) {
+    appendLine("Need a temporary token.")
+    return
+  }
+  startBtn.disabled = true
+  output.replaceChildren()
+
+  micStream = await navigator.mediaDevices.getUserMedia({
+    audio: {
+      echoCancellation: true,
+      noiseSuppression: true,
+      autoGainControl: true,
+    },
+  })
+  systemStream = await navigator.mediaDevices.getDisplayMedia({
+    audio: true,
+    video: true,
+  })
+  if (systemStream.getAudioTracks().length === 0) {
+    appendLine(
+      "No audio track in display capture. On macOS, install BlackHole or similar.",
+    )
+    micStream.getTracks().forEach((t) => t.stop())
+    systemStream.getTracks().forEach((t) => t.stop())
+    startBtn.disabled = false
+    return
+  }
+
+  const resolveMethod = resolveMethodSelect.value as
+    | "none"
+    | "window"
+    | "speaker-history"
+  const thresholdRatio = parseFloat(thresholdRatioInput.value)
+  const hangoverFrames = parseInt(hangoverFramesInput.value, 10)
+  const maxSpeakersRaw = maxSpeakersInput.value.trim()
+  const maxSpeakers =
+    maxSpeakersRaw === "" ? undefined : parseInt(maxSpeakersRaw, 10)
+
+  const channelAttribution: NonNullable<
+    ConstructorParameters<typeof StreamingTranscriber>[0]["channelAttribution"]
+  > = {
+    createVad: () => new EnergyVad({ thresholdRatio, hangoverFrames }),
+    resolveUnknownChannelsMethod: resolveMethod,
+  }
+
+  transcriber = new StreamingTranscriber({
+    token,
+    sampleRate: 16_000,
+    speechModel: "u3-rt-pro",
+    speakerLabels: true,
+    ...(maxSpeakers !== undefined && { maxSpeakers }),
+    continuousPartials: true,
+    channels: [{ name: "mic" }, { name: "system" }],
+    channelAttribution,
+    minTurnSilence: 400,
+    maxTurnSilence: 1000,
+    vadThreshold: 0.5,
+  })
+  transcriber.on("open", (e) => logEvent("Begin", e))
+  transcriber.on("error", (err) => {
+    appendLine(`[error] ${err.message}`)
+    logEvent("Error", { message: err.message })
+  })
+  transcriber.on("close", (code, reason) => {
+    appendLine(`[close] ${code} ${reason}`)
+    logEvent("Close", { code, reason })
+  })
+  transcriber.on("warning", (e) => logEvent("Warning", e))
+
+  // Meter state: EMA-smoothed peak RMS per channel + a "we ever saw audio?"
+  // diagnostic that flags after ~3 s of silence on the system side.
+  const ema = new Map<string, number>([
+    ["mic", 0],
+    ["system", 0],
+  ])
+  let sysSawAudio = false
+  const startedAt = performance.now()
+  const EMA_ALPHA = 0.3
+  const SYS_AUDIO_THRESHOLD = 5e-4
+
+  transcriber.on("vad", (frame) => {
+    const prev = ema.get(frame.channel) ?? 0
+    const next = prev * (1 - EMA_ALPHA) + frame.rms * EMA_ALPHA
+    ema.set(frame.channel, next)
+
+    if (frame.channel === "system" && frame.rms > SYS_AUDIO_THRESHOLD) {
+      if (!sysSawAudio) diag.textContent = ""
+      sysSawAudio = true
+    }
+
+    if (frame.channel === "mic") {
+      micFill.style.width = `${Math.min(100, rmsToBarPct(next))}%`
+      micDb.textContent = rmsToDbStr(next)
+      micFlag.textContent = frame.active ? "● speaking" : ""
+      micFlag.className = `meter-flag${frame.active ? " active" : ""}`
+    } else if (frame.channel === "system") {
+      sysFill.style.width = `${Math.min(100, rmsToBarPct(next))}%`
+      sysDb.textContent = rmsToDbStr(next)
+      sysFlag.textContent = frame.active ? "● speaking" : ""
+      sysFlag.className = `meter-flag${frame.active ? " active" : ""}`
+    }
+
+    if (
+      !sysSawAudio &&
+      performance.now() - startedAt > 3000 &&
+      diag.textContent === ""
+    ) {
+      diag.textContent =
+        "No system audio detected after 3s. Common causes: (1) didn't tick 'Share tab audio' in the picker, (2) shared a different tab than the one playing audio, (3) on macOS you need a virtual loopback driver (BlackHole) for non-tab system audio."
+    }
+  })
+
+  // Per-turn container divs. Each container holds N bubbles (one per
+  // contiguous same-channel run within the turn) plus the rollup line on
+  // end_of_turn. Keyed by turn_order so partials replace the same container.
+  const turnContainers = new Map<number, HTMLElement>()
+  // Last committed turn-level (channel, speaker_label) composite — used to
+  // detect speaker changes between consecutive *finalized turns*.
+  let lastFinalTurnComposite: string | undefined
+
+  /**
+   * Split a turn's words into contiguous same-channel runs. Words within a run
+   * share a channel value (treating `undefined` as `"unknown"`). Each run
+   * becomes its own bubble so a turn that contains both mic and system speech
+   * renders as multiple bubbles aligned to their respective sides.
+   */
+  function groupByChannel(turn: TurnEvent) {
+    const runs: Array<{
+      channel: string
+      anyResolved: boolean
+      words: TurnEvent["words"]
+    }> = []
+    for (const w of turn.words) {
+      const ch = w.channel ?? "unknown"
+      const last = runs[runs.length - 1]
+      if (last && last.channel === ch) {
+        last.words.push(w)
+        if (w.channelResolved) last.anyResolved = true
+      } else {
+        runs.push({ channel: ch, anyResolved: !!w.channelResolved, words: [w] })
+      }
+    }
+    return runs
+  }
+
+  function renderTurnContainer(
+    container: HTMLElement,
+    turn: TurnEvent,
+    isFinal: boolean,
+  ): void {
+    container.replaceChildren()
+    const runs = groupByChannel(turn)
+
+    for (let r = 0; r < runs.length; r++) {
+      const run = runs[r]
+      const bubble = document.createElement("div")
+      const align = run.channel === "mic" ? "align-right" : "align-left"
+      bubble.className = `turn-row ${align} ${isFinal ? "final" : "partial"}`
+
+      // Show the partial marker only on the first bubble of an in-progress turn.
+      if (!isFinal && r === 0) {
+        const marker = document.createElement("span")
+        marker.className = "partial-marker"
+        marker.textContent = "[partial]"
+        bubble.appendChild(marker)
+      }
+
+      for (const w of run.words) {
+        const chip = document.createElement("span")
+        chip.className = "word-chip"
+        const channelLabel = `${w.channel ?? "unknown"}${
+          w.channelResolved ? "*" : ""
+        }`
+        // Per AAI docs, `word.speaker` is only set on final words and may
+        // still be absent; `turn.speaker_label` is only populated on the
+        // final turn event. On partials we have neither, so we render the
+        // channel only.
+        const speakerForWord = w.speaker ?? turn.speaker_label
+        const speakerChip = speakerForWord ? `/spk ${speakerForWord}` : ""
+        chip.textContent = `[${channelLabel}${speakerChip}] ${w.text} `
+        bubble.appendChild(chip)
+      }
+
+      container.appendChild(bubble)
+    }
+  }
+
+  transcriber.on("turn", (turn) => {
+    logEvent("Turn", turn)
+    let container = turnContainers.get(turn.turn_order)
+    if (!container) {
+      container = document.createElement("div")
+      container.className = "turn-container"
+      output.appendChild(container)
+      turnContainers.set(turn.turn_order, container)
+    }
+
+    renderTurnContainer(container, turn, turn.end_of_turn)
+    output.scrollTop = output.scrollHeight
+
+    if (!turn.end_of_turn) return
+
+    turnContainers.delete(turn.turn_order)
+
+    // Turn-level speaker-change detection: compare this turn's primary
+    // (channel, speaker_label) to the previous finalized turn's.
+    const turnComposite =
+      turn.channel && turn.speaker_label
+        ? `${turn.channel}-${turn.speaker_label}`
+        : undefined
+    if (
+      turnComposite &&
+      lastFinalTurnComposite &&
+      lastFinalTurnComposite !== turnComposite
+    ) {
+      const log = document.createElement("div")
+      log.className = "speaker-change-log"
+      log.textContent = `[Speaker change: ${lastFinalTurnComposite} → ${turnComposite}]`
+      container.before(log)
+    }
+    if (turnComposite) lastFinalTurnComposite = turnComposite
+
+    // End-of-turn rollup line.
+    const rollupLine = document.createElement("div")
+    rollupLine.className = "rollup-line"
+    rollupLine.textContent = `-- End of turn #${turn.turn_order}: speaker_label: ${
+      turn.speaker_label ?? "?"
+    } | Most active channel: ${turn.channel ?? "unknown"}`
+    container.after(rollupLine)
+    output.scrollTop = output.scrollHeight
+  })
+
+  // Expose so stop() can reset session-scoped state.
+  ;(
+    globalThis as unknown as { __resetTurnState?: () => void }
+  ).__resetTurnState = () => {
+    turnContainers.clear()
+    lastFinalTurnComposite = undefined
+  }
+
+  capture = new DualChannelCapture({ micStream, systemStream, transcriber })
+  capture.on("error", (err) => appendLine(`[capture error] ${err.message}`))
+
+  await transcriber.connect()
+  await capture.start()
+  stopBtn.disabled = false
+  resolveMethodSelect.disabled = true
+  thresholdRatioInput.disabled = true
+  hangoverFramesInput.disabled = true
+  appendLine("Streaming…")
+}
+
+async function stop(): Promise<void> {
+  stopBtn.disabled = true
+  await capture?.stop()
+  await transcriber?.close()
+  micStream?.getTracks().forEach((t) => t.stop())
+  systemStream?.getTracks().forEach((t) => t.stop())
+  capture = undefined
+  transcriber = undefined
+  micStream = undefined
+  systemStream = undefined
+  ;(
+    globalThis as unknown as { __resetTurnState?: () => void }
+  ).__resetTurnState?.()
+  micFill.style.width = "0%"
+  sysFill.style.width = "0%"
+  micDb.textContent = "–∞ dB"
+  sysDb.textContent = "–∞ dB"
+  micFlag.textContent = ""
+  sysFlag.textContent = ""
+  startBtn.disabled = false
+  resolveMethodSelect.disabled = false
+  thresholdRatioInput.disabled = false
+  hangoverFramesInput.disabled = false
+  appendLine("Stopped.")
+}
+
+startBtn.addEventListener("click", () => {
+  void start().catch((err) => {
+    appendLine(`[start error] ${(err as Error).message}`)
+    startBtn.disabled = false
+  })
+})
+stopBtn.addEventListener("click", () => {
+  void stop()
+})
diff --git a/samples/streaming-dual-channel-mic-system/package.json b/samples/streaming-dual-channel-mic-system/package.json
new file mode 100644
index 0000000..44c2729
--- /dev/null
+++ b/samples/streaming-dual-channel-mic-system/package.json
@@ -0,0 +1,18 @@
+{
+  "name": "streaming-dual-channel-mic-system",
+  "private": true,
+  "version": "1.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "devDependencies": {
+    "typescript": "^5.4.5",
+    "vite": "^5.4.10"
+  },
+  "dependencies": {
+    "assemblyai": "file:../.."
+  }
+}
diff --git a/samples/streaming-dual-channel-mic-system/tsconfig.json b/samples/streaming-dual-channel-mic-system/tsconfig.json
new file mode 100644
index 0000000..38d75d5
--- /dev/null
+++ b/samples/streaming-dual-channel-mic-system/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "lib": ["ES2022", "DOM", "DOM.Iterable"],
+    "strict": true,
+    "skipLibCheck": true,
+    "isolatedModules": true,
+    "noEmit": true,
+    "useDefineForClassFields": true,
+    "esModuleInterop": true
+  },
+  "include": ["main.ts"]
+}
diff --git a/src/exports/streaming.ts b/src/exports/streaming.ts
index eb145dd..d0db881 100644
--- a/src/exports/streaming.ts
+++ b/src/exports/streaming.ts
@@ -1,4 +1,11 @@
 export * from "../types/asyncapi.generated";
 export * from "../types/realtime";
 export * from "../types/helpers";
+export * from "../types/streaming/dual-channel";
 export * from "../services/realtime/service";
+export * from "../services/streaming/service";
+export * from "../services/streaming/factory";
+export * from "../services/streaming/browser/dual-channel-capture";
+export * from "../services/streaming/energy-vad";
+export * from "../services/streaming/label-mapper";
+export * from "../services/streaming/resampler";
diff --git a/src/services/index.ts b/src/services/index.ts
index d0441b4..0c12913 100644
--- a/src/services/index.ts
+++ b/src/services/index.ts
@@ -8,7 +8,18 @@ import {
 } from "./realtime";
 import { TranscriptService } from "./transcripts";
 import { FileService } from "./files";
-import { StreamingTranscriber, StreamingTranscriberFactory } from "./streaming";
+import {
+  StreamingTranscriber,
+  StreamingTranscriberFactory,
+  DualChannelCapture,
+  EnergyVad,
+  LinearResampler,
+  VadTimeline,
+  attributeTurn,
+  attributeWord,
+  rollUpTurnChannel,
+  float32ToPcm16,
+} from "./streaming";
 
 const defaultBaseUrl = "https://api.assemblyai.com";
 const defaultStreamingUrl = "https://streaming.assemblyai.com";
@@ -71,4 +82,12 @@ export {
   TranscriptService,
   FileService,
   StreamingTranscriber,
+  DualChannelCapture,
+  EnergyVad,
+  LinearResampler,
+  VadTimeline,
+  attributeTurn,
+  attributeWord,
+  rollUpTurnChannel,
+  float32ToPcm16,
 };
diff --git a/src/services/streaming/browser/dual-channel-capture.ts b/src/services/streaming/browser/dual-channel-capture.ts
new file mode 100644
index 0000000..14bad56
--- /dev/null
+++ b/src/services/streaming/browser/dual-channel-capture.ts
@@ -0,0 +1,177 @@
+import { StreamingTranscriber } from "../service";
+import { BrowserOnlyError } from "../../../types/streaming/dual-channel";
+
+export { BrowserOnlyError } from "../../../types/streaming/dual-channel";
+import {
+  PCM16_ENCODER_PROCESSOR_NAME,
+  Pcm16EncoderMessage,
+  pcm16EncoderWorkletSource,
+} from "./worklets/pcm16-encoder";
+
+const DEFAULT_TARGET_RATE = 16_000;
+const DEFAULT_CHUNK_MS = 50;
+const MIC_CHANNEL = "mic";
+const SYSTEM_CHANNEL = "system";
+
+type ErrorListener = (err: Error) => void;
+
+export type DualChannelCaptureParams = {
+  /** Microphone MediaStream. Caller should set `echoCancellation: true` at `getUserMedia` time. */
+  micStream: MediaStream;
+  /** System-audio MediaStream (e.g. `getDisplayMedia({ audio: true })`). */
+  systemStream: MediaStream;
+  /**
+   * The transcriber to push tagged PCM into. MUST be constructed with
+   * `channels: [{ name: "mic" }, { name: "system" }]` so the per-channel
+   * `sendAudio` calls succeed.
+   */
+  transcriber: StreamingTranscriber;
+  /**
+   * Target sample rate sent to the transcriber. Defaults to 16000. The
+   * AudioContext runs at the device's native rate; resampling happens inside
+   * the encoder worklet (forcing `AudioContext({ sampleRate })` is unreliable
+   * across browsers).
+   */
+  targetSampleRate?: number;
+};
+
+/**
+ * Browser-only adapter that pumps two `MediaStream`s into a `StreamingTranscriber`
+ * configured for dual-channel mode. Each `MediaStream` runs through its own
+ * `pcm16-encoder` AudioWorklet (resample to `targetSampleRate`, encode to Int16
+ * PCM); each PCM chunk is forwarded via `transcriber.sendAudio(pcm, { channel })`.
+ *
+ * All dual-channel orchestration (mixing, VAD, per-word attribution) lives inside
+ * `StreamingTranscriber` — this class is a pure I/O adapter. Non-browser runtimes
+ * can replicate its job by pushing tagged PCM into `transcriber.sendAudio` directly.
+ *
+ * Caller responsibilities:
+ * - **Echo cancellation** is set at `getUserMedia` time (`audio: { echoCancellation: true }`).
+ * - **System-audio capture** is platform-dependent. Chrome's `getDisplayMedia({ audio: true })`
+ *   captures tab audio (and on Windows, full system audio when sharing the whole screen).
+ *   macOS requires a virtual loopback driver (e.g. BlackHole) to expose system audio at all.
+ * - **Token auth.** Construct the transcriber with `token` — API-key auth is unsupported in browsers.
+ * - **Stream ownership.** `stop()` tears down the AudioContext but does NOT stop the
+ *   `MediaStreamTrack`s passed in — callers own those.
+ */
+export class DualChannelCapture {
+  private readonly params: Required<
+    Omit<DualChannelCaptureParams, "targetSampleRate">
+  > & { targetSampleRate: number };
+  private errorListener?: ErrorListener;
+  private context?: AudioContext;
+  private micSource?: MediaStreamAudioSourceNode;
+  private sysSource?: MediaStreamAudioSourceNode;
+  private micEncoder?: AudioWorkletNode;
+  private sysEncoder?: AudioWorkletNode;
+  private running = false;
+
+  constructor(params: DualChannelCaptureParams) {
+    if (typeof globalThis.AudioContext === "undefined") {
+      throw new BrowserOnlyError();
+    }
+    this.params = {
+      micStream: params.micStream,
+      systemStream: params.systemStream,
+      transcriber: params.transcriber,
+      targetSampleRate: params.targetSampleRate ?? DEFAULT_TARGET_RATE,
+    };
+  }
+
+  on(event: "error", listener: ErrorListener): void {
+    if (event === "error") this.errorListener = listener;
+  }
+
+  /**
+   * Wire the capture pipeline and start pumping tagged PCM into the transcriber.
+   * The transcriber must already be connected. Returns once the worklet is
+   * registered and the audio graph is live.
+   */
+  async start(): Promise<void> {
+    if (this.running) {
+      throw new Error("DualChannelCapture already started");
+    }
+    this.context = new AudioContext();
+
+    const blob = new Blob([pcm16EncoderWorkletSource], {
+      type: "application/javascript",
+    });
+    const url = URL.createObjectURL(blob);
+    try {
+      await this.context.audioWorklet.addModule(url);
+    } finally {
+      URL.revokeObjectURL(url);
+    }
+
+    this.micSource = this.context.createMediaStreamSource(
+      this.params.micStream,
+    );
+    this.sysSource = this.context.createMediaStreamSource(
+      this.params.systemStream,
+    );
+
+    this.micEncoder = this.makeEncoder(MIC_CHANNEL);
+    this.sysEncoder = this.makeEncoder(SYSTEM_CHANNEL);
+    this.micSource.connect(this.micEncoder);
+    this.sysSource.connect(this.sysEncoder);
+
+    this.running = true;
+  }
+
+  private makeEncoder(channel: string): AudioWorkletNode {
+    const node = new AudioWorkletNode(
+      this.context!,
+      PCM16_ENCODER_PROCESSOR_NAME,
+      {
+        numberOfInputs: 1,
+        numberOfOutputs: 0,
+        channelCount: 1,
+        channelCountMode: "explicit",
+        channelInterpretation: "speakers",
+        processorOptions: {
+          targetRate: this.params.targetSampleRate,
+          chunkMs: DEFAULT_CHUNK_MS,
+        },
+      },
+    );
+    node.port.onmessage = (e: MessageEvent<Pcm16EncoderMessage>) => {
+      try {
+        this.params.transcriber.sendAudio(e.data.pcm, { channel });
+      } catch (err) {
+        this.errorListener?.(err as Error);
+      }
+    };
+    return node;
+  }
+
+  /**
+   * Tear down internal nodes and close the AudioContext. Does NOT stop the
+   * caller-provided MediaStream tracks — they remain available for preview UI,
+   * recording, etc. Idempotent.
+   */
+  async stop(): Promise<void> {
+    if (!this.running) return;
+    this.running = false;
+
+    try {
+      this.micEncoder?.port.close();
+      this.sysEncoder?.port.close();
+      this.micEncoder?.disconnect();
+      this.sysEncoder?.disconnect();
+      this.micSource?.disconnect();
+      this.sysSource?.disconnect();
+    } catch {
+      // Disconnecting already-disconnected nodes throws in some browsers; ignore.
+    }
+
+    if (this.context && this.context.state !== "closed") {
+      await this.context.close();
+    }
+
+    this.context = undefined;
+    this.micSource = undefined;
+    this.sysSource = undefined;
+    this.micEncoder = undefined;
+    this.sysEncoder = undefined;
+  }
+}
diff --git a/src/services/streaming/browser/worklets/pcm16-encoder.ts b/src/services/streaming/browser/worklets/pcm16-encoder.ts
new file mode 100644
index 0000000..3ed4294
--- /dev/null
+++ b/src/services/streaming/browser/worklets/pcm16-encoder.ts
@@ -0,0 +1,70 @@
+/**
+ * AudioWorklet processor that ingests mono Float32 audio at the AudioContext's
+ * native sample rate, resamples to `targetRate` (linear interpolation, stateful
+ * across `process()` calls), packs to little-endian Int16 PCM, and posts
+ * fixed-size chunks via `port.postMessage` with a running `samplesSent` counter.
+ *
+ * `samplesSent` is in **target-rate samples**, so the main thread can derive a
+ * stream-relative timestamp = `samplesSent / targetRate * 1000` (ms) — the same
+ * frame AAI uses for `StreamingWord.start` / `.end`.
+ *
+ * Defined as a string so it can be registered via a Blob URL — the SDK ships as
+ * a single ESM file, so a separate `.js` worklet asset isn't viable.
+ */
+export const pcm16EncoderWorkletSource = `
+class Pcm16EncoderProcessor extends AudioWorkletProcessor {
+  constructor(options) {
+    super();
+    const opts = (options && options.processorOptions) || {};
+    this.targetRate = opts.targetRate || 16000;
+    this.chunkMs = opts.chunkMs || 50;
+    this.ratio = sampleRate / this.targetRate;
+    this.chunkSize = Math.round(this.targetRate * this.chunkMs / 1000);
+    this.buffer = new Int16Array(this.chunkSize);
+    this.bufferIdx = 0;
+    this.samplesSent = 0;
+    this.lastSample = 0;
+    this.fractional = 0;
+  }
+
+  process(inputs) {
+    const input = inputs[0];
+    if (!input || input.length === 0 || !input[0] || input[0].length === 0) {
+      return true;
+    }
+    const mono = input[0];
+    let pos = this.fractional;
+    while (pos < mono.length) {
+      const i = Math.floor(pos);
+      const frac = pos - i;
+      const a = i === 0 ? this.lastSample : mono[i - 1];
+      const b = mono[i];
+      const sample = a + (b - a) * frac;
+      const clamped = sample < -1 ? -1 : sample > 1 ? 1 : sample;
+      this.buffer[this.bufferIdx++] = clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff;
+      if (this.bufferIdx === this.chunkSize) {
+        const out = new Int16Array(this.chunkSize);
+        out.set(this.buffer);
+        this.samplesSent += this.chunkSize;
+        this.port.postMessage(
+          { pcm: out.buffer, samplesSent: this.samplesSent },
+          [out.buffer],
+        );
+        this.bufferIdx = 0;
+      }
+      pos += this.ratio;
+    }
+    this.lastSample = mono[mono.length - 1];
+    this.fractional = pos - mono.length;
+    return true;
+  }
+}
+registerProcessor("aai-pcm16-encoder", Pcm16EncoderProcessor);
+`;
+
+export const PCM16_ENCODER_PROCESSOR_NAME = "aai-pcm16-encoder";
+
+export type Pcm16EncoderMessage = {
+  pcm: ArrayBuffer;
+  samplesSent: number;
+};
diff --git a/src/services/streaming/energy-vad.ts b/src/services/streaming/energy-vad.ts
new file mode 100644
index 0000000..44b9fe9
--- /dev/null
+++ b/src/services/streaming/energy-vad.ts
@@ -0,0 +1,75 @@
+import {
+  VadDetector,
+  VadDetectorResult,
+} from "../../types/streaming/dual-channel";
+
+export type EnergyVadParams = {
+  /** Threshold = noiseFloor * thresholdRatio. Default 3.0 (~ +9.5 dB above noise). */
+  thresholdRatio?: number;
+  /** EMA smoothing for the noise-floor estimate when frame is non-speech. Default 0.05. */
+  noiseFloorAlpha?: number;
+  /** Hangover in frames: stay "active" this many frames after the last speech frame. Default 10 (\~200 ms at 20 ms frames). */
+  hangoverFrames?: number;
+  /** Initial noise floor estimate. Default 1e-4. Adaptive after the first non-speech frame. */
+  initialNoiseFloor?: number;
+};
+
+/**
+ * Energy-based VAD with adaptive noise-floor tracking and hangover. Pure JS,
+ * no dependencies. Suitable for the "which physical channel is speaking" task
+ * because the channels are already physically separated at capture — the harder
+ * problem (speech vs. non-speech in the wild) is one a customer can swap in a
+ * DNN VAD for via the `createVad` parameter.
+ *
+ * Tuning notes:
+ * - thresholdRatio below 2 will treat anything above noise as speech (too sensitive).
+ * - thresholdRatio above 6 will miss quiet utterance onsets/offsets.
+ * - noiseFloorAlpha above 0.1 makes the floor track quickly (good for non-stationary
+ *   background) but risks slowly adapting *up* to a sustained low voice.
+ */
+export class EnergyVad implements VadDetector {
+  private readonly thresholdRatio: number;
+  private readonly noiseFloorAlpha: number;
+  private readonly hangoverFrames: number;
+  private readonly initialNoiseFloor: number;
+  private noiseFloor: number;
+  private hangoverRemaining = 0;
+
+  constructor(params: EnergyVadParams = {}) {
+    this.thresholdRatio = params.thresholdRatio ?? 3.0;
+    this.noiseFloorAlpha = params.noiseFloorAlpha ?? 0.05;
+    this.hangoverFrames = params.hangoverFrames ?? 10;
+    this.initialNoiseFloor = params.initialNoiseFloor ?? 1e-4;
+    this.noiseFloor = this.initialNoiseFloor;
+  }
+
+  process(frame: Float32Array): VadDetectorResult {
+    let sumSq = 0;
+    for (let i = 0; i < frame.length; i++) {
+      sumSq += frame[i] * frame[i];
+    }
+    const rms = frame.length > 0 ? Math.sqrt(sumSq / frame.length) : 0;
+
+    const threshold = this.noiseFloor * this.thresholdRatio;
+    let active = rms > threshold;
+
+    if (active) {
+      this.hangoverRemaining = this.hangoverFrames;
+    } else if (this.hangoverRemaining > 0) {
+      this.hangoverRemaining--;
+      active = true;
+      // While in hangover, do not update noise floor — RMS may still reflect tail energy.
+    } else {
+      this.noiseFloor =
+        this.noiseFloor * (1 - this.noiseFloorAlpha) +
+        rms * this.noiseFloorAlpha;
+    }
+
+    return { active, energy: rms };
+  }
+
+  reset(): void {
+    this.noiseFloor = this.initialNoiseFloor;
+    this.hangoverRemaining = 0;
+  }
+}
diff --git a/src/services/streaming/index.ts b/src/services/streaming/index.ts
index 6a6ee0e..bfac48e 100644
--- a/src/services/streaming/index.ts
+++ b/src/services/streaming/index.ts
@@ -1,2 +1,6 @@
 export * from "./factory";
 export * from "./service";
+export * from "./browser/dual-channel-capture";
+export * from "./energy-vad";
+export * from "./label-mapper";
+export * from "./resampler";
diff --git a/src/services/streaming/label-mapper.ts b/src/services/streaming/label-mapper.ts
new file mode 100644
index 0000000..bea85a7
--- /dev/null
+++ b/src/services/streaming/label-mapper.ts
@@ -0,0 +1,128 @@
+import { StreamingWord, TurnEvent } from "../../types/streaming";
+import { Channel, VadFrame } from "../../types/streaming/dual-channel";
+
+export type LabelMapperParams = {
+  /** Per-word energy ratio above which a channel is declared dominant. */
+  dominanceRatio: number;
+};
+
+/**
+ * Append-only ring buffer of VAD frames in stream-relative ms order.
+ * `pushFrame` is O(1) amortized; `framesInWindow` is O(n) over kept frames,
+ * which is fine for the per-word lookups we do (a 30 s window at 50 frames/s
+ * per channel × 2 channels = 3000 entries, scanned once per word).
+ *
+ * Runtime-agnostic — no DOM or Web Audio dependencies.
+ */
+export class VadTimeline {
+  private frames: VadFrame[] = [];
+  private head = 0;
+
+  constructor(private readonly windowMs: number) {}
+
+  pushFrame(frame: VadFrame): void {
+    this.frames.push(frame);
+    const cutoff = frame.ts - this.windowMs;
+    while (
+      this.head < this.frames.length &&
+      this.frames[this.head].ts < cutoff
+    ) {
+      this.head++;
+    }
+    if (this.head > 1024 && this.head * 2 > this.frames.length) {
+      this.frames = this.frames.slice(this.head);
+      this.head = 0;
+    }
+  }
+
+  framesInWindow(startMs: number, endMs: number): VadFrame[] {
+    const out: VadFrame[] = [];
+    for (let i = this.head; i < this.frames.length; i++) {
+      const f = this.frames[i];
+      if (f.ts < startMs) continue;
+      if (f.ts > endMs) break;
+      out.push(f);
+    }
+    return out;
+  }
+
+  clear(): void {
+    this.frames = [];
+    this.head = 0;
+  }
+}
+
+/**
+ * Sum per-channel active RMS over a window. Returns a Map from channel name
+ * to total score. Channels with zero score are omitted.
+ */
+function scoreChannels(frames: VadFrame[]): Map<string, number> {
+  const scores = new Map<string, number>();
+  for (const f of frames) {
+    if (!f.active) continue;
+    scores.set(f.channel, (scores.get(f.channel) ?? 0) + f.rms);
+  }
+  return scores;
+}
+
+/**
+ * Decide which channel was dominant during a word's `[start, end]` window.
+ *
+ * - If no channel has any active VAD energy → `"unknown"`.
+ * - If the top channel beats the runner-up by at least `dominanceRatio` → top channel.
+ * - Else: top channel wins on absolute score; exact ties → `"unknown"`.
+ */
+export function attributeWord(
+  word: StreamingWord,
+  timeline: VadTimeline,
+  params: LabelMapperParams,
+): Channel {
+  const scores = scoreChannels(timeline.framesInWindow(word.start, word.end));
+  if (scores.size === 0) return "unknown";
+  const sorted = [...scores.entries()].sort((a, b) => b[1] - a[1]);
+  if (sorted.length === 1) return sorted[0][0];
+  const [topName, topScore] = sorted[0];
+  const [runnerName, runnerScore] = sorted[1];
+  if (topScore >= params.dominanceRatio * runnerScore) return topName;
+  if (topScore > runnerScore) return topName;
+  if (runnerScore > topScore) return runnerName;
+  return "unknown";
+}
+
+/**
+ * Duration-weighted majority of word channels. `"unknown"` if there are no
+ * words, every word resolved to `"unknown"`, or two channels tie exactly.
+ */
+export function rollUpTurnChannel(words: StreamingWord[]): Channel {
+  const totals = new Map<string, number>();
+  for (const w of words) {
+    if (!w.channel || w.channel === "unknown") continue;
+    const dur = Math.max(0, w.end - w.start);
+    totals.set(w.channel, (totals.get(w.channel) ?? 0) + dur);
+  }
+  if (totals.size === 0) return "unknown";
+  const sorted = [...totals.entries()].sort((a, b) => b[1] - a[1]);
+  if (sorted.length === 1) return sorted[0][0];
+  const [topName, topMs] = sorted[0];
+  const [, runnerMs] = sorted[1];
+  if (topMs === runnerMs) return "unknown";
+  return topName;
+}
+
+/**
+ * Mutate `turn` in place: write `turn.words[i].channel` for every word and set
+ * `turn.channel` to the duration-weighted rollup.
+ *
+ * Returns `void` because the transcriber owns the `TurnEvent` ref and forwards
+ * the same object to the customer listener — no need to allocate a copy.
+ */
+export function attributeTurn(
+  turn: TurnEvent,
+  timeline: VadTimeline,
+  params: LabelMapperParams,
+): void {
+  for (const w of turn.words) {
+    w.channel = attributeWord(w, timeline, params);
+  }
+  turn.channel = rollUpTurnChannel(turn.words);
+}
diff --git a/src/services/streaming/resampler.ts b/src/services/streaming/resampler.ts
new file mode 100644
index 0000000..f9e993f
--- /dev/null
+++ b/src/services/streaming/resampler.ts
@@ -0,0 +1,69 @@
+/**
+ * Linear-interpolation resampler for streaming Float32 audio. Stateful across
+ * `process()` calls so chunk boundaries don't introduce phase discontinuities:
+ * the last input sample and a fractional read position are carried over.
+ *
+ * Linear interpolation is good enough for ASR ingest — the downstream
+ * StreamingTranscriber band-limits at the target rate anyway, and a polyphase
+ * filter would be overkill in the AudioWorklet hot path. If a customer needs
+ * higher quality they can supply their own VadDetector + bypass the encoder.
+ */
+export class LinearResampler {
+  private readonly ratio: number;
+  private lastSample = 0;
+  private fractional = 0;
+
+  constructor(
+    private readonly sourceRate: number,
+    private readonly targetRate: number,
+  ) {
+    if (sourceRate <= 0 || targetRate <= 0) {
+      throw new Error("sourceRate and targetRate must be positive");
+    }
+    this.ratio = sourceRate / targetRate;
+  }
+
+  process(input: Float32Array): Float32Array {
+    if (this.sourceRate === this.targetRate) {
+      return input;
+    }
+
+    // Worst-case output length; we'll slice to actual.
+    const out = new Float32Array(Math.ceil(input.length / this.ratio) + 1);
+    let outIdx = 0;
+    let pos = this.fractional;
+
+    while (pos < input.length) {
+      const i = Math.floor(pos);
+      const frac = pos - i;
+      const a = i === 0 ? this.lastSample : input[i - 1];
+      const b = input[i];
+      out[outIdx++] = a + (b - a) * frac;
+      pos += this.ratio;
+    }
+
+    this.lastSample = input[input.length - 1] ?? this.lastSample;
+    this.fractional = pos - input.length;
+    return out.subarray(0, outIdx);
+  }
+
+  reset(): void {
+    this.lastSample = 0;
+    this.fractional = 0;
+  }
+}
+
+/** Convert Float32 PCM (-1..1) to little-endian Int16 PCM. */
+export function float32ToPcm16(input: Float32Array): ArrayBuffer {
+  const out = new ArrayBuffer(input.length * 2);
+  const view = new DataView(out);
+  for (let i = 0; i < input.length; i++) {
+    const clamped = Math.max(-1, Math.min(1, input[i]));
+    view.setInt16(
+      i * 2,
+      clamped < 0 ? clamped * 0x8000 : clamped * 0x7fff,
+      true,
+    );
+  }
+  return out;
+}
diff --git a/src/services/streaming/service.ts b/src/services/streaming/service.ts
index 636c561..7930d47 100644
--- a/src/services/streaming/service.ts
+++ b/src/services/streaming/service.ts
@@ -6,6 +6,7 @@ import {
 import { ErrorEvent, MessageEvent, CloseEvent } from "ws";
 import { conditions } from "#conditions";
 import {
+  ChannelAttributionParams,
   StreamingEvents,
   StreamingListeners,
   StreamingTranscriberParams,
@@ -18,12 +19,62 @@ import {
   StreamingForceEndpoint,
   WarningEvent,
 } from "../..";
+import type { VadDetector, VadFrame } from "../../types/streaming/dual-channel";
+import { EnergyVad } from "./energy-vad";
+import { attributeTurn, rollUpTurnChannel, VadTimeline } from "./label-mapper";
 import { StreamingError, StreamingErrorMessages } from "../../utils/errors";
 import { StreamingErrorTypeCodes } from "../../utils/errors/streaming";
 
+/**
+ * Options for `sendAudio`. In dual-channel mode (when `channels` is configured
+ * on the transcriber), `channel` is required and must match one of the declared
+ * channel names; in single-channel mode it is ignored.
+ */
+export type SendAudioOptions = {
+  channel?: string;
+};
+
+/**
+ * View any `AudioData` (ArrayBuffer / ArrayBufferView / typed array) as a
+ * little-endian Int16 sample sequence without copying. Callers must guarantee
+ * the underlying byte length is even.
+ */
+function toInt16View(audio: AudioData): Int16Array {
+  // AudioData is ArrayBufferLike per the public type, but in practice callers
+  // pass ArrayBuffer or a typed-array view. Handle both without copying.
+  if (audio instanceof Int16Array) return audio;
+  if (ArrayBuffer.isView(audio)) {
+    const view = audio as ArrayBufferView;
+    return new Int16Array(
+      view.buffer,
+      view.byteOffset,
+      Math.floor(view.byteLength / 2),
+    );
+  }
+  return new Int16Array(audio as ArrayBuffer);
+}
+
 const defaultStreamingUrl = "wss://streaming.assemblyai.com/v3/ws";
 const terminateSessionMessage = `{"type":"Terminate"}`;
 
+/**
+ * Per-send chunk cap in milliseconds for the dual-channel mixer. The streaming
+ * server rejects audio messages longer than 1000 ms (`Input Duration Error`).
+ * If a backlog accumulates (e.g. when a browser tab is backgrounded and
+ * `setInterval` is throttled to ~1 Hz), `flushMix` loops and emits multiple
+ * sends each ≤ this cap until the buffers drain.
+ */
+const MAX_CHUNK_MS = 200;
+
+/**
+ * Per-send minimum chunk size in milliseconds. The streaming server also
+ * rejects audio messages shorter than 50 ms with the same
+ * `Input Duration Error`, so the mixer waits until both per-channel buffers
+ * have at least this much accumulated before emitting. Final-flush (close
+ * path) bypasses this floor so the trailing partial buffer still gets sent.
+ */
+const MIN_CHUNK_MS = 50;
+
 type BufferLike =
   | string
   | Buffer
@@ -51,6 +102,25 @@ export class StreamingTranscriber {
   private listeners: StreamingListeners = {};
   private sessionTerminatedResolve?: () => void;
 
+  // Dual-channel mode state (allocated only when params.channels is set).
+  private isDualChannel = false;
+  private channelNames?: string[];
+  private channelBuffers?: Map<string, number[]>;
+  private channelSamplesReceived?: Map<string, number>;
+  private channelVadFloatBuffers?: Map<string, Float32Array>;
+  private channelVadBufferIdx?: Map<string, number>;
+  private channelVads?: Map<string, VadDetector>;
+  private timeline?: VadTimeline;
+  private flushTimer?: ReturnType<typeof setInterval>;
+  private attributionParams?: Required<ChannelAttributionParams>;
+  private vadFrameSamples = 0;
+  private minChunkSamples = 0;
+  private maxChunkSamples = 0;
+  // For resolveUnknownChannelsMethod === "speaker-history": per-speaker_label
+  // cumulative active-VAD RMS per channel. Allocated only when that method is
+  // configured.
+  private speakerHistory?: Map<string, Map<string, number>>;
+
   constructor(params: StreamingTranscriberParams) {
     this.params = {
       ...params,
@@ -63,6 +133,58 @@ export class StreamingTranscriber {
     if (!(this.token || this.apiKey)) {
       throw new Error("API key or temporary token is required.");
     }
+
+    if (params.channels) {
+      if (params.channels.length !== 2) {
+        throw new Error(
+          "StreamingTranscriber.channels must have exactly 2 entries.",
+        );
+      }
+      const names = params.channels.map((c) => c.name);
+      if (new Set(names).size !== names.length) {
+        throw new Error("StreamingTranscriber.channels names must be unique.");
+      }
+      this.isDualChannel = true;
+      this.channelNames = names;
+      const att = params.channelAttribution ?? {};
+      this.attributionParams = {
+        dominanceRatio: att.dominanceRatio ?? 4,
+        timelineWindowMs: att.timelineWindowMs ?? 30_000,
+        createVad: att.createVad ?? (() => new EnergyVad()),
+        flushIntervalMs: att.flushIntervalMs ?? 50,
+        resolveUnknownChannelsMethod:
+          att.resolveUnknownChannelsMethod ?? "window",
+        resolutionWindowWords: att.resolutionWindowWords ?? 2,
+        speakerHistoryMinRmsEvidence: att.speakerHistoryMinRmsEvidence ?? 0.5,
+        speakerHistoryDominanceRatio: att.speakerHistoryDominanceRatio ?? 3,
+      };
+      if (
+        this.attributionParams.resolveUnknownChannelsMethod ===
+        "speaker-history"
+      ) {
+        this.speakerHistory = new Map();
+      }
+      // 20 ms VAD frames at the transcriber's target sample rate.
+      this.vadFrameSamples = Math.max(1, Math.round(params.sampleRate * 0.02));
+      this.minChunkSamples = Math.max(
+        1,
+        Math.round(params.sampleRate * (MIN_CHUNK_MS / 1000)),
+      );
+      this.maxChunkSamples = Math.max(
+        this.minChunkSamples,
+        Math.round(params.sampleRate * (MAX_CHUNK_MS / 1000)),
+      );
+      this.channelBuffers = new Map(names.map((n) => [n, [] as number[]]));
+      this.channelSamplesReceived = new Map(names.map((n) => [n, 0]));
+      this.channelVadFloatBuffers = new Map(
+        names.map((n) => [n, new Float32Array(this.vadFrameSamples)]),
+      );
+      this.channelVadBufferIdx = new Map(names.map((n) => [n, 0]));
+      this.channelVads = new Map(
+        names.map((n) => [n, this.attributionParams!.createVad(n)]),
+      );
+      this.timeline = new VadTimeline(this.attributionParams.timelineWindowMs);
+    }
   }
 
   private connectionUrl(): URL {
@@ -283,6 +405,7 @@ export class StreamingTranscriber {
     listener: (event: LLMGatewayResponseEvent) => void,
   ): void;
   on(event: "warning", listener: (event: WarningEvent) => void): void;
+  on(event: "vad", listener: (event: VadFrame) => void): void;
   on(event: "error", listener: (error: Error) => void): void;
   on(event: "close", listener: (code: number, reason: string) => void): void;
   // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -323,6 +446,13 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
             reason = StreamingErrorMessages[code as StreamingErrorTypeCodes];
           }
         }
+        // Stop the flush timer when the socket is gone (server-initiated close,
+        // network drop, etc.) — otherwise subsequent ticks call send() on a
+        // closed socket and spam the error listener.
+        if (this.flushTimer) {
+          clearInterval(this.flushTimer);
+          this.flushTimer = undefined;
+        }
         this.listeners.close?.(code, reason);
       };
 
@@ -351,6 +481,22 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
             break;
           }
           case "Turn": {
+            if (this.isDualChannel && this.timeline && this.attributionParams) {
+              attributeTurn(message, this.timeline, {
+                dominanceRatio: this.attributionParams.dominanceRatio,
+              });
+              switch (this.attributionParams.resolveUnknownChannelsMethod) {
+                case "window":
+                  this.resolveUnknownChannelsByWindow(message);
+                  break;
+                case "speaker-history":
+                  this.resolveUnknownChannelsBySpeakerHistory(message);
+                  break;
+                case "none":
+                  // Leave "unknown" words as-is.
+                  break;
+              }
+            }
             this.listeners.turn?.(message);
             break;
           }
@@ -379,6 +525,11 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
     });
   }
 
+  /**
+   * Returns a WritableStream that pumps PCM chunks into `sendAudio`. Single-channel
+   * only — in dual-channel mode use `sendAudio(pcm, { channel })` directly, since
+   * `WritableStream` has no place to carry a channel tag.
+   */
   stream(): WritableStream<AudioData> {
     return new WritableStream<AudioData>({
       write: (chunk: AudioData) => {
@@ -387,8 +538,231 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
     });
   }
 
-  sendAudio(audio: AudioData) {
-    this.send(audio);
+  /**
+   * Send PCM audio.
+   *
+   * In single-channel mode, `audio` is forwarded directly to the WebSocket and
+   * `options` is ignored.
+   *
+   * In dual-channel mode (when `channels` is configured), `options.channel` is
+   * REQUIRED and must match one of the declared channel names. Per-channel PCM is
+   * fed into that channel's VAD, accumulated into a per-channel ring buffer, and
+   * a scheduled flush (`channelAttribution.flushIntervalMs`, default 50ms) mixes
+   * the buffers into mono before sending to the WebSocket.
+   */
+  sendAudio(audio: AudioData, options?: SendAudioOptions) {
+    if (!this.isDualChannel) {
+      this.send(audio);
+      return;
+    }
+    if (!options?.channel) {
+      throw new Error(
+        "StreamingTranscriber is in dual-channel mode; sendAudio requires { channel }.",
+      );
+    }
+    if (!this.channelNames!.includes(options.channel)) {
+      throw new Error(
+        `Unknown channel "${options.channel}"; declared channels: ${this.channelNames!.join(", ")}.`,
+      );
+    }
+    this.ingestChannelAudio(options.channel, audio);
+  }
+
+  private ingestChannelAudio(name: string, audio: AudioData) {
+    const samples = toInt16View(audio);
+    const buf = this.channelBuffers!.get(name)!;
+    const vadBuf = this.channelVadFloatBuffers!.get(name)!;
+    let vadIdx = this.channelVadBufferIdx!.get(name)!;
+    let received = this.channelSamplesReceived!.get(name)!;
+    const vad = this.channelVads!.get(name)!;
+    const sampleRate = this.params.sampleRate;
+    const frameSize = this.vadFrameSamples;
+
+    for (let i = 0; i < samples.length; i++) {
+      const s = samples[i];
+      buf.push(s);
+      vadBuf[vadIdx++] = s / 0x8000;
+      received++;
+      if (vadIdx === frameSize) {
+        const result = vad.process(vadBuf);
+        const frame: VadFrame = {
+          ts: (received / sampleRate) * 1000,
+          channel: name,
+          active: result.active,
+          rms: result.energy,
+        };
+        this.timeline!.pushFrame(frame);
+        this.listeners.vad?.(frame);
+        vadIdx = 0;
+      }
+    }
+
+    this.channelVadBufferIdx!.set(name, vadIdx);
+    this.channelSamplesReceived!.set(name, received);
+
+    if (!this.flushTimer) this.startFlushTimer();
+  }
+
+  private startFlushTimer() {
+    this.flushTimer = setInterval(
+      () => this.flushMix(),
+      this.attributionParams!.flushIntervalMs,
+    );
+  }
+
+  private flushMix(force = false) {
+    if (!this.channelNames || !this.channelBuffers) return;
+    const bufs = this.channelNames.map((n) => this.channelBuffers!.get(n)!);
+    const divisor = bufs.length;
+    // Loop so a backlog (e.g. accumulated while a browser tab was throttled in
+    // the background) drains as multiple sends, each capped at MAX_CHUNK_MS.
+    // Without the cap a single message could exceed the server's 1000 ms input
+    // duration limit and be rejected with code 3007.
+    for (;;) {
+      let mixLen = Infinity;
+      for (const b of bufs) if (b.length < mixLen) mixLen = b.length;
+      if (!Number.isFinite(mixLen) || mixLen === 0) return;
+      // The streaming server rejects audio messages shorter than 50 ms with
+      // `Input Duration Error`. Wait until both per-channel buffers have at
+      // least minChunkSamples worth queued before emitting. The `force` path
+      // (final flush on close) bypasses this so the trailing partial buffer
+      // still gets through.
+      if (!force && mixLen < this.minChunkSamples) return;
+      if (mixLen > this.maxChunkSamples) mixLen = this.maxChunkSamples;
+      const out = new Int16Array(mixLen);
+      for (let i = 0; i < mixLen; i++) {
+        let sum = 0;
+        for (let c = 0; c < divisor; c++) sum += bufs[c][i];
+        const avg = Math.round(sum / divisor);
+        out[i] = avg < -32768 ? -32768 : avg > 32767 ? 32767 : avg;
+      }
+      for (const b of bufs) b.splice(0, mixLen);
+      try {
+        this.send(out.buffer);
+      } catch (err) {
+        this.listeners.error?.(err as Error);
+        return;
+      }
+    }
+  }
+
+  /**
+   * Fill in words whose per-word VAD attribution was `"unknown"` by looking
+   * at the dominant non-`"unknown"` channel among ±N neighbors in the same
+   * turn. Words with no non-`"unknown"` neighbors stay `"unknown"`. Confident
+   * per-word VAD decisions are never modified.
+   *
+   * Local temporal heuristic — ignores `speaker_label`, so it works even when
+   * AAI's diarization re-uses the same label for two physically distinct
+   * voices. Each resolved word gets `channelResolved: true` so downstream
+   * renderers can distinguish inferred channels from directly-measured ones.
+   */
+  private resolveUnknownChannelsByWindow(turn: TurnEvent): void {
+    if (!this.attributionParams) return;
+    const window = this.attributionParams.resolutionWindowWords;
+    const words = turn.words;
+    let mutated = false;
+
+    for (let i = 0; i < words.length; i++) {
+      if (words[i].channel !== "unknown") continue;
+      const tally = new Map<string, number>();
+      const lo = Math.max(0, i - window);
+      const hi = Math.min(words.length - 1, i + window);
+      for (let j = lo; j <= hi; j++) {
+        if (j === i) continue;
+        const ch = words[j].channel;
+        if (!ch || ch === "unknown") continue;
+        tally.set(ch, (tally.get(ch) ?? 0) + 1);
+      }
+      if (tally.size === 0) continue;
+
+      // Pick the dominant neighbor channel. Ties → leave `"unknown"` (rare;
+      // would require an equal count of mic and system neighbors).
+      let top: string | undefined;
+      let topCount = 0;
+      let tied = false;
+      for (const [name, count] of tally) {
+        if (count > topCount) {
+          top = name;
+          topCount = count;
+          tied = false;
+        } else if (count === topCount) {
+          tied = true;
+        }
+      }
+      if (top && !tied) {
+        words[i].channel = top;
+        words[i].channelResolved = true;
+        mutated = true;
+      }
+    }
+
+    // Recompute the rollup only if any per-word channel changed.
+    if (mutated) turn.channel = rollUpTurnChannel(words);
+  }
+
+  /**
+   * Fill `"unknown"` words by looking up the speaker's session-wide channel
+   * evidence. For each `speaker_label`, sums active VAD frame RMS per channel
+   * across every word the speaker has uttered to date. A speaker is
+   * "resolvable" if their total evidence clears
+   * `speakerHistoryMinRmsEvidence` and their top channel exceeds the
+   * runner-up by `speakerHistoryDominanceRatio`.
+   *
+   * Only touches `"unknown"` words. Confident per-word VAD decisions are
+   * never modified. `speaker_label` is never modified.
+   */
+  private resolveUnknownChannelsBySpeakerHistory(turn: TurnEvent): void {
+    if (!this.timeline || !this.attributionParams || !this.speakerHistory)
+      return;
+    const minEvidence = this.attributionParams.speakerHistoryMinRmsEvidence;
+    const dominanceRatio = this.attributionParams.speakerHistoryDominanceRatio;
+
+    // 1. Accumulate evidence from this turn's words.
+    for (const w of turn.words) {
+      if (!w.speaker) continue;
+      const frames = this.timeline.framesInWindow(w.start, w.end);
+      let entry = this.speakerHistory.get(w.speaker);
+      if (!entry) {
+        entry = new Map();
+        this.speakerHistory.set(w.speaker, entry);
+      }
+      for (const f of frames) {
+        if (!f.active) continue;
+        entry.set(f.channel, (entry.get(f.channel) ?? 0) + f.rms);
+      }
+    }
+
+    // 2. Fill unknown words whose speakers have dominant evidence.
+    let mutated = false;
+    for (const w of turn.words) {
+      if (w.channel !== "unknown" || !w.speaker) continue;
+      const entry = this.speakerHistory.get(w.speaker);
+      if (!entry || entry.size === 0) continue;
+      let total = 0;
+      let topName: string | undefined;
+      let topScore = 0;
+      let runnerScore = 0;
+      for (const [name, score] of entry) {
+        total += score;
+        if (score > topScore) {
+          runnerScore = topScore;
+          topScore = score;
+          topName = name;
+        } else if (score > runnerScore) {
+          runnerScore = score;
+        }
+      }
+      if (total < minEvidence) continue;
+      if (runnerScore > 0 && topScore < dominanceRatio * runnerScore) continue;
+      if (topName) {
+        w.channel = topName;
+        w.channelResolved = true;
+        mutated = true;
+      }
+    }
+
+    if (mutated) turn.channel = rollUpTurnChannel(turn.words);
   }
 
   /**
@@ -440,6 +814,15 @@ Learn more at https://github.com/AssemblyAI/assemblyai-node-sdk/blob/main/docs/c
   }
 
   async close(waitForSessionTermination = true) {
+    if (this.flushTimer) {
+      clearInterval(this.flushTimer);
+      this.flushTimer = undefined;
+      // Best-effort: drain any final partial mix so the server gets the tail.
+      // Bypass the 50ms floor here since this is the last flush; if the tail
+      // is <50ms the server will reject that single message, but we'd lose
+      // the audio either way.
+      this.flushMix(true);
+    }
     if (this.socket) {
       if (this.socket.readyState === this.socket.OPEN) {
         if (waitForSessionTermination) {
diff --git a/src/types/streaming/dual-channel.ts b/src/types/streaming/dual-channel.ts
new file mode 100644
index 0000000..a63e0e6
--- /dev/null
+++ b/src/types/streaming/dual-channel.ts
@@ -0,0 +1,57 @@
+/**
+ * Physical input channel that a word/turn was attributed to.
+ * - A channel name declared in `StreamingTranscriberParams.channels` (e.g. `"mic"`, `"system"`).
+ * - `"unknown"`: no channel was clearly dominant during the word's time window (silent
+ *   or all channels evenly active under our threshold).
+ *
+ * This is independent of AssemblyAI's diarization `speaker_label` / `words[i].speaker`,
+ * which identifies voices by acoustic characteristics. A given speaker_label can map
+ * to any physical channel; the two dimensions can disagree.
+ */
+export type Channel = string | "unknown";
+
+/**
+ * Per-channel, per-frame VAD observation emitted by `StreamingTranscriber` when running
+ * in dual-channel mode. `ts` is stream-relative milliseconds, derived from the
+ * per-channel sample counter — the same reference frame as `StreamingWord.start` /
+ * `.end`, so per-word lookups need no conversion.
+ */
+export type VadFrame = {
+  ts: number;
+  channel: string;
+  active: boolean;
+  rms: number;
+};
+
+export type VadDetectorResult = {
+  active: boolean;
+  energy: number;
+};
+
+/**
+ * Pluggable per-channel voice-activity detector. The default `EnergyVad` is energy-based
+ * with an adaptive noise-floor threshold; callers can drop in a DNN-backed detector
+ * (e.g. Silero via `@ricky0123/vad-web`) for noisier environments.
+ *
+ * A separate `VadDetector` instance is held per channel; do not assume cross-channel
+ * state. Frames are fixed-size at the transcriber's target sample rate.
+ */
+export interface VadDetector {
+  process(frame: Float32Array): VadDetectorResult;
+  reset(): void;
+}
+
+/**
+ * Thrown when `DualChannelCapture` is constructed in a non-browser environment
+ * (no `globalThis.AudioContext`). The helper is intentionally surfaced from the
+ * main entrypoint so the import path is uniform across runtimes; the runtime
+ * guard moves to construction time.
+ */
+export class BrowserOnlyError extends Error {
+  constructor(
+    message = "DualChannelCapture requires a browser environment (AudioContext is undefined).",
+  ) {
+    super(message);
+    this.name = "BrowserOnlyError";
+  }
+}
diff --git a/src/types/streaming/index.ts b/src/types/streaming/index.ts
index 3f5f2e9..0eb8731 100644
--- a/src/types/streaming/index.ts
+++ b/src/types/streaming/index.ts
@@ -1,4 +1,65 @@
 import { AudioEncoding } from "..";
+import type { Channel, VadDetector, VadFrame } from "./dual-channel";
+
+export * from "./dual-channel";
+
+/**
+ * Per-channel attribution tuning for dual-channel mode. All fields optional;
+ * ignored when `StreamingTranscriberParams.channels` is not set.
+ */
+export type ChannelAttributionParams = {
+  /** Energy ratio above which a channel is declared dominant for a word. Default 4. */
+  dominanceRatio?: number;
+  /** Rolling VAD timeline window in ms. Default 30_000. */
+  timelineWindowMs?: number;
+  /**
+   * Factory for the per-channel VAD detector. Called once per declared channel
+   * at transcriber construction time. The channel name is passed so factories
+   * that wrap higher-level VAD libraries (which manage their own audio source)
+   * can map each `VadDetector` instance to its corresponding channel.
+   */
+  createVad?: (channelName: string) => VadDetector;
+  /** Mix flush interval in ms — how often per-channel buffers are summed and sent. Default 50. */
+  flushIntervalMs?: number;
+  /**
+   * Strategy used to fill words whose per-word VAD attribution resolved to
+   * `"unknown"`. Confident per-word VAD decisions (`"mic"` / `"system"`) are
+   * never modified by any strategy.
+   *
+   * - `"window"` (default): look at the dominant non-`"unknown"` channel
+   *   among ±`resolutionWindowWords` neighboring words in the same turn.
+   *   Ignores `speaker_label`, so it works even when AAI re-uses a label for
+   *   two physically distinct voices.
+   * - `"speaker-history"`: accumulate per-`speaker_label` per-channel active
+   *   VAD energy across the session, then fill `"unknown"` words with the
+   *   speaker's dominant channel when it clears
+   *   `speakerHistoryMinRmsEvidence` and beats runner-up by
+   *   `speakerHistoryDominanceRatio`. Robust for stable speaker labels but
+   *   does nothing when a speaker has split evidence.
+   * - `"none"`: disable resolution; `"unknown"` words remain `"unknown"` in
+   *   the output.
+   */
+  resolveUnknownChannelsMethod?: "none" | "window" | "speaker-history";
+  /**
+   * Half-window (in words) on each side of an `"unknown"` word for the
+   * `"window"` method. Default 2 — so the full window is up to 5 words
+   * (2 before + the unknown + 2 after).
+   */
+  resolutionWindowWords?: number;
+  /**
+   * Minimum cumulative active-RMS evidence (sum across all the speaker's
+   * frames to date) before a speaker can be resolved via the
+   * `"speaker-history"` method. Default 0.5 — roughly a few seconds of
+   * sustained speech.
+   */
+  speakerHistoryMinRmsEvidence?: number;
+  /**
+   * For the `"speaker-history"` method, the top channel's evidence must
+   * exceed the runner-up's by at least this factor for the speaker to be
+   * considered pinned to that channel. Default 3.
+   */
+  speakerHistoryDominanceRatio?: number;
+};
 
 export type LLMGatewayMessage = {
   role: string;
@@ -50,6 +111,33 @@ export type StreamingTranscriberParams = {
   webhookUrl?: string;
   webhookAuthHeaderName?: string;
   webhookAuthHeaderValue?: string;
+  /**
+   * Enable dual-channel (or N-channel) mode. Presence of `channels` switches the
+   * transcriber into channel-tagged mode: `sendAudio(audio, { channel })` is required,
+   * per-channel VAD runs on the raw PCM, the streams are mixed to mono before being
+   * sent to the server, and emitted `TurnEvent`s are enriched with `channel` and
+   * per-word `channel` attribution.
+   *
+   * Must contain exactly 2 entries with unique names. The names are echoed back in
+   * `TurnEvent.channel` / `words[i].channel`.
+   *
+   * **Acoustic-leak caveat.** Per-word channel attribution uses energy-based
+   * VAD on each channel. If your capture setup lets one channel's audio bleed
+   * into another at similar amplitude — typically system audio playing
+   * through speakers and being picked up by an open mic — attribution can
+   * misfire (mic-tagged words that were actually system). Transcription
+   * quality is unaffected; only the `channel` field is. To preserve
+   * attribution in speaker-leak setups, apply echo cancellation at capture
+   * before feeding audio to the SDK. In browsers, that's
+   * `getUserMedia({ audio: { echoCancellation: true } })`. On macOS native,
+   * `AVAudioEngine.setVoiceProcessingEnabled(true)` on the input node. If
+   * platform-level AEC isn't available, swap in a DNN VAD (e.g. Silero) via
+   * `channelAttribution.createVad`. See the dual-channel sample app's
+   * README for worked examples.
+   */
+  channels?: Array<{ name: string }>;
+  /** Tuning for dual-channel attribution. Ignored when `channels` is unset. */
+  channelAttribution?: ChannelAttributionParams;
 };
 
 export type StreamingEvents =
@@ -59,6 +147,7 @@ export type StreamingEvents =
   | "speechStarted"
   | "llmGatewayResponse"
   | "warning"
+  | "vad"
   | "error";
 
 export type StreamingListeners = {
@@ -68,6 +157,7 @@ export type StreamingListeners = {
   speechStarted?: (event: SpeechStartedEvent) => void;
   llmGatewayResponse?: (event: LLMGatewayResponseEvent) => void;
   warning?: (event: WarningEvent) => void;
+  vad?: (event: VadFrame) => void;
   error?: (error: Error) => void;
 };
 
@@ -186,6 +276,12 @@ export type TurnEvent = {
   language_code?: string;
   language_confidence?: number;
   speaker_label?: string;
+  /**
+   * Duration-weighted majority channel across `words[i].channel`. Populated only
+   * when the transcriber is configured with `channels`. Independent from
+   * `speaker_label`.
+   */
+  channel?: Channel;
 };
 
 export type StreamingWord = {
@@ -195,6 +291,20 @@ export type StreamingWord = {
   text: string;
   word_is_final: boolean;
   speaker?: string;
+  /**
+   * Physical input channel attributed by client-side VAD during this word's
+   * time window. Populated only when the transcriber is configured with
+   * `channels`. Independent from `speaker`.
+   */
+  channel?: Channel;
+  /**
+   * True if `channel` was filled in by `channelAttribution.resolveUnknownChannelsMethod`
+   * rather than by the per-word VAD. Only set on words whose per-word VAD
+   * attribution was `"unknown"` and whose resolution method produced a
+   * confident channel. Useful for debugging or rendering an indicator that a
+   * word's channel came from context, not direct VAD evidence.
+   */
+  channelResolved?: boolean;
 };
 
 export type TerminationEvent = {
diff --git a/tests/unit/dual-channel-browser-guard.test.ts b/tests/unit/dual-channel-browser-guard.test.ts
new file mode 100644
index 0000000..dafd796
--- /dev/null
+++ b/tests/unit/dual-channel-browser-guard.test.ts
@@ -0,0 +1,19 @@
+import {
+  BrowserOnlyError,
+  DualChannelCapture,
+} from "../../src/services/streaming/browser/dual-channel-capture";
+import { StreamingTranscriber } from "../../src/services/streaming/service";
+
+describe("DualChannelCapture in non-browser env", () => {
+  it("throws BrowserOnlyError when AudioContext is missing", () => {
+    // jest runs with testEnvironment: "node", so globalThis.AudioContext is undefined.
+    expect(
+      () =>
+        new DualChannelCapture({
+          micStream: undefined as unknown as MediaStream,
+          systemStream: undefined as unknown as MediaStream,
+          transcriber: undefined as unknown as StreamingTranscriber,
+        }),
+    ).toThrow(BrowserOnlyError);
+  });
+});
diff --git a/tests/unit/dual-channel-energy-vad.test.ts b/tests/unit/dual-channel-energy-vad.test.ts
new file mode 100644
index 0000000..5b38797
--- /dev/null
+++ b/tests/unit/dual-channel-energy-vad.test.ts
@@ -0,0 +1,53 @@
+import { EnergyVad } from "../../src/services/streaming/energy-vad";
+
+const FRAME = 320;
+
+function silence(): Float32Array {
+  return new Float32Array(FRAME);
+}
+
+function tone(amp: number): Float32Array {
+  const f = new Float32Array(FRAME);
+  for (let i = 0; i < FRAME; i++) f[i] = amp * Math.sin((2 * Math.PI * i) / 16);
+  return f;
+}
+
+describe("EnergyVad", () => {
+  it("reports silent frames as inactive after the noise floor adapts", () => {
+    const vad = new EnergyVad({ initialNoiseFloor: 1e-4, hangoverFrames: 0 });
+    // Warm up noise floor with silence.
+    for (let i = 0; i < 5; i++) vad.process(silence());
+    expect(vad.process(silence()).active).toBe(false);
+  });
+
+  it("triggers on a tone significantly above the noise floor", () => {
+    const vad = new EnergyVad({ initialNoiseFloor: 1e-4, hangoverFrames: 0 });
+    for (let i = 0; i < 5; i++) vad.process(silence());
+    expect(vad.process(tone(0.5)).active).toBe(true);
+  });
+
+  it("holds active state through hangover frames", () => {
+    const vad = new EnergyVad({ initialNoiseFloor: 1e-4, hangoverFrames: 3 });
+    for (let i = 0; i < 5; i++) vad.process(silence());
+    expect(vad.process(tone(0.5)).active).toBe(true);
+    expect(vad.process(silence()).active).toBe(true);
+    expect(vad.process(silence()).active).toBe(true);
+    expect(vad.process(silence()).active).toBe(true);
+    expect(vad.process(silence()).active).toBe(false);
+  });
+
+  it("reset() restores initial state", () => {
+    const vad = new EnergyVad({ initialNoiseFloor: 1e-4, hangoverFrames: 5 });
+    vad.process(tone(0.9));
+    expect(vad.process(silence()).active).toBe(true); // in hangover
+    vad.reset();
+    expect(vad.process(silence()).active).toBe(false);
+  });
+
+  it("returns RMS energy alongside the activity decision", () => {
+    const vad = new EnergyVad();
+    const result = vad.process(tone(0.5));
+    expect(result.energy).toBeGreaterThan(0);
+    expect(result.energy).toBeLessThan(1);
+  });
+});
diff --git a/tests/unit/dual-channel-label-mapper.test.ts b/tests/unit/dual-channel-label-mapper.test.ts
new file mode 100644
index 0000000..3f5504d
--- /dev/null
+++ b/tests/unit/dual-channel-label-mapper.test.ts
@@ -0,0 +1,227 @@
+import { StreamingWord, TurnEvent } from "../../src";
+import {
+  attributeTurn,
+  attributeWord,
+  rollUpTurnChannel,
+  VadTimeline,
+} from "../../src/services/streaming/label-mapper";
+import { VadFrame } from "../../src/types/streaming/dual-channel";
+
+function frame(
+  ts: number,
+  channel: string,
+  active: boolean,
+  rms = active ? 0.5 : 0,
+): VadFrame {
+  return { ts, channel, active, rms };
+}
+
+function buildTimeline(frames: VadFrame[]): VadTimeline {
+  const t = new VadTimeline(60_000);
+  for (const f of frames) t.pushFrame(f);
+  return t;
+}
+
+describe("attributeWord", () => {
+  const params = { dominanceRatio: 4 };
+
+  it("returns 'unknown' for a window with no VAD frames", () => {
+    const t = new VadTimeline(60_000);
+    const ch = attributeWord(
+      { start: 0, end: 200, confidence: 1, text: "hi", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("unknown");
+  });
+
+  it("returns 'mic' when only mic frames are active in the window", () => {
+    const t = buildTimeline([
+      frame(10, "mic", true),
+      frame(30, "mic", true),
+      frame(50, "mic", true),
+    ]);
+    const ch = attributeWord(
+      { start: 0, end: 100, confidence: 1, text: "hello", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("mic");
+  });
+
+  it("returns 'system' when only system frames are active", () => {
+    const t = buildTimeline([
+      frame(10, "system", true),
+      frame(30, "system", true),
+    ]);
+    const ch = attributeWord(
+      { start: 0, end: 100, confidence: 1, text: "ok", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("system");
+  });
+
+  it("returns the higher-scoring channel when ratio threshold not met", () => {
+    const t = buildTimeline([
+      frame(10, "mic", true, 0.5),
+      frame(20, "system", true, 0.4),
+    ]);
+    const ch = attributeWord(
+      { start: 0, end: 100, confidence: 1, text: "x", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("mic");
+  });
+
+  it("returns 'unknown' on an exact tie", () => {
+    const t = buildTimeline([
+      frame(10, "mic", true, 0.5),
+      frame(20, "system", true, 0.5),
+    ]);
+    const ch = attributeWord(
+      { start: 0, end: 100, confidence: 1, text: "x", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("unknown");
+  });
+
+  it("ignores inactive frames", () => {
+    const t = buildTimeline([
+      frame(10, "mic", false, 0.9),
+      frame(20, "system", true, 0.1),
+    ]);
+    const ch = attributeWord(
+      { start: 0, end: 100, confidence: 1, text: "x", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("system");
+  });
+
+  it("ignores frames outside the [start, end] window", () => {
+    const t = buildTimeline([
+      frame(10, "system", true),
+      frame(150, "mic", true),
+      frame(500, "system", true),
+    ]);
+    const ch = attributeWord(
+      { start: 100, end: 200, confidence: 1, text: "x", word_is_final: true },
+      t,
+      params,
+    );
+    expect(ch).toBe("mic");
+  });
+});
+
+describe("rollUpTurnChannel", () => {
+  function w(
+    channel: string | undefined,
+    start: number,
+    end: number,
+  ): StreamingWord {
+    return {
+      start,
+      end,
+      confidence: 1,
+      text: "x",
+      word_is_final: true,
+      channel,
+    };
+  }
+
+  it("returns 'unknown' for an empty word list", () => {
+    expect(rollUpTurnChannel([])).toBe("unknown");
+  });
+
+  it("returns 'unknown' when all words are unknown", () => {
+    expect(rollUpTurnChannel([w("unknown", 0, 100)])).toBe("unknown");
+  });
+
+  it("returns 'unknown' when all words are missing a channel", () => {
+    expect(rollUpTurnChannel([w(undefined, 0, 100)])).toBe("unknown");
+  });
+
+  it("returns the duration-dominant channel", () => {
+    expect(
+      rollUpTurnChannel([w("mic", 0, 1000), w("system", 1000, 1100)]),
+    ).toBe("mic");
+  });
+
+  it("returns 'unknown' on equal mic/system durations", () => {
+    expect(rollUpTurnChannel([w("mic", 0, 500), w("system", 500, 1000)])).toBe(
+      "unknown",
+    );
+  });
+
+  it("ignores unknown-channel words in the rollup vote", () => {
+    expect(
+      rollUpTurnChannel([w("unknown", 0, 10_000), w("mic", 10_000, 10_200)]),
+    ).toBe("mic");
+  });
+});
+
+describe("attributeTurn", () => {
+  it("mutates the turn in place: sets channel on each word and rolls up", () => {
+    const t = buildTimeline([
+      frame(10, "mic", true),
+      frame(150, "system", true),
+    ]);
+    const turn: TurnEvent = {
+      type: "Turn",
+      turn_order: 1,
+      turn_is_formatted: false,
+      end_of_turn: true,
+      transcript: "hi there",
+      end_of_turn_confidence: 0.9,
+      speaker_label: "A",
+      words: [
+        {
+          start: 0,
+          end: 100,
+          confidence: 1,
+          text: "hi",
+          word_is_final: true,
+          speaker: "A",
+        },
+        {
+          start: 100,
+          end: 200,
+          confidence: 1,
+          text: "there",
+          word_is_final: true,
+          speaker: "A",
+        },
+      ],
+    };
+    attributeTurn(turn, t, { dominanceRatio: 4 });
+    expect(turn.speaker_label).toBe("A");
+    expect(turn.words[0].speaker).toBe("A");
+    expect(turn.words[0].channel).toBe("mic");
+    expect(turn.words[1].channel).toBe("system");
+    expect(turn.channel).toBe("unknown"); // equal duration → tie → unknown
+  });
+});
+
+describe("VadTimeline window behavior", () => {
+  it("drops frames outside the rolling window", () => {
+    const t = new VadTimeline(100);
+    t.pushFrame(frame(0, "mic", true));
+    t.pushFrame(frame(50, "mic", true));
+    t.pushFrame(frame(150, "mic", true));
+    t.pushFrame(frame(200, "system", true));
+    const inWindow = t.framesInWindow(0, 1000);
+    expect(inWindow.map((f) => f.ts)).toEqual([150, 200]);
+  });
+
+  it("keeps everything inside the rolling window", () => {
+    const t = new VadTimeline(1000);
+    t.pushFrame(frame(0, "mic", true));
+    t.pushFrame(frame(500, "mic", true));
+    t.pushFrame(frame(900, "mic", true));
+    const inWindow = t.framesInWindow(0, 1000);
+    expect(inWindow.length).toBe(3);
+  });
+});
diff --git a/tests/unit/dual-channel-resampler.test.ts b/tests/unit/dual-channel-resampler.test.ts
new file mode 100644
index 0000000..a606d06
--- /dev/null
+++ b/tests/unit/dual-channel-resampler.test.ts
@@ -0,0 +1,61 @@
+import {
+  float32ToPcm16,
+  LinearResampler,
+} from "../../src/services/streaming/resampler";
+
+describe("LinearResampler", () => {
+  it("is identity when source == target", () => {
+    const r = new LinearResampler(16000, 16000);
+    const input = new Float32Array([0.1, 0.2, 0.3, 0.4]);
+    const out = r.process(input);
+    expect(out.length).toBe(4);
+    for (let i = 0; i < 4; i++) expect(out[i]).toBeCloseTo(input[i], 5);
+  });
+
+  it("approximately halves length when down-sampling 2:1", () => {
+    const r = new LinearResampler(48000, 24000);
+    const input = new Float32Array(960);
+    for (let i = 0; i < input.length; i++) input[i] = i / 960;
+    const out = r.process(input);
+    expect(out.length).toBeGreaterThan(479);
+    expect(out.length).toBeLessThan(482);
+  });
+
+  it("preserves continuity across chunk boundaries", () => {
+    const r = new LinearResampler(48000, 16000);
+    const chunkA = new Float32Array(480);
+    const chunkB = new Float32Array(480);
+    for (let i = 0; i < 480; i++) {
+      chunkA[i] = i;
+      chunkB[i] = 480 + i;
+    }
+    const a = r.process(chunkA);
+    const b = r.process(chunkB);
+    expect(b.length).toBeGreaterThan(0);
+    // The first output of chunkB should be near the last input of chunkA
+    // (continuity), not jump back to 0.
+    expect(b[0]).toBeGreaterThan(a[a.length - 1]);
+  });
+
+  it("rejects non-positive sample rates", () => {
+    expect(() => new LinearResampler(0, 16000)).toThrow();
+    expect(() => new LinearResampler(48000, -1)).toThrow();
+  });
+});
+
+describe("float32ToPcm16", () => {
+  it("clamps and packs to little-endian Int16", () => {
+    const buf = float32ToPcm16(new Float32Array([0, 1, -1, 2, -2]));
+    const view = new DataView(buf);
+    expect(view.getInt16(0, true)).toBe(0);
+    expect(view.getInt16(2, true)).toBe(0x7fff);
+    expect(view.getInt16(4, true)).toBe(-0x8000);
+    expect(view.getInt16(6, true)).toBe(0x7fff); // clamped from 2
+    expect(view.getInt16(8, true)).toBe(-0x8000); // clamped from -2
+  });
+
+  it("returns an empty buffer for empty input", () => {
+    const buf = float32ToPcm16(new Float32Array(0));
+    expect(buf.byteLength).toBe(0);
+  });
+});
diff --git a/tests/unit/streaming-dual-channel.test.ts b/tests/unit/streaming-dual-channel.test.ts
new file mode 100644
index 0000000..8293eb1
--- /dev/null
+++ b/tests/unit/streaming-dual-channel.test.ts
@@ -0,0 +1,864 @@
+jest.mock("ws", () => require("./mocks/ws"));
+
+import WS from "jest-websocket-mock";
+import fetchMock from "jest-fetch-mock";
+import { StreamingTranscriber, TurnEvent, VadFrame } from "../../src";
+
+fetchMock.enableMocks();
+
+const websocketBaseUrl = "wss://localhost:4242/v3/ws";
+
+const sessionBeginsMessage = {
+  type: "Begin",
+  id: "abc",
+  expires_at: 1,
+};
+
+const sessionTerminatedMessage = {
+  type: "Termination",
+};
+
+async function connect(rt: StreamingTranscriber, server: WS): Promise<void> {
+  const p = rt.connect();
+  await server.connected;
+  server.send(JSON.stringify(sessionBeginsMessage));
+  await p;
+}
+
+async function teardown(rt: StreamingTranscriber, server: WS): Promise<void> {
+  const p = rt.close();
+  server.send(JSON.stringify(sessionTerminatedMessage));
+  await p;
+  await server.closed;
+  WS.clean();
+}
+
+/** Build a 20ms-frame-aligned loud Int16 PCM buffer at the given amplitude. */
+function loudPcm(samples: number, amplitude = 20_000): ArrayBuffer {
+  const buf = new Int16Array(samples);
+  for (let i = 0; i < samples; i++) {
+    buf[i] = i % 2 === 0 ? amplitude : -amplitude;
+  }
+  return buf.buffer;
+}
+
+/** Build a silent Int16 PCM buffer (all zeros). */
+function silentPcm(samples: number): ArrayBuffer {
+  return new Int16Array(samples).buffer;
+}
+
+describe("StreamingTranscriber constructor validation (dual-channel)", () => {
+  it("throws when channels has the wrong arity", () => {
+    expect(
+      () =>
+        new StreamingTranscriber({
+          token: "t",
+          sampleRate: 16_000,
+          speechModel: "u3-rt-pro",
+          channels: [{ name: "only" }],
+        }),
+    ).toThrow(/exactly 2/);
+  });
+
+  it("throws when channel names are not unique", () => {
+    expect(
+      () =>
+        new StreamingTranscriber({
+          token: "t",
+          sampleRate: 16_000,
+          speechModel: "u3-rt-pro",
+          channels: [{ name: "dup" }, { name: "dup" }],
+        }),
+    ).toThrow(/unique/);
+  });
+});
+
+describe("StreamingTranscriber single-channel backward compat", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  it("sendAudio(buf) forwards directly to ws (no channel needed)", async () => {
+    const payload = silentPcm(160);
+    rt.sendAudio(payload);
+    await expect(server).toReceiveMessage(payload);
+  });
+
+  it("sendAudio(buf, { channel }) is accepted and ignored in single-channel mode", async () => {
+    const payload = silentPcm(160);
+    rt.sendAudio(payload, { channel: "anything" });
+    await expect(server).toReceiveMessage(payload);
+  });
+});
+
+describe("StreamingTranscriber dual-channel sendAudio validation", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  it("throws when channel is missing in dual-channel mode", () => {
+    expect(() => rt.sendAudio(silentPcm(16))).toThrow(
+      /dual-channel mode; sendAudio requires/,
+    );
+  });
+
+  it("throws on unknown channel name", () => {
+    expect(() => rt.sendAudio(silentPcm(16), { channel: "bogus" })).toThrow(
+      /Unknown channel "bogus"/,
+    );
+  });
+});
+
+describe("StreamingTranscriber dual-channel mixing and VAD", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+      channelAttribution: { flushIntervalMs: 50 },
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  it("flushes mixed mono PCM at flushIntervalMs cadence", async () => {
+    // 800 samples = 50ms at 16kHz on each channel — exactly the minimum the
+    // server accepts per audio message. Both channels send the same amount,
+    // so the next flush emits 800 Int16 samples = 1600 bytes of mono PCM.
+    rt.sendAudio(loudPcm(800), { channel: "mic" });
+    rt.sendAudio(silentPcm(800), { channel: "system" });
+    const msg = (await server.nextMessage) as ArrayBuffer;
+    expect(msg.byteLength).toBe(800 * 2);
+  });
+
+  it("emits a 'vad' event per 20ms frame as PCM is ingested", () => {
+    const vadFrames: VadFrame[] = [];
+    rt.on("vad", (f) => vadFrames.push(f));
+    // 320 samples = exactly one 20ms VAD frame at 16kHz.
+    rt.sendAudio(loudPcm(320), { channel: "mic" });
+    rt.sendAudio(silentPcm(320), { channel: "system" });
+    expect(vadFrames.length).toBe(2);
+    const channels = vadFrames.map((f) => f.channel).sort();
+    expect(channels).toEqual(["mic", "system"]);
+    const mic = vadFrames.find((f) => f.channel === "mic")!;
+    expect(mic.rms).toBeGreaterThan(0);
+    expect(mic.ts).toBeCloseTo(20, 1); // 320 / 16000 * 1000 = 20ms
+  });
+
+  it("attributes a Turn message based on per-channel VAD energy", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive ~200ms of loud mic audio + matching silence on system.
+    // Energy VAD with default thresholdRatio=3 takes a few frames to "lock on";
+    // 10 frames (200ms) is enough to push energy well above the adaptive floor.
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+
+    const turn: TurnEvent = {
+      type: "Turn",
+      turn_order: 1,
+      turn_is_formatted: false,
+      end_of_turn: true,
+      transcript: "hello",
+      end_of_turn_confidence: 0.9,
+      speaker_label: "A",
+      words: [
+        {
+          start: 20,
+          end: 200,
+          confidence: 1,
+          text: "hello",
+          word_is_final: true,
+          speaker: "A",
+        },
+      ],
+    };
+    server.send(JSON.stringify(turn));
+
+    expect(received).toHaveLength(1);
+    const got = received[0];
+    expect(got.speaker_label).toBe("A");
+    expect(got.words[0].speaker).toBe("A");
+    expect(got.words[0].channel).toBe("mic");
+    expect(got.channel).toBe("mic");
+  });
+
+  it("mixes per-channel PCM with /channelCount averaging", async () => {
+    // 800 samples each (50ms floor): mic constant +10000, system constant
+    // -10000. Average per sample = 0; mixed mono should be all zeros.
+    const micBuf = new Int16Array(800);
+    const sysBuf = new Int16Array(800);
+    for (let i = 0; i < 800; i++) {
+      micBuf[i] = 10_000;
+      sysBuf[i] = -10_000;
+    }
+    rt.sendAudio(micBuf.buffer, { channel: "mic" });
+    rt.sendAudio(sysBuf.buffer, { channel: "system" });
+    const msg = (await server.nextMessage) as ArrayBuffer;
+    const mixed = new Int16Array(msg);
+    expect(mixed.length).toBe(800);
+    for (let i = 0; i < mixed.length; i++) {
+      expect(mixed[i]).toBe(0);
+    }
+
+    // Now: both channels at +20000 → mix should be +20000 (avg, not sum).
+    const both = new Int16Array(800);
+    for (let i = 0; i < 800; i++) both[i] = 20_000;
+    rt.sendAudio(both.buffer, { channel: "mic" });
+    rt.sendAudio(both.buffer, { channel: "system" });
+    const msg2 = (await server.nextMessage) as ArrayBuffer;
+    const mixed2 = new Int16Array(msg2);
+    for (let i = 0; i < mixed2.length; i++) {
+      expect(mixed2[i]).toBe(20_000);
+    }
+  });
+
+  it("only flushes min(channel lengths); the longer channel retains its tail", async () => {
+    // Mic: 1600 samples, system: 800 samples (both ≥ the 50ms floor). First
+    // flush should emit 800 mixed samples; the remaining 800 mic samples wait
+    // for system to catch up.
+    rt.sendAudio(loudPcm(1600), { channel: "mic" });
+    rt.sendAudio(silentPcm(800), { channel: "system" });
+    const first = (await server.nextMessage) as ArrayBuffer;
+    expect(first.byteLength).toBe(800 * 2);
+
+    // Now feed 800 more on system → second flush of 800 mixed samples.
+    rt.sendAudio(silentPcm(800), { channel: "system" });
+    const second = (await server.nextMessage) as ArrayBuffer;
+    expect(second.byteLength).toBe(800 * 2);
+  });
+
+  it("aggregates samples across sendAudio calls into a single VAD frame", () => {
+    const vadFrames: VadFrame[] = [];
+    rt.on("vad", (f) => vadFrames.push(f));
+    // Feed mic in two halves of 160 samples — should still produce exactly one
+    // 20ms VAD frame once the second half lands.
+    rt.sendAudio(loudPcm(160), { channel: "mic" });
+    expect(vadFrames.filter((f) => f.channel === "mic")).toHaveLength(0);
+    rt.sendAudio(loudPcm(160), { channel: "mic" });
+    const mic = vadFrames.filter((f) => f.channel === "mic");
+    expect(mic).toHaveLength(1);
+    expect(mic[0].ts).toBeCloseTo(20, 1); // 320 / 16000 * 1000
+  });
+
+  it("withholds sub-50ms flushes until enough audio accumulates", async () => {
+    // 320 samples = 20ms — below the server's 50ms floor. The mixer should
+    // hold them in the per-channel buffers and NOT emit a message yet.
+    rt.sendAudio(loudPcm(320), { channel: "mic" });
+    rt.sendAudio(silentPcm(320), { channel: "system" });
+    // Top each channel up to exactly 50ms (320 + 480 = 800 samples). Now the
+    // floor is met and the next flush emits 800 samples — and only those 800.
+    rt.sendAudio(loudPcm(480), { channel: "mic" });
+    rt.sendAudio(silentPcm(480), { channel: "system" });
+
+    const msg = (await server.nextMessage) as ArrayBuffer;
+    expect(msg.byteLength).toBe(800 * 2);
+  });
+
+  it("produces multiple sequential flushes as new PCM arrives", async () => {
+    rt.sendAudio(loudPcm(800), { channel: "mic" });
+    rt.sendAudio(silentPcm(800), { channel: "system" });
+    const a = (await server.nextMessage) as ArrayBuffer;
+    expect(a.byteLength).toBe(800 * 2);
+
+    rt.sendAudio(loudPcm(800), { channel: "mic" });
+    rt.sendAudio(silentPcm(800), { channel: "system" });
+    const b = (await server.nextMessage) as ArrayBuffer;
+    expect(b.byteLength).toBe(800 * 2);
+  });
+
+  it("caps each emitted chunk at MAX_CHUNK_MS even with a large backlog", async () => {
+    // 8000 samples per channel = 500 ms backlog at 16 kHz. Without the cap
+    // this would emit a single 16,000-byte (>1000 ms equivalent on the wire)
+    // message; with the cap it drains in multiple ≤200 ms (3200-sample) sends.
+    rt.sendAudio(loudPcm(8000), { channel: "mic" });
+    rt.sendAudio(silentPcm(8000), { channel: "system" });
+
+    let totalSamples = 0;
+    const maxSamplesPerSend = Math.round(16_000 * (200 / 1000));
+    while (totalSamples < 8000) {
+      const msg = (await server.nextMessage) as ArrayBuffer;
+      const samples = msg.byteLength / 2;
+      expect(samples).toBeLessThanOrEqual(maxSamplesPerSend);
+      expect(samples).toBeGreaterThan(0);
+      totalSamples += samples;
+    }
+    expect(totalSamples).toBe(8000);
+  });
+
+  it("clears the flush timer when the socket is closed externally", async () => {
+    const spy = jest.spyOn(global, "clearInterval");
+    rt.sendAudio(silentPcm(160), { channel: "mic" });
+    rt.sendAudio(silentPcm(160), { channel: "system" });
+    // Close the client-side WebSocket without going through transcriber.close().
+    // The onclose handler should fire and clear the flush timer so subsequent
+    // ticks don't try to call send() on a dead socket.
+    type SocketHolder = { socket?: { close: () => void } };
+    (rt as unknown as SocketHolder).socket?.close();
+    // Yield a tick so the onclose event drains.
+    await new Promise((resolve) => setTimeout(resolve, 20));
+    expect(spy).toHaveBeenCalled();
+    spy.mockRestore();
+
+    // Replace rt/server so afterEach can run the standard teardown.
+    WS.clean();
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+      channelAttribution: { flushIntervalMs: 50 },
+    });
+    await connect(rt, server);
+  });
+
+  it("close() clears the flush timer", async () => {
+    const spy = jest.spyOn(global, "clearInterval");
+    rt.sendAudio(silentPcm(160), { channel: "mic" });
+    rt.sendAudio(silentPcm(160), { channel: "system" });
+
+    const closePromise = rt.close();
+    server.send(JSON.stringify(sessionTerminatedMessage));
+    await closePromise;
+    await server.closed;
+    WS.clean();
+    expect(spy).toHaveBeenCalled();
+    spy.mockRestore();
+
+    // Replace rt/server so afterEach can run the standard teardown without errors.
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+    });
+    await connect(rt, server);
+  });
+});
+
+describe("StreamingTranscriber resolveUnknownChannelsMethod = 'window'", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+      channelAttribution: {
+        resolveUnknownChannelsMethod: "window",
+        resolutionWindowWords: 2,
+      },
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  /**
+   * Build a TurnEvent whose words already have pre-set `channel` values, so
+   * tests can exercise the unknown-channel resolver without driving VAD audio
+   * through the transcriber. attributeTurn() will overwrite the channels with
+   * what it sees on the timeline — but for words whose timestamps don't
+   * overlap any timeline frame, attributeTurn returns "unknown", preserving
+   * the pre-set values via... actually no, it sets unknown.
+   *
+   * To get reliable per-word channel values we drive the timeline first.
+   */
+  function makeTurn(
+    words: Array<{
+      text: string;
+      start: number;
+      end: number;
+      speaker?: string;
+    }>,
+  ): TurnEvent {
+    return {
+      type: "Turn",
+      turn_order: 1,
+      turn_is_formatted: false,
+      end_of_turn: true,
+      transcript: words.map((w) => w.text).join(" "),
+      end_of_turn_confidence: 0.9,
+      speaker_label: words[0].speaker ?? "A",
+      words: words.map((w) => ({
+        start: w.start,
+        end: w.end,
+        confidence: 1,
+        text: w.text,
+        word_is_final: true,
+        speaker: w.speaker ?? "A",
+      })),
+    };
+  }
+
+  it("fills an unknown word surrounded by mic neighbors and marks it channelResolved", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive ~200ms of mic-loud audio so the first two words get a confident
+    // "mic" attribution from per-word VAD.
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+
+    // Word at [10000, 10100] has no VAD frames → unknown. Surrounding mic
+    // words should let the window resolver fill it as mic.
+    server.send(
+      JSON.stringify(
+        makeTurn([
+          { text: "hello", start: 20, end: 80 },
+          { text: "there", start: 100, end: 180 },
+          { text: "a", start: 10_000, end: 10_100 }, // no VAD evidence
+        ]),
+      ),
+    );
+
+    const turn = received[0];
+    expect(turn.words[0].channel).toBe("mic");
+    expect(turn.words[1].channel).toBe("mic");
+    expect(turn.words[2].channel).toBe("mic");
+    expect(turn.words[2].channelResolved).toBe(true);
+    expect(turn.words[0].channelResolved).toBeUndefined();
+    expect(turn.channel).toBe("mic");
+  });
+
+  it("leaves an unknown word alone if it has no non-unknown neighbors", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // No audio driven → all words attributeWord-resolve to "unknown".
+    server.send(
+      JSON.stringify(
+        makeTurn([
+          { text: "x", start: 0, end: 100 },
+          { text: "y", start: 100, end: 200 },
+          { text: "z", start: 200, end: 300 },
+        ]),
+      ),
+    );
+
+    const turn = received[0];
+    for (const w of turn.words) {
+      expect(w.channel).toBe("unknown");
+      expect(w.channelResolved).toBeUndefined();
+    }
+  });
+
+  it("leaves an unknown word alone on a neighbor tie (1 mic + 1 system)", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive both: 200ms loud on mic AND loud on system. First we need mic-only
+    // for the first word, then system-only for the third word, separated by
+    // an unknown middle. That requires two distinct time windows.
+    // Window 1 (0-200ms): mic loud, system silent → first word resolves to mic.
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+    // Window 2 (200-400ms): mic silent, system loud → third word resolves to system.
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(silentPcm(320), { channel: "mic" });
+      rt.sendAudio(loudPcm(320), { channel: "system" });
+    }
+
+    server.send(
+      JSON.stringify(
+        makeTurn([
+          { text: "first", start: 20, end: 180 },
+          { text: "middle", start: 5_000, end: 5_100 }, // unknown, no VAD
+          { text: "third", start: 220, end: 380 },
+        ]),
+      ),
+    );
+
+    const turn = received[0];
+    expect(turn.words[0].channel).toBe("mic");
+    expect(turn.words[2].channel).toBe("system");
+    // Tie between mic and system neighbors → middle stays unknown.
+    expect(turn.words[1].channel).toBe("unknown");
+    expect(turn.words[1].channelResolved).toBeUndefined();
+  });
+
+  it("never modifies confident per-word VAD decisions", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive mic-loud audio so per-word VAD confidently calls each word "mic".
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+
+    server.send(
+      JSON.stringify(
+        makeTurn([
+          { text: "alpha", start: 20, end: 80 },
+          { text: "beta", start: 100, end: 180 },
+        ]),
+      ),
+    );
+
+    const turn = received[0];
+    expect(turn.words[0].channel).toBe("mic");
+    expect(turn.words[1].channel).toBe("mic");
+    // No words were unknown → no resolution markers should be set.
+    expect(turn.words[0].channelResolved).toBeUndefined();
+    expect(turn.words[1].channelResolved).toBeUndefined();
+  });
+});
+
+describe("StreamingTranscriber resolveUnknownChannelsMethod = 'none'", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+      channelAttribution: { resolveUnknownChannelsMethod: "none" },
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  it("leaves unknown words untouched", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive 200ms of mic-loud audio so we have a baseline timeline.
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+
+    // Send a Turn with one word inside the mic-active window (would resolve to mic
+    // even without resolution since VAD attributes it) plus one outside any VAD
+    // window (would only be filled by resolution).
+    const turn: TurnEvent = {
+      type: "Turn",
+      turn_order: 1,
+      turn_is_formatted: false,
+      end_of_turn: true,
+      transcript: "hello x",
+      end_of_turn_confidence: 0.9,
+      speaker_label: "A",
+      words: [
+        {
+          start: 20,
+          end: 180,
+          confidence: 1,
+          text: "hello",
+          word_is_final: true,
+          speaker: "A",
+        },
+        {
+          start: 100_000,
+          end: 100_200,
+          confidence: 1,
+          text: "x",
+          word_is_final: true,
+          speaker: "A",
+        },
+      ],
+    };
+    server.send(JSON.stringify(turn));
+
+    const got = received[0];
+    // First word: per-word VAD attributed it (mic) — unchanged.
+    expect(got.words[0].channel).toBe("mic");
+    // Second word: no VAD evidence in window → unknown, and "none" leaves it.
+    expect(got.words[1].channel).toBe("unknown");
+    expect(got.words[1].channelResolved).toBeUndefined();
+  });
+});
+
+describe("StreamingTranscriber resolveUnknownChannelsMethod default", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    // No channelAttribution at all — default should fire: resolveUnknownChannelsMethod: "window".
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  it("resolves unknown words by default (no channelAttribution passed)", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+
+    server.send(
+      JSON.stringify({
+        type: "Turn",
+        turn_order: 1,
+        turn_is_formatted: false,
+        end_of_turn: true,
+        transcript: "hello a there",
+        end_of_turn_confidence: 0.9,
+        speaker_label: "A",
+        words: [
+          {
+            start: 20,
+            end: 80,
+            confidence: 1,
+            text: "hello",
+            word_is_final: true,
+            speaker: "A",
+          },
+          {
+            // No timeline coverage → resolves to "unknown" via per-word VAD.
+            start: 10_000,
+            end: 10_100,
+            confidence: 1,
+            text: "a",
+            word_is_final: true,
+            speaker: "A",
+          },
+          {
+            start: 100,
+            end: 180,
+            confidence: 1,
+            text: "there",
+            word_is_final: true,
+            speaker: "A",
+          },
+        ],
+      }),
+    );
+
+    const turn = received[0];
+    expect(turn.words[1].channel).toBe("mic");
+    expect(turn.words[1].channelResolved).toBe(true);
+  });
+});
+
+describe("StreamingTranscriber resolveUnknownChannelsMethod = 'speaker-history'", () => {
+  let server: WS;
+  let rt: StreamingTranscriber;
+
+  beforeEach(async () => {
+    server = new WS(websocketBaseUrl);
+    rt = new StreamingTranscriber({
+      websocketBaseUrl,
+      token: "t",
+      sampleRate: 16_000,
+      speechModel: "u3-rt-pro",
+      channels: [{ name: "mic" }, { name: "system" }],
+      channelAttribution: {
+        resolveUnknownChannelsMethod: "speaker-history",
+        // Low threshold so a single turn's worth of audio is enough.
+        speakerHistoryMinRmsEvidence: 0.01,
+        speakerHistoryDominanceRatio: 3,
+      },
+    });
+    await connect(rt, server);
+  });
+
+  afterEach(async () => {
+    await teardown(rt, server);
+  });
+
+  function makeTurn(
+    words: Array<{
+      text: string;
+      start: number;
+      end: number;
+      speaker?: string;
+    }>,
+  ): TurnEvent {
+    return {
+      type: "Turn",
+      turn_order: 1,
+      turn_is_formatted: false,
+      end_of_turn: true,
+      transcript: words.map((w) => w.text).join(" "),
+      end_of_turn_confidence: 0.9,
+      speaker_label: words[0].speaker ?? "A",
+      words: words.map((w) => ({
+        start: w.start,
+        end: w.end,
+        confidence: 1,
+        text: w.text,
+        word_is_final: true,
+        speaker: w.speaker ?? "A",
+      })),
+    };
+  }
+
+  it("fills an unknown word when the speaker has dominant session evidence", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive ~200ms of mic-loud audio so speaker A's first words accumulate
+    // strong mic-side evidence.
+    for (let i = 0; i < 10; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+
+    // First turn: two confident mic words for speaker A. After this, A has
+    // strong mic evidence in the speaker-history map.
+    server.send(
+      JSON.stringify(
+        makeTurn([
+          { text: "hello", start: 20, end: 80 },
+          { text: "there", start: 100, end: 180 },
+        ]),
+      ),
+    );
+    expect(received[0].words[0].channel).toBe("mic");
+
+    // Second turn: speaker A says one word outside any VAD coverage → unknown
+    // via per-word VAD, but speaker-history should fill it as mic.
+    server.send(
+      JSON.stringify(
+        makeTurn([{ text: "yeah", start: 100_000, end: 100_500 }]),
+      ),
+    );
+    const turn = received[1];
+    expect(turn.words[0].channel).toBe("mic");
+    expect(turn.words[0].channelResolved).toBe(true);
+  });
+
+  it("does not fill when the speaker has split evidence across channels", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Drive equal loud audio on both channels.
+    for (let i = 0; i < 20; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(loudPcm(320), { channel: "system" });
+    }
+
+    // First turn covers the dual-loud period; per-word VAD will tie → unknown.
+    server.send(
+      JSON.stringify(makeTurn([{ text: "both", start: 20, end: 380 }])),
+    );
+    expect(received[0].words[0].channel).toBe("unknown");
+
+    // Future word with no VAD evidence — speaker A's history is split, so no fill.
+    server.send(
+      JSON.stringify(makeTurn([{ text: "x", start: 100_000, end: 100_400 }])),
+    );
+    expect(received[1].words[0].channel).toBe("unknown");
+    expect(received[1].words[0].channelResolved).toBeUndefined();
+  });
+
+  it("does not modify confident per-word VAD decisions", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    // Build heavy SYSTEM evidence first.
+    for (let i = 0; i < 20; i++) {
+      rt.sendAudio(silentPcm(320), { channel: "mic" });
+      rt.sendAudio(loudPcm(320), { channel: "system" });
+    }
+    server.send(
+      JSON.stringify(makeTurn([{ text: "hi", start: 20, end: 380 }])),
+    );
+    expect(received[0].words[0].channel).toBe("system");
+
+    // Now feed mic-loud audio.
+    for (let i = 0; i < 5; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+    // Per-word VAD will confidently say "mic" for the new turn. Speaker
+    // history might still favor system (5 mic frames << 20 system frames),
+    // but speaker-history must not override confident VAD decisions.
+    server.send(
+      JSON.stringify(makeTurn([{ text: "actually", start: 460, end: 560 }])),
+    );
+    expect(received[1].words[0].channel).toBe("mic");
+    expect(received[1].words[0].channelResolved).toBeUndefined();
+  });
+
+  it("never modifies speaker_label", async () => {
+    const received: TurnEvent[] = [];
+    rt.on("turn", (t) => received.push(t));
+
+    for (let i = 0; i < 20; i++) {
+      rt.sendAudio(loudPcm(320), { channel: "mic" });
+      rt.sendAudio(silentPcm(320), { channel: "system" });
+    }
+    server.send(
+      JSON.stringify(makeTurn([{ text: "hi", start: 20, end: 380 }])),
+    );
+    expect(received[0].speaker_label).toBe("A");
+    expect(received[0].words[0].speaker).toBe("A");
+  });
+});