TanStack · harshlocham · Jun 12, 2026 · Jul 3, 2026 · Jul 4, 2026
diff --git a/testing/e2e/global-setup.ts b/testing/e2e/global-setup.ts
@@ -43,6 +43,18 @@ export default async function globalSetup() {
   mock.mount('/v1/text-to-speech', elevenLabsTTSMount())
   mock.mount('/v1/speech-to-text', elevenLabsSTTMount())
 
+  // Gemini TTS hits the standard Gemini generateContent endpoint
+  // (POST /v1beta/models/{model}:generateContent) with
+  // responseModalities: ['AUDIO']. aimock's native Gemini audio helper derives
+  // the mime type from the fixture's `format`/`contentType`, so it can't emit
+  // the raw `audio/L16;codec=pcm;rate=24000` PCM that real Gemini TTS returns.
+  // Mount the TTS model's generateContent path directly so we can hand back
+  // PCM and exercise the adapter's PCM→WAV normalization. The path is specific
+  // to the TTS model, so it doesn't intercept Gemini chat/summarize requests.
+  mock.mount(
+    '/v1beta/models/gemini-3.1-flash-tts-preview:generateContent',
+    geminiTTSMount(),
+  )
   // Gemini Veo video generation. aimock 1.29 mocks Gemini's `:predict`
   // (Imagen) endpoint but not the long-running `:predictLongRunning` +
   // operations-polling pair Veo uses, so mount both here. Non-Veo paths
@@ -85,7 +97,7 @@ export default async function globalSetup() {
 
   await mock.start()
   console.log(`[aimock] started on port 4010`)
-  ;(globalThis as any).__aimock = mock
+    ; (globalThis as any).__aimock = mock
 }
 
 function registerMediaFixtures(mock: LLMock) {
@@ -137,6 +149,14 @@ const FAKE_MP3_BYTES = Buffer.from([
   0xff, 0xfb, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 ])
 
+/**
+ * Raw 16-bit little-endian PCM bytes. Gemini TTS returns audio as
+ * `audio/L16;codec=pcm;rate=24000` inlineData, which the adapter wraps in a
+ * RIFF/WAV header before handing it to the browser. The samples are arbitrary
+ * silence — the spec only asserts the `<audio>` element becomes visible.
+ */
+const FAKE_PCM_BYTES = Buffer.alloc(32)
+
 function grokTTSMount(): Mountable {
   return {
     async handleRequest(
@@ -157,6 +177,52 @@ function grokTTSMount(): Mountable {
   }
 }
 
+function geminiTTSMount(): Mountable {
+  return {
+    async handleRequest(
+      req: http.IncomingMessage,
+      res: http.ServerResponse,
+      // aimock strips the mount prefix — pathname will be "/" for an exact match.
+      pathname: string,
+    ): Promise<boolean> {
+      if (pathname !== '/' || req.method !== 'POST') return false
+      await drainBody(req)
+      res.statusCode = 200
+      res.setHeader('Content-Type', 'application/json')
+      // Mirror the Gemini generateContent audio response shape: audio lands as
+      // a single `candidates[0].content.parts[0].inlineData` entry. The PCM
+      // mime type forces the adapter down its PCM→WAV wrapping path.
+      res.end(
+        JSON.stringify({
+          candidates: [
+            {
+              content: {
+                role: 'model',
+                parts: [
+                  {
+                    inlineData: {
+                      mimeType: 'audio/L16;codec=pcm;rate=24000',
+                      data: FAKE_PCM_BYTES.toString('base64'),
+                    },
+                  },
+                ],
+              },
+              finishReason: 'STOP',
+              index: 0,
+            },
+          ],
+          usageMetadata: {
+            promptTokenCount: 5,
+            candidatesTokenCount: 15,
+            totalTokenCount: 20,
+          },
+        }),
+      )
+      return true
+    },
+  }
+}
+
 function grokSTTMount(): Mountable {
   return {
     async handleRequest(
@@ -741,7 +807,7 @@ function readBody(req: http.IncomingMessage): Promise<string> {
 
 function drainBody(req: http.IncomingMessage): Promise<void> {
   return new Promise((resolve, reject) => {
-    req.on('data', () => {})
+    req.on('data', () => { })
     req.on('end', () => resolve())
     req.on('error', reject)
   })

diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts
@@ -231,13 +231,13 @@ export const matrix: Record<Feature, Set<Provider>> = {
   'image-to-image': new Set(['openai']),
   'audio-gen': new Set(['gemini', 'elevenlabs']),
   'sound-effects': new Set(['elevenlabs']),
-  tts: new Set(['openai', 'grok', 'elevenlabs']),
+  tts: new Set(['openai', 'gemini', 'grok', 'elevenlabs']),
   transcription: new Set(['openai', 'grok', 'groq', 'elevenlabs']),
   'transcription-diarization': new Set(['openai']),
+  'video-gen': new Set(['openai']),
   // Gemini Veo runs through a custom aimock mount (see geminiVeoMount in
   // global-setup.ts) — aimock 1.29 doesn't model the long-running
   // `:predictLongRunning` + operations-polling pair natively.
-  'video-gen': new Set(['openai', 'gemini']),
   // image-to-video (image parts in the generateVideo prompt). aimock 1.29's
   // `/v1/videos` handler parses Sora's multipart upload (the SDK switches to
   // multipart when `input_reference` carries a File) and matches on the
@@ -246,9 +246,6 @@ export const matrix: Record<Feature, Set<Provider>> = {
   // routing remain unit-test-only (the spec's journal assertion is tied to
   // aimock's /v1/videos pipeline, which custom mounts bypass).
   'image-to-video': new Set(['openai']),
-  // Only Gemini currently surfaces a first-class stateful conversation API via
-  // the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental).
-  'stateful-interactions': new Set(['gemini']),
 }
 
 export function isSupported(provider: Provider, feature: Feature): boolean {

diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts
@@ -7,6 +7,7 @@ import {
 import {
   createGeminiAudio,
   createGeminiImage,
+  createGeminiSpeech,
   createGeminiVideo,
 } from '@tanstack/ai-gemini'
 import {
@@ -94,6 +95,10 @@ export function createTTSAdapter(
         baseURL: openaiUrl(aimockPort),
         defaultHeaders: headers,
       }),
+    gemini: () =>
+      createGeminiSpeech('gemini-3.1-flash-tts-preview', DUMMY_KEY, {
+        httpOptions: { baseUrl: llmockBase(aimockPort), headers },
+      }),
     grok: () =>
       createGrokSpeech('grok-tts', DUMMY_KEY, {
         baseURL: openaiUrl(aimockPort),