Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 68 additions & 2 deletions testing/e2e/global-setup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ export default async function globalSetup() {
mock.mount('/v1/text-to-speech', elevenLabsTTSMount())
mock.mount('/v1/speech-to-text', elevenLabsSTTMount())

// Gemini TTS hits the standard Gemini generateContent endpoint
// (POST /v1beta/models/{model}:generateContent) with
// responseModalities: ['AUDIO']. aimock's native Gemini audio helper derives
// the mime type from the fixture's `format`/`contentType`, so it can't emit
// the raw `audio/L16;codec=pcm;rate=24000` PCM that real Gemini TTS returns.
// Mount the TTS model's generateContent path directly so we can hand back
// PCM and exercise the adapter's PCM→WAV normalization. The path is specific
// to the TTS model, so it doesn't intercept Gemini chat/summarize requests.
mock.mount(
'/v1beta/models/gemini-3.1-flash-tts-preview:generateContent',
geminiTTSMount(),
)
// Gemini Veo video generation. aimock 1.29 mocks Gemini's `:predict`
// (Imagen) endpoint but not the long-running `:predictLongRunning` +
// operations-polling pair Veo uses, so mount both here. Non-Veo paths
Expand Down Expand Up @@ -85,7 +97,7 @@ export default async function globalSetup() {

await mock.start()
console.log(`[aimock] started on port 4010`)
;(globalThis as any).__aimock = mock
; (globalThis as any).__aimock = mock
}

function registerMediaFixtures(mock: LLMock) {
Expand Down Expand Up @@ -137,6 +149,14 @@ const FAKE_MP3_BYTES = Buffer.from([
0xff, 0xfb, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
])

/**
* Raw 16-bit little-endian PCM bytes. Gemini TTS returns audio as
* `audio/L16;codec=pcm;rate=24000` inlineData, which the adapter wraps in a
* RIFF/WAV header before handing it to the browser. The samples are arbitrary
* silence — the spec only asserts the `<audio>` element becomes visible.
*/
const FAKE_PCM_BYTES = Buffer.alloc(32)

function grokTTSMount(): Mountable {
return {
async handleRequest(
Expand All @@ -157,6 +177,52 @@ function grokTTSMount(): Mountable {
}
}

function geminiTTSMount(): Mountable {
return {
async handleRequest(
req: http.IncomingMessage,
res: http.ServerResponse,
// aimock strips the mount prefix — pathname will be "/" for an exact match.
pathname: string,
): Promise<boolean> {
if (pathname !== '/' || req.method !== 'POST') return false
await drainBody(req)
res.statusCode = 200
res.setHeader('Content-Type', 'application/json')
// Mirror the Gemini generateContent audio response shape: audio lands as
// a single `candidates[0].content.parts[0].inlineData` entry. The PCM
// mime type forces the adapter down its PCM→WAV wrapping path.
res.end(
JSON.stringify({
candidates: [
{
content: {
role: 'model',
parts: [
{
inlineData: {
mimeType: 'audio/L16;codec=pcm;rate=24000',
data: FAKE_PCM_BYTES.toString('base64'),
},
},
],
},
finishReason: 'STOP',
index: 0,
},
],
usageMetadata: {
promptTokenCount: 5,
candidatesTokenCount: 15,
totalTokenCount: 20,
},
}),
)
return true
},
}
}

function grokSTTMount(): Mountable {
return {
async handleRequest(
Expand Down Expand Up @@ -741,7 +807,7 @@ function readBody(req: http.IncomingMessage): Promise<string> {

function drainBody(req: http.IncomingMessage): Promise<void> {
return new Promise((resolve, reject) => {
req.on('data', () => {})
req.on('data', () => { })
req.on('end', () => resolve())
req.on('error', reject)
})
Expand Down
7 changes: 2 additions & 5 deletions testing/e2e/src/lib/feature-support.ts
Original file line number Diff line number Diff line change
Expand Up @@ -231,13 +231,13 @@ export const matrix: Record<Feature, Set<Provider>> = {
'image-to-image': new Set(['openai']),
'audio-gen': new Set(['gemini', 'elevenlabs']),
'sound-effects': new Set(['elevenlabs']),
tts: new Set(['openai', 'grok', 'elevenlabs']),
tts: new Set(['openai', 'gemini', 'grok', 'elevenlabs']),
transcription: new Set(['openai', 'grok', 'groq', 'elevenlabs']),
'transcription-diarization': new Set(['openai']),
'video-gen': new Set(['openai']),
// Gemini Veo runs through a custom aimock mount (see geminiVeoMount in
// global-setup.ts) — aimock 1.29 doesn't model the long-running
// `:predictLongRunning` + operations-polling pair natively.
'video-gen': new Set(['openai', 'gemini']),
// image-to-video (image parts in the generateVideo prompt). aimock 1.29's
// `/v1/videos` handler parses Sora's multipart upload (the SDK switches to
// multipart when `input_reference` carries a File) and matches on the
Expand All @@ -246,9 +246,6 @@ export const matrix: Record<Feature, Set<Provider>> = {
// routing remain unit-test-only (the spec's journal assertion is tied to
// aimock's /v1/videos pipeline, which custom mounts bypass).
'image-to-video': new Set(['openai']),
// Only Gemini currently surfaces a first-class stateful conversation API via
// the adapter (geminiTextInteractions, behind @tanstack/ai-gemini/experimental).
'stateful-interactions': new Set(['gemini']),
}

export function isSupported(provider: Provider, feature: Feature): boolean {
Expand Down
5 changes: 5 additions & 0 deletions testing/e2e/src/lib/media-providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
import {
createGeminiAudio,
createGeminiImage,
createGeminiSpeech,
createGeminiVideo,
} from '@tanstack/ai-gemini'
import {
Expand Down Expand Up @@ -94,6 +95,10 @@ export function createTTSAdapter(
baseURL: openaiUrl(aimockPort),
defaultHeaders: headers,
}),
gemini: () =>
createGeminiSpeech('gemini-3.1-flash-tts-preview', DUMMY_KEY, {
httpOptions: { baseUrl: llmockBase(aimockPort), headers },
}),
grok: () =>
createGrokSpeech('grok-tts', DUMMY_KEY, {
baseURL: openaiUrl(aimockPort),
Expand Down