From deb1bc7183a6fdf8bfa14e44de97f8fa2796e31d Mon Sep 17 00:00:00 2001 From: Tom Beckenham <34339192+tombeckenham@users.noreply.github.com> Date: Thu, 2 Jul 2026 20:30:33 +1000 Subject: [PATCH 1/4] feat(ai-gemini): support Gemini Omni Flash video generation via the Interactions API Add gemini-omni-flash-preview to the Gemini video adapter. Omni only serves the Interactions API (generateContent rejects it with 400), so the adapter now routes by model: Veo models keep the :predictLongRunning operations flow, while Omni creates a background interaction with response_modalities: ['video'], polls it by id, and returns the inline base64 MP4 as a data: URL (Files-API URI delivery passes through). Usage maps from output_tokens_by_modality, size maps onto response_format.aspect_ratio, and modelOptions.previous_interaction_id chains conversational video edits. - model-meta: GEMINI_OMNI_FLASH_PREVIEW ($0.10/sec video+audio output) + GEMINI_INTERACTIONS_VIDEO_MODELS - provider options: GeminiOmniVideoProviderOptions derived from the SDK's CreateModelInteractionParamsNonStreaming; per-model input modalities (Omni accepts image+video parts) and fixed 10s duration - @google/genai floor bumped to ^2.10.0 for the interactions surface - 17 new unit tests; new interactions-video E2E feature backed by a dedicated aimock mount (native interactions text handling untouched) - docs/media/video-generation.md + media-generation skill updates Verified live against the Gemini API: background job completed in ~45s and returned a valid MP4 with video-modality usage; the SDK's typed interactions.create works with Step-list input, so no raw REST fallback is needed. Closes #871 Co-Authored-By: Claude Fable 5 --- .changeset/gemini-omni-flash-video.md | 5 + docs/config.json | 2 +- docs/media/video-generation.md | 78 +++- packages/ai-gemini/package.json | 2 +- packages/ai-gemini/src/adapters/video.ts | 273 ++++++++++++- packages/ai-gemini/src/index.ts | 8 +- packages/ai-gemini/src/model-meta.ts | 47 ++- .../src/video/video-provider-options.ts | 102 ++++- .../ai-gemini/tests/video-adapter.test.ts | 368 ++++++++++++++++++ .../skills/ai-core/media-generation/SKILL.md | 27 ++ pnpm-lock.yaml | 2 +- testing/e2e/global-setup.ts | 92 +++++ testing/e2e/src/components/VideoGenUI.tsx | 9 +- testing/e2e/src/lib/feature-support.ts | 5 + testing/e2e/src/lib/features.ts | 4 + testing/e2e/src/lib/media-providers.ts | 13 + testing/e2e/src/lib/server-functions.ts | 2 + testing/e2e/src/lib/types.ts | 2 + testing/e2e/src/routes/$provider/$feature.tsx | 11 + testing/e2e/src/routes/api.video.stream.ts | 12 +- testing/e2e/src/routes/api.video.ts | 12 +- testing/e2e/tests/interactions-video.spec.ts | 76 ++++ 22 files changed, 1107 insertions(+), 45 deletions(-) create mode 100644 .changeset/gemini-omni-flash-video.md create mode 100644 testing/e2e/tests/interactions-video.spec.ts diff --git a/.changeset/gemini-omni-flash-video.md b/.changeset/gemini-omni-flash-video.md new file mode 100644 index 000000000..dbd040bd3 --- /dev/null +++ b/.changeset/gemini-omni-flash-video.md @@ -0,0 +1,5 @@ +--- +'@tanstack/ai-gemini': minor +--- + +Add Gemini Omni Flash (`gemini-omni-flash-preview`) video generation via the Interactions API. Omni only serves the Interactions API (`generateContent` rejects it), so the video adapter now routes by model: Veo models keep the `:predictLongRunning` operations flow, while `geminiVideo('gemini-omni-flash-preview')` creates a background interaction with `response_modalities: ['video']`, polls it by id, and returns the inline base64 MP4 as a `data:` URL (Files-API URI delivery passes through). Usage is mapped from the interaction's `output_tokens_by_modality`. Image and video prompt parts are sent as interaction content blocks, and `modelOptions.previous_interaction_id` chains a new prompt onto a prior Omni generation for conversational video editing. The top-level `size` option maps onto `response_format.aspect_ratio` (`'16:9' | '9:16'`); clips are a fixed 10 seconds today. Raises the `@google/genai` floor to `^2.10.0` for the Interactions API surface. diff --git a/docs/config.json b/docs/config.json index 0e8982869..7a52e405c 100644 --- a/docs/config.json +++ b/docs/config.json @@ -282,7 +282,7 @@ "label": "Video Generation", "to": "media/video-generation", "addedAt": "2026-04-15", - "updatedAt": "2026-07-01" + "updatedAt": "2026-07-02" }, { "label": "Generation Hooks", diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index 408bae527..2b4645986 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -2,12 +2,14 @@ title: Video Generation id: video-generation order: 6 -description: "Generate video from text prompts with OpenAI Sora, Google Veo, xAI Grok Imagine, or fal.ai using TanStack AI's experimental generateVideo() jobs/polling API." +description: "Generate video from text prompts with OpenAI Sora, Google Veo, Gemini Omni Flash, xAI Grok Imagine, or fal.ai using TanStack AI's experimental generateVideo() jobs/polling API." keywords: - tanstack ai - video generation - sora - veo + - omni flash + - interactions api - gemini - grok imagine - fal @@ -40,7 +42,7 @@ TanStack AI provides experimental support for video generation through dedicated Currently supported: - **OpenAI**: Sora-2 and Sora-2-Pro models (when available) -- **Google Gemini**: Veo 3.1, Veo 3, and Veo 2 models (via the long-running operations API) +- **Google Gemini**: Veo 3.1, Veo 3, and Veo 2 models (via the long-running operations API), and Gemini Omni Flash (via the Interactions API) - **Grok (xAI)**: grok-imagine-video (text-to-video + image-to-video) and grok-imagine-video-1.5 (image-to-video only) models - **fal.ai**: MiniMax, Luma, Kling, Hunyuan, and other hosted video models @@ -569,6 +571,78 @@ Adapters that haven't declared a per-model duration map keep the plain > Files API and requires your API key to download (send it as an > `x-goog-api-key` header or `key` query parameter). +### Gemini Omni Flash (Interactions API) Model Options + +Gemini Omni Flash (`gemini-omni-flash-preview`) is Google's multimodal +video-generation model with conversational editing. It only serves the +[Interactions API](https://ai.google.dev/gemini-api/docs/omni) — the same +`geminiVideo()` adapter routes it automatically: `generateVideo` creates a +background interaction, `getVideoJobStatus` polls it by id, and the +finished clip comes back **inline as a `data:video/mp4;base64,…` URL** +(when Google delivers by reference instead, the Files API URI passes +through and needs your API key to download, like Veo). + +Clips are 720p at 24 FPS and a fixed **10 seconds** today (`duration` is +typed as `10`; `snapDuration(n)` always returns `10`). The `size` option +maps onto the interaction's output aspect ratio: + +```typescript ignore +import { generateVideo, getVideoJobStatus } from '@tanstack/ai' +import { geminiVideo } from '@tanstack/ai-gemini' + +const adapter = geminiVideo('gemini-omni-flash-preview') + +const { jobId } = await generateVideo({ + adapter, + prompt: 'A woman playing violin outdoors at golden hour', + size: '9:16', // aspect ratio: '16:9' (default) or '9:16' +}) + +const status = await getVideoJobStatus({ adapter, jobId }) +// status.url → 'data:video/mp4;base64,…' once completed +``` + +Image and video prompt parts are sent to the interaction as content blocks +in order (Omni doesn't use Veo's `metadata.role` routing), so you can +condition the generation on stills or short reference clips. `data` sources +are sent inline as base64; `url` sources pass through as-is — the adapter +never downloads them, so use Gemini Files API URIs (upload large media via +the Files API first). + +#### Conversational video editing + +Omni's headline capability is iterative refinement: pass the interaction id +of a prior generation (its `jobId`) as +`modelOptions.previous_interaction_id` and describe the change — the model +edits the video while preserving everything you didn't mention: + +```typescript ignore +import { generateVideo } from '@tanstack/ai' +import { geminiVideo } from '@tanstack/ai-gemini' + +const adapter = geminiVideo('gemini-omni-flash-preview') + +// Turn 1: generate +const first = await generateVideo({ + adapter, + prompt: 'A woman playing violin outdoors at golden hour', +}) + +// …poll first.jobId to completion, then… + +// Turn 2: edit the result conversationally +const second = await generateVideo({ + adapter, + prompt: 'Make the violin invisible', + modelOptions: { previous_interaction_id: first.jobId }, +}) +``` + +`modelOptions` also passes through the Interactions API's request fields +(e.g. `generation_config.video_config.task` to pin +`'text_to_video' | 'image_to_video' | 'reference_to_video' | 'edit'` +instead of letting the model infer the task mode). + ### Grok (xAI Imagine) Model Options Based on the [xAI video generation API](https://docs.x.ai/docs/guides/video-generations). Two models are available: `grok-imagine-video` (v1.0) supports **text-to-video and image-to-video**, while `grok-imagine-video-1.5` is **image-to-video only** (a text-only prompt is rejected by the API; the adapter throws a clear error pointing you at `grok-imagine-video`). Both are aspect-ratio sized — the generic `size` option takes an `aspectRatio_resolution` template (like the Grok Imagine image models), and clips can be 1–15 seconds long. diff --git a/packages/ai-gemini/package.json b/packages/ai-gemini/package.json index f1b18bdd7..456ac00c5 100644 --- a/packages/ai-gemini/package.json +++ b/packages/ai-gemini/package.json @@ -64,7 +64,7 @@ "text-to-speech" ], "dependencies": { - "@google/genai": "^2.8.0", + "@google/genai": "^2.10.0", "@tanstack/ai-utils": "workspace:*", "partial-json": "^0.1.7" }, diff --git a/packages/ai-gemini/src/adapters/video.ts b/packages/ai-gemini/src/adapters/video.ts index b6935e503..75c25a69f 100644 --- a/packages/ai-gemini/src/adapters/video.ts +++ b/packages/ai-gemini/src/adapters/video.ts @@ -6,13 +6,18 @@ import { resolveMediaPrompt } from '@tanstack/ai' import { BaseVideoAdapter, snapToDurationOption } from '@tanstack/ai/adapters' import { arrayBufferToBase64 } from '@tanstack/ai-utils' import { createGeminiClient, getGeminiApiKeyFromEnv } from '../utils' -import { getGeminiVideoDurationOptions } from '../video/video-provider-options' +import { + getGeminiVideoDurationOptions, + isInteractionsVideoModel, +} from '../video/video-provider-options' import type { DurationOptions } from '@tanstack/ai/adapters' import type { ImagePart, MediaInputMetadata, + TokenUsage, VideoGenerationOptions, VideoJobResult, + VideoPart, VideoStatusResult, VideoUrlResult, } from '@tanstack/ai' @@ -20,9 +25,11 @@ import type { GenerateVideosConfig, GoogleGenAI, Image, + Interactions, VideoGenerationReferenceImage, } from '@google/genai' import type { + GeminiOmniVideoProviderOptions, GeminiVideoModel, GeminiVideoModelDurationByName, GeminiVideoModelInputModalitiesByName, @@ -33,6 +40,9 @@ import type { } from '../video/video-provider-options' import type { GeminiClientConfig } from '../utils' +type Interaction = Interactions.Interaction +type InteractionContent = Interactions.Content + /** * Configuration for Gemini video adapter. * @@ -99,31 +109,114 @@ async function imagePartToVeoImage( } /** - * Gemini Veo Video Generation Adapter + * Convert an image or video prompt part into an Interactions API content + * block. Data sources become inline base64 `data`; URL sources pass through + * as `uri` (Files API URIs — mirrors the Interactions text adapter). + */ +function mediaPartToInteractionsContent( + part: ImagePart | VideoPart, +): InteractionContent { + const mimeType = part.source.mimeType + if (part.type === 'image') { + return part.source.type === 'data' + ? { type: 'image', data: part.source.value, mime_type: mimeType } + : { type: 'image', uri: part.source.value, mime_type: mimeType } + } + return part.source.type === 'data' + ? { type: 'video', data: part.source.value, mime_type: mimeType } + : { type: 'video', uri: part.source.value, mime_type: mimeType } +} + +/** + * Pull the generated video out of a completed interaction. Prefers the + * SDK's `output_video` sugar, then walks `steps` back-to-front for the last + * `model_output` step carrying a video content block (the wire shape the + * raw REST response uses). + */ +function extractInteractionVideo( + interaction: Interaction, +): { data?: string; uri?: string; mimeType: string } | undefined { + const direct = interaction.output_video + if (direct && (direct.data || direct.uri)) { + return { + data: direct.data, + uri: direct.uri, + mimeType: direct.mime_type || 'video/mp4', + } + } + const steps = interaction.steps ?? [] + for (let i = steps.length - 1; i >= 0; i--) { + const step = steps[i] + if (step?.type !== 'model_output') continue + for (const block of step.content ?? []) { + if (block.type === 'video' && (block.data || block.uri)) { + return { + data: block.data, + uri: block.uri, + mimeType: block.mime_type || 'video/mp4', + } + } + } + } + return undefined +} + +/** + * Map Interactions usage onto the canonical TokenUsage shape. Omni reports + * video output via `output_tokens_by_modality`; fall back to the video + * modality entry when the total is absent. + */ +function interactionUsageToTokenUsage( + usage: Interaction['usage'], +): TokenUsage | undefined { + if (!usage) return undefined + const videoTokens = usage.output_tokens_by_modality?.find( + (entry) => entry.modality === 'video', + )?.tokens + const promptTokens = usage.total_input_tokens ?? 0 + const completionTokens = usage.total_output_tokens ?? videoTokens ?? 0 + return { + promptTokens, + completionTokens, + totalTokens: usage.total_tokens ?? promptTokens + completionTokens, + } +} + +/** + * Gemini Video Generation Adapter (Veo + Gemini Omni Flash) * - * Tree-shakeable adapter for Google Veo video generation. Veo runs as a - * long-running operation: `createVideoJob` starts the operation via the - * `:predictLongRunning` endpoint, `getVideoStatus` polls it, and - * `getVideoUrl` extracts the generated video's URI once it completes. + * Tree-shakeable adapter for Google video generation, routing by model: * - * Image prompt parts are routed by `metadata.role`: + * **Veo models** run as a long-running operation: `createVideoJob` starts + * the operation via the `:predictLongRunning` endpoint, `getVideoStatus` + * polls it, and `getVideoUrl` extracts the generated video's URI once it + * completes. Image prompt parts are routed by `metadata.role`: * - `'start_frame'` (or the first un-roled image) → the input image the * video starts from * - `'end_frame'` → `lastFrame` (the frame the video ends on) * - `'reference'` / `'character'` → `referenceImages` (asset references, * Veo 3.1) * - * Note: the returned video URI is served by the Gemini Files API and + * Note: the returned Veo video URI is served by the Gemini Files API and * requires the API key (`x-goog-api-key` header or `?key=` query * parameter) to download. * + * **Gemini Omni Flash** (`gemini-omni-flash-preview`) only serves the + * Interactions API: `createVideoJob` creates a background interaction with + * `response_modalities: ['video']`, `getVideoStatus` polls it by id, and + * `getVideoUrl` returns the inline base64 MP4 as a `data:` URL (or the + * Files API URI when the server delivers by reference). Image and video + * prompt parts are sent as interaction content blocks in order; pass + * `modelOptions.previous_interaction_id` to conversationally edit a prior + * Omni generation. + * * @experimental Video generation is an experimental feature and may change. */ export class GeminiVideoAdapter< TModel extends GeminiVideoModel, > extends BaseVideoAdapter< TModel, - GeminiVideoProviderOptions, + GeminiVideoModelProviderOptionsByName[TModel], GeminiVideoModelProviderOptionsByName, GeminiVideoModelSizeByName, GeminiVideoModelInputModalitiesByName, @@ -140,18 +233,25 @@ export class GeminiVideoAdapter< async createVideoJob( options: VideoGenerationOptions< - GeminiVideoProviderOptions, + GeminiVideoModelProviderOptionsByName[TModel], GeminiVideoSize, GeminiVideoModelDurationByName[TModel] >, ): Promise { - const { prompt, size, duration, modelOptions, logger } = options + const { prompt, size, duration, logger } = options logger.request( `activity=video.create provider=${this.name} model=${this.model} size=${size ?? 'default'} duration=${duration ?? 'default'}`, { provider: this.name, model: this.model }, ) + if (isInteractionsVideoModel(this.model)) { + return await this.createInteractionsVideoJob(options) + } + const modelOptions = options.modelOptions as + | GeminiVideoProviderOptions + | undefined + try { const resolved = resolveMediaPrompt(prompt) @@ -201,6 +301,75 @@ export class GeminiVideoAdapter< } } + /** + * Gemini Omni Flash job creation via the Interactions API. Creates a + * background interaction requesting video output; the interaction id is + * the job id polled by `getVideoStatus` / `getVideoUrl`. + */ + private async createInteractionsVideoJob( + options: VideoGenerationOptions< + GeminiVideoModelProviderOptionsByName[TModel], + GeminiVideoSize, + GeminiVideoModelDurationByName[TModel] + >, + ): Promise { + const { prompt, size, logger } = options + const modelOptions = options.modelOptions as + | GeminiOmniVideoProviderOptions + | undefined + + try { + const resolved = resolveMediaPrompt(prompt) + + if (resolved.audios.length > 0) { + throw new Error( + `${this.name}.createVideoJob does not support audio prompt parts (model: ${this.model}).`, + ) + } + + const content: Array = [ + ...resolved.images.map(mediaPartToInteractionsContent), + ...resolved.videos.map(mediaPartToInteractionsContent), + ] + if (resolved.text) { + content.push({ type: 'text', text: resolved.text }) + } + if (content.length === 0) { + throw new Error( + `${this.name}.createVideoJob: the prompt produced no content to send (model: ${this.model}).`, + ) + } + + const interaction = await this.client.interactions.create({ + ...modelOptions, + model: this.model, + input: [{ type: 'user_input', content }], + response_modalities: ['video'], + background: true, + // Omni's clip length is fixed (10s) and not a request field, so the + // typed `duration` option is compile-time-only here. Aspect ratio is + // the one output knob the API exposes today. + ...(size !== undefined && { + response_format: { type: 'video' as const, aspect_ratio: size }, + }), + }) + + if (!interaction.id) { + throw new Error( + 'Gemini Omni did not return an interaction id for the video generation job.', + ) + } + + return { jobId: interaction.id, model: this.model } + } catch (error) { + logger.errors(`${this.name}.createVideoJob fatal`, { + error, + source: `${this.name}.createVideoJob`, + }) + throw error + } + } + /** * Route image prompt parts onto Veo's request fields by `metadata.role`. */ @@ -257,6 +426,9 @@ export class GeminiVideoAdapter< } async getVideoStatus(jobId: string): Promise { + if (isInteractionsVideoModel(this.model)) { + return await this.getInteractionsVideoStatus(jobId) + } const operation = await this.getOperation(jobId) if (!operation.done) { @@ -289,7 +461,43 @@ export class GeminiVideoAdapter< return { jobId, status: 'completed' } } + /** + * Poll an Omni background interaction. `in_progress` maps to + * 'processing'; a `completed` interaction with no video content (e.g. + * filtered output) is surfaced as a failure so `getVideoUrl` doesn't + * throw on an empty response. + */ + private async getInteractionsVideoStatus( + jobId: string, + ): Promise { + const interaction = await this.getInteraction(jobId) + const status = interaction.status + + if (status === 'in_progress' || status === 'requires_action') { + return { jobId, status: 'processing' } + } + if (status === 'completed') { + if (!extractInteractionVideo(interaction)) { + return { + jobId, + status: 'failed', + error: + 'Gemini Omni completed the interaction without returning a video (the output may have been filtered).', + } + } + return { jobId, status: 'completed' } + } + return { + jobId, + status: 'failed', + error: `Gemini Omni video generation ended with status "${status}".`, + } + } + async getVideoUrl(jobId: string): Promise { + if (isInteractionsVideoModel(this.model)) { + return await this.getInteractionsVideoUrl(jobId) + } const operation = await this.getOperation(jobId) if (!operation.done) { @@ -317,6 +525,42 @@ export class GeminiVideoAdapter< return { jobId, url: uri } } + /** + * Extract the finished Omni video. Inline base64 output (the API default) + * becomes a `data:` URL — matching the OpenAI Sora adapter's inline + * delivery — and URI delivery passes through (Files API URIs need the API + * key to download, like Veo). Usage carries the video-modality output + * tokens (Omni bills per second of video, reported as tokens). + */ + private async getInteractionsVideoUrl( + jobId: string, + ): Promise { + const interaction = await this.getInteraction(jobId) + const status = interaction.status + + if (status === 'in_progress' || status === 'requires_action') { + throw new Error( + `Video is not ready yet. Check status first. Job ID: ${jobId}`, + ) + } + if (status !== 'completed') { + throw new Error( + `Video generation failed: Gemini Omni interaction ended with status "${status}". Job ID: ${jobId}`, + ) + } + + const video = extractInteractionVideo(interaction) + if (!video) { + throw new Error( + `Video not found in interaction response (the output may have been filtered). Job ID: ${jobId}`, + ) + } + + const usage = interactionUsageToTokenUsage(interaction.usage) + const url = video.uri ?? `data:${video.mimeType};base64,${video.data}` + return { jobId, url, ...(usage && { usage }) } + } + override availableDurations(): DurationOptions< GeminiVideoModelDurationByName[TModel] > { @@ -340,6 +584,13 @@ export class GeminiVideoAdapter< operation.name = jobId return await this.client.operations.getVideosOperation({ operation }) } + + /** + * Fetch an Omni background interaction by id. + */ + private async getInteraction(jobId: string): Promise { + return await this.client.interactions.get(jobId) + } } /** diff --git a/packages/ai-gemini/src/index.ts b/packages/ai-gemini/src/index.ts index 462de4067..d8733709e 100644 --- a/packages/ai-gemini/src/index.ts +++ b/packages/ai-gemini/src/index.ts @@ -61,9 +61,9 @@ export { type GeminiAudioProviderOptions, } from './adapters/audio' -// Video / Veo generation adapter (experimental) +// Video generation adapter — Veo + Gemini Omni Flash (experimental) /** - * @experimental Veo video generation is an experimental feature and may change. + * @experimental Video generation is an experimental feature and may change. */ export { GeminiVideoAdapter, @@ -74,8 +74,11 @@ export { export { GEMINI_VIDEO_DURATIONS, getGeminiVideoDurationOptions, + isInteractionsVideoModel, } from './video/video-provider-options' export type { + GeminiInteractionsVideoModel, + GeminiOmniVideoProviderOptions, GeminiVideoModel, GeminiVideoModelDurationByName, GeminiVideoModelInputModalitiesByName, @@ -96,6 +99,7 @@ export { GEMINI_TTS_MODELS as GeminiTTSModels } from './model-meta' export { GEMINI_TTS_VOICES as GeminiTTSVoices } from './model-meta' export { GEMINI_AUDIO_MODELS as GeminiAudioModels } from './model-meta' export { GEMINI_VIDEO_MODELS as GeminiVideoModels } from './model-meta' +export { GEMINI_INTERACTIONS_VIDEO_MODELS as GeminiInteractionsVideoModels } from './model-meta' export type { GeminiModels as GeminiTextModel } from './model-meta' export type { GeminiImageModels as GeminiImageModel } from './model-meta' export type { GeminiTTSVoice } from './model-meta' diff --git a/packages/ai-gemini/src/model-meta.ts b/packages/ai-gemini/src/model-meta.ts index 67a7fc574..7174d38a2 100644 --- a/packages/ai-gemini/src/model-meta.ts +++ b/packages/ai-gemini/src/model-meta.ts @@ -712,6 +712,37 @@ const VEO_3_1_LITE_PREVIEW = { GeminiCachedContentOptions > +/** + * Gemini Omni Flash — multimodal video generation with conversational + * editing. Serves only the Interactions API (`generateContent` rejects it), + * so it routes through the interactions-based path of the video adapter, + * not Veo's `:predictLongRunning` flow. Pricing is per second of generated + * video ($0.10/sec). 720p / 24 FPS, 10-second clips. + * @experimental Omni video generation is an experimental feature and may change. + */ +const GEMINI_OMNI_FLASH_PREVIEW = { + name: 'gemini-omni-flash-preview', + max_input_tokens: 1_048_576, + max_output_tokens: 1, + supports: { + input: ['text', 'image', 'video'], + output: ['video', 'audio'], + }, + pricing: { + input: { + normal: 0, + }, + output: { + normal: 0.1, + }, + }, +} as const satisfies ModelMeta< + GeminiToolConfigOptions & + GeminiSafetyOptions & + GeminiCommonConfigOptions & + GeminiCachedContentOptions +> + const GEMINI_3_5_FLASH = { name: 'gemini-3.5-flash', max_input_tokens: 1_048_576, @@ -845,13 +876,25 @@ export const GEMINI_TTS_VOICES = [ export type GeminiTTSVoice = (typeof GEMINI_TTS_VOICES)[number] /** - * Veo video generation models. - * @experimental Veo video generation is an experimental feature and may change. + * Video generation models. Veo models run on the long-running + * `:predictLongRunning` flow; Gemini Omni Flash runs on the Interactions + * API — the video adapter routes by model. + * @experimental Video generation is an experimental feature and may change. */ export const GEMINI_VIDEO_MODELS = [ VEO_3_1_PREVIEW.name, VEO_3_1_FAST_PREVIEW.name, VEO_3_1_LITE_PREVIEW.name, + GEMINI_OMNI_FLASH_PREVIEW.name, +] as const + +/** + * Video models served by the Interactions API rather than Veo's + * `:predictLongRunning` operations flow. + * @experimental Omni video generation is an experimental feature and may change. + */ +export const GEMINI_INTERACTIONS_VIDEO_MODELS = [ + GEMINI_OMNI_FLASH_PREVIEW.name, ] as const // Manual type map for per-model provider options diff --git a/packages/ai-gemini/src/video/video-provider-options.ts b/packages/ai-gemini/src/video/video-provider-options.ts index 1daee974b..a99ae4d6c 100644 --- a/packages/ai-gemini/src/video/video-provider-options.ts +++ b/packages/ai-gemini/src/video/video-provider-options.ts @@ -1,25 +1,50 @@ /** - * Gemini Veo Video Generation Provider Options + * Gemini Video Generation Provider Options * - * Based on https://ai.google.dev/gemini-api/docs/video + * Covers two request paths behind the one video adapter: + * - Veo models — long-running operations via `:predictLongRunning` + * (https://ai.google.dev/gemini-api/docs/video) + * - Gemini Omni Flash — background jobs via the Interactions API + * (https://ai.google.dev/gemini-api/docs/omni) * * @experimental Video generation is an experimental feature and may change. */ +import { GEMINI_INTERACTIONS_VIDEO_MODELS } from '../model-meta' import type { DurationOptions } from '@tanstack/ai/adapters' -import type { GenerateVideosConfig } from '@google/genai' +import type { GenerateVideosConfig, Interactions } from '@google/genai' import type { GEMINI_VIDEO_MODELS } from '../model-meta' /** - * Model type for Gemini Veo video generation. + * Model type for Gemini video generation (Veo + Omni Flash). * @experimental Video generation is an experimental feature and may change. */ export type GeminiVideoModel = (typeof GEMINI_VIDEO_MODELS)[number] /** - * Supported aspect ratios for Veo video generation. This is the `size` value - * for the Gemini video adapter — Veo expresses output shape as an aspect - * ratio (plus an optional `resolution` in `modelOptions`), not pixel - * dimensions. + * Video models served by the Interactions API (Gemini Omni Flash) rather + * than Veo's `:predictLongRunning` operations flow. + * @experimental Omni video generation is an experimental feature and may change. + */ +export type GeminiInteractionsVideoModel = + (typeof GEMINI_INTERACTIONS_VIDEO_MODELS)[number] + +/** + * Runtime guard for the Interactions-served video models. + * @experimental Omni video generation is an experimental feature and may change. + */ +export function isInteractionsVideoModel( + model: GeminiVideoModel, +): model is GeminiInteractionsVideoModel { + return (GEMINI_INTERACTIONS_VIDEO_MODELS as ReadonlyArray).includes( + model, + ) +} + +/** + * Supported aspect ratios for Gemini video generation. This is the `size` + * value for the Gemini video adapter — both Veo and Omni Flash express + * output shape as an aspect ratio (plus an optional `resolution` in Veo's + * `modelOptions`), not pixel dimensions. * * @experimental Video generation is an experimental feature and may change. */ @@ -49,13 +74,50 @@ export type GeminiVideoProviderOptions = Omit< | 'abortSignal' > +/** + * Provider-specific options for Gemini Omni Flash video generation on the + * Interactions API. + * + * Derived from the SDK's `Interactions.CreateModelInteractionParamsNonStreaming`, + * minus the fields the adapter manages itself: + * - `model` / `input` — set from the adapter's model and the `prompt` + * - `stream` / `background` — the adapter always creates a background job + * and polls it through the `generateVideo` jobs API + * - `response_modalities` / `response_format` — the adapter requests video + * output and maps the top-level `size` option onto + * `response_format.aspect_ratio` + * - `tools` / `response_mime_type` — not applicable to video generation + * + * Notable passthroughs: + * - `previous_interaction_id` — conversational video editing: chain a new + * prompt onto a prior Omni interaction to refine its video + * - `generation_config.video_config.task` — pin the task mode + * (`'text_to_video' | 'image_to_video' | 'reference_to_video' | 'edit'`) + * instead of letting the model infer it + * + * @experimental Omni video generation is an experimental feature and may change. + */ +export type GeminiOmniVideoProviderOptions = Omit< + Interactions.CreateModelInteractionParamsNonStreaming, + | 'model' + | 'input' + | 'stream' + | 'background' + | 'response_modalities' + | 'response_format' + | 'response_mime_type' + | 'tools' +> + /** * Model-specific provider options mapping. * * @experimental Video generation is an experimental feature and may change. */ export type GeminiVideoModelProviderOptionsByName = { - [TModel in GeminiVideoModel]: GeminiVideoProviderOptions + [TModel in GeminiVideoModel]: TModel extends GeminiInteractionsVideoModel + ? GeminiOmniVideoProviderOptions + : GeminiVideoProviderOptions } /** @@ -70,17 +132,21 @@ export type GeminiVideoModelSizeByName = { /** * Per-model prompt input modalities. Every Veo model accepts image * conditioning inputs (first frame, last frame, reference images) alongside - * the text prompt. + * the text prompt. Omni Flash additionally accepts video inputs (short + * reference clips / videos to edit). * * @experimental Video generation is an experimental feature and may change. */ export type GeminiVideoModelInputModalitiesByName = { - [TModel in GeminiVideoModel]: readonly ['image'] + [TModel in GeminiVideoModel]: TModel extends GeminiInteractionsVideoModel + ? readonly ['image', 'video'] + : readonly ['image'] } /** - * Per-model duration unions (seconds, as numbers — the API's - * `parameters.durationSeconds` field is numeric). + * Per-model duration unions (seconds, as numbers — Veo's + * `parameters.durationSeconds` field is numeric; Omni Flash clips are a + * fixed 10 seconds today, with longer durations "coming soon" per Google). * * @experimental Video generation is an experimental feature and may change. */ @@ -88,15 +154,18 @@ export type GeminiVideoModelDurationByName = { 'veo-3.1-generate-preview': 4 | 6 | 8 'veo-3.1-fast-generate-preview': 4 | 6 | 8 'veo-3.1-lite-generate-preview': 4 | 6 | 8 + 'gemini-omni-flash-preview': 10 } /** * Runtime duration table backing `availableDurations()` / `snapDuration()`. * - * Curated from the official Veo docs - * (https://ai.google.dev/gemini-api/docs/video) — the Gemini OpenAPI spec + * Curated from the official docs + * (https://ai.google.dev/gemini-api/docs/video, + * https://ai.google.dev/gemini-api/docs/omni) — the Gemini OpenAPI spec * types the `:predictLongRunning` request's `parameters` as unconstrained, * so it carries no per-model duration information to derive these from. + * Omni Flash has no duration request field at all; clips are 10 seconds. * * @experimental Video generation is an experimental feature and may change. */ @@ -108,10 +177,11 @@ export const GEMINI_VIDEO_DURATIONS: { 'veo-3.1-generate-preview': { kind: 'discrete', values: [4, 6, 8] }, 'veo-3.1-fast-generate-preview': { kind: 'discrete', values: [4, 6, 8] }, 'veo-3.1-lite-generate-preview': { kind: 'discrete', values: [4, 6, 8] }, + 'gemini-omni-flash-preview': { kind: 'discrete', values: [10] }, } /** - * Look up the duration options for a Veo model. + * Look up the duration options for a Gemini video model. * * @experimental Video generation is an experimental feature and may change. */ diff --git a/packages/ai-gemini/tests/video-adapter.test.ts b/packages/ai-gemini/tests/video-adapter.test.ts index 5763d6737..41889ecc5 100644 --- a/packages/ai-gemini/tests/video-adapter.test.ts +++ b/packages/ai-gemini/tests/video-adapter.test.ts @@ -501,3 +501,371 @@ describe('Gemini Video Adapter', () => { }) }) }) + +// =========================== +// Gemini Omni Flash (Interactions API) +// =========================== + +interface InteractionsClientStub { + interactions: { + create: ReturnType + get: ReturnType + } +} + +const completedOmniInteraction = { + id: 'v1_omni-job-123', + status: 'completed', + usage: { + total_input_tokens: 12, + total_output_tokens: 57920, + total_tokens: 57932, + output_tokens_by_modality: [{ modality: 'video', tokens: 57920 }], + }, + steps: [ + { type: 'user_input', content: [{ type: 'text', text: 'a sunset' }] }, + { type: 'thought', signature: 'sig' }, + { + type: 'model_output', + content: [ + { type: 'video', mime_type: 'video/mp4', data: 'AAAAIGZ0eXA=' }, + ], + }, + ], +} + +function createInteractionsClientStub( + overrides: { + createResult?: Record + getResult?: Record + } = {}, +): InteractionsClientStub { + return { + interactions: { + create: vi.fn().mockResolvedValue( + overrides.createResult ?? { + id: 'v1_omni-job-123', + status: 'in_progress', + object: 'interaction', + }, + ), + get: vi + .fn() + .mockResolvedValue(overrides.getResult ?? completedOmniInteraction), + }, + } +} + +class StubbedGeminiOmniVideoAdapter extends GeminiVideoAdapter<'gemini-omni-flash-preview'> { + constructor(stub: InteractionsClientStub) { + super({ apiKey: 'test-key' }, 'gemini-omni-flash-preview') + this.client = stub as unknown as GoogleGenAI + } +} + +describe('Gemini Omni Flash Video Adapter (Interactions API)', () => { + describe('durations', () => { + it('reports the fixed 10-second clip length', () => { + const adapter = createGeminiVideo('gemini-omni-flash-preview', 'test-key') + expect(adapter.availableDurations()).toEqual({ + kind: 'discrete', + values: [10], + }) + expect(adapter.snapDuration(3)).toBe(10) + expect(adapter.snapDuration(60)).toBe(10) + }) + + it('types duration as the fixed 10-second literal at compile time', () => { + const omni = createGeminiVideo('gemini-omni-flash-preview', 'test-key') + expectTypeOf(omni.snapDuration).returns.toEqualTypeOf<10 | undefined>() + type OmniOptions = Parameters[0] + expectTypeOf().toEqualTypeOf<10 | undefined>() + }) + }) + + describe('createVideoJob', () => { + it('creates a background interaction requesting video output', async () => { + const stub = createInteractionsClientStub() + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + const result = await adapter.createVideoJob({ + model: 'gemini-omni-flash-preview', + prompt: 'a sunset over the ocean', + size: '9:16', + logger: testLogger, + }) + + expect(result).toEqual({ + jobId: 'v1_omni-job-123', + model: 'gemini-omni-flash-preview', + }) + expect(stub.interactions.create).toHaveBeenCalledWith({ + model: 'gemini-omni-flash-preview', + input: [ + { + type: 'user_input', + content: [{ type: 'text', text: 'a sunset over the ocean' }], + }, + ], + response_modalities: ['video'], + background: true, + response_format: { type: 'video', aspect_ratio: '9:16' }, + }) + }) + + it('omits response_format when no size is given and passes modelOptions through', async () => { + const stub = createInteractionsClientStub() + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await adapter.createVideoJob({ + model: 'gemini-omni-flash-preview', + prompt: 'make the violin invisible', + modelOptions: { previous_interaction_id: 'v1_prior-turn' }, + logger: testLogger, + }) + + expect(stub.interactions.create).toHaveBeenCalledWith({ + model: 'gemini-omni-flash-preview', + previous_interaction_id: 'v1_prior-turn', + input: [ + { + type: 'user_input', + content: [{ type: 'text', text: 'make the violin invisible' }], + }, + ], + response_modalities: ['video'], + background: true, + }) + }) + + it('sends image and video prompt parts as content blocks before the text', async () => { + const stub = createInteractionsClientStub() + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await adapter.createVideoJob({ + model: 'gemini-omni-flash-preview', + prompt: [ + { + type: 'image', + source: { type: 'data', value: 'aGVsbG8=', mimeType: 'image/png' }, + }, + { + type: 'video', + source: { + type: 'url', + value: + 'https://generativelanguage.googleapis.com/v1beta/files/abc', + mimeType: 'video/mp4', + }, + }, + { type: 'text', content: 'animate this' }, + ], + logger: testLogger, + }) + + expect(stub.interactions.create).toHaveBeenCalledWith( + expect.objectContaining({ + input: [ + { + type: 'user_input', + content: [ + { type: 'image', data: 'aGVsbG8=', mime_type: 'image/png' }, + { + type: 'video', + uri: 'https://generativelanguage.googleapis.com/v1beta/files/abc', + mime_type: 'video/mp4', + }, + { type: 'text', text: 'animate this' }, + ], + }, + ], + }), + ) + }) + + it('throws on audio prompt parts', async () => { + const stub = createInteractionsClientStub() + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await expect( + adapter.createVideoJob({ + model: 'gemini-omni-flash-preview', + prompt: [ + { type: 'text', content: 'sync to this' }, + { + type: 'audio', + source: { + type: 'data', + value: 'aGVsbG8=', + mimeType: 'audio/wav', + }, + }, + ], + logger: testLogger, + }), + ).rejects.toThrow(/audio prompt parts/) + expect(stub.interactions.create).not.toHaveBeenCalled() + }) + + it('throws when the interaction comes back without an id', async () => { + const stub = createInteractionsClientStub({ + createResult: { status: 'in_progress' }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await expect( + adapter.createVideoJob({ + model: 'gemini-omni-flash-preview', + prompt: 'a sunset', + logger: testLogger, + }), + ).rejects.toThrow(/interaction id/) + }) + }) + + describe('getVideoStatus', () => { + const jobId = 'v1_omni-job-123' + + it('maps in_progress to processing', async () => { + const stub = createInteractionsClientStub({ + getResult: { id: jobId, status: 'in_progress' }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + expect(await adapter.getVideoStatus(jobId)).toEqual({ + jobId, + status: 'processing', + }) + expect(stub.interactions.get).toHaveBeenCalledWith(jobId) + }) + + it('maps a completed interaction with a video to completed', async () => { + const stub = createInteractionsClientStub() + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + expect(await adapter.getVideoStatus(jobId)).toEqual({ + jobId, + status: 'completed', + }) + }) + + it('maps a completed interaction without video output to failed', async () => { + const stub = createInteractionsClientStub({ + getResult: { + id: jobId, + status: 'completed', + steps: [ + { + type: 'model_output', + content: [{ type: 'text', text: 'cannot do that' }], + }, + ], + }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + const status = await adapter.getVideoStatus(jobId) + expect(status.status).toBe('failed') + expect(status.error).toMatch(/without returning a video/) + }) + + it('maps terminal non-success statuses to failed', async () => { + for (const failure of ['failed', 'cancelled', 'incomplete']) { + const stub = createInteractionsClientStub({ + getResult: { id: jobId, status: failure }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + const status = await adapter.getVideoStatus(jobId) + expect(status.status).toBe('failed') + expect(status.error).toContain(failure) + } + }) + }) + + describe('getVideoUrl', () => { + const jobId = 'v1_omni-job-123' + + it('returns the inline base64 video as a data: URL with usage', async () => { + const stub = createInteractionsClientStub() + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + expect(await adapter.getVideoUrl(jobId)).toEqual({ + jobId, + url: 'data:video/mp4;base64,AAAAIGZ0eXA=', + usage: { + promptTokens: 12, + completionTokens: 57920, + totalTokens: 57932, + }, + }) + }) + + it('falls back to the video-modality token count when totals are missing', async () => { + const stub = createInteractionsClientStub({ + getResult: { + ...completedOmniInteraction, + usage: { + output_tokens_by_modality: [{ modality: 'video', tokens: 57920 }], + }, + }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + const result = await adapter.getVideoUrl(jobId) + expect(result.usage).toEqual({ + promptTokens: 0, + completionTokens: 57920, + totalTokens: 57920, + }) + }) + + it('passes a URI delivery through as the URL', async () => { + const stub = createInteractionsClientStub({ + getResult: { + id: jobId, + status: 'completed', + output_video: { + type: 'video', + uri: 'https://generativelanguage.googleapis.com/v1beta/files/xyz:download', + }, + }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + const result = await adapter.getVideoUrl(jobId) + expect(result.url).toBe( + 'https://generativelanguage.googleapis.com/v1beta/files/xyz:download', + ) + }) + + it('throws when the interaction is still in progress', async () => { + const stub = createInteractionsClientStub({ + getResult: { id: jobId, status: 'in_progress' }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await expect(adapter.getVideoUrl(jobId)).rejects.toThrow(/not ready/) + }) + + it('throws with the terminal status on failure', async () => { + const stub = createInteractionsClientStub({ + getResult: { id: jobId, status: 'failed' }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await expect(adapter.getVideoUrl(jobId)).rejects.toThrow(/"failed"/) + }) + + it('throws when a completed interaction has no video content', async () => { + const stub = createInteractionsClientStub({ + getResult: { id: jobId, status: 'completed', steps: [] }, + }) + const adapter = new StubbedGeminiOmniVideoAdapter(stub) + + await expect(adapter.getVideoUrl(jobId)).rejects.toThrow( + /Video not found/, + ) + }) + }) +}) diff --git a/packages/ai/skills/ai-core/media-generation/SKILL.md b/packages/ai/skills/ai-core/media-generation/SKILL.md index 0c63f347e..8da02856c 100644 --- a/packages/ai/skills/ai-core/media-generation/SKILL.md +++ b/packages/ai/skills/ai-core/media-generation/SKILL.md @@ -467,6 +467,33 @@ const { jobId } = await generateVideo({ // (x-goog-api-key header or ?key= query parameter). ``` +Gemini Omni Flash (`geminiVideo('gemini-omni-flash-preview')`) is served by +the Interactions API instead of Veo's operations flow — same adapter, routed +by model. Clips are a fixed 10s at 720p (`duration` is typed `10`), `size` +is the aspect ratio (`'16:9' | '9:16'`), and the finished video arrives +**inline** as a `data:video/mp4;base64,…` URL (no key needed to use it). +Image/video prompt parts are sent as interaction content in order (no +`metadata.role` routing); `data` sources go inline, `url` sources pass +through as-is (never downloaded — use Gemini Files API URIs for remote +media). For conversational editing, pass a prior generation's `jobId` as +`modelOptions.previous_interaction_id` with a prompt describing the change: + +```typescript +import { geminiVideo } from '@tanstack/ai-gemini' + +const omni = geminiVideo('gemini-omni-flash-preview') +const first = await generateVideo({ + adapter: omni, + prompt: 'A violinist outdoors', +}) +// …poll first.jobId to completion, then edit it: +const edited = await generateVideo({ + adapter: omni, + prompt: 'Make the violin invisible', + modelOptions: { previous_interaction_id: first.jobId }, +}) +``` + Other video adapters: `openaiVideo('sora-2')` (pixel sizes like `'1280x720'`, durations 4/8/12s, single `input_reference` image prompt part), `grokVideo(...)` (`grok-imagine-video` does text-to-video + image-to-video; `grok-imagine-video-1.5` is diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c2abbdf71..f9627db9a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1709,7 +1709,7 @@ importers: packages/ai-gemini: dependencies: '@google/genai': - specifier: ^2.8.0 + specifier: ^2.10.0 version: 2.10.0(@modelcontextprotocol/sdk@1.29.0(zod@4.3.6)) '@tanstack/ai-utils': specifier: workspace:* diff --git a/testing/e2e/global-setup.ts b/testing/e2e/global-setup.ts index 42011f43f..2dbbace72 100644 --- a/testing/e2e/global-setup.ts +++ b/testing/e2e/global-setup.ts @@ -50,6 +50,15 @@ export default async function globalSetup() { // aimock's native Gemini handlers. mock.mount('/v1beta/models', geminiVeoMount()) + // Gemini Omni Flash video generation (Interactions API). aimock handles + // synchronous text interactions natively, but not background video jobs + // (POST /v1beta/interactions with background:true → poll + // GET /v1beta/interactions/{id} → inline base64 mp4). The adapter under + // test points its baseUrl at this dedicated prefix so aimock's native + // interactions handling stays untouched for the stateful-interactions + // text tests. + mock.mount('/omni-video', geminiOmniVideoMount()) + // Anthropic server_tool_use bug reproduction (issue #604). aimock can't // natively synthesize `server_tool_use` / `web_fetch_tool_result` content // blocks, so this mount hand-crafts the raw SSE Claude would emit when a @@ -345,6 +354,89 @@ function geminiVeoMount(): Mountable { } } +/** + * Mounts Gemini Omni Flash's Interactions-API video generation flow under a + * dedicated `/omni-video` prefix (the adapter under test sets its baseUrl to + * it, so requests land on `/omni-video/v1beta/interactions`): + * + * - `POST /v1beta/interactions` — creates the background job and returns an + * `in_progress` interaction with an id. + * - `GET /v1beta/interactions/{id}` — polls the job. The mock completes + * immediately with the raw wire shape: a `model_output` step carrying an + * inline base64 `video` content block plus `output_tokens_by_modality` + * usage, which the adapter maps to a `data:video/mp4;base64,…` URL. + */ +function geminiOmniVideoMount(): Mountable { + const JOB_ID = 'v1_omni-video-e2e' + // Minimal MP4-ish base64 payload — the spec only asserts the