diff --git a/.changeset/gemini-omni-flash-video.md b/.changeset/gemini-omni-flash-video.md new file mode 100644 index 000000000..74d04a3da --- /dev/null +++ b/.changeset/gemini-omni-flash-video.md @@ -0,0 +1,5 @@ +--- +'@tanstack/ai-gemini': minor +--- + +Add Gemini Omni Flash (`gemini-omni-flash-preview`) video generation via the Interactions API. Omni only serves the Interactions API (`generateContent` rejects it), so the video adapter now routes by model: Veo models keep the `:predictLongRunning` operations flow, while `geminiVideo('gemini-omni-flash-preview')` creates a background interaction with `response_modalities: ['video']`, polls it by id, and returns the inline base64 MP4 as a `data:` URL (Files-API URI delivery passes through). Usage is mapped from the interaction's `output_tokens_by_modality`. Image and video prompt parts are sent as interaction content blocks, and `modelOptions.previous_interaction_id` chains a new prompt onto a prior Omni generation for conversational video editing. The top-level `size` option maps onto `response_format.aspect_ratio` (`'16:9' | '9:16'`) and `duration` onto `response_format.duration` — any value in the 3–10 second range (fractional seconds included, verified against the live API), defaulting to a 10-second clip when omitted. Raises the `@google/genai` floor to `^2.10.0` for the Interactions API surface. diff --git a/.changeset/video-adapter-duration-constraint.md b/.changeset/video-adapter-duration-constraint.md new file mode 100644 index 000000000..093e86715 --- /dev/null +++ b/.changeset/video-adapter-duration-constraint.md @@ -0,0 +1,5 @@ +--- +'@tanstack/ai': patch +--- + +Fix `generateVideo` / `getVideoJobStatus` rejecting video adapters that declare a narrowed per-model duration union (e.g. Gemini's `4 | 6 | 8` for Veo or `10` for Omni Flash) at the type level. The activity's `TAdapter extends VideoAdapter` constraints left the input-modality and duration generics at their defaults, so `duration?: number` failed contravariance against the adapter's literal union. All video-activity constraints and helper conditionals now span all six `VideoAdapter` generics. diff --git a/docs/config.json b/docs/config.json index 0e8982869..7a52e405c 100644 --- a/docs/config.json +++ b/docs/config.json @@ -282,7 +282,7 @@ "label": "Video Generation", "to": "media/video-generation", "addedAt": "2026-04-15", - "updatedAt": "2026-07-01" + "updatedAt": "2026-07-02" }, { "label": "Generation Hooks", diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md index 408bae527..2386de6fa 100644 --- a/docs/media/video-generation.md +++ b/docs/media/video-generation.md @@ -2,12 +2,14 @@ title: Video Generation id: video-generation order: 6 -description: "Generate video from text prompts with OpenAI Sora, Google Veo, xAI Grok Imagine, or fal.ai using TanStack AI's experimental generateVideo() jobs/polling API." +description: "Generate video from text prompts with OpenAI Sora, Google Veo, Gemini Omni Flash, xAI Grok Imagine, or fal.ai using TanStack AI's experimental generateVideo() jobs/polling API." keywords: - tanstack ai - video generation - sora - veo + - omni flash + - interactions api - gemini - grok imagine - fal @@ -40,7 +42,7 @@ TanStack AI provides experimental support for video generation through dedicated Currently supported: - **OpenAI**: Sora-2 and Sora-2-Pro models (when available) -- **Google Gemini**: Veo 3.1, Veo 3, and Veo 2 models (via the long-running operations API) +- **Google Gemini**: Veo 3.1 models (via the long-running operations API), and Gemini Omni Flash (via the Interactions API) - **Grok (xAI)**: grok-imagine-video (text-to-video + image-to-video) and grok-imagine-video-1.5 (image-to-video only) models - **fal.ai**: MiniMax, Luma, Kling, Hunyuan, and other hosted video models @@ -569,6 +571,85 @@ Adapters that haven't declared a per-model duration map keep the plain > Files API and requires your API key to download (send it as an > `x-goog-api-key` header or `key` query parameter). +### Gemini Omni Flash (Interactions API) Model Options + +Gemini Omni Flash (`gemini-omni-flash-preview`) is Google's multimodal +video-generation model with conversational editing. It only serves the +[Interactions API](https://ai.google.dev/gemini-api/docs/omni) — the same +`geminiVideo()` adapter routes it automatically: `generateVideo` creates a +background interaction, `getVideoJobStatus` polls it by id, and the +finished clip comes back **inline as a `data:video/mp4;base64,…` URL** +(when Google delivers by reference instead, the Files API URI passes +through and needs your API key to download, like Veo). + +Clips are 720p at 24 FPS, and `duration` accepts any value in the **3–10 +second** range (fractional seconds included), defaulting to 10 seconds when +omitted. `availableDurations()` reports +`{ kind: 'range', min: 3, max: 10, unit: 'seconds' }`; out-of-range +`duration` values are rejected at job creation, and `snapDuration(n)` snaps +raw seconds into the range (clamping to its bounds and rounding to whole +seconds). The `size` option maps onto the interaction's output aspect +ratio: + +```typescript ignore +import { generateVideo, getVideoJobStatus } from '@tanstack/ai' +import { geminiVideo } from '@tanstack/ai-gemini' + +const adapter = geminiVideo('gemini-omni-flash-preview') + +const { jobId } = await generateVideo({ + adapter, + prompt: 'A woman playing violin outdoors at golden hour', + size: '9:16', // aspect ratio: '16:9' (default) or '9:16' + duration: 6, // 3-10 seconds; omit for the 10s default +}) + +const status = await getVideoJobStatus({ adapter, jobId }) +// status.url → 'data:video/mp4;base64,…' once completed +``` + +Image and video prompt parts are sent to the interaction as content blocks +— grouped as images, then videos, then the text prompt (Omni doesn't use +Veo's `metadata.role` routing) — so you can condition the generation on +stills or short reference clips. `data` sources +are sent inline as base64; `url` sources pass through as-is — the adapter +never downloads them, so use Gemini Files API URIs (upload large media via +the Files API first). + +#### Conversational video editing + +Omni's headline capability is iterative refinement: pass the interaction id +of a prior generation (its `jobId`) as +`modelOptions.previous_interaction_id` and describe the change — the model +edits the video while preserving everything you didn't mention: + +```typescript ignore +import { generateVideo } from '@tanstack/ai' +import { geminiVideo } from '@tanstack/ai-gemini' + +const adapter = geminiVideo('gemini-omni-flash-preview') + +// Turn 1: generate +const first = await generateVideo({ + adapter, + prompt: 'A woman playing violin outdoors at golden hour', +}) + +// …poll first.jobId to completion, then… + +// Turn 2: edit the result conversationally +const second = await generateVideo({ + adapter, + prompt: 'Make the violin invisible', + modelOptions: { previous_interaction_id: first.jobId }, +}) +``` + +`modelOptions` also passes through the Interactions API's request fields +(e.g. `generation_config.video_config.task` to pin +`'text_to_video' | 'image_to_video' | 'reference_to_video' | 'edit'` +instead of letting the model infer the task mode). + ### Grok (xAI Imagine) Model Options Based on the [xAI video generation API](https://docs.x.ai/docs/guides/video-generations). Two models are available: `grok-imagine-video` (v1.0) supports **text-to-video and image-to-video**, while `grok-imagine-video-1.5` is **image-to-video only** (a text-only prompt is rejected by the API; the adapter throws a clear error pointing you at `grok-imagine-video`). Both are aspect-ratio sized — the generic `size` option takes an `aspectRatio_resolution` template (like the Grok Imagine image models), and clips can be 1–15 seconds long. diff --git a/examples/ts-react-media/src/components/ImageGenerator.tsx b/examples/ts-react-media/src/components/ImageGenerator.tsx index 9b4d5fd29..09e2eb7d4 100644 --- a/examples/ts-react-media/src/components/ImageGenerator.tsx +++ b/examples/ts-react-media/src/components/ImageGenerator.tsx @@ -6,8 +6,8 @@ import type { MediaPrompt } from '@tanstack/ai/client' import { generateImageFn } from '@/lib/server-functions' import { getRandomImagePrompt } from '@/lib/prompts' import { IMAGE_MODELS } from '@/lib/models' -import { readImageFile, toImagePart } from '@/lib/media' -import type { AttachedImage } from '@/lib/media' +import { readMediaFile, toImagePart } from '@/lib/media' +import type { AttachedMedia } from '@/lib/media' interface ImageGeneratorProps { onImageGenerated?: (imageUrl: string) => void @@ -36,7 +36,7 @@ export default function ImageGenerator({ const [selectedModel, setSelectedModel] = useState('all') const [isLoading, setIsLoading] = useState(false) const [results, setResults] = useState>({}) - const [images, setImages] = useState>([]) + const [images, setImages] = useState>([]) const fileInputRef = useRef(null) const currentModel = IMAGE_MODELS.find((m) => m.id === selectedModel) @@ -56,7 +56,7 @@ export default function ImageGenerator({ const files = Array.from(e.target.files ?? []) if (fileInputRef.current) fileInputRef.current.value = '' if (files.length === 0) return - const attached = await Promise.all(files.map((file) => readImageFile(file))) + const attached = await Promise.all(files.map((file) => readMediaFile(file))) setImages((prev) => [...prev, ...attached]) } diff --git a/examples/ts-react-media/src/components/VideoGenerator.tsx b/examples/ts-react-media/src/components/VideoGenerator.tsx index f31a8078e..f59063fd7 100644 --- a/examples/ts-react-media/src/components/VideoGenerator.tsx +++ b/examples/ts-react-media/src/components/VideoGenerator.tsx @@ -1,6 +1,8 @@ import { useEffect, useRef, useState } from 'react' -import { Film, Loader2, Shuffle, Upload, X } from 'lucide-react' +import { Film, Loader2, Shuffle, Upload, Wand2, X } from 'lucide-react' import type { VideoMode } from '@/lib/models' +import type { AttachedMedia } from '@/lib/media' +import type { MediaPromptPart } from '@tanstack/ai/client' import { createVideoJobFn, @@ -9,7 +11,7 @@ import { } from '@/lib/server-functions' import { VIDEO_MODELS } from '@/lib/models' import { getRandomVideoPrompt } from '@/lib/prompts' -import { imageUrlToPart, readImageFile } from '@/lib/media' +import { imageUrlToPart, readMediaFile, toVideoPart } from '@/lib/media' type JobState = | { status: 'idle' } @@ -21,7 +23,13 @@ type JobState = model: string progress?: number | undefined } - | { status: 'completed'; url: string; unitsBilled?: number; cost?: number } + | { + status: 'completed' + url: string + jobId: string + unitsBilled?: number + cost?: number + } | { status: 'error'; message: string } interface VideoGeneratorProps { @@ -37,13 +45,25 @@ export default function VideoGenerator({ const [imagePreview, setImagePreview] = useState( initialImageUrl ?? null, ) + const [attachedVideo, setAttachedVideo] = useState(null) + const [editPrompts, setEditPrompts] = useState>({}) const [jobStates, setJobStates] = useState>({}) const fileInputRef = useRef(null) + const videoInputRef = useRef(null) const pollingRefs = useRef>(new Map()) const filteredModels = VIDEO_MODELS.filter((m) => m.mode === mode) const falModels = filteredModels.filter((m) => m.provider === 'fal') const xaiModels = filteredModels.filter((m) => m.provider === 'xai') + const geminiModels = filteredModels.filter((m) => m.provider === 'gemini') + + // Gemini Omni Flash additionally accepts video prompt parts (a reference + // clip or a video to edit). Offer the upload whenever an Omni model is in + // the running — other providers never receive the video part. + const omniInRun = + selectedModel === 'all' + ? geminiModels.length > 0 + : selectedModel.startsWith('gemini-omni-flash-preview') useEffect(() => { if (initialImageUrl) { @@ -68,7 +88,7 @@ export default function VideoGenerator({ const file = e.target.files?.[0] if (fileInputRef.current) fileInputRef.current.value = '' if (!file) return - const attached = await readImageFile(file) + const attached = await readMediaFile(file) setImagePreview(attached.dataUrl) } @@ -77,6 +97,18 @@ export default function VideoGenerator({ if (fileInputRef.current) fileInputRef.current.value = '' } + const handleVideoSelect = async (e: React.ChangeEvent) => { + const file = e.target.files?.[0] + if (videoInputRef.current) videoInputRef.current.value = '' + if (!file) return + setAttachedVideo(await readMediaFile(file)) + } + + const clearVideo = () => { + setAttachedVideo(null) + if (videoInputRef.current) videoInputRef.current.value = '' + } + const pollStatus = async (jobId: string, model: string) => { try { const status = await getVideoStatusFn({ data: { jobId, model } }) @@ -98,6 +130,7 @@ export default function VideoGenerator({ [model]: { status: 'completed', url: url, + jobId, unitsBilled: urlResult.usage?.unitsBilled, cost: urlResult.usage?.cost, }, @@ -112,6 +145,19 @@ export default function VideoGenerator({ progress: status.progress, }, })) + } else if (status.status === 'failed') { + const interval = pollingRefs.current.get(model) + if (interval) { + clearInterval(interval) + pollingRefs.current.delete(model) + } + setJobStates((prev) => ({ + ...prev, + [model]: { + status: 'error', + message: status.error ?? 'Video generation failed', + }, + })) } else { setJobStates((prev) => ({ ...prev, @@ -134,6 +180,16 @@ export default function VideoGenerator({ } } + // Poll keyed by the UI model id, not result.model: the direct-xAI + // entries share one adapter model ('grok-imagine-video-1.5'), + // so result.model wouldn't identify the card (or the adapter) uniquely. + const beginPolling = (modelId: string, jobId: string) => { + const interval = setInterval(() => { + pollStatus(jobId, modelId) + }, 4000) + pollingRefs.current.set(modelId, interval) + } + const startJobForModel = async (modelId: string) => { setJobStates((prev) => ({ ...prev, @@ -141,16 +197,21 @@ export default function VideoGenerator({ })) try { + const model = VIDEO_MODELS.find((m) => m.id === modelId) + const parts: Array = [{ type: 'text', content: prompt }] // Image-to-video sends the start frame as a prompt part — the fal // adapter routes `role: 'start_frame'` to the endpoint's start-image - // field (e.g. `image_url` on Kling i2v). - const builtPrompt = - mode === 'image-to-video' && imagePreview - ? [ - { type: 'text' as const, content: prompt }, - imageUrlToPart(imagePreview, { role: 'start_frame' }), - ] - : prompt + // field (e.g. `image_url` on Kling i2v); Omni takes it as an + // interaction content block. + if (mode === 'image-to-video' && imagePreview) { + parts.push(imageUrlToPart(imagePreview, { role: 'start_frame' })) + } + // Video prompt parts (reference clip / video to edit) are an Omni + // capability only — never send them to the other providers. + if (attachedVideo && model?.provider === 'gemini') { + parts.push(toVideoPart(attachedVideo)) + } + const builtPrompt = parts.length === 1 ? prompt : parts const result = await createVideoJobFn({ data: { prompt: builtPrompt, @@ -167,13 +228,7 @@ export default function VideoGenerator({ }, })) - // Poll keyed by the UI model id, not result.model: the direct-xAI - // entries share one adapter model ('grok-imagine-video-1.5'), - // so result.model wouldn't identify the card (or the adapter) uniquely. - const interval = setInterval(() => { - pollStatus(result.jobId, modelId) - }, 4000) - pollingRefs.current.set(modelId, interval) + beginPolling(modelId, result.jobId) } catch (err) { setJobStates((prev) => ({ ...prev, @@ -186,6 +241,51 @@ export default function VideoGenerator({ } } + /** + * Gemini Omni Flash conversational editing: chain a new prompt onto a + * completed generation via its interaction id (the jobId). The model + * applies the change while preserving everything else in the video. + */ + const handleEditVideo = async (modelId: string, previousJobId: string) => { + const editPrompt = editPrompts[modelId]?.trim() + if (!editPrompt) return + + setJobStates((prev) => ({ + ...prev, + [modelId]: { status: 'submitting' }, + })) + + try { + const result = await createVideoJobFn({ + data: { + prompt: editPrompt, + model: modelId, + previousInteractionId: previousJobId, + }, + }) + + setJobStates((prev) => ({ + ...prev, + [modelId]: { + status: 'pending', + jobId: result.jobId, + model: result.model, + }, + })) + setEditPrompts((prev) => ({ ...prev, [modelId]: '' })) + + beginPolling(modelId, result.jobId) + } catch (err) { + setJobStates((prev) => ({ + ...prev, + [modelId]: { + status: 'error', + message: err instanceof Error ? err.message : 'Failed to edit video', + }, + })) + } + } + const handleGenerate = async () => { if (!prompt.trim()) return if (mode === 'image-to-video' && !imagePreview) return @@ -269,6 +369,13 @@ export default function VideoGenerator({ ))} + + {geminiModels.map((model) => ( + + ))} + @@ -311,6 +418,49 @@ export default function VideoGenerator({ )} + {omniInRun && ( +
+ + {attachedVideo ? ( +
+
+ ) : ( + + )} + +
+ )} +
@@ -437,6 +587,37 @@ export default function VideoGenerator({

) )} + {model?.provider === 'gemini' && ( +
+ + setEditPrompts((prev) => ({ + ...prev, + [modelId]: e.target.value, + })) + } + onKeyDown={(e) => { + if (e.key === 'Enter') + handleEditVideo(modelId, state.jobId) + }} + placeholder="Describe an edit — e.g. 'make it nighttime'..." + disabled={isGenerating} + className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded-lg text-white text-sm placeholder-gray-500 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent disabled:opacity-50" + /> + +
+ )} )}
diff --git a/examples/ts-react-media/src/lib/media.ts b/examples/ts-react-media/src/lib/media.ts index 40d82c039..65bec82b9 100644 --- a/examples/ts-react-media/src/lib/media.ts +++ b/examples/ts-react-media/src/lib/media.ts @@ -1,22 +1,23 @@ import type { MediaInputMetadata, MediaPromptPart } from '@tanstack/ai/client' /** - * An image the user attached as conditioning input. `dataUrl` is the full - * `data:;base64,...` string used directly for the thumbnail preview; - * `base64` is the same payload with the prefix stripped for the prompt part. + * A media file (image or video) the user attached as conditioning input. + * `dataUrl` is the full `data:;base64,...` string used directly for + * the thumbnail preview; `base64` is the same payload with the prefix + * stripped for the prompt part. */ -export interface AttachedImage { +export interface AttachedMedia { id: string name: string mimeType: string - /** Full data URL, used for the preview. */ + /** Full data URL, used for the /