diff --git a/apps/server/package.json b/apps/server/package.json index ef80469dbd..fd68ba0e67 100644 --- a/apps/server/package.json +++ b/apps/server/package.json @@ -26,9 +26,11 @@ "@anthropic-ai/claude-agent-sdk": "^0.2.77", "@effect/platform-node": "catalog:", "@effect/sql-sqlite-bun": "catalog:", + "@huggingface/transformers": "^4.0.1", "@pierre/diffs": "^1.1.0-beta.16", "effect": "catalog:", "node-pty": "^1.1.0", + "onnxruntime-node": "^1.24.3", "open": "^10.1.0", "ws": "^8.18.0" }, diff --git a/apps/server/src/keybindings.ts b/apps/server/src/keybindings.ts index 8c65de4422..e773f6a2e4 100644 --- a/apps/server/src/keybindings.ts +++ b/apps/server/src/keybindings.ts @@ -77,6 +77,7 @@ export const DEFAULT_KEYBINDINGS: ReadonlyArray = [ { key: "mod+shift+o", command: "chat.new", when: "!terminalFocus" }, { key: "mod+shift+n", command: "chat.newLocal", when: "!terminalFocus" }, { key: "mod+o", command: "editor.openFavorite" }, + { key: "mod+shift+v", command: "voice.toggle", when: "!terminalFocus" }, { key: "mod+shift+[", command: "thread.previous" }, { key: "mod+shift+]", command: "thread.next" }, ...THREAD_JUMP_KEYBINDING_COMMANDS.map((command, index) => ({ diff --git a/apps/server/src/serverLayers.ts b/apps/server/src/serverLayers.ts index 009d9f4517..80aa4885c0 100644 --- a/apps/server/src/serverLayers.ts +++ b/apps/server/src/serverLayers.ts @@ -36,6 +36,7 @@ import { RoutingTextGenerationLive } from "./git/Layers/RoutingTextGeneration"; import { PtyAdapter } from "./terminal/Services/PTY"; import { JiraTokenServiceLive } from "./jira/Layers/JiraTokenService"; import { JiraApiClientLive } from "./jira/Layers/JiraApiClient"; +import { LocalWhisperTranscriptionLive } from "./transcription/Layers/LocalWhisperTranscription"; type RuntimePtyAdapterLoader = { layer: Layer.Layer; @@ -148,5 +149,6 @@ export function makeServerRuntimeServicesLayer() { terminalLayer, KeybindingsLive, jiraLayer, + LocalWhisperTranscriptionLive, ).pipe(Layer.provideMerge(NodeServices.layer)); } diff --git a/apps/server/src/transcription/Errors.ts b/apps/server/src/transcription/Errors.ts new file mode 100644 index 0000000000..2fa76ec48f --- /dev/null +++ b/apps/server/src/transcription/Errors.ts @@ -0,0 +1,18 @@ +import { Schema } from "effect"; + +export class TranscriptionError extends Schema.TaggedErrorClass()( + "TranscriptionError", + { + reason: Schema.Literals([ + "model-not-installed", + "model-loading", + "inference-failed", + "invalid-audio", + "cleanup-failed", + "worker-crashed", + "install-failed", + "delete-failed", + ]), + message: Schema.String, + }, +) {} diff --git a/apps/server/src/transcription/Layers/LocalWhisperTranscription.ts b/apps/server/src/transcription/Layers/LocalWhisperTranscription.ts new file mode 100644 index 0000000000..85244de69a --- /dev/null +++ b/apps/server/src/transcription/Layers/LocalWhisperTranscription.ts @@ -0,0 +1,269 @@ +import { fork, spawn, type ChildProcess } from "node:child_process"; +import * as path from "node:path"; +import * as fs from "node:fs"; +import { Effect, Layer } from "effect"; +import type { + TranscribeInput, + TranscribeResult, + TranscriptionCleanupInput, + TranscriptionCleanupResult, +} from "@marcode/contracts"; +import { TranscriptionService } from "../Services/TranscriptionService"; +import { TranscriptionError } from "../Errors"; +import { unstable_v2_prompt, type SDKResultMessage } from "@anthropic-ai/claude-agent-sdk"; +import { ServerSettingsService } from "../../serverSettings"; + +const CLEANUP_PROMPT_PREFIX = + "Clean up this voice-dictated text. Remove filler words (um, uh, like, so, you know), fix grammar and punctuation, and preserve technical terms, code references, and variable names exactly as spoken. Keep the same language as the input. Return ONLY the cleaned text, nothing else — no preamble, no explanation.\n\nText to clean up:\n"; + +async function cleanupWithClaude( + rawText: string, + claudeBinaryPath: string, + language?: string, +): Promise { + const prompt = `${CLEANUP_PROMPT_PREFIX}${language ? `[Language: ${language}] ` : ""}${rawText}`; + + const result: SDKResultMessage = await unstable_v2_prompt(prompt, { + model: "claude-haiku-4-5", + pathToClaudeCodeExecutable: claudeBinaryPath, + env: process.env, + disallowedTools: ["*"], + }); + + if (result.subtype === "success" && result.result) { + return { cleanedText: result.result.trim() }; + } + + console.error("[voice] cleanup failed:", result.subtype === "error" ? result : "unknown error"); + return { cleanedText: rawText }; +} + +const KNOWN_MODELS = [ + "Xenova/whisper-tiny", + "Xenova/whisper-base", + "Xenova/whisper-small", + "Xenova/whisper-medium", +]; + +function getModelsCacheDir(): string { + const homeDir = process.env.HOME || process.env.USERPROFILE || "/tmp"; + return path.join(homeDir, ".marcode", "whisper-models"); +} + +function isModelInstalledOnDisk(modelId: string): boolean { + const modelDir = path.join(getModelsCacheDir(), modelId); + try { + if (!fs.existsSync(modelDir)) return false; + const entries = fs.readdirSync(modelDir, { recursive: true }); + return entries.some( + (f) => typeof f === "string" && (f.endsWith(".onnx") || f.endsWith(".json")), + ); + } catch { + return false; + } +} + +function getInstalledModelsFromDisk(): ReadonlyArray { + return KNOWN_MODELS.filter((id) => isModelInstalledOnDisk(id)); +} + +function resolveChildScriptPath(): string { + const basename = "whisperChildProcess"; + const thisDir = import.meta.dirname; + + const searchDirs = [ + path.join(thisDir, ".."), + path.join(thisDir, "..", "src", "transcription"), + path.join(thisDir), + path.join(thisDir, "src", "transcription"), + ]; + + const extensions = [".ts", ".js", ".mjs"]; + + for (const dir of searchDirs) { + for (const ext of extensions) { + const candidate = path.join(dir, basename + ext); + if (fs.existsSync(candidate)) return candidate; + } + } + + throw new Error(`Could not find ${basename} script. Searched from ${thisDir}`); +} + +let child: ChildProcess | null = null; +let childReady = false; +let readyPromise: Promise | null = null; + +function ensureChild(): ChildProcess { + if (child && !child.killed) return child; + + const scriptPath = resolveChildScriptPath(); + console.log("[voice] spawning child process:", scriptPath); + const isTsFile = scriptPath.endsWith(".ts"); + + if (isTsFile) { + child = spawn("bun", ["run", scriptPath], { + stdio: ["pipe", "pipe", "pipe", "ipc"], + }); + } else { + child = fork(scriptPath, [], { + stdio: ["pipe", "pipe", "pipe", "ipc"], + }); + } + + child.stderr?.on("data", (data: Buffer) => { + console.error("[whisper-child]", data.toString().trim()); + }); + + childReady = false; + readyPromise = new Promise((resolve) => { + const onReady = (msg: { type: string }) => { + if (msg.type === "ready") { + childReady = true; + child?.off("message", onReady); + resolve(); + } + }; + child!.on("message", onReady); + }); + + child.on("exit", (code) => { + if (code !== 0 && code !== null) { + console.error("[whisper-child] exited with code", code); + } + child = null; + childReady = false; + readyPromise = null; + }); + + child.on("error", (err) => { + console.error("[whisper-child] error:", err.message); + child = null; + childReady = false; + readyPromise = null; + }); + + return child; +} + +async function waitForReady(): Promise { + ensureChild(); + if (childReady) return; + if (readyPromise) await readyPromise; +} + +let messageIdCounter = 0; + +function sendToChild( + message: Record, + expectedTypes: string[], + onInterim?: (msg: Record) => void, +): Promise { + return new Promise((resolve, reject) => { + const id = String(++messageIdCounter); + const proc = ensureChild(); + + const handler = (response: { type: string; id?: string; message?: string }) => { + if (response.id !== id) return; + + if (expectedTypes.includes(response.type)) { + proc.off("message", handler); + resolve(response as T); + } else if (response.type === "error") { + proc.off("message", handler); + reject(new Error(response.message || "Child process error")); + } else if (onInterim) { + onInterim(response as Record); + } + }; + + proc.on("message", handler); + const outgoing = { ...message, id }; + console.log("[voice] sending to child:", message.type, "id:", id); + proc.send(outgoing); + }); +} + +const PROGRESS_THROTTLE_MS = 500; + +export const LocalWhisperTranscriptionLive = Layer.succeed( + TranscriptionService, + TranscriptionService.of({ + transcribe: (input: TranscribeInput, modelId: string) => + Effect.tryPromise({ + try: async () => { + await waitForReady(); + const result = await sendToChild<{ text: string }>( + { + type: "transcribe", + model: modelId, + audio: input.audio, + language: input.language ?? null, + }, + ["result"], + ); + return { text: result.text } satisfies TranscribeResult; + }, + catch: (err) => + new TranscriptionError({ + reason: "inference-failed", + message: err instanceof Error ? err.message : "Transcription failed", + }), + }), + + cleanup: (input: TranscriptionCleanupInput, claudeBinaryPath: string) => + Effect.tryPromise({ + try: () => cleanupWithClaude(input.rawText, claudeBinaryPath, input.language), + catch: (err) => + new TranscriptionError({ + reason: "cleanup-failed", + message: err instanceof Error ? err.message : "Cleanup failed", + }), + }).pipe(Effect.catch(() => Effect.succeed({ cleanedText: input.rawText }))), + + installModel: ( + modelId: string, + onProgress: (progress: { progress: number; file: string }) => void, + ) => + Effect.tryPromise({ + try: async () => { + await waitForReady(); + let lastEmitTime = 0; + const onnxFileProgress = new Map(); + + await sendToChild({ type: "install", model: modelId }, ["install-complete"], (msg) => { + if (msg.type === "install-progress") { + const file = msg.file as string; + const progress = msg.progress as number; + if (!file.endsWith(".onnx")) return; + + onnxFileProgress.set(file, progress); + + const now = Date.now(); + if (now - lastEmitTime < PROGRESS_THROTTLE_MS) return; + lastEmitTime = now; + + const values = [...onnxFileProgress.values()]; + const overall = values.reduce((a, b) => a + b, 0) / Math.max(values.length, 1); + onProgress({ progress: Math.round(overall), file }); + } + }); + }, + catch: (err) => + new TranscriptionError({ + reason: "install-failed", + message: err instanceof Error ? err.message : "Install failed", + }), + }), + + deleteModel: (modelId: string) => + Effect.sync(() => { + const modelDir = path.join(getModelsCacheDir(), modelId); + if (fs.existsSync(modelDir)) { + fs.rmSync(modelDir, { recursive: true, force: true }); + } + }), + + getInstalledModels: Effect.sync(() => getInstalledModelsFromDisk()), + }), +); diff --git a/apps/server/src/transcription/Services/TranscriptionService.ts b/apps/server/src/transcription/Services/TranscriptionService.ts new file mode 100644 index 0000000000..6a05d6073e --- /dev/null +++ b/apps/server/src/transcription/Services/TranscriptionService.ts @@ -0,0 +1,30 @@ +import { Effect, ServiceMap } from "effect"; +import type { + TranscribeInput, + TranscribeResult, + TranscriptionCleanupInput, + TranscriptionCleanupResult, +} from "@marcode/contracts"; +import type { TranscriptionError } from "../Errors"; + +export interface TranscriptionServiceShape { + readonly transcribe: ( + input: TranscribeInput, + modelId: string, + ) => Effect.Effect; + readonly cleanup: ( + input: TranscriptionCleanupInput, + claudeBinaryPath: string, + ) => Effect.Effect; + readonly installModel: ( + modelId: string, + onProgress: (progress: { progress: number; file: string }) => void, + ) => Effect.Effect; + readonly deleteModel: (modelId: string) => Effect.Effect; + readonly getInstalledModels: Effect.Effect>; +} + +export class TranscriptionService extends ServiceMap.Service< + TranscriptionService, + TranscriptionServiceShape +>()("marcode/transcription/TranscriptionService") {} diff --git a/apps/server/src/transcription/whisperChildProcess.ts b/apps/server/src/transcription/whisperChildProcess.ts new file mode 100644 index 0000000000..8ba6504afe --- /dev/null +++ b/apps/server/src/transcription/whisperChildProcess.ts @@ -0,0 +1,131 @@ +import { env, pipeline, type AutomaticSpeechRecognitionPipeline } from "@huggingface/transformers"; +import * as fs from "node:fs"; +import * as path from "node:path"; + +if (env.backends?.onnx?.wasm) { + env.backends.onnx.wasm.proxy = false; +} + +let transcriber: AutomaticSpeechRecognitionPipeline | null = null; +let currentModel: string | null = null; + +function getCacheDir(): string { + const homeDir = process.env.HOME || process.env.USERPROFILE || "/tmp"; + const dir = path.join(homeDir, ".marcode", "whisper-models"); + fs.mkdirSync(dir, { recursive: true }); + return dir; +} + +function send(msg: Record): void { + process.send?.(msg); +} + +async function handleMessage(message: { type: string; id?: string; [key: string]: unknown }) { + const id = message.id; + try { + switch (message.type) { + case "transcribe": { + const modelId = message.model as string; + const audioBase64 = message.audio as string; + const language = message.language as string | null; + const cacheDir = getCacheDir(); + + console.error( + `[whisper-child] transcribe request: model=${modelId}, audio=${audioBase64.length} chars, language=${language ?? "auto"}`, + ); + + if (!transcriber || currentModel !== modelId) { + console.error(`[whisper-child] loading model ${modelId}...`); + transcriber = await pipeline("automatic-speech-recognition", modelId, { + cache_dir: cacheDir, + }); + currentModel = modelId; + console.error(`[whisper-child] model loaded`); + } + + const audioBytes = Buffer.from(audioBase64, "base64"); + const pcmData = decodeWavToPcm(audioBytes); + console.error( + `[whisper-child] decoded WAV: ${pcmData.length} samples (${(pcmData.length / 16000).toFixed(1)}s)`, + ); + + const options: Record = { + task: "transcribe", + language: language || "en", + }; + console.error(`[whisper-child] running inference with language=${options.language}...`); + const result = await transcriber(pcmData, options); + console.error(`[whisper-child] inference done, raw result type: ${typeof result}`); + + const text = + typeof result === "object" && "text" in result + ? (result as { text: string }).text + : String(result); + + console.error(`[whisper-child] text: "${text.trim()}"`); + send({ type: "result", id, text: text.trim() }); + break; + } + + case "install": { + const modelId = message.model as string; + const cacheDir = getCacheDir(); + env.cacheDir = cacheDir; + + await pipeline("automatic-speech-recognition", modelId, { + cache_dir: cacheDir, + progress_callback: (p: { status: string; progress?: number; file?: string }) => { + if (p.status === "progress" && p.progress !== undefined) { + send({ + type: "install-progress", + id, + progress: p.progress, + file: p.file || "", + }); + } + }, + }); + send({ type: "install-complete", id }); + break; + } + + default: + send({ type: "error", id, message: `Unknown message type: ${message.type}` }); + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + send({ type: "error", id, message: errorMessage }); + } +} + +function decodeWavToPcm(wavBuffer: Buffer): Float32Array { + const dataView = new DataView(wavBuffer.buffer, wavBuffer.byteOffset, wavBuffer.byteLength); + let offset = 12; + while (offset < dataView.byteLength - 8) { + const chunkId = String.fromCharCode( + dataView.getUint8(offset), + dataView.getUint8(offset + 1), + dataView.getUint8(offset + 2), + dataView.getUint8(offset + 3), + ); + const chunkSize = dataView.getUint32(offset + 4, true); + if (chunkId === "data") { + offset += 8; + const sampleCount = chunkSize / 2; + const pcm = new Float32Array(sampleCount); + for (let i = 0; i < sampleCount; i++) { + const sample = dataView.getInt16(offset + i * 2, true); + pcm[i] = sample / 32768; + } + return pcm; + } + offset += 8 + chunkSize; + } + throw new Error("Invalid WAV: no data chunk found"); +} + +process.on("message", (message) => { + void handleMessage(message as { type: string; id?: string; [key: string]: unknown }); +}); + +send({ type: "ready" }); diff --git a/apps/server/src/transcription/whisperWorkerThread.ts b/apps/server/src/transcription/whisperWorkerThread.ts new file mode 100644 index 0000000000..3abe85ccb5 --- /dev/null +++ b/apps/server/src/transcription/whisperWorkerThread.ts @@ -0,0 +1,130 @@ +import { parentPort } from "node:worker_threads"; +import { env, pipeline, type AutomaticSpeechRecognitionPipeline } from "@huggingface/transformers"; +import * as fs from "node:fs"; +import * as path from "node:path"; + +if (env.backends?.onnx?.wasm) { + env.backends.onnx.wasm.proxy = false; +} + +let transcriber: AutomaticSpeechRecognitionPipeline | null = null; +let currentModel: string | null = null; +let cacheDir: string | null = null; + +function getCacheDir(): string { + if (cacheDir) return cacheDir; + const homeDir = process.env.HOME || process.env.USERPROFILE || "/tmp"; + cacheDir = path.join(homeDir, ".marcode", "whisper-models"); + fs.mkdirSync(cacheDir, { recursive: true }); + return cacheDir; +} + +function getModelCacheDir(modelId: string): string { + return path.join(getCacheDir(), modelId.replace("/", "__")); +} + +function isModelInstalled(modelId: string): boolean { + const modelDir = getModelCacheDir(modelId); + if (!fs.existsSync(modelDir)) return false; + const files = fs.readdirSync(modelDir, { recursive: true }) as string[]; + return files.some((f) => typeof f === "string" && f.endsWith(".onnx")); +} + +async function handleMessage(message: { type: string; [key: string]: unknown }) { + try { + switch (message.type) { + case "check-installed": { + const models = message.models as string[]; + const statuses = models.map((modelId) => ({ + modelId, + installed: isModelInstalled(modelId), + })); + parentPort?.postMessage({ type: "install-status", statuses }); + break; + } + + case "install": { + const modelId = message.model as string; + env.cacheDir = getCacheDir(); + env.localModelPath = getCacheDir(); + + await pipeline("automatic-speech-recognition", modelId, { + cache_dir: getCacheDir(), + progress_callback: (progress: { status: string; progress?: number; file?: string }) => { + if (progress.status === "progress" && progress.progress !== undefined) { + parentPort?.postMessage({ + type: "install-progress", + progress: progress.progress, + file: progress.file || "", + }); + } + }, + }); + parentPort?.postMessage({ type: "install-complete" }); + break; + } + + case "load": { + const modelId = message.model as string; + if (transcriber && currentModel === modelId) { + parentPort?.postMessage({ type: "ready" }); + return; + } + + transcriber = await pipeline("automatic-speech-recognition", modelId, { + cache_dir: getCacheDir(), + }); + currentModel = modelId; + parentPort?.postMessage({ type: "ready" }); + break; + } + + case "transcribe": { + if (!transcriber) { + parentPort?.postMessage({ type: "error", message: "Model not loaded" }); + return; + } + + const audioBuffer = message.audioBuffer as SharedArrayBuffer; + const length = message.length as number; + const language = message.language as string | null; + + const audioData = new Float32Array(audioBuffer, 0, length); + + const options: Record = { task: "transcribe" }; + if (language) { + options.language = language; + } + const result = await transcriber(audioData, options); + + const text = + typeof result === "object" && "text" in result + ? (result as { text: string }).text + : String(result); + parentPort?.postMessage({ type: "result", text: text.trim() }); + break; + } + + case "delete": { + const modelId = message.model as string; + const modelDir = getModelCacheDir(modelId); + if (fs.existsSync(modelDir)) { + fs.rmSync(modelDir, { recursive: true, force: true }); + } + if (currentModel === modelId) { + transcriber = null; + currentModel = null; + } + parentPort?.postMessage({ type: "deleted" }); + break; + } + } + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + parentPort?.postMessage({ type: "error", message: errorMessage }); + } +} + +parentPort?.on("message", (message) => { + void handleMessage(message); +}); diff --git a/apps/server/src/wsServer.test.ts b/apps/server/src/wsServer.test.ts index 30662729a3..4c773b4537 100644 --- a/apps/server/src/wsServer.test.ts +++ b/apps/server/src/wsServer.test.ts @@ -871,6 +871,7 @@ describe("WebSocket Server", () => { providers: defaultProviderStatuses, availableEditors: expect.any(Array), settings: defaultServerSettings, + whisper: { installedModels: expect.any(Array) }, }); expectAvailableEditors((response.result as { availableEditors: unknown }).availableEditors); }); @@ -897,6 +898,7 @@ describe("WebSocket Server", () => { providers: defaultProviderStatuses, availableEditors: expect.any(Array), settings: defaultServerSettings, + whisper: { installedModels: expect.any(Array) }, }); expectAvailableEditors((response.result as { availableEditors: unknown }).availableEditors); @@ -934,6 +936,7 @@ describe("WebSocket Server", () => { providers: defaultProviderStatuses, availableEditors: expect.any(Array), settings: defaultServerSettings, + whisper: { installedModels: expect.any(Array) }, }); expectAvailableEditors((response.result as { availableEditors: unknown }).availableEditors); expect(fs.readFileSync(keybindingsPath, "utf8")).toBe("{ not-json"); @@ -1084,6 +1087,7 @@ describe("WebSocket Server", () => { providers: defaultProviderStatuses, availableEditors: expect.any(Array), settings: defaultServerSettings, + whisper: { installedModels: expect.any(Array) }, }); expectAvailableEditors((response.result as { availableEditors: unknown }).availableEditors); }); @@ -1133,6 +1137,7 @@ describe("WebSocket Server", () => { providers: defaultProviderStatuses, availableEditors: expect.any(Array), settings: defaultServerSettings, + whisper: { installedModels: expect.any(Array) }, }); expectAvailableEditors( (configResponse.result as { availableEditors: unknown }).availableEditors, diff --git a/apps/server/src/wsServer.ts b/apps/server/src/wsServer.ts index 53366b38c5..5e6b92f626 100644 --- a/apps/server/src/wsServer.ts +++ b/apps/server/src/wsServer.ts @@ -66,6 +66,8 @@ import { GitCore } from "./git/Services/GitCore.ts"; import { tryHandleProjectFaviconRequest } from "./projectFaviconRoute"; import { JiraApiClient } from "./jira/Services/JiraApiClient"; import { JiraTokenService } from "./jira/Services/JiraTokenService"; +import { TranscriptionService } from "./transcription/Services/TranscriptionService"; + import { tryHandleJiraAuthRequest, tryHandleJiraCallbackRequest } from "./jira/oauthRoutes"; import { ATTACHMENTS_ROUTE_PREFIX, @@ -232,6 +234,7 @@ export type ServerRuntimeServices = | ServerSettingsService | JiraApiClient | JiraTokenService + | TranscriptionService | Open; export class ServerLifecycleError extends Schema.TaggedErrorClass()( @@ -1034,6 +1037,8 @@ export const createServer = Effect.fn(function* (): Effect.fn.Return< const keybindingsConfig = yield* keybindingsManager.loadConfigState; const settings = yield* serverSettingsManager.getSettings; const providers = yield* Ref.get(providersRef); + const transcriptionService = yield* TranscriptionService; + const installedModels = yield* transcriptionService.getInstalledModels; return { cwd, keybindingsConfigPath, @@ -1042,6 +1047,7 @@ export const createServer = Effect.fn(function* (): Effect.fn.Return< providers, availableEditors, settings, + whisper: { installedModels }, }; } @@ -1112,6 +1118,75 @@ export const createServer = Effect.fn(function* (): Effect.fn.Return< return yield* jiraClient.getAttachment(body); } + case WS_METHODS.transcriptionTranscribe: { + const body = stripRequestTag(request.body); + const transcriptionService = yield* TranscriptionService; + const currentSettings = yield* serverSettingsManager.getSettings; + const modelId = currentSettings.whisperSelectedModel ?? "Xenova/whisper-small"; + logger.info( + `[voice] transcribe request received, model=${modelId}, audio length=${body.audio.length}, language=${body.language ?? "auto"}`, + ); + const result = yield* transcriptionService.transcribe(body, modelId); + logger.info(`[voice] transcribe result: ${JSON.stringify(result)}`); + return result; + } + + case WS_METHODS.transcriptionCleanup: { + const body = stripRequestTag(request.body); + const transcriptionService = yield* TranscriptionService; + return yield* transcriptionService.cleanup(body); + } + + case WS_METHODS.whisperInstallModel: { + const body = stripRequestTag(request.body); + const transcriptionService = yield* TranscriptionService; + Effect.gen(function* () { + yield* transcriptionService.installModel(body.modelId, (progress) => { + pushBus + .publishAll(WS_CHANNELS.whisperDownloadProgress, { + modelId: body.modelId, + progress: progress.progress, + file: progress.file, + status: "downloading" as const, + }) + .pipe(Effect.runSync); + }); + yield* pushBus.publishAll(WS_CHANNELS.whisperDownloadProgress, { + modelId: body.modelId, + progress: 100, + file: "", + status: "complete" as const, + }); + yield* pushBus.publishAll(WS_CHANNELS.serverConfigUpdated, { + issues: [], + settings: yield* serverSettingsManager.getSettings, + }); + }).pipe( + Effect.catch(() => + pushBus.publishAll(WS_CHANNELS.whisperDownloadProgress, { + modelId: body.modelId, + progress: 0, + file: "", + status: "error" as const, + error: "Download failed", + }), + ), + Effect.runPromise, + ); + return {}; + } + + case WS_METHODS.whisperDeleteModel: { + const body = stripRequestTag(request.body); + const transcriptionService = yield* TranscriptionService; + yield* transcriptionService.deleteModel(body.modelId); + yield* pushBus.publishAll(WS_CHANNELS.serverConfigUpdated, { + issues: [], + settings: yield* serverSettingsManager.getSettings, + }); + return {}; + } + default: { const _exhaustiveCheck: never = request.body; return yield* new RouteRequestError({ diff --git a/apps/web/src/components/ChatView.browser.tsx b/apps/web/src/components/ChatView.browser.tsx index adcb013ec0..40e149268e 100644 --- a/apps/web/src/components/ChatView.browser.tsx +++ b/apps/web/src/components/ChatView.browser.tsx @@ -126,6 +126,7 @@ function createBaseServerConfig(): ServerConfig { ...DEFAULT_SERVER_SETTINGS, ...DEFAULT_CLIENT_SETTINGS, }, + whisper: { installedModels: [] }, }; } diff --git a/apps/web/src/components/ChatView.tsx b/apps/web/src/components/ChatView.tsx index f985a12aea..3f618b9d43 100644 --- a/apps/web/src/components/ChatView.tsx +++ b/apps/web/src/components/ChatView.tsx @@ -136,6 +136,8 @@ import { resolveSelectableProvider, } from "../providerModels"; import { useSettings } from "../hooks/useSettings"; +import { useVoiceRecording } from "../hooks/useVoiceRecording"; +import { useWhisperModelStatus } from "../hooks/useWhisperModelStatus"; import { resolveAppModelSelection } from "../modelSelection"; import { isTerminalFocused } from "../lib/terminalFocus"; import { @@ -179,6 +181,7 @@ import { PullRequestThreadDialog } from "./PullRequestThreadDialog"; import { MessagesTimeline } from "./chat/MessagesTimeline"; import { ChatHeader } from "./chat/ChatHeader"; import { ContextWindowMeter } from "./chat/ContextWindowMeter"; +import { VoiceMicButton } from "./chat/VoiceMicButton"; import { buildExpandedImagePreview, ExpandedImagePreview } from "./chat/ExpandedImagePreview"; import { AVAILABLE_PROVIDER_OPTIONS, ProviderModelPicker } from "./chat/ProviderModelPicker"; import { ComposerCommandItem, ComposerCommandMenu } from "./chat/ComposerCommandMenu"; @@ -331,6 +334,7 @@ export default function ChatView({ threadId }: ChatViewProps) { const setStoreThreadError = useStore((store) => store.setError); const setStoreThreadBranch = useStore((store) => store.setThreadBranch); const settings = useSettings(); + const { modelReady } = useWhisperModelStatus(); const setStickyComposerModelSelection = useComposerDraftStore( (store) => store.setStickyModelSelection, ); @@ -649,6 +653,51 @@ export default function ChatView({ threadId }: ChatViewProps) { ), ); + const onVoiceTranscript = useCallback( + (transcript: string) => { + if (!activeThread) return; + const current = promptRef.current; + const sep = current.length > 0 && !current.endsWith(" ") ? " " : ""; + const newPrompt = current + sep + transcript; + promptRef.current = newPrompt; + setPrompt(newPrompt); + requestAnimationFrame(() => { + composerEditorRef.current?.focusAtEnd(); + }); + }, + [activeThread, setPrompt], + ); + + const voiceRecording = useVoiceRecording({ + onTranscript: onVoiceTranscript, + onError: (err) => { + if (err.type === "permission-denied") { + toastManager.add({ + type: "error", + title: "Microphone access denied", + description: "Enable microphone in your browser settings.", + }); + } else if (err.type === "no-microphone") { + toastManager.add({ type: "error", title: "No microphone detected" }); + } else if (err.type === "transcription-failed") { + toastManager.add({ + type: "error", + title: "Transcription failed", + description: err.message, + }); + } else if (err.type === "cleanup-failed") { + toastManager.add({ + type: "warning", + title: "Cleanup skipped", + description: "Using raw transcription.", + }); + } + }, + ready: settings.voiceEnabled && modelReady, + language: settings.voiceLanguage, + llmCleanup: settings.voiceLlmCleanup, + }); + const openPullRequestDialog = useCallback( (reference?: string) => { if (!canCheckoutPullRequestIntoThread) { @@ -2480,6 +2529,13 @@ export default function ChatView({ threadId }: ChatViewProps) { return; } + if (command === "voice.toggle") { + event.preventDefault(); + event.stopPropagation(); + voiceRecording.toggleRecording(); + return; + } + const scriptId = projectScriptIdFromCommand(command); if (!scriptId || !activeProject) return; const script = activeProject.scripts.find((entry) => entry.id === scriptId); @@ -2503,6 +2559,7 @@ export default function ChatView({ threadId }: ChatViewProps) { keybindings, onToggleDiff, toggleTerminalVisibility, + voiceRecording, ]); const addComposerImages = useCallback( @@ -4442,6 +4499,16 @@ export default function ChatView({ threadId }: ChatViewProps) { {activeContextWindow ? ( ) : null} + {activePendingProgress ? (
{activePendingProgress.questionIndex > 0 ? ( diff --git a/apps/web/src/components/KeybindingsToast.browser.tsx b/apps/web/src/components/KeybindingsToast.browser.tsx index 5ce4bce476..69877fc76e 100644 --- a/apps/web/src/components/KeybindingsToast.browser.tsx +++ b/apps/web/src/components/KeybindingsToast.browser.tsx @@ -64,7 +64,9 @@ function createBaseServerConfig(): ServerConfig { codex: { enabled: true, binaryPath: "", homePath: "", customModels: [] }, claudeAgent: { enabled: true, binaryPath: "", customModels: [] }, }, + whisperSelectedModel: null, }, + whisper: { installedModels: [] }, }; } diff --git a/apps/web/src/components/chat/VoiceMicButton.tsx b/apps/web/src/components/chat/VoiceMicButton.tsx new file mode 100644 index 0000000000..10a614d1eb --- /dev/null +++ b/apps/web/src/components/chat/VoiceMicButton.tsx @@ -0,0 +1,151 @@ +import { memo, useEffect, useRef } from "react"; +import { MicIcon } from "lucide-react"; +import { Tooltip, TooltipPopup, TooltipTrigger } from "../ui/tooltip"; + +interface VoiceMicButtonProps { + status: "idle" | "recording" | "transcribing" | "cleaning-up"; + isSupported: boolean; + analyserNode: AnalyserNode | null; + onToggle: () => void; + shortcutLabel: string | null; + disabled: boolean; + voiceEnabled: boolean; + modelReady: boolean; +} + +export const VoiceMicButton = memo(function VoiceMicButton(props: VoiceMicButtonProps) { + const { + status, + isSupported, + analyserNode, + onToggle, + shortcutLabel, + disabled, + voiceEnabled, + modelReady, + } = props; + const barsRef = useRef(null); + const rafRef = useRef(0); + + useEffect(() => { + if (status !== "recording" || !analyserNode || !barsRef.current) return; + + const dataArray = new Uint8Array(analyserNode.frequencyBinCount); + const bars = barsRef.current.children; + + const animate = () => { + analyserNode.getByteFrequencyData(dataArray); + const bucketSize = Math.floor(dataArray.length / bars.length); + for (let i = 0; i < bars.length; i++) { + let sum = 0; + for (let j = 0; j < bucketSize; j++) { + sum += dataArray[i * bucketSize + j]!; + } + const avg = sum / bucketSize / 255; + const height = Math.max(0.2, avg); + (bars[i] as HTMLElement).style.transform = `scaleY(${height})`; + } + rafRef.current = requestAnimationFrame(animate); + }; + + rafRef.current = requestAnimationFrame(animate); + return () => cancelAnimationFrame(rafRef.current); + }, [status, analyserNode]); + + if (!voiceEnabled || !isSupported) return null; + + const isProcessing = status === "transcribing" || status === "cleaning-up"; + + if (!modelReady) { + return ( + + + + + } + /> + Install a voice model in Settings to use voice input + + ); + } + + if (status === "recording") { + return ( + + ); + } + + if (isProcessing) { + return ( + + ); + } + + return ( + + + } + > + + + + Voice input{shortcutLabel ? ` (${shortcutLabel})` : ""} + + + ); +}); diff --git a/apps/web/src/components/settings/SettingsPanels.tsx b/apps/web/src/components/settings/SettingsPanels.tsx index c46f581323..3f3fb63d4f 100644 --- a/apps/web/src/components/settings/SettingsPanels.tsx +++ b/apps/web/src/components/settings/SettingsPanels.tsx @@ -60,6 +60,7 @@ import { toastManager } from "../ui/toast"; import { Tooltip, TooltipPopup, TooltipTrigger } from "../ui/tooltip"; import { ProjectFavicon } from "../ProjectFavicon"; import { JiraSettingsSection } from "./JiraSettingsSection"; +import { VoiceSettingsSection } from "./VoiceSettingsSection"; import { ThemePicker } from "./ThemePicker"; const TIMESTAMP_FORMAT_LABELS = { @@ -1026,6 +1027,12 @@ export function GeneralSettingsPanel() {
+ +
+ +
+
+ = []; + +export function VoiceSettingsSection() { + const queryClient = useQueryClient(); + const settings = useSettings(); + const { updateSettings } = useUpdateSettings(); + const serverConfigQuery = useQuery(serverConfigQueryOptions()); + + const voiceEnabled = settings.voiceEnabled; + const voiceLanguage = settings.voiceLanguage; + const voiceLlmCleanup = settings.voiceLlmCleanup; + const whisperSelectedModel = settings.whisperSelectedModel ?? null; + + const installedModels = + serverConfigQuery.data?.whisper?.installedModels ?? EMPTY_INSTALLED_MODELS; + + const [downloadProgress, setDownloadProgress] = useState>({}); + const [downloadingModelId, setDownloadingModelId] = useState(null); + const [deletingModelId, setDeletingModelId] = useState(null); + + useEffect(() => { + const api = readNativeApi(); + if (!api?.whisper.onDownloadProgress) return; + const unsubscribe = api.whisper.onDownloadProgress((event: WhisperDownloadProgressPayload) => { + if (event.status === "complete") { + setDownloadProgress((prev) => { + const next = { ...prev }; + delete next[event.modelId]; + return next; + }); + setDownloadingModelId((current) => (current === event.modelId ? null : current)); + void queryClient.invalidateQueries({ queryKey: serverQueryKeys.config() }); + updateSettings({ whisperSelectedModel: event.modelId }); + toastManager.add({ + title: "Model installed", + description: `${WHISPER_MODELS.find((m) => m.id === event.modelId)?.label ?? event.modelId} is ready to use.`, + type: "success", + }); + } else if (event.status === "error") { + setDownloadProgress((prev) => { + const next = { ...prev }; + delete next[event.modelId]; + return next; + }); + setDownloadingModelId((current) => (current === event.modelId ? null : current)); + toastManager.add({ + title: "Download failed", + description: event.error ?? "An error occurred while downloading the model.", + type: "error", + }); + } else { + setDownloadProgress((prev) => ({ + ...prev, + [event.modelId]: event.progress, + })); + } + }); + return unsubscribe; + }, [queryClient, updateSettings]); + + const handleInstall = useCallback(async (modelId: string) => { + setDownloadingModelId(modelId); + setDownloadProgress((prev) => ({ ...prev, [modelId]: 0 })); + try { + await ensureNativeApi().whisper.installModel({ modelId }); + } catch { + setDownloadingModelId(null); + setDownloadProgress((prev) => { + const next = { ...prev }; + delete next[modelId]; + return next; + }); + toastManager.add({ + title: "Install failed", + description: "Could not start model download.", + type: "error", + }); + } + }, []); + + const handleDelete = useCallback( + async (modelId: string) => { + setDeletingModelId(modelId); + try { + await ensureNativeApi().whisper.deleteModel({ modelId }); + await queryClient.invalidateQueries({ queryKey: serverQueryKeys.config() }); + if (whisperSelectedModel === modelId) { + updateSettings({ whisperSelectedModel: null }); + } + toastManager.add({ + title: "Model removed", + description: `${WHISPER_MODELS.find((m) => m.id === modelId)?.label ?? modelId} has been removed.`, + type: "success", + }); + } catch { + toastManager.add({ + title: "Remove failed", + description: "Could not remove the model.", + type: "error", + }); + } finally { + setDeletingModelId(null); + } + }, + [queryClient, whisperSelectedModel, updateSettings], + ); + + return ( +
+
+
+

Enable voice input

+

+ Use your microphone to dictate messages. Requires a local Whisper model. +

+
+ updateSettings({ voiceEnabled: checked })} + /> +
+ + {voiceEnabled && ( + <> +
+
+

Voice model

+

+ Select and install a local Whisper model for speech recognition. Larger models are + more accurate but slower. +

+
+
+ {WHISPER_MODELS.map((model) => { + const isInstalled = installedModels.includes(model.id); + const isSelected = whisperSelectedModel === model.id; + const isDownloading = downloadingModelId === model.id; + const isDeleting = deletingModelId === model.id; + const progress = downloadProgress[model.id]; + + return ( +
+
+
+ {model.label} + {model.size} + {isInstalled && isSelected && ( + + Active + + )} +
+
+ {model.accuracy} accuracy / {model.speed} +
+ {isDownloading && progress !== undefined && ( +
+
+
+ )} +
+
+ {isDownloading ? ( + + ) : isInstalled ? ( + <> + {!isSelected && ( + + )} + + + ) : ( + + )} +
+
+ ); + })} +
+
+ +
+
+

Language

+

+ Hint the expected spoken language for better accuracy. +

+
+ +
+ +
+
+

LLM cleanup

+

+ Use the active LLM to clean up transcription (fix grammar, remove filler words). +

+
+ updateSettings({ voiceLlmCleanup: checked })} + /> +
+ + )} +
+ ); +} diff --git a/apps/web/src/hooks/useVoiceRecording.ts b/apps/web/src/hooks/useVoiceRecording.ts new file mode 100644 index 0000000000..98e294e5fc --- /dev/null +++ b/apps/web/src/hooks/useVoiceRecording.ts @@ -0,0 +1,227 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { readNativeApi } from "~/nativeApi"; +import { audioBlobToWavBase64 } from "../lib/audioEncoder"; + +type VoiceRecordingStatus = "idle" | "recording" | "transcribing" | "cleaning-up"; + +type VoiceRecordingError = + | { type: "permission-denied" } + | { type: "no-microphone" } + | { type: "not-supported" } + | { type: "transcription-failed"; message: string } + | { type: "cleanup-failed"; message: string }; + +interface UseVoiceRecordingOptions { + onTranscript: (text: string) => void; + onError?: (error: VoiceRecordingError) => void; + ready: boolean; + language: string; + llmCleanup: boolean; +} + +interface UseVoiceRecordingReturn { + status: VoiceRecordingStatus; + isRecording: boolean; + isProcessing: boolean; + isSupported: boolean; + analyserNode: AnalyserNode | null; + toggleRecording: () => void; + stopRecording: () => void; + error: VoiceRecordingError | null; +} + +const isSupported = + typeof navigator !== "undefined" && + typeof navigator.mediaDevices !== "undefined" && + typeof MediaRecorder !== "undefined"; + +export function useVoiceRecording(options: UseVoiceRecordingOptions): UseVoiceRecordingReturn { + const { onTranscript, onError, ready, language, llmCleanup } = options; + const [status, setStatus] = useState("idle"); + const [analyserNode, setAnalyserNode] = useState(null); + const [error, setError] = useState(null); + + const mediaRecorderRef = useRef(null); + const audioContextRef = useRef(null); + const chunksRef = useRef([]); + const streamRef = useRef(null); + + const onTranscriptRef = useRef(onTranscript); + onTranscriptRef.current = onTranscript; + const onErrorRef = useRef(onError); + onErrorRef.current = onError; + const languageRef = useRef(language); + languageRef.current = language; + const llmCleanupRef = useRef(llmCleanup); + llmCleanupRef.current = llmCleanup; + + const cleanup = useCallback(() => { + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + } + mediaRecorderRef.current = null; + + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + streamRef.current = null; + } + + if (audioContextRef.current) { + void audioContextRef.current.close(); + audioContextRef.current = null; + } + + setAnalyserNode(null); + chunksRef.current = []; + }, []); + + useEffect(() => { + return cleanup; + }, [cleanup]); + + const processRecording = useCallback(async (audioBlob: Blob) => { + try { + setStatus("transcribing"); + console.log("[voice] processing recording, blob size:", audioBlob.size); + const wavBase64 = await audioBlobToWavBase64(audioBlob); + console.log("[voice] WAV base64 length:", wavBase64.length); + const api = readNativeApi(); + if (!api) { + throw new Error("API not available"); + } + + const lang = languageRef.current; + console.log("[voice] sending transcribe request, language:", lang); + const result = await api.transcription.transcribe({ + audio: wavBase64, + language: lang, + }); + console.log("[voice] transcription result:", JSON.stringify(result)); + + let finalText = result.text; + + if (llmCleanupRef.current && finalText.trim().length > 0) { + setStatus("cleaning-up"); + try { + const cleanupResult = await api.transcription.cleanup({ + rawText: finalText, + language: lang, + }); + finalText = cleanupResult.cleanedText; + } catch { + const cleanupError: VoiceRecordingError = { + type: "cleanup-failed", + message: "Cleanup skipped", + }; + onErrorRef.current?.(cleanupError); + } + } + + console.log("[voice] final text:", JSON.stringify(finalText)); + if (finalText.trim().length > 0) { + onTranscriptRef.current(finalText.trim()); + } else { + console.warn("[voice] empty transcription result"); + } + } catch (err) { + console.error("[voice] transcription error:", err); + const transcriptionError: VoiceRecordingError = { + type: "transcription-failed", + message: err instanceof Error ? err.message : "Transcription failed", + }; + setError(transcriptionError); + onErrorRef.current?.(transcriptionError); + } finally { + setStatus("idle"); + } + }, []); + + const startRecording = useCallback(async () => { + setError(null); + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + streamRef.current = stream; + + const audioCtx = new AudioContext(); + audioContextRef.current = audioCtx; + const source = audioCtx.createMediaStreamSource(stream); + const analyser = audioCtx.createAnalyser(); + analyser.fftSize = 256; + source.connect(analyser); + setAnalyserNode(analyser); + + const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus") + ? "audio/webm;codecs=opus" + : "audio/webm"; + + const recorder = new MediaRecorder(stream, { mimeType }); + mediaRecorderRef.current = recorder; + chunksRef.current = []; + + recorder.ondataavailable = (e) => { + if (e.data.size > 0) { + chunksRef.current.push(e.data); + } + }; + + recorder.onstop = () => { + const blob = new Blob(chunksRef.current, { type: mimeType }); + cleanup(); + if (blob.size > 0) { + void processRecording(blob); + } else { + setStatus("idle"); + } + }; + + recorder.start(100); + setStatus("recording"); + } catch (err) { + cleanup(); + const errorName = err instanceof DOMException ? err.name : ""; + if (errorName === "NotAllowedError") { + const permError: VoiceRecordingError = { type: "permission-denied" }; + setError(permError); + onErrorRef.current?.(permError); + } else if (errorName === "NotFoundError") { + const micError: VoiceRecordingError = { type: "no-microphone" }; + setError(micError); + onErrorRef.current?.(micError); + } else { + const genericError: VoiceRecordingError = { + type: "transcription-failed", + message: err instanceof Error ? err.message : "Failed to start recording", + }; + setError(genericError); + onErrorRef.current?.(genericError); + } + } + }, [cleanup, processRecording]); + + const stopRecording = useCallback(() => { + if (mediaRecorderRef.current && mediaRecorderRef.current.state === "recording") { + mediaRecorderRef.current.stop(); + } + }, []); + + const toggleRecording = useCallback(() => { + if (!ready) return; + if (status === "recording") { + stopRecording(); + } else if (status === "idle") { + void startRecording(); + } + }, [ready, status, startRecording, stopRecording]); + + return { + status, + isRecording: status === "recording", + isProcessing: status === "transcribing" || status === "cleaning-up", + isSupported, + analyserNode, + toggleRecording, + stopRecording, + error, + }; +} diff --git a/apps/web/src/hooks/useWhisperModelStatus.ts b/apps/web/src/hooks/useWhisperModelStatus.ts new file mode 100644 index 0000000000..a99e7fc1a7 --- /dev/null +++ b/apps/web/src/hooks/useWhisperModelStatus.ts @@ -0,0 +1,21 @@ +import { useQuery } from "@tanstack/react-query"; +import { serverConfigQueryOptions } from "../lib/serverReactQuery"; +import { useSettings } from "./useSettings"; + +const EMPTY_INSTALLED_MODELS: ReadonlyArray = []; + +export function useWhisperModelStatus() { + const serverConfigQuery = useQuery(serverConfigQueryOptions()); + const settings = useSettings(); + + const installedModels = + serverConfigQuery.data?.whisper?.installedModels ?? EMPTY_INSTALLED_MODELS; + const selectedModel = settings.whisperSelectedModel ?? null; + const modelReady = selectedModel !== null && installedModels.includes(selectedModel); + + return { + installedModels, + selectedModel, + modelReady, + }; +} diff --git a/apps/web/src/index.css b/apps/web/src/index.css index 3d4e63e2ef..b783924b32 100644 --- a/apps/web/src/index.css +++ b/apps/web/src/index.css @@ -516,3 +516,13 @@ label:has(> select#reasoning-effort) select { -webkit-background-clip: text; animation: ultrathink-rainbow 10s linear infinite; } + +@keyframes voice-bar { + 0%, + 100% { + transform: scaleY(0.3); + } + 50% { + transform: scaleY(1); + } +} diff --git a/apps/web/src/keybindings.ts b/apps/web/src/keybindings.ts index 07debfae56..561dbb1e91 100644 --- a/apps/web/src/keybindings.ts +++ b/apps/web/src/keybindings.ts @@ -346,6 +346,14 @@ export function isOpenFavoriteEditorShortcut( return matchesCommandShortcut(event, keybindings, "editor.openFavorite", options); } +export function isVoiceToggleShortcut( + event: ShortcutEventLike, + keybindings: ResolvedKeybindingsConfig, + options?: ShortcutMatchOptions, +): boolean { + return matchesCommandShortcut(event, keybindings, "voice.toggle", options); +} + export function isTerminalClearShortcut( event: ShortcutEventLike, platform = navigator.platform, diff --git a/apps/web/src/lib/audioEncoder.ts b/apps/web/src/lib/audioEncoder.ts new file mode 100644 index 0000000000..f62aa63477 --- /dev/null +++ b/apps/web/src/lib/audioEncoder.ts @@ -0,0 +1,73 @@ +export async function audioBlobToWavBase64(blob: Blob): Promise { + const arrayBuffer = await blob.arrayBuffer(); + const audioCtx = new AudioContext(); + const decoded = await audioCtx.decodeAudioData(arrayBuffer); + await audioCtx.close(); + + const targetSampleRate = 16000; + const offlineCtx = new OfflineAudioContext( + 1, + Math.ceil(decoded.duration * targetSampleRate), + targetSampleRate, + ); + const source = offlineCtx.createBufferSource(); + source.buffer = decoded; + source.connect(offlineCtx.destination); + source.start(0); + const resampled = await offlineCtx.startRendering(); + + const pcmData = resampled.getChannelData(0); + const wavBuffer = encodeWav(pcmData, targetSampleRate); + const base64 = bufferToBase64(wavBuffer); + return base64; +} + +function encodeWav(samples: Float32Array, sampleRate: number): ArrayBuffer { + const bitsPerSample = 16; + const numChannels = 1; + const byteRate = (sampleRate * numChannels * bitsPerSample) / 8; + const blockAlign = (numChannels * bitsPerSample) / 8; + const dataSize = samples.length * blockAlign; + const buffer = new ArrayBuffer(44 + dataSize); + const view = new DataView(buffer); + + writeString(view, 0, "RIFF"); + view.setUint32(4, 36 + dataSize, true); + writeString(view, 8, "WAVE"); + + writeString(view, 12, "fmt "); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, numChannels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, bitsPerSample, true); + + writeString(view, 36, "data"); + view.setUint32(40, dataSize, true); + + let offset = 44; + for (let i = 0; i < samples.length; i++) { + const s = Math.max(-1, Math.min(1, samples[i]!)); + view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true); + offset += 2; + } + + return buffer; +} + +function writeString(view: DataView, offset: number, str: string): void { + for (let i = 0; i < str.length; i++) { + view.setUint8(offset + i, str.charCodeAt(i)); + } +} + +function bufferToBase64(buffer: ArrayBuffer): string { + const bytes = new Uint8Array(buffer); + let binary = ""; + for (let i = 0; i < bytes.length; i++) { + binary += String.fromCharCode(bytes[i]!); + } + return btoa(binary); +} diff --git a/apps/web/src/wsNativeApi.ts b/apps/web/src/wsNativeApi.ts index e1f5a55d70..5c7ea12ea5 100644 --- a/apps/web/src/wsNativeApi.ts +++ b/apps/web/src/wsNativeApi.ts @@ -1,6 +1,7 @@ import { type GitActionProgressEvent, type JiraConnectionStatus, + type WhisperDownloadProgressPayload, ORCHESTRATION_WS_CHANNELS, ORCHESTRATION_WS_METHODS, type ContextMenuItem, @@ -21,6 +22,9 @@ const serverConfigUpdatedListeners = new Set<(payload: ServerConfigUpdatedPayloa const providersUpdatedListeners = new Set<(payload: ServerProviderUpdatedPayload) => void>(); const gitActionProgressListeners = new Set<(payload: GitActionProgressEvent) => void>(); const jiraConnectionStatusListeners = new Set<(payload: JiraConnectionStatus) => void>(); +const whisperDownloadProgressListeners = new Set< + (payload: WhisperDownloadProgressPayload) => void +>(); /** * Subscribe to the server welcome message. If a welcome was already received @@ -143,6 +147,16 @@ export function createWsNativeApi(): NativeApi { } } }); + transport.subscribe(WS_CHANNELS.whisperDownloadProgress, (message) => { + const payload = message.data; + for (const listener of whisperDownloadProgressListeners) { + try { + listener(payload); + } catch { + // Swallow listener errors + } + } + }); const api: NativeApi = { dialogs: { @@ -258,6 +272,22 @@ export function createWsNativeApi(): NativeApi { }; }, }, + transcription: { + transcribe: (input) => + transport.request(WS_METHODS.transcriptionTranscribe, input, { timeoutMs: null }), + cleanup: (input) => transport.request(WS_METHODS.transcriptionCleanup, input), + }, + whisper: { + installModel: (input) => + transport.request(WS_METHODS.whisperInstallModel, input, { timeoutMs: null }), + deleteModel: (input) => transport.request(WS_METHODS.whisperDeleteModel, input), + onDownloadProgress: (callback) => { + whisperDownloadProgressListeners.add(callback); + return () => { + whisperDownloadProgressListeners.delete(callback); + }; + }, + }, }; instance = { api, transport }; diff --git a/bun.lock b/bun.lock index 2e147aa99f..4b2b5ba22c 100644 --- a/bun.lock +++ b/bun.lock @@ -62,9 +62,11 @@ "@anthropic-ai/claude-agent-sdk": "^0.2.77", "@effect/platform-node": "catalog:", "@effect/sql-sqlite-bun": "catalog:", + "@huggingface/transformers": "^4.0.1", "@pierre/diffs": "^1.1.0-beta.16", "effect": "catalog:", "node-pty": "^1.1.0", + "onnxruntime-node": "^1.24.3", "open": "^10.1.0", "ws": "^8.18.0", }, @@ -350,6 +352,12 @@ "@hapi/topo": ["@hapi/topo@6.0.2", "", { "dependencies": { "@hapi/hoek": "^11.0.2" } }, "sha512-KR3rD5inZbGMrHmgPxsJ9dbi6zEK+C3ZwUwTa+eMwWLz7oijWUTWD2pMSNNYJAU6Qq+65NkxXjqHr/7LM2Xkqg=="], + "@huggingface/jinja": ["@huggingface/jinja@0.5.6", "", {}, "sha512-MyMWyLnjqo+KRJYSH7oWNbsOn5onuIvfXYPcc0WOGxU0eHUV7oAYUoQTl2BMdu7ml+ea/bu11UM+EshbeHwtIA=="], + + "@huggingface/tokenizers": ["@huggingface/tokenizers@0.1.3", "", {}, "sha512-8rF/RRT10u+kn7YuUbUg0OF30K8rjTc78aHpxT+qJ1uWSqxT1MHi8+9ltwYfkFYJzT/oS+qw3JVfHtNMGAdqyA=="], + + "@huggingface/transformers": ["@huggingface/transformers@4.0.1", "", { "dependencies": { "@huggingface/jinja": "^0.5.6", "@huggingface/tokenizers": "^0.1.3", "onnxruntime-node": "1.24.3", "onnxruntime-web": "1.25.0-dev.20260327-722743c0e2", "sharp": "^0.34.5" } }, "sha512-tAQYEy+cnW0ku/NxBSjFXCymi+DZa1/JkoGf4McxjzO36CZZIL/J4TF6X7i/tzs75yTjshUDgsvSz03s2xym2A=="], + "@img/colour": ["@img/colour@1.1.0", "", {}, "sha512-Td76q7j57o/tLVdgS746cYARfSyxk8iEfRxewL9h4OMzYhbW4TAcppl0mT4eyqXddh6L/jwoM75mo7ixa/pCeQ=="], "@img/sharp-darwin-arm64": ["@img/sharp-darwin-arm64@0.34.5", "", { "optionalDependencies": { "@img/sharp-libvips-darwin-arm64": "1.2.4" }, "os": "darwin", "cpu": "arm64" }, "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w=="], @@ -606,6 +614,26 @@ "@preact/signals-core": ["@preact/signals-core@1.14.0", "", {}, "sha512-AowtCcCU/33lFlh1zRFf/u+12rfrhtNakj7UpaGEsmMwUKpKWMVvcktOGcwBBNiB4lWrZWc01LhiyyzVklJyaQ=="], + "@protobufjs/aspromise": ["@protobufjs/aspromise@1.1.2", "", {}, "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ=="], + + "@protobufjs/base64": ["@protobufjs/base64@1.1.2", "", {}, "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg=="], + + "@protobufjs/codegen": ["@protobufjs/codegen@2.0.4", "", {}, "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg=="], + + "@protobufjs/eventemitter": ["@protobufjs/eventemitter@1.1.0", "", {}, "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q=="], + + "@protobufjs/fetch": ["@protobufjs/fetch@1.1.0", "", { "dependencies": { "@protobufjs/aspromise": "^1.1.1", "@protobufjs/inquire": "^1.1.0" } }, "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ=="], + + "@protobufjs/float": ["@protobufjs/float@1.0.2", "", {}, "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ=="], + + "@protobufjs/inquire": ["@protobufjs/inquire@1.1.0", "", {}, "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q=="], + + "@protobufjs/path": ["@protobufjs/path@1.1.2", "", {}, "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA=="], + + "@protobufjs/pool": ["@protobufjs/pool@1.1.0", "", {}, "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw=="], + + "@protobufjs/utf8": ["@protobufjs/utf8@1.1.0", "", {}, "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="], + "@quansync/fs": ["@quansync/fs@1.0.0", "", { "dependencies": { "quansync": "^1.0.0" } }, "sha512-4TJ3DFtlf1L5LDMaM6CanJ/0lckGNtJcMjQ1NAV6zDmA0tEHKZtxNKin8EgPaVX1YzljbxckyT2tJrpQKAtngQ=="], "@radix-ui/react-compose-refs": ["@radix-ui/react-compose-refs@1.1.2", "", { "peerDependencies": { "@types/react": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react"] }, "sha512-z4eqJvfiNnFMHIIvXP3CY57y2WJs5g2v3X0zm9mEJkrkNv4rDxu+sg9Jh8EkXyeqBkB7SOcboo9dMVqhyrACIg=="], @@ -814,6 +842,8 @@ "acorn": ["acorn@8.16.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw=="], + "adm-zip": ["adm-zip@0.5.17", "", {}, "sha512-+Ut8d9LLqwEvHHJl1+PIHqoyDxFgVN847JTVM3Izi3xHDWPE4UtzzXysMZQs64DMcrJfBeS/uoEP4AD3HQHnQQ=="], + "ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], "ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], @@ -1010,6 +1040,8 @@ "find-my-way-ts": ["find-my-way-ts@0.1.6", "", {}, "sha512-a85L9ZoXtNAey3Y6Z+eBWW658kO/MwR7zIafkIUPUMf3isZG0NCs2pjW2wtjxAKuJPxMAsHUIP4ZPGv0o5gyTA=="], + "flatbuffers": ["flatbuffers@25.9.23", "", {}, "sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ=="], + "follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="], "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], @@ -1046,6 +1078,8 @@ "graphql": ["graphql@16.13.1", "", {}, "sha512-gGgrVCoDKlIZ8fIqXBBb0pPKqDgki0Z/FSKNiQzSGj2uEYHr1tq5wmBegGwJx6QB5S5cM0khSBpi/JFHMCvsmQ=="], + "guid-typescript": ["guid-typescript@1.0.9", "", {}, "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ=="], + "has-property-descriptors": ["has-property-descriptors@1.0.2", "", { "dependencies": { "es-define-property": "^1.0.0" } }, "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg=="], "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], @@ -1174,6 +1208,8 @@ "lodash.isequal": ["lodash.isequal@4.5.0", "", {}, "sha512-pDo3lu8Jhfjqls6GkMgpahsF9kCyayhgykjyLMNFTKWrpVdAQtYyB4muAMWozBB4ig/dtWAmsMxLEI8wuz+DYQ=="], + "long": ["long@5.3.2", "", {}, "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA=="], + "longest-streak": ["longest-streak@3.1.0", "", {}, "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g=="], "lowercase-keys": ["lowercase-keys@2.0.0", "", {}, "sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA=="], @@ -1330,6 +1366,12 @@ "oniguruma-to-es": ["oniguruma-to-es@4.3.5", "", { "dependencies": { "oniguruma-parser": "^0.12.1", "regex": "^6.1.0", "regex-recursion": "^6.0.2" } }, "sha512-Zjygswjpsewa0NLTsiizVuMQZbp0MDyM6lIt66OxsF21npUDlzpHi1Mgb/qhQdkb+dWFTzJmFbEWdvZgRho8eQ=="], + "onnxruntime-common": ["onnxruntime-common@1.24.3", "", {}, "sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA=="], + + "onnxruntime-node": ["onnxruntime-node@1.24.3", "", { "dependencies": { "adm-zip": "^0.5.16", "global-agent": "^3.0.0", "onnxruntime-common": "1.24.3" }, "os": [ "linux", "win32", "darwin", ] }, "sha512-JH7+czbc8ALA819vlTgcV+Q214/+VjGeBHDjX81+ZCD0PCVCIFGFNtT0V4sXG/1JXypKPgScQcB3ij/hk3YnTg=="], + + "onnxruntime-web": ["onnxruntime-web@1.25.0-dev.20260327-722743c0e2", "", { "dependencies": { "flatbuffers": "^25.1.24", "guid-typescript": "^1.0.9", "long": "^5.2.3", "onnxruntime-common": "1.24.0-dev.20251116-b39e144322", "platform": "^1.3.6", "protobufjs": "^7.2.4" } }, "sha512-8PXdZy4Ekhg10CLg+cFFt39b4tFDGMRJB6lGjnQL6eA+2boUQYDymZ0gtxiS+H6oIWoCjQp/ziyirvFbaFKfiw=="], + "open": ["open@10.2.0", "", { "dependencies": { "default-browser": "^5.2.1", "define-lazy-prop": "^3.0.0", "is-inside-container": "^1.0.0", "wsl-utils": "^0.1.0" } }, "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA=="], "outvariant": ["outvariant@1.4.3", "", {}, "sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA=="], @@ -1352,6 +1394,8 @@ "picomatch": ["picomatch@4.0.3", "", {}, "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q=="], + "platform": ["platform@1.3.6", "", {}, "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="], + "playwright": ["playwright@1.58.2", "", { "dependencies": { "playwright-core": "1.58.2" }, "optionalDependencies": { "fsevents": "2.3.2" }, "bin": { "playwright": "cli.js" } }, "sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A=="], "playwright-core": ["playwright-core@1.58.2", "", { "bin": { "playwright-core": "cli.js" } }, "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg=="], @@ -1368,6 +1412,8 @@ "property-information": ["property-information@7.1.0", "", {}, "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ=="], + "protobufjs": ["protobufjs@7.5.4", "", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.4", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.0", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg=="], + "proxy-from-env": ["proxy-from-env@1.1.0", "", {}, "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg=="], "pump": ["pump@3.0.4", "", { "dependencies": { "end-of-stream": "^1.1.0", "once": "^1.3.1" } }, "sha512-VS7sjc6KR7e1ukRFhQSY5LM2uBWAUPiOPa/A3mkKmiMwSmRFUITt0xuj+/lesgnCv+dPIEYlkzrcyXgquIHMcA=="], @@ -1706,6 +1752,8 @@ "next/postcss": ["postcss@8.4.31", "", { "dependencies": { "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" } }, "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ=="], + "onnxruntime-web/onnxruntime-common": ["onnxruntime-common@1.24.0-dev.20251116-b39e144322", "", {}, "sha512-BOoomdHYmNRL5r4iQ4bMvsl2t0/hzVQ3OM3PHD0gxeXu1PmggqBv3puZicEUVOA3AtHHYmqZtjMj9FOfGrATTw=="], + "parse-entities/@types/unist": ["@types/unist@2.0.11", "", {}, "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA=="], "readdirp/picomatch": ["picomatch@2.3.1", "", {}, "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA=="], diff --git a/packages/contracts/src/index.ts b/packages/contracts/src/index.ts index 03b65be94c..796e25dbb1 100644 --- a/packages/contracts/src/index.ts +++ b/packages/contracts/src/index.ts @@ -13,3 +13,4 @@ export * from "./orchestration"; export * from "./editor"; export * from "./project"; export * from "./jira"; +export * from "./transcription"; diff --git a/packages/contracts/src/ipc.ts b/packages/contracts/src/ipc.ts index a4c8920cf1..6c08b74a03 100644 --- a/packages/contracts/src/ipc.ts +++ b/packages/contracts/src/ipc.ts @@ -66,6 +66,15 @@ import type { JiraListSprintsResult, JiraSite, } from "./jira"; +import type { + TranscribeInput, + TranscribeResult, + TranscriptionCleanupInput, + TranscriptionCleanupResult, + WhisperInstallModelInput, + WhisperDeleteModelInput, + WhisperDownloadProgressPayload, +} from "./transcription"; import { EditorId } from "./editor"; import { ServerSettings, ServerSettingsPatch } from "./settings"; @@ -218,4 +227,13 @@ export interface NativeApi { getAttachment: (input: JiraGetAttachmentInput) => Promise; onConnectionStatusChanged: (callback: (status: JiraConnectionStatus) => void) => () => void; }; + transcription: { + transcribe: (input: TranscribeInput) => Promise; + cleanup: (input: TranscriptionCleanupInput) => Promise; + }; + whisper: { + installModel: (input: WhisperInstallModelInput) => Promise; + deleteModel: (input: WhisperDeleteModelInput) => Promise; + onDownloadProgress: (callback: (event: WhisperDownloadProgressPayload) => void) => () => void; + }; } diff --git a/packages/contracts/src/keybindings.ts b/packages/contracts/src/keybindings.ts index 067cba8804..7a634f8e1d 100644 --- a/packages/contracts/src/keybindings.ts +++ b/packages/contracts/src/keybindings.ts @@ -36,6 +36,7 @@ const STATIC_KEYBINDING_COMMANDS = [ "chat.new", "chat.newLocal", "editor.openFavorite", + "voice.toggle", ...THREAD_KEYBINDING_COMMANDS, ] as const; diff --git a/packages/contracts/src/server.ts b/packages/contracts/src/server.ts index 78d0879cd2..10a33a7088 100644 --- a/packages/contracts/src/server.ts +++ b/packages/contracts/src/server.ts @@ -66,6 +66,9 @@ export const ServerConfig = Schema.Struct({ providers: ServerProviders, availableEditors: Schema.Array(EditorId), settings: ServerSettings, + whisper: Schema.Struct({ + installedModels: Schema.Array(Schema.String), + }).pipe(Schema.withDecodingDefault(() => ({ installedModels: [] as ReadonlyArray }))), }); export type ServerConfig = typeof ServerConfig.Type; diff --git a/packages/contracts/src/settings.ts b/packages/contracts/src/settings.ts index 508550813c..02f6370ac5 100644 --- a/packages/contracts/src/settings.ts +++ b/packages/contracts/src/settings.ts @@ -35,6 +35,9 @@ export const ClientSettingsSchema = Schema.Struct({ ), timestampFormat: TimestampFormat.pipe(Schema.withDecodingDefault(() => DEFAULT_TIMESTAMP_FORMAT)), showTodosInComposer: Schema.Boolean.pipe(Schema.withDecodingDefault(() => true)), + voiceEnabled: Schema.Boolean.pipe(Schema.withDecodingDefault(() => false)), + voiceLanguage: Schema.String.pipe(Schema.withDecodingDefault(() => "en")), + voiceLlmCleanup: Schema.Boolean.pipe(Schema.withDecodingDefault(() => false)), }); export type ClientSettings = typeof ClientSettingsSchema.Type; @@ -89,6 +92,9 @@ export const ServerSettings = Schema.Struct({ codex: CodexSettings.pipe(Schema.withDecodingDefault(() => ({}))), claudeAgent: ClaudeSettings.pipe(Schema.withDecodingDefault(() => ({}))), }).pipe(Schema.withDecodingDefault(() => ({}))), + whisperSelectedModel: Schema.NullOr(Schema.String).pipe( + Schema.withDecodingDefault(() => null as string | null), + ), }); export type ServerSettings = typeof ServerSettings.Type; @@ -151,5 +157,6 @@ export const ServerSettingsPatch = Schema.Struct({ claudeAgent: Schema.optionalKey(ClaudeSettingsPatch), }), ), + whisperSelectedModel: Schema.optionalKey(Schema.NullOr(Schema.String)), }); export type ServerSettingsPatch = typeof ServerSettingsPatch.Type; diff --git a/packages/contracts/src/transcription.ts b/packages/contracts/src/transcription.ts new file mode 100644 index 0000000000..81a3a4a43b --- /dev/null +++ b/packages/contracts/src/transcription.ts @@ -0,0 +1,74 @@ +import { Schema } from "effect"; + +export const WHISPER_MODELS = [ + { id: "Xenova/whisper-tiny", label: "Tiny", size: "~39 MB", accuracy: "Fair", speed: "Fastest" }, + { id: "Xenova/whisper-base", label: "Base", size: "~142 MB", accuracy: "Good", speed: "Fast" }, + { + id: "Xenova/whisper-small", + label: "Small", + size: "~466 MB", + accuracy: "Very good", + speed: "Moderate", + }, + { + id: "Xenova/whisper-medium", + label: "Medium", + size: "~1.5 GB", + accuracy: "Excellent", + speed: "Slow", + }, +] as const; + +export type WhisperModelDefinition = (typeof WHISPER_MODELS)[number]; + +export const TranscribeInput = Schema.Struct({ + audio: Schema.String, + language: Schema.optional(Schema.String), +}); +export type TranscribeInput = typeof TranscribeInput.Type; + +export const TranscribeResult = Schema.Struct({ + text: Schema.String, +}); +export type TranscribeResult = typeof TranscribeResult.Type; + +export const TranscriptionCleanupInput = Schema.Struct({ + rawText: Schema.String, + language: Schema.optional(Schema.String), +}); +export type TranscriptionCleanupInput = typeof TranscriptionCleanupInput.Type; + +export const TranscriptionCleanupResult = Schema.Struct({ + cleanedText: Schema.String, +}); +export type TranscriptionCleanupResult = typeof TranscriptionCleanupResult.Type; + +export const WhisperInstallModelInput = Schema.Struct({ + modelId: Schema.String, +}); +export type WhisperInstallModelInput = typeof WhisperInstallModelInput.Type; + +export const WhisperDeleteModelInput = Schema.Struct({ + modelId: Schema.String, +}); +export type WhisperDeleteModelInput = typeof WhisperDeleteModelInput.Type; + +export const WhisperDownloadProgressPayload = Schema.Struct({ + modelId: Schema.String, + progress: Schema.Number, + file: Schema.String, + status: Schema.Literals(["downloading", "complete", "error"]), + error: Schema.optional(Schema.String), +}); +export type WhisperDownloadProgressPayload = typeof WhisperDownloadProgressPayload.Type; + +export const TRANSCRIPTION_WS_METHODS = { + transcribe: "transcription.transcribe", + cleanup: "transcription.cleanup", + installModel: "whisper.installModel", + deleteModel: "whisper.deleteModel", +} as const; + +export const WHISPER_WS_CHANNELS = { + downloadProgress: "whisper.downloadProgress", +} as const; diff --git a/packages/contracts/src/ws.ts b/packages/contracts/src/ws.ts index 5d0591b4c6..94119a16de 100644 --- a/packages/contracts/src/ws.ts +++ b/packages/contracts/src/ws.ts @@ -53,6 +53,15 @@ import { JIRA_WS_CHANNELS, JIRA_WS_METHODS, } from "./jira"; +import { + TranscribeInput, + TranscriptionCleanupInput, + WhisperInstallModelInput, + WhisperDeleteModelInput, + WhisperDownloadProgressPayload, + TRANSCRIPTION_WS_METHODS, + WHISPER_WS_CHANNELS, +} from "./transcription"; // ── WebSocket RPC Method Names ─────────────────────────────────────── @@ -105,6 +114,11 @@ export const WS_METHODS = { jiraListIssues: JIRA_WS_METHODS.listIssues, jiraGetIssue: JIRA_WS_METHODS.getIssue, jiraGetAttachment: JIRA_WS_METHODS.getAttachment, + + transcriptionTranscribe: TRANSCRIPTION_WS_METHODS.transcribe, + transcriptionCleanup: TRANSCRIPTION_WS_METHODS.cleanup, + whisperInstallModel: TRANSCRIPTION_WS_METHODS.installModel, + whisperDeleteModel: TRANSCRIPTION_WS_METHODS.deleteModel, } as const; // ── Push Event Channels ────────────────────────────────────────────── @@ -116,6 +130,7 @@ export const WS_CHANNELS = { serverConfigUpdated: "server.configUpdated", serverProvidersUpdated: "server.providersUpdated", jiraConnectionStatusChanged: JIRA_WS_CHANNELS.connectionStatusChanged, + whisperDownloadProgress: WHISPER_WS_CHANNELS.downloadProgress, } as const; // -- Tagged Union of all request body schemas ───────────────────────── @@ -186,6 +201,11 @@ const WebSocketRequestBody = Schema.Union([ tagRequestBody(WS_METHODS.jiraListIssues, JiraListIssuesInput), tagRequestBody(WS_METHODS.jiraGetIssue, JiraGetIssueInput), tagRequestBody(WS_METHODS.jiraGetAttachment, JiraGetAttachmentInput), + + tagRequestBody(WS_METHODS.transcriptionTranscribe, TranscribeInput), + tagRequestBody(WS_METHODS.transcriptionCleanup, TranscriptionCleanupInput), + tagRequestBody(WS_METHODS.whisperInstallModel, WhisperInstallModelInput), + tagRequestBody(WS_METHODS.whisperDeleteModel, WhisperDeleteModelInput), ]); export const WebSocketRequest = Schema.Struct({ @@ -224,6 +244,7 @@ export interface WsPushPayloadByChannel { readonly [WS_CHANNELS.terminalEvent]: typeof TerminalEvent.Type; readonly [ORCHESTRATION_WS_CHANNELS.domainEvent]: OrchestrationEvent; readonly [WS_CHANNELS.jiraConnectionStatusChanged]: typeof JiraConnectionStatus.Type; + readonly [WS_CHANNELS.whisperDownloadProgress]: typeof WhisperDownloadProgressPayload.Type; } export type WsPushChannel = keyof WsPushPayloadByChannel; @@ -262,6 +283,10 @@ export const WsPushJiraConnectionStatusChanged = makeWsPushSchema( WS_CHANNELS.jiraConnectionStatusChanged, JiraConnectionStatus, ); +export const WsPushWhisperDownloadProgress = makeWsPushSchema( + WS_CHANNELS.whisperDownloadProgress, + WhisperDownloadProgressPayload, +); export const WsPushChannelSchema = Schema.Literals([ WS_CHANNELS.gitActionProgress, @@ -271,6 +296,7 @@ export const WsPushChannelSchema = Schema.Literals([ WS_CHANNELS.terminalEvent, ORCHESTRATION_WS_CHANNELS.domainEvent, WS_CHANNELS.jiraConnectionStatusChanged, + WS_CHANNELS.whisperDownloadProgress, ]); export type WsPushChannelSchema = typeof WsPushChannelSchema.Type; @@ -282,6 +308,7 @@ export const WsPush = Schema.Union([ WsPushTerminalEvent, WsPushOrchestrationDomainEvent, WsPushJiraConnectionStatusChanged, + WsPushWhisperDownloadProgress, ]); export type WsPush = typeof WsPush.Type;