From 1b5980deb2d32351612fe80369e940d463c04d16 Mon Sep 17 00:00:00 2001 From: Chris Scott <99081550+chriswritescode-dev@users.noreply.github.com> Date: Sun, 31 May 2026 23:46:42 -0400 Subject: [PATCH 1/5] loop: stt-warmup completed after 6 iterations --- docs/features/stt.md | 6 + frontend/src/hooks/useSTT.test.tsx | 113 ++++++++++++ frontend/src/hooks/useSTT.ts | 21 ++- frontend/src/lib/audioRecorder.test.ts | 241 ++++++++++++++++++++++++- frontend/src/lib/audioRecorder.ts | 103 +++++++---- 5 files changed, 444 insertions(+), 40 deletions(-) create mode 100644 frontend/src/hooks/useSTT.test.tsx diff --git a/docs/features/stt.md b/docs/features/stt.md index bb8c85ea..d8c9b919 100644 --- a/docs/features/stt.md +++ b/docs/features/stt.md @@ -69,6 +69,12 @@ Any OpenAI-compatible transcription API works: - Self-hosted Whisper servers - Local STT servers with OpenAI-compatible API +### Performance + +After the first microphone press, the browser audio pipeline stays prepared so subsequent recordings start faster. The audio context and worklet processor are retained between recordings; only the microphone track is stopped after each use. Resources are released entirely when external STT is disabled or the voice input UI unmounts. + +This optimization applies only to the external API provider. It does not affect the initial permission prompt — the browser still asks for microphone access on the first recording. + ## Using Voice Input ### Tap-to-Start / Tap-to-Stop diff --git a/frontend/src/hooks/useSTT.test.tsx b/frontend/src/hooks/useSTT.test.tsx new file mode 100644 index 00000000..e0ac6523 --- /dev/null +++ b/frontend/src/hooks/useSTT.test.tsx @@ -0,0 +1,113 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest' +import { renderHook, act, waitFor } from '@testing-library/react' +import { useSTT } from './useSTT' + +type MockRecorder = { + start: ReturnType + stop: ReturnType + abort: ReturnType + dispose: ReturnType + setOnStateChange: ReturnType + setOnError: ReturnType + setOnDataAvailable: ReturnType +} + +const mocks = vi.hoisted(() => ({ + useSettings: vi.fn(), + AudioRecorder: vi.fn(), + getWebSpeechRecognizer: vi.fn(), + isWebRecognitionSupported: vi.fn(), +})) + +vi.mock('@/hooks/useSettings', () => ({ + useSettings: mocks.useSettings, +})) + +vi.mock('@/lib/audioRecorder', () => ({ + AudioRecorder: mocks.AudioRecorder, +})) + +vi.mock('@/lib/webSpeechRecognizer', () => ({ + getWebSpeechRecognizer: mocks.getWebSpeechRecognizer, + isWebRecognitionSupported: mocks.isWebRecognitionSupported, +})) + +const externalSTTPreferences = { + preferences: { + stt: { + enabled: true, + provider: 'external' as const, + endpoint: 'https://api.openai.com', + apiKey: 'test-key', + model: 'whisper-1', + language: 'en-US', + }, + }, +} + +describe('useSTT external provider lifecycle', () => { + let mockRecorder: MockRecorder + + beforeEach(() => { + vi.clearAllMocks() + + mockRecorder = { + start: vi.fn().mockResolvedValue(undefined), + stop: vi.fn(), + abort: vi.fn(), + dispose: vi.fn(), + setOnStateChange: vi.fn(), + setOnError: vi.fn(), + setOnDataAvailable: vi.fn(), + } + + mocks.AudioRecorder.mockImplementation(() => mockRecorder) + mocks.useSettings.mockReturnValue(externalSTTPreferences) + mocks.getWebSpeechRecognizer.mockReturnValue({ + start: vi.fn(), + stop: vi.fn(), + abort: vi.fn(), + clearCallbacks: vi.fn(), + onResult: vi.fn(), + onInterimResult: vi.fn(), + onError: vi.fn(), + onEnd: vi.fn(), + onStart: vi.fn(), + }) + mocks.isWebRecognitionSupported.mockReturnValue(true) + }) + + it('does not start external recording until startRecording is called', async () => { + const { result } = renderHook(() => useSTT()) + + await waitFor(() => { + expect(mocks.AudioRecorder).toHaveBeenCalledTimes(1) + }) + + expect(mockRecorder.start).not.toHaveBeenCalled() + expect(mockRecorder.setOnStateChange).toHaveBeenCalledTimes(1) + expect(mockRecorder.setOnError).toHaveBeenCalledTimes(1) + expect(mockRecorder.setOnDataAvailable).toHaveBeenCalledTimes(1) + + await act(async () => { + await result.current.startRecording() + }) + + expect(mockRecorder.start).toHaveBeenCalledTimes(1) + }) + + it('disposes external recorder resources on unmount', async () => { + const { unmount } = renderHook(() => useSTT()) + + await waitFor(() => { + expect(mocks.AudioRecorder).toHaveBeenCalledTimes(1) + }) + + const recorder = mockRecorder + + unmount() + + expect(recorder.dispose).toHaveBeenCalledTimes(1) + expect(recorder.abort).not.toHaveBeenCalled() + }) +}) diff --git a/frontend/src/hooks/useSTT.ts b/frontend/src/hooks/useSTT.ts index 384cdf1b..edd79d15 100644 --- a/frontend/src/hooks/useSTT.ts +++ b/frontend/src/hooks/useSTT.ts @@ -185,6 +185,14 @@ export function useSTT(userId = 'default') { }) }, []) + const disposeAudioRecorder = useCallback(() => { + if (audioRecorder.current) { + audioRecorder.current.dispose() + audioRecorder.current = null + } + recorderConfiguredRef.current = false + }, []) + useEffect(() => { if (!isEnabled || !isExternalProvider) { return @@ -200,11 +208,9 @@ export function useSTT(userId = 'default') { } return () => { - if (audioRecorder.current) { - audioRecorder.current.abort() - } + disposeAudioRecorder() } - }, [isEnabled, isExternalProvider, setupAudioRecorder]) + }, [isEnabled, isExternalProvider, setupAudioRecorder, disposeAudioRecorder]) const clearStartupTimeout = useCallback(() => { if (startupTimeoutRef.current) { @@ -214,8 +220,8 @@ export function useSTT(userId = 'default') { }, []) const abortAndResetOnTimeout = useCallback(() => { - if (isExternalProvider && audioRecorder.current) { - audioRecorder.current.abort() + if (isExternalProvider) { + disposeAudioRecorder() } else { recognizer.current.abort() } @@ -224,7 +230,7 @@ export function useSTT(userId = 'default') { setState('idle') setIsError(true) setError('Microphone start timed out') - }, [isExternalProvider]) + }, [isExternalProvider, disposeAudioRecorder]) const startRecording = useCallback(async (): Promise => { if (!isSupported) { @@ -252,6 +258,7 @@ export function useSTT(userId = 'default') { if (!audioRecorder.current) { audioRecorder.current = new AudioRecorder() setupAudioRecorder(audioRecorder.current) + recorderConfiguredRef.current = true } try { diff --git a/frontend/src/lib/audioRecorder.test.ts b/frontend/src/lib/audioRecorder.test.ts index d0b3773b..142563bf 100644 --- a/frontend/src/lib/audioRecorder.test.ts +++ b/frontend/src/lib/audioRecorder.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect } from 'vitest' +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest' import { AudioRecorder, downsampleAndConvert, encodeWavFromInt16 } from './audioRecorder' const blobToArrayBuffer = (blob: Blob): Promise => @@ -158,3 +158,242 @@ describe('AudioRecorder.isSupported', () => { }).not.toThrow() }) }) + +describe('AudioRecorder.prepare', () => { + let originalAudioContext: typeof window.AudioContext + let originalGetUserMedia: (typeof navigator.mediaDevices)['getUserMedia'] | undefined + let mockAddModule: ReturnType + + beforeEach(() => { + originalAudioContext = window.AudioContext + originalGetUserMedia = navigator.mediaDevices?.getUserMedia + + mockAddModule = vi.fn().mockResolvedValue(undefined) + + const mockSource = { connect: vi.fn(), disconnect: vi.fn() } + + const MockAudioContext = vi.fn().mockImplementation(() => ({ + state: 'running', + sampleRate: 16000, + audioWorklet: { addModule: mockAddModule }, + createMediaStreamSource: vi.fn().mockReturnValue(mockSource), + createScriptProcessor: vi.fn().mockReturnValue({ + connect: vi.fn(), + disconnect: vi.fn(), + onaudioprocess: null, + }), + resume: vi.fn().mockResolvedValue(undefined), + close: vi.fn().mockResolvedValue(undefined), + })) + + window.AudioContext = MockAudioContext as unknown as typeof window.AudioContext + + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: vi.fn() }, + writable: true, + configurable: true, + }) + }) + + afterEach(() => { + window.AudioContext = originalAudioContext + if (originalGetUserMedia) { + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: originalGetUserMedia }, + writable: true, + configurable: true, + }) + } + }) + + it('prepares the audio context and worklet without requesting microphone access', async () => { + const recorder = new AudioRecorder() + await recorder.prepare() + + expect(navigator.mediaDevices.getUserMedia).not.toHaveBeenCalled() + expect(window.AudioContext).toHaveBeenCalledTimes(1) + expect(mockAddModule).toHaveBeenCalledOnce() + expect(mockAddModule).toHaveBeenCalledWith('/audio-worklet-processor.js') + }) + + it('reuses the same AudioContext and worklet when prepare() precedes start()', async () => { + const originalAudioWorkletNode = (window as any).AudioWorkletNode + + const mockWorkletNode = { + port: { + onmessage: null as ((e: MessageEvent) => void) | null, + postMessage: vi.fn(), + }, + disconnect: vi.fn(), + } + const MockAudioWorkletNode = vi.fn().mockImplementation(() => mockWorkletNode) + ;(window as any).AudioWorkletNode = MockAudioWorkletNode + + const mockTrack = { stop: vi.fn(), kind: 'audio' } + const mockMediaStream = { + getTracks: vi.fn().mockReturnValue([mockTrack]), + getAudioTracks: vi.fn().mockReturnValue([mockTrack]), + } + ;(navigator.mediaDevices as any).getUserMedia = vi.fn().mockResolvedValue(mockMediaStream) + + const recorder = new AudioRecorder() + await recorder.prepare() + + mockAddModule.mockClear() + + await recorder.start() + + expect(mockAddModule).not.toHaveBeenCalled() + expect(window.AudioContext).toHaveBeenCalledTimes(1) + expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1) + + recorder.stop() + + ;(window as any).AudioWorkletNode = originalAudioWorkletNode + }) + + it('reuses the prepared audio context and loaded worklet across recordings', async () => { + const originalAudioWorkletNode = (window as any).AudioWorkletNode + const originalAudioContext = window.AudioContext + + const mockAddModule = vi.fn().mockResolvedValue(undefined) + const mockClose = vi.fn().mockResolvedValue(undefined) + const mockResume = vi.fn().mockResolvedValue(undefined) + const mockTrack = { stop: vi.fn(), kind: 'audio' } + const mockSource = { connect: vi.fn(), disconnect: vi.fn() } + + const MockAudioContext = vi.fn().mockImplementation(() => ({ + state: 'running', + sampleRate: 16000, + audioWorklet: { addModule: mockAddModule }, + createMediaStreamSource: vi.fn().mockReturnValue(mockSource), + createScriptProcessor: vi.fn().mockReturnValue({ + connect: vi.fn(), + disconnect: vi.fn(), + onaudioprocess: null, + }), + resume: mockResume, + close: mockClose, + })) + window.AudioContext = MockAudioContext as unknown as typeof window.AudioContext + + const mockWorkletNode = { + port: { + onmessage: null as ((e: MessageEvent) => void) | null, + postMessage: vi.fn(), + }, + disconnect: vi.fn(), + } + const MockAudioWorkletNode = vi.fn().mockImplementation(() => mockWorkletNode) + ;(window as any).AudioWorkletNode = MockAudioWorkletNode + + const originalGetUserMedia = navigator.mediaDevices?.getUserMedia + const mockGetUserMedia = vi.fn().mockResolvedValue({ + getTracks: () => [mockTrack], + getAudioTracks: () => [mockTrack], + }) + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: mockGetUserMedia }, + writable: true, + configurable: true, + }) + + const recorder = new AudioRecorder() + + await recorder.start() + recorder.stop() + + await recorder.start() + recorder.stop() + + recorder.dispose() + + expect(mockGetUserMedia).toHaveBeenCalledTimes(2) + expect(MockAudioContext).toHaveBeenCalledTimes(1) + expect(mockAddModule).toHaveBeenCalledTimes(1) + expect(mockTrack.stop).toHaveBeenCalledTimes(2) + expect(mockClose).toHaveBeenCalledTimes(1) + + window.AudioContext = originalAudioContext + ;(window as any).AudioWorkletNode = originalAudioWorkletNode + if (originalGetUserMedia) { + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: originalGetUserMedia }, + writable: true, + configurable: true, + }) + } + }) +}) + +describe('AudioRecorder lifecycle cancellation', () => { + let originalAudioContext: typeof window.AudioContext + let originalAudioWorkletNode: unknown + let originalGetUserMedia: (typeof navigator.mediaDevices)['getUserMedia'] | undefined + let mockTrack: { stop: ReturnType; kind: string } + + beforeEach(() => { + originalAudioContext = window.AudioContext + originalAudioWorkletNode = (window as any).AudioWorkletNode + originalGetUserMedia = navigator.mediaDevices?.getUserMedia + + mockTrack = { stop: vi.fn(), kind: 'audio' } + + const MockAudioContext = vi.fn().mockImplementation(() => ({ + state: 'running', + sampleRate: 16000, + audioWorklet: { addModule: vi.fn().mockResolvedValue(undefined) }, + createMediaStreamSource: vi.fn(), + createScriptProcessor: vi.fn(), + resume: vi.fn().mockResolvedValue(undefined), + close: vi.fn().mockResolvedValue(undefined), + })) + + window.AudioContext = MockAudioContext as unknown as typeof window.AudioContext + ;(window as any).AudioWorkletNode = vi.fn().mockImplementation(() => ({ + port: { onmessage: null, postMessage: vi.fn() }, + disconnect: vi.fn(), + })) + }) + + afterEach(() => { + window.AudioContext = originalAudioContext + ;(window as any).AudioWorkletNode = originalAudioWorkletNode + if (originalGetUserMedia) { + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: originalGetUserMedia }, + writable: true, + configurable: true, + }) + } + }) + + it('cleans up and does not enter recording state when dispose is called during async startup', async () => { + let resolveGetUserMedia: (stream: MediaStream) => void + const deferredGetUserMedia = new Promise((resolve) => { + resolveGetUserMedia = resolve + }) + + const mockGetUserMedia = vi.fn().mockReturnValue(deferredGetUserMedia) + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: mockGetUserMedia }, + writable: true, + configurable: true, + }) + + const recorder = new AudioRecorder() + + const startPromise = recorder.start() + + recorder.dispose() + + const stream = { getTracks: () => [mockTrack], getAudioTracks: () => [mockTrack] } as unknown as MediaStream + resolveGetUserMedia!(stream) + + await startPromise + + expect(recorder.getState()).toBe('idle') + expect(mockTrack.stop).toHaveBeenCalled() + expect(window.AudioContext).not.toHaveBeenCalled() + }) +}) diff --git a/frontend/src/lib/audioRecorder.ts b/frontend/src/lib/audioRecorder.ts index 526eacd6..aa2469a9 100644 --- a/frontend/src/lib/audioRecorder.ts +++ b/frontend/src/lib/audioRecorder.ts @@ -110,6 +110,32 @@ export class AudioRecorder { ) } + async prepare(): Promise { + if (!AudioRecorder.isSupported()) { + throw new Error('Audio recording is not supported in this browser') + } + + const ctx = this.getReusableAudioContext() + + if (ctx.state === 'suspended') { + await ctx.resume() + } + + if (ctx.audioWorklet) { + await ensureWorkletLoaded(ctx) + } + } + + private getReusableAudioContext(): AudioContext { + if (this.audioContext && this.audioContext.state !== 'closed') { + return this.audioContext + } + this.audioContext = new AudioContext({ + sampleRate: this.options.sampleRate, + }) + return this.audioContext + } + getState(): AudioRecorderState { return this.state } @@ -151,33 +177,39 @@ export class AudioRecorder { }, }) - this.audioContext = new AudioContext({ - sampleRate: this.options.sampleRate, - }) + if (this.isAborted) { + this.mediaStream.getTracks().forEach(t => t.stop()) + this.mediaStream = null + return + } + + await this.prepare() + + if (this.isAborted) { + if (this.mediaStream) { + this.mediaStream.getTracks().forEach(t => t.stop()) + this.mediaStream = null + } + return + } + + const ctx = this.audioContext! + this.source = ctx.createMediaStreamSource(this.mediaStream) - this.source = this.audioContext.createMediaStreamSource(this.mediaStream) - - if (this.audioContext.audioWorklet) { - try { - await ensureWorkletLoaded(this.audioContext) - this.workletNode = new AudioWorkletNode(this.audioContext, 'recorder-processor', { - processorOptions: { targetSampleRate: this.options.sampleRate }, - }) - this.workletNode.port.onmessage = (e: MessageEvent) => { - this.chunks.push(e.data) - this.totalSamples += e.data.length - } - this.source.connect(this.workletNode) - } catch (error) { - this.audioContext.close() - this.audioContext = null - throw new Error('Failed to load audio worklet processor', { cause: error }) + if (ctx.audioWorklet) { + this.workletNode = new AudioWorkletNode(ctx, 'recorder-processor', { + processorOptions: { targetSampleRate: this.options.sampleRate }, + }) + this.workletNode.port.onmessage = (e: MessageEvent) => { + this.chunks.push(e.data) + this.totalSamples += e.data.length } - } else if (this.audioContext) { + this.source.connect(this.workletNode) + } else { const bufferSize = 4096 - this.processor = this.audioContext.createScriptProcessor(bufferSize, 1, 1) + this.processor = ctx.createScriptProcessor(bufferSize, 1, 1) const targetRate = this.options.sampleRate ?? 16000 - const inputRate = this.audioContext.sampleRate + const inputRate = ctx.sampleRate this.processor.onaudioprocess = (e) => { const inputData = e.inputBuffer.getChannelData(0) const int16Chunk = downsampleAndConvert(inputData, inputRate, targetRate) @@ -190,7 +222,7 @@ export class AudioRecorder { this.setState('recording') } catch (error) { this.setState('error') - this.cleanup() + this.cleanupRecording(true) if (error instanceof DOMException) { if (error.name === 'NotAllowedError') { @@ -213,14 +245,21 @@ export class AudioRecorder { this.processRecording() } this.resetRecordingState() - this.cleanup() + this.cleanupRecording(false) this.setState('stopped') } abort(): void { this.isAborted = true this.resetRecordingState() - this.cleanup() + this.cleanupRecording(false) + this.setState('idle') + } + + dispose(): void { + this.isAborted = true + this.resetRecordingState() + this.cleanupRecording(true) this.setState('idle') } @@ -244,7 +283,7 @@ export class AudioRecorder { } } - private cleanup(): void { + private cleanupRecording(closeAudioContext: boolean): void { if (this.workletNode) { this.workletNode.port.onmessage = null this.workletNode.port.postMessage('stop') @@ -263,15 +302,15 @@ export class AudioRecorder { this.source = null } - if (this.audioContext && this.audioContext.state !== 'closed') { - this.audioContext.close() - this.audioContext = null - } - if (this.mediaStream) { this.mediaStream.getTracks().forEach(track => track.stop()) this.mediaStream = null } + + if (closeAudioContext && this.audioContext && this.audioContext.state !== 'closed') { + this.audioContext.close() + this.audioContext = null + } } private resetRecordingState(): void { From 5f7134289242c0a4259983f32bf1454c4e0a6b53 Mon Sep 17 00:00:00 2001 From: Chris Scott <99081550+chriswritescode-dev@users.noreply.github.com> Date: Mon, 1 Jun 2026 09:01:21 -0400 Subject: [PATCH 2/5] refactor: extract STT startup helpers and simplify audio recorder --- docs/features/stt.md | 2 +- frontend/src/hooks/useSTT.test.tsx | 3 + frontend/src/hooks/useSTT.ts | 124 +++++++++++-------------- frontend/src/lib/audioRecorder.test.ts | 110 +++++----------------- frontend/src/lib/audioRecorder.ts | 27 +++--- 5 files changed, 93 insertions(+), 173 deletions(-) diff --git a/docs/features/stt.md b/docs/features/stt.md index d8c9b919..9705adae 100644 --- a/docs/features/stt.md +++ b/docs/features/stt.md @@ -71,7 +71,7 @@ Any OpenAI-compatible transcription API works: ### Performance -After the first microphone press, the browser audio pipeline stays prepared so subsequent recordings start faster. The audio context and worklet processor are retained between recordings; only the microphone track is stopped after each use. Resources are released entirely when external STT is disabled or the voice input UI unmounts. +When external voice input is enabled, the browser audio pipeline is warmed up ahead of time so the first and subsequent recordings start faster. The audio context and worklet processor are prepared without requesting microphone access, and are retained between recordings; only the microphone track is stopped after each use. Resources are released entirely when external STT is disabled or the voice input UI unmounts. This optimization applies only to the external API provider. It does not affect the initial permission prompt — the browser still asks for microphone access on the first recording. diff --git a/frontend/src/hooks/useSTT.test.tsx b/frontend/src/hooks/useSTT.test.tsx index e0ac6523..73a9b548 100644 --- a/frontend/src/hooks/useSTT.test.tsx +++ b/frontend/src/hooks/useSTT.test.tsx @@ -7,6 +7,7 @@ type MockRecorder = { stop: ReturnType abort: ReturnType dispose: ReturnType + prepare: ReturnType setOnStateChange: ReturnType setOnError: ReturnType setOnDataAvailable: ReturnType @@ -56,6 +57,7 @@ describe('useSTT external provider lifecycle', () => { stop: vi.fn(), abort: vi.fn(), dispose: vi.fn(), + prepare: vi.fn().mockResolvedValue(undefined), setOnStateChange: vi.fn(), setOnError: vi.fn(), setOnDataAvailable: vi.fn(), @@ -85,6 +87,7 @@ describe('useSTT external provider lifecycle', () => { }) expect(mockRecorder.start).not.toHaveBeenCalled() + expect(mockRecorder.prepare).toHaveBeenCalledTimes(1) expect(mockRecorder.setOnStateChange).toHaveBeenCalledTimes(1) expect(mockRecorder.setOnError).toHaveBeenCalledTimes(1) expect(mockRecorder.setOnDataAvailable).toHaveBeenCalledTimes(1) diff --git a/frontend/src/hooks/useSTT.ts b/frontend/src/hooks/useSTT.ts index edd79d15..909f264f 100644 --- a/frontend/src/hooks/useSTT.ts +++ b/frontend/src/hooks/useSTT.ts @@ -185,6 +185,17 @@ export function useSTT(userId = 'default') { }) }, []) + const ensureAudioRecorder = useCallback((): AudioRecorder => { + if (!audioRecorder.current) { + audioRecorder.current = new AudioRecorder() + } + if (!recorderConfiguredRef.current) { + setupAudioRecorder(audioRecorder.current) + recorderConfiguredRef.current = true + } + return audioRecorder.current + }, [setupAudioRecorder]) + const disposeAudioRecorder = useCallback(() => { if (audioRecorder.current) { audioRecorder.current.dispose() @@ -198,19 +209,12 @@ export function useSTT(userId = 'default') { return } - if (!audioRecorder.current) { - audioRecorder.current = new AudioRecorder() - } - - if (!recorderConfiguredRef.current) { - setupAudioRecorder(audioRecorder.current) - recorderConfiguredRef.current = true - } + void ensureAudioRecorder().prepare().catch(() => undefined) return () => { disposeAudioRecorder() } - }, [isEnabled, isExternalProvider, setupAudioRecorder, disposeAudioRecorder]) + }, [isEnabled, isExternalProvider, ensureAudioRecorder, disposeAudioRecorder]) const clearStartupTimeout = useCallback(() => { if (startupTimeoutRef.current) { @@ -232,6 +236,37 @@ export function useSTT(userId = 'default') { setError('Microphone start timed out') }, [isExternalProvider, disposeAudioRecorder]) + const runStartupWithTimeout = useCallback( + async (startup: () => Promise, startOpId: number): Promise => { + try { + const startupPromise = startup() + const timeoutPromise = new Promise((_, reject) => { + startupTimeoutRef.current = setTimeout(() => { + if (startOpIdRef.current !== startOpId) return + reject(new Error('Microphone start timed out')) + }, STT_START_TIMEOUT_MS) + }) + + await Promise.race([startupPromise, timeoutPromise]) + clearStartupTimeout() + + return startOpIdRef.current === startOpId + } catch (err) { + clearStartupTimeout() + if (startOpIdRef.current !== startOpId) return false + setIsProcessing(false) + if (err instanceof Error && err.message === 'Microphone start timed out') { + abortAndResetOnTimeout() + return false + } + setIsError(true) + setError(err instanceof Error ? err.message : 'Failed to start recording') + return false + } + }, + [clearStartupTimeout, abortAndResetOnTimeout], + ) + const startRecording = useCallback(async (): Promise => { if (!isSupported) { setError('Speech recognition is not supported in this browser') @@ -255,42 +290,14 @@ export function useSTT(userId = 'default') { clearStartupTimeout() if (isExternalProvider) { - if (!audioRecorder.current) { - audioRecorder.current = new AudioRecorder() - setupAudioRecorder(audioRecorder.current) - recorderConfiguredRef.current = true - } - - try { - setIsProcessing(true) - - const startupPromise = audioRecorder.current.start() - const timeoutPromise = new Promise((_, reject) => { - startupTimeoutRef.current = setTimeout(() => { - if (startOpIdRef.current !== startOpId) return - reject(new Error('Microphone start timed out')) - }, STT_START_TIMEOUT_MS) - }) - - await Promise.race([startupPromise, timeoutPromise]) - clearStartupTimeout() + const recorder = ensureAudioRecorder() - if (startOpIdRef.current !== startOpId) return false - - setIsProcessing(false) - return true - } catch (err) { - clearStartupTimeout() - if (startOpIdRef.current !== startOpId) return false + setIsProcessing(true) + const started = await runStartupWithTimeout(() => recorder.start(), startOpId) + if (started) { setIsProcessing(false) - if (err instanceof Error && err.message === 'Microphone start timed out') { - abortAndResetOnTimeout() - return false - } - setIsError(true) - setError(err instanceof Error ? err.message : 'Failed to start recording') - return false } + return started } else { const options: SpeechRecognitionOptions = { language: config.language, @@ -298,37 +305,10 @@ export function useSTT(userId = 'default') { maxAlternatives: 1, } - try { - setIsProcessing(true) - - const startupPromise = recognizer.current.start(options) - const timeoutPromise = new Promise((_, reject) => { - startupTimeoutRef.current = setTimeout(() => { - if (startOpIdRef.current !== startOpId) return - reject(new Error('Microphone start timed out')) - }, STT_START_TIMEOUT_MS) - }) - - await Promise.race([startupPromise, timeoutPromise]) - clearStartupTimeout() - - if (startOpIdRef.current !== startOpId) return false - - return true - } catch (err) { - clearStartupTimeout() - if (startOpIdRef.current !== startOpId) return false - setIsProcessing(false) - if (err instanceof Error && err.message === 'Microphone start timed out') { - abortAndResetOnTimeout() - return false - } - setIsError(true) - setError(err instanceof Error ? err.message : 'Failed to start recording') - return false - } + setIsProcessing(true) + return runStartupWithTimeout(() => recognizer.current.start(options), startOpId) } - }, [isSupported, isEnabled, isExternalProvider, config.language, setupAudioRecorder, clearStartupTimeout, abortAndResetOnTimeout]) + }, [isSupported, isEnabled, isExternalProvider, config.language, clearStartupTimeout, ensureAudioRecorder, runStartupWithTimeout]) const stopRecording = useCallback(() => { if (isExternalProvider && audioRecorder.current) { diff --git a/frontend/src/lib/audioRecorder.test.ts b/frontend/src/lib/audioRecorder.test.ts index 142563bf..5d050236 100644 --- a/frontend/src/lib/audioRecorder.test.ts +++ b/frontend/src/lib/audioRecorder.test.ts @@ -161,18 +161,25 @@ describe('AudioRecorder.isSupported', () => { describe('AudioRecorder.prepare', () => { let originalAudioContext: typeof window.AudioContext + let originalAudioWorkletNode: unknown let originalGetUserMedia: (typeof navigator.mediaDevices)['getUserMedia'] | undefined let mockAddModule: ReturnType + let mockClose: ReturnType + let mockTrack: { stop: ReturnType; kind: string } + let mockGetUserMedia: ReturnType + let MockAudioContext: ReturnType beforeEach(() => { originalAudioContext = window.AudioContext + originalAudioWorkletNode = (window as any).AudioWorkletNode originalGetUserMedia = navigator.mediaDevices?.getUserMedia mockAddModule = vi.fn().mockResolvedValue(undefined) - + mockClose = vi.fn().mockResolvedValue(undefined) + mockTrack = { stop: vi.fn(), kind: 'audio' } const mockSource = { connect: vi.fn(), disconnect: vi.fn() } - const MockAudioContext = vi.fn().mockImplementation(() => ({ + MockAudioContext = vi.fn().mockImplementation(() => ({ state: 'running', sampleRate: 16000, audioWorklet: { addModule: mockAddModule }, @@ -183,13 +190,21 @@ describe('AudioRecorder.prepare', () => { onaudioprocess: null, }), resume: vi.fn().mockResolvedValue(undefined), - close: vi.fn().mockResolvedValue(undefined), + close: mockClose, })) - window.AudioContext = MockAudioContext as unknown as typeof window.AudioContext + ;(window as any).AudioWorkletNode = vi.fn().mockImplementation(() => ({ + port: { onmessage: null, postMessage: vi.fn() }, + disconnect: vi.fn(), + })) + + mockGetUserMedia = vi.fn().mockResolvedValue({ + getTracks: () => [mockTrack], + getAudioTracks: () => [mockTrack], + }) Object.defineProperty(navigator, 'mediaDevices', { - value: { getUserMedia: vi.fn() }, + value: { getUserMedia: mockGetUserMedia }, writable: true, configurable: true, }) @@ -197,6 +212,7 @@ describe('AudioRecorder.prepare', () => { afterEach(() => { window.AudioContext = originalAudioContext + ;(window as any).AudioWorkletNode = originalAudioWorkletNode if (originalGetUserMedia) { Object.defineProperty(navigator, 'mediaDevices', { value: { getUserMedia: originalGetUserMedia }, @@ -210,32 +226,13 @@ describe('AudioRecorder.prepare', () => { const recorder = new AudioRecorder() await recorder.prepare() - expect(navigator.mediaDevices.getUserMedia).not.toHaveBeenCalled() - expect(window.AudioContext).toHaveBeenCalledTimes(1) + expect(mockGetUserMedia).not.toHaveBeenCalled() + expect(MockAudioContext).toHaveBeenCalledTimes(1) expect(mockAddModule).toHaveBeenCalledOnce() expect(mockAddModule).toHaveBeenCalledWith('/audio-worklet-processor.js') }) it('reuses the same AudioContext and worklet when prepare() precedes start()', async () => { - const originalAudioWorkletNode = (window as any).AudioWorkletNode - - const mockWorkletNode = { - port: { - onmessage: null as ((e: MessageEvent) => void) | null, - postMessage: vi.fn(), - }, - disconnect: vi.fn(), - } - const MockAudioWorkletNode = vi.fn().mockImplementation(() => mockWorkletNode) - ;(window as any).AudioWorkletNode = MockAudioWorkletNode - - const mockTrack = { stop: vi.fn(), kind: 'audio' } - const mockMediaStream = { - getTracks: vi.fn().mockReturnValue([mockTrack]), - getAudioTracks: vi.fn().mockReturnValue([mockTrack]), - } - ;(navigator.mediaDevices as any).getUserMedia = vi.fn().mockResolvedValue(mockMediaStream) - const recorder = new AudioRecorder() await recorder.prepare() @@ -244,60 +241,13 @@ describe('AudioRecorder.prepare', () => { await recorder.start() expect(mockAddModule).not.toHaveBeenCalled() - expect(window.AudioContext).toHaveBeenCalledTimes(1) - expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1) + expect(MockAudioContext).toHaveBeenCalledTimes(1) + expect(mockGetUserMedia).toHaveBeenCalledTimes(1) recorder.stop() - - ;(window as any).AudioWorkletNode = originalAudioWorkletNode }) it('reuses the prepared audio context and loaded worklet across recordings', async () => { - const originalAudioWorkletNode = (window as any).AudioWorkletNode - const originalAudioContext = window.AudioContext - - const mockAddModule = vi.fn().mockResolvedValue(undefined) - const mockClose = vi.fn().mockResolvedValue(undefined) - const mockResume = vi.fn().mockResolvedValue(undefined) - const mockTrack = { stop: vi.fn(), kind: 'audio' } - const mockSource = { connect: vi.fn(), disconnect: vi.fn() } - - const MockAudioContext = vi.fn().mockImplementation(() => ({ - state: 'running', - sampleRate: 16000, - audioWorklet: { addModule: mockAddModule }, - createMediaStreamSource: vi.fn().mockReturnValue(mockSource), - createScriptProcessor: vi.fn().mockReturnValue({ - connect: vi.fn(), - disconnect: vi.fn(), - onaudioprocess: null, - }), - resume: mockResume, - close: mockClose, - })) - window.AudioContext = MockAudioContext as unknown as typeof window.AudioContext - - const mockWorkletNode = { - port: { - onmessage: null as ((e: MessageEvent) => void) | null, - postMessage: vi.fn(), - }, - disconnect: vi.fn(), - } - const MockAudioWorkletNode = vi.fn().mockImplementation(() => mockWorkletNode) - ;(window as any).AudioWorkletNode = MockAudioWorkletNode - - const originalGetUserMedia = navigator.mediaDevices?.getUserMedia - const mockGetUserMedia = vi.fn().mockResolvedValue({ - getTracks: () => [mockTrack], - getAudioTracks: () => [mockTrack], - }) - Object.defineProperty(navigator, 'mediaDevices', { - value: { getUserMedia: mockGetUserMedia }, - writable: true, - configurable: true, - }) - const recorder = new AudioRecorder() await recorder.start() @@ -313,16 +263,6 @@ describe('AudioRecorder.prepare', () => { expect(mockAddModule).toHaveBeenCalledTimes(1) expect(mockTrack.stop).toHaveBeenCalledTimes(2) expect(mockClose).toHaveBeenCalledTimes(1) - - window.AudioContext = originalAudioContext - ;(window as any).AudioWorkletNode = originalAudioWorkletNode - if (originalGetUserMedia) { - Object.defineProperty(navigator, 'mediaDevices', { - value: { getUserMedia: originalGetUserMedia }, - writable: true, - configurable: true, - }) - } }) }) diff --git a/frontend/src/lib/audioRecorder.ts b/frontend/src/lib/audioRecorder.ts index aa2469a9..c02721d6 100644 --- a/frontend/src/lib/audioRecorder.ts +++ b/frontend/src/lib/audioRecorder.ts @@ -117,13 +117,13 @@ export class AudioRecorder { const ctx = this.getReusableAudioContext() - if (ctx.state === 'suspended') { - await ctx.resume() - } - if (ctx.audioWorklet) { await ensureWorkletLoaded(ctx) } + + if (ctx.state === 'suspended') { + await ctx.resume() + } } private getReusableAudioContext(): AudioContext { @@ -178,18 +178,14 @@ export class AudioRecorder { }) if (this.isAborted) { - this.mediaStream.getTracks().forEach(t => t.stop()) - this.mediaStream = null + this.cleanupRecording(true) return } await this.prepare() if (this.isAborted) { - if (this.mediaStream) { - this.mediaStream.getTracks().forEach(t => t.stop()) - this.mediaStream = null - } + this.cleanupRecording(true) return } @@ -250,16 +246,17 @@ export class AudioRecorder { } abort(): void { - this.isAborted = true - this.resetRecordingState() - this.cleanupRecording(false) - this.setState('idle') + this.teardown(false) } dispose(): void { + this.teardown(true) + } + + private teardown(closeAudioContext: boolean): void { this.isAborted = true this.resetRecordingState() - this.cleanupRecording(true) + this.cleanupRecording(closeAudioContext) this.setState('idle') } From bed1b8cd418645485c2c1c7b830aa7156a885fd2 Mon Sep 17 00:00:00 2001 From: Chris Scott <99081550+chriswritescode-dev@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:08:43 -0400 Subject: [PATCH 3/5] feat(stt): add energy-based VAD with silence gate and auto-stop - New VoiceActivityDetector class: adaptive noise floor, per-frame RMS classification, trailing-silence endpointing, configurable thresholds - AudioWorkletProcessor computes RMS in existing flush loop and posts {samples, rms} instead of a bare Int16Array - AudioRecorder integrates VAD via handleVadFrame() in both worklet and ScriptProcessor paths; auto-calls stop() on trailing silence; silence gate in processRecording() fires onNoSpeech instead of sending empty audio to the transcription API (fixes Whisper hallucination bug) - useSTT wires setOnNoSpeech to reset isProcessing/isRecording cleanly without surfacing an error - computeRms() exported helper for ScriptProcessor fallback path --- frontend/public/audio-worklet-processor.js | 10 +- frontend/src/hooks/useSTT.test.tsx | 22 ++++ frontend/src/hooks/useSTT.ts | 7 ++ frontend/src/lib/audioRecorder.test.ts | 115 ++++++++++++++++++ frontend/src/lib/audioRecorder.ts | 49 +++++++- .../src/lib/voiceActivityDetector.test.ts | 94 ++++++++++++++ frontend/src/lib/voiceActivityDetector.ts | 73 +++++++++++ 7 files changed, 364 insertions(+), 6 deletions(-) create mode 100644 frontend/src/lib/voiceActivityDetector.test.ts create mode 100644 frontend/src/lib/voiceActivityDetector.ts diff --git a/frontend/public/audio-worklet-processor.js b/frontend/public/audio-worklet-processor.js index 0a0e584d..848dd5b2 100644 --- a/frontend/public/audio-worklet-processor.js +++ b/frontend/public/audio-worklet-processor.js @@ -59,12 +59,16 @@ class RecorderProcessor extends AudioWorkletProcessor { } _flushBuffer() { - const int16 = new Int16Array(this._buffer.length) - for (let i = 0; i < this._buffer.length; i++) { + const length = this._buffer.length + const int16 = new Int16Array(length) + let sumSquares = 0 + for (let i = 0; i < length; i++) { const sample = Math.max(-1, Math.min(1, this._buffer[i])) + sumSquares += sample * sample int16[i] = sample < 0 ? sample * 32768 : sample * 32767 } - this.port.postMessage(int16, [int16.buffer]) + const rms = length > 0 ? Math.sqrt(sumSquares / length) : 0 + this.port.postMessage({ samples: int16, rms }, [int16.buffer]) this._buffer = [] } } diff --git a/frontend/src/hooks/useSTT.test.tsx b/frontend/src/hooks/useSTT.test.tsx index 73a9b548..e4898043 100644 --- a/frontend/src/hooks/useSTT.test.tsx +++ b/frontend/src/hooks/useSTT.test.tsx @@ -11,6 +11,7 @@ type MockRecorder = { setOnStateChange: ReturnType setOnError: ReturnType setOnDataAvailable: ReturnType + setOnNoSpeech: ReturnType } const mocks = vi.hoisted(() => ({ @@ -61,6 +62,7 @@ describe('useSTT external provider lifecycle', () => { setOnStateChange: vi.fn(), setOnError: vi.fn(), setOnDataAvailable: vi.fn(), + setOnNoSpeech: vi.fn(), } mocks.AudioRecorder.mockImplementation(() => mockRecorder) @@ -91,6 +93,7 @@ describe('useSTT external provider lifecycle', () => { expect(mockRecorder.setOnStateChange).toHaveBeenCalledTimes(1) expect(mockRecorder.setOnError).toHaveBeenCalledTimes(1) expect(mockRecorder.setOnDataAvailable).toHaveBeenCalledTimes(1) + expect(mockRecorder.setOnNoSpeech).toHaveBeenCalledTimes(1) await act(async () => { await result.current.startRecording() @@ -99,6 +102,25 @@ describe('useSTT external provider lifecycle', () => { expect(mockRecorder.start).toHaveBeenCalledTimes(1) }) + it('clears processing without an error when no speech is detected', async () => { + const { result } = renderHook(() => useSTT()) + + await waitFor(() => { + expect(mockRecorder.setOnNoSpeech).toHaveBeenCalledTimes(1) + }) + + const onNoSpeech = mockRecorder.setOnNoSpeech.mock.calls[0][0] as () => void + + act(() => { + onNoSpeech() + }) + + expect(result.current.isProcessing).toBe(false) + expect(result.current.isRecording).toBe(false) + expect(result.current.isError).toBe(false) + expect(result.current.error).toBeNull() + }) + it('disposes external recorder resources on unmount', async () => { const { unmount } = renderHook(() => useSTT()) diff --git a/frontend/src/hooks/useSTT.ts b/frontend/src/hooks/useSTT.ts index 909f264f..f720f533 100644 --- a/frontend/src/hooks/useSTT.ts +++ b/frontend/src/hooks/useSTT.ts @@ -139,6 +139,13 @@ export function useSTT(userId = 'default') { }, 3000) }) + recorder.setOnNoSpeech(() => { + setIsProcessing(false) + setIsRecording(false) + setInterimTranscript('') + setState('idle') + }) + recorder.setOnDataAvailable(async (blob) => { if (lastProcessedBlobRef.current === blob) { return diff --git a/frontend/src/lib/audioRecorder.test.ts b/frontend/src/lib/audioRecorder.test.ts index 5d050236..cadaddc1 100644 --- a/frontend/src/lib/audioRecorder.test.ts +++ b/frontend/src/lib/audioRecorder.test.ts @@ -337,3 +337,118 @@ describe('AudioRecorder lifecycle cancellation', () => { expect(window.AudioContext).not.toHaveBeenCalled() }) }) + +describe('AudioRecorder voice activity detection', () => { + const SAMPLE_RATE = 16000 + const msToSamples = (ms: number): number => Math.round((ms / 1000) * SAMPLE_RATE) + + let originalAudioContext: typeof window.AudioContext + let originalAudioWorkletNode: unknown + let originalGetUserMedia: (typeof navigator.mediaDevices)['getUserMedia'] | undefined + let mockWorkletNode: { port: { onmessage: ((e: MessageEvent) => void) | null; postMessage: ReturnType }; disconnect: ReturnType } + + type Frame = { samples: Int16Array; rms: number } + const feed = (rms: number, ms: number): void => { + const frame: Frame = { samples: new Int16Array(msToSamples(ms)), rms } + mockWorkletNode.port.onmessage?.({ data: frame } as MessageEvent) + } + + beforeEach(() => { + originalAudioContext = window.AudioContext + originalAudioWorkletNode = (window as any).AudioWorkletNode + originalGetUserMedia = navigator.mediaDevices?.getUserMedia + + const mockSource = { connect: vi.fn(), disconnect: vi.fn() } + mockWorkletNode = { + port: { onmessage: null, postMessage: vi.fn() }, + disconnect: vi.fn(), + } + + const MockAudioContext = vi.fn().mockImplementation(() => ({ + state: 'running', + sampleRate: SAMPLE_RATE, + audioWorklet: { addModule: vi.fn().mockResolvedValue(undefined) }, + createMediaStreamSource: vi.fn().mockReturnValue(mockSource), + createScriptProcessor: vi.fn(), + resume: vi.fn().mockResolvedValue(undefined), + close: vi.fn().mockResolvedValue(undefined), + })) + window.AudioContext = MockAudioContext as unknown as typeof window.AudioContext + + ;(window as any).AudioWorkletNode = vi.fn().mockImplementation(() => mockWorkletNode) + + const mockTrack = { stop: vi.fn(), kind: 'audio' } + Object.defineProperty(navigator, 'mediaDevices', { + value: { + getUserMedia: vi.fn().mockResolvedValue({ + getTracks: () => [mockTrack], + getAudioTracks: () => [mockTrack], + }), + }, + writable: true, + configurable: true, + }) + }) + + afterEach(() => { + window.AudioContext = originalAudioContext + ;(window as any).AudioWorkletNode = originalAudioWorkletNode + if (originalGetUserMedia) { + Object.defineProperty(navigator, 'mediaDevices', { + value: { getUserMedia: originalGetUserMedia }, + writable: true, + configurable: true, + }) + } + }) + + it('does not emit audio and signals no-speech when the recording is silent', async () => { + const onDataAvailable = vi.fn() + const onNoSpeech = vi.fn() + const recorder = new AudioRecorder() + recorder.setOnDataAvailable(onDataAvailable) + recorder.setOnNoSpeech(onNoSpeech) + + await recorder.start() + for (let i = 0; i < 5; i++) { + feed(0.0005, 100) + } + recorder.stop() + + expect(onNoSpeech).toHaveBeenCalledTimes(1) + expect(onDataAvailable).not.toHaveBeenCalled() + expect(recorder.getState()).toBe('stopped') + }) + + it('emits audio when speech is detected', async () => { + const onDataAvailable = vi.fn() + const onNoSpeech = vi.fn() + const recorder = new AudioRecorder() + recorder.setOnDataAvailable(onDataAvailable) + recorder.setOnNoSpeech(onNoSpeech) + + await recorder.start() + for (let i = 0; i < 3; i++) { + feed(0.2, 100) + } + recorder.stop() + + expect(onDataAvailable).toHaveBeenCalledTimes(1) + expect(onNoSpeech).not.toHaveBeenCalled() + }) + + it('auto-stops after trailing silence once speech has been detected', async () => { + const onDataAvailable = vi.fn() + const recorder = new AudioRecorder({ vad: { minSpeechMs: 50, silenceTimeoutMs: 200 } }) + recorder.setOnDataAvailable(onDataAvailable) + + await recorder.start() + feed(0.2, 100) + feed(0.0005, 100) + feed(0.0005, 100) + + expect(recorder.getState()).toBe('stopped') + expect(onDataAvailable).toHaveBeenCalledTimes(1) + expect(mockWorkletNode.port.onmessage).toBeNull() + }) +}) diff --git a/frontend/src/lib/audioRecorder.ts b/frontend/src/lib/audioRecorder.ts index c02721d6..1afd9891 100644 --- a/frontend/src/lib/audioRecorder.ts +++ b/frontend/src/lib/audioRecorder.ts @@ -1,8 +1,11 @@ +import { VoiceActivityDetector, type VadOptions } from './voiceActivityDetector' + export type AudioRecorderState = 'idle' | 'recording' | 'stopped' | 'error' export interface AudioRecorderOptions { sampleRate?: number channelCount?: number + vad?: Partial> } const DEFAULT_OPTIONS: AudioRecorderOptions = { @@ -49,6 +52,18 @@ export function downsampleAndConvert(input: Float32Array, inputRate: number, tar return output } +export function computeRms(samples: Int16Array): number { + if (samples.length === 0) { + return 0 + } + let sumSquares = 0 + for (let i = 0; i < samples.length; i++) { + const normalized = samples[i] / 32768 + sumSquares += normalized * normalized + } + return Math.sqrt(sumSquares / samples.length) +} + export function encodeWavFromInt16(samples: Int16Array, sampleRate: number, channels: number): Blob { const dataLength = samples.length * 2 const bufferSize = 44 + dataLength @@ -91,10 +106,12 @@ export class AudioRecorder { private state: AudioRecorderState = 'idle' private options: AudioRecorderOptions private isAborted: boolean = false + private vad: VoiceActivityDetector | null = null private onStateChange?: (state: AudioRecorderState) => void private onError?: (error: string) => void private onDataAvailable?: (blob: Blob) => void + private onNoSpeech?: () => void constructor(options: AudioRecorderOptions = {}) { this.options = { ...DEFAULT_OPTIONS, ...options } @@ -152,6 +169,10 @@ export class AudioRecorder { this.onDataAvailable = callback } + setOnNoSpeech(callback: () => void): void { + this.onNoSpeech = callback + } + private setState(newState: AudioRecorderState): void { this.state = newState this.onStateChange?.(newState) @@ -168,6 +189,10 @@ export class AudioRecorder { this.isAborted = false this.chunks = [] this.totalSamples = 0 + this.vad = new VoiceActivityDetector({ + sampleRate: this.options.sampleRate ?? 16000, + ...this.options.vad, + }) this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { @@ -196,9 +221,11 @@ export class AudioRecorder { this.workletNode = new AudioWorkletNode(ctx, 'recorder-processor', { processorOptions: { targetSampleRate: this.options.sampleRate }, }) - this.workletNode.port.onmessage = (e: MessageEvent) => { - this.chunks.push(e.data) - this.totalSamples += e.data.length + this.workletNode.port.onmessage = (e: MessageEvent<{ samples: Int16Array; rms: number }>) => { + const { samples, rms } = e.data + this.chunks.push(samples) + this.totalSamples += samples.length + this.handleVadFrame(rms, samples.length) } this.source.connect(this.workletNode) } else { @@ -211,6 +238,7 @@ export class AudioRecorder { const int16Chunk = downsampleAndConvert(inputData, inputRate, targetRate) this.chunks.push(int16Chunk) this.totalSamples += int16Chunk.length + this.handleVadFrame(computeRms(int16Chunk), int16Chunk.length) } this.source.connect(this.processor) } @@ -260,11 +288,26 @@ export class AudioRecorder { this.setState('idle') } + private handleVadFrame(rms: number, frameSamples: number): void { + if (!this.vad) { + return + } + const { shouldAutoStop } = this.vad.process(rms, frameSamples) + if (shouldAutoStop) { + this.stop() + } + } + private processRecording(): void { if (this.isAborted || this.chunks.length === 0 || this.totalSamples === 0) { return } + if (this.vad && !this.vad.hasSpeech) { + this.onNoSpeech?.() + return + } + try { const merged = new Int16Array(this.totalSamples) let offset = 0 diff --git a/frontend/src/lib/voiceActivityDetector.test.ts b/frontend/src/lib/voiceActivityDetector.test.ts new file mode 100644 index 00000000..1327788b --- /dev/null +++ b/frontend/src/lib/voiceActivityDetector.test.ts @@ -0,0 +1,94 @@ +import { describe, it, expect } from 'vitest' +import { VoiceActivityDetector } from './voiceActivityDetector' + +const SAMPLE_RATE = 16000 +const msToSamples = (ms: number): number => Math.round((ms / 1000) * SAMPLE_RATE) + +describe('VoiceActivityDetector', () => { + it('classifies loud frames as speech and quiet frames as silence', () => { + const vad = new VoiceActivityDetector({ sampleRate: SAMPLE_RATE }) + + expect(vad.process(0.2, msToSamples(100)).isSpeech).toBe(true) + expect(vad.process(0.0005, msToSamples(100)).isSpeech).toBe(false) + }) + + it('reports hasSpeech only after cumulative speech exceeds minSpeechMs', () => { + const vad = new VoiceActivityDetector({ sampleRate: SAMPLE_RATE, minSpeechMs: 150 }) + + vad.process(0.2, msToSamples(100)) + expect(vad.hasSpeech).toBe(false) + + vad.process(0.2, msToSamples(100)) + expect(vad.hasSpeech).toBe(true) + }) + + it('does not flag pure silence as speech', () => { + const vad = new VoiceActivityDetector({ sampleRate: SAMPLE_RATE }) + + for (let i = 0; i < 10; i++) { + vad.process(0.0008, msToSamples(100)) + } + + expect(vad.hasSpeech).toBe(false) + }) + + it('auto-stops after trailing silence once speech has started', () => { + const vad = new VoiceActivityDetector({ + sampleRate: SAMPLE_RATE, + minSpeechMs: 50, + silenceTimeoutMs: 200, + }) + + expect(vad.process(0.2, msToSamples(100)).shouldAutoStop).toBe(false) + expect(vad.process(0.0005, msToSamples(100)).shouldAutoStop).toBe(false) + expect(vad.process(0.0005, msToSamples(100)).shouldAutoStop).toBe(true) + }) + + it('does not auto-stop during leading silence before any speech', () => { + const vad = new VoiceActivityDetector({ + sampleRate: SAMPLE_RATE, + silenceTimeoutMs: 200, + }) + + for (let i = 0; i < 10; i++) { + expect(vad.process(0.0005, msToSamples(100)).shouldAutoStop).toBe(false) + } + }) + + it('never auto-stops when silenceTimeoutMs is 0', () => { + const vad = new VoiceActivityDetector({ + sampleRate: SAMPLE_RATE, + minSpeechMs: 50, + silenceTimeoutMs: 0, + }) + + vad.process(0.2, msToSamples(100)) + for (let i = 0; i < 20; i++) { + expect(vad.process(0.0005, msToSamples(100)).shouldAutoStop).toBe(false) + } + }) + + it('resets accumulated speech and silence state', () => { + const vad = new VoiceActivityDetector({ sampleRate: SAMPLE_RATE, minSpeechMs: 50 }) + + vad.process(0.2, msToSamples(100)) + expect(vad.hasSpeech).toBe(true) + + vad.reset() + expect(vad.hasSpeech).toBe(false) + expect(vad.process(0.0005, msToSamples(100)).shouldAutoStop).toBe(false) + }) + + it('resets trailing silence when speech resumes', () => { + const vad = new VoiceActivityDetector({ + sampleRate: SAMPLE_RATE, + minSpeechMs: 50, + silenceTimeoutMs: 200, + }) + + vad.process(0.2, msToSamples(100)) + vad.process(0.0005, msToSamples(100)) + vad.process(0.2, msToSamples(100)) + expect(vad.process(0.0005, msToSamples(100)).shouldAutoStop).toBe(false) + }) +}) diff --git a/frontend/src/lib/voiceActivityDetector.ts b/frontend/src/lib/voiceActivityDetector.ts new file mode 100644 index 00000000..3197a048 --- /dev/null +++ b/frontend/src/lib/voiceActivityDetector.ts @@ -0,0 +1,73 @@ +export interface VadOptions { + sampleRate: number + silenceFloor: number + speechMultiplier: number + silenceTimeoutMs: number + minSpeechMs: number + noiseFloorSmoothing: number +} + +export const DEFAULT_VAD_OPTIONS: Omit = { + silenceFloor: 0.008, + speechMultiplier: 2.5, + silenceTimeoutMs: 1500, + minSpeechMs: 150, + noiseFloorSmoothing: 0.95, +} + +export interface VadFrameResult { + isSpeech: boolean + shouldAutoStop: boolean +} + +export class VoiceActivityDetector { + private readonly options: VadOptions + private noiseFloor: number + private speechSamples = 0 + private trailingSilenceSamples = 0 + private speechStarted = false + + constructor(options: Partial & Pick) { + this.options = { ...DEFAULT_VAD_OPTIONS, ...options } + this.noiseFloor = this.options.silenceFloor + } + + process(rms: number, frameSamples: number): VadFrameResult { + const { silenceFloor, speechMultiplier, noiseFloorSmoothing, silenceTimeoutMs } = this.options + const threshold = Math.max(silenceFloor, this.noiseFloor * speechMultiplier) + const isSpeech = rms >= threshold + + if (isSpeech) { + this.speechStarted = true + this.speechSamples += frameSamples + this.trailingSilenceSamples = 0 + } else { + this.noiseFloor = noiseFloorSmoothing * this.noiseFloor + (1 - noiseFloorSmoothing) * rms + if (this.speechStarted) { + this.trailingSilenceSamples += frameSamples + } + } + + const shouldAutoStop = + silenceTimeoutMs > 0 && + this.speechStarted && + this.samplesToMs(this.trailingSilenceSamples) >= silenceTimeoutMs + + return { isSpeech, shouldAutoStop } + } + + get hasSpeech(): boolean { + return this.samplesToMs(this.speechSamples) >= this.options.minSpeechMs + } + + reset(): void { + this.noiseFloor = this.options.silenceFloor + this.speechSamples = 0 + this.trailingSilenceSamples = 0 + this.speechStarted = false + } + + private samplesToMs(samples: number): number { + return (samples / this.options.sampleRate) * 1000 + } +} From 9a56b233064fd1868ce3e5f8d05cc61f20b37438 Mon Sep 17 00:00:00 2001 From: Chris Scott <99081550+chriswritescode-dev@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:23:24 -0400 Subject: [PATCH 4/5] fix(stt): clear processing state when stopping a silent recording stopRecording() called recorder.stop() before setIsProcessing(true). For a silent take the recorder synchronously fires onNoSpeech (which sets processing false), so the subsequent setIsProcessing(true) won and the voice overlay was stuck in the processing state forever. Set processing before stop() so the recorder callback (onNoSpeech or onDataAvailable) has the final say on the processing flag. --- frontend/src/hooks/useSTT.test.tsx | 25 +++++++++++++++++++++++++ frontend/src/hooks/useSTT.ts | 3 +-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/frontend/src/hooks/useSTT.test.tsx b/frontend/src/hooks/useSTT.test.tsx index e4898043..d3a5aa82 100644 --- a/frontend/src/hooks/useSTT.test.tsx +++ b/frontend/src/hooks/useSTT.test.tsx @@ -121,6 +121,31 @@ describe('useSTT external provider lifecycle', () => { expect(result.current.error).toBeNull() }) + it('does not get stuck processing when stopping a silent recording', async () => { + const { result } = renderHook(() => useSTT()) + + await waitFor(() => { + expect(mockRecorder.setOnNoSpeech).toHaveBeenCalledTimes(1) + }) + + const onNoSpeech = mockRecorder.setOnNoSpeech.mock.calls[0][0] as () => void + mockRecorder.stop.mockImplementation(() => { + onNoSpeech() + }) + + await act(async () => { + await result.current.startRecording() + }) + + act(() => { + result.current.stopRecording() + }) + + expect(result.current.isProcessing).toBe(false) + expect(result.current.isRecording).toBe(false) + expect(result.current.isError).toBe(false) + }) + it('disposes external recorder resources on unmount', async () => { const { unmount } = renderHook(() => useSTT()) diff --git a/frontend/src/hooks/useSTT.ts b/frontend/src/hooks/useSTT.ts index f720f533..03e46bf8 100644 --- a/frontend/src/hooks/useSTT.ts +++ b/frontend/src/hooks/useSTT.ts @@ -318,12 +318,11 @@ export function useSTT(userId = 'default') { }, [isSupported, isEnabled, isExternalProvider, config.language, clearStartupTimeout, ensureAudioRecorder, runStartupWithTimeout]) const stopRecording = useCallback(() => { + setIsProcessing(true) if (isExternalProvider && audioRecorder.current) { audioRecorder.current.stop() - setIsProcessing(true) } else { recognizer.current.stop() - setIsProcessing(true) } }, [isExternalProvider]) From 7961268852f1989e2b29917ffe366d51fce7fa70 Mon Sep 17 00:00:00 2001 From: Chris Scott <99081550+chriswritescode-dev@users.noreply.github.com> Date: Mon, 1 Jun 2026 23:04:45 -0400 Subject: [PATCH 5/5] fix(stt): always emit a terminal signal so the voice overlay never sticks Two remaining stuck paths after the silent-stop fix: - processRecording() early-returned without firing onNoSpeech when a take had no captured audio (e.g. a quick start/stop under one worklet flush, totalSamples === 0). Now any non-emitting path (empty audio or no detected speech) fires onNoSpeech, while a genuine abort still returns silently. - stopRecording() set isProcessing true even when the recorder was no longer recording (double-stop or post auto-stop), where stop() skips processRecording entirely and nothing clears the flag. Guard the external path on recorder state before entering processing. --- frontend/src/hooks/useSTT.test.tsx | 19 +++++++++++++++++++ frontend/src/hooks/useSTT.ts | 6 +++++- frontend/src/lib/audioRecorder.test.ts | 15 +++++++++++++++ frontend/src/lib/audioRecorder.ts | 7 +++++-- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/frontend/src/hooks/useSTT.test.tsx b/frontend/src/hooks/useSTT.test.tsx index d3a5aa82..559f7c30 100644 --- a/frontend/src/hooks/useSTT.test.tsx +++ b/frontend/src/hooks/useSTT.test.tsx @@ -8,6 +8,7 @@ type MockRecorder = { abort: ReturnType dispose: ReturnType prepare: ReturnType + getState: ReturnType setOnStateChange: ReturnType setOnError: ReturnType setOnDataAvailable: ReturnType @@ -59,6 +60,7 @@ describe('useSTT external provider lifecycle', () => { abort: vi.fn(), dispose: vi.fn(), prepare: vi.fn().mockResolvedValue(undefined), + getState: vi.fn().mockReturnValue('recording'), setOnStateChange: vi.fn(), setOnError: vi.fn(), setOnDataAvailable: vi.fn(), @@ -146,6 +148,23 @@ describe('useSTT external provider lifecycle', () => { expect(result.current.isError).toBe(false) }) + it('ignores stopRecording when the recorder is not recording', async () => { + const { result } = renderHook(() => useSTT()) + + await waitFor(() => { + expect(mocks.AudioRecorder).toHaveBeenCalledTimes(1) + }) + + mockRecorder.getState.mockReturnValue('stopped') + + act(() => { + result.current.stopRecording() + }) + + expect(mockRecorder.stop).not.toHaveBeenCalled() + expect(result.current.isProcessing).toBe(false) + }) + it('disposes external recorder resources on unmount', async () => { const { unmount } = renderHook(() => useSTT()) diff --git a/frontend/src/hooks/useSTT.ts b/frontend/src/hooks/useSTT.ts index 03e46bf8..c820e4db 100644 --- a/frontend/src/hooks/useSTT.ts +++ b/frontend/src/hooks/useSTT.ts @@ -318,10 +318,14 @@ export function useSTT(userId = 'default') { }, [isSupported, isEnabled, isExternalProvider, config.language, clearStartupTimeout, ensureAudioRecorder, runStartupWithTimeout]) const stopRecording = useCallback(() => { - setIsProcessing(true) if (isExternalProvider && audioRecorder.current) { + if (audioRecorder.current.getState() !== 'recording') { + return + } + setIsProcessing(true) audioRecorder.current.stop() } else { + setIsProcessing(true) recognizer.current.stop() } }, [isExternalProvider]) diff --git a/frontend/src/lib/audioRecorder.test.ts b/frontend/src/lib/audioRecorder.test.ts index cadaddc1..34c47e27 100644 --- a/frontend/src/lib/audioRecorder.test.ts +++ b/frontend/src/lib/audioRecorder.test.ts @@ -420,6 +420,21 @@ describe('AudioRecorder voice activity detection', () => { expect(recorder.getState()).toBe('stopped') }) + it('signals no-speech when stopped before any audio frame is captured', async () => { + const onDataAvailable = vi.fn() + const onNoSpeech = vi.fn() + const recorder = new AudioRecorder() + recorder.setOnDataAvailable(onDataAvailable) + recorder.setOnNoSpeech(onNoSpeech) + + await recorder.start() + recorder.stop() + + expect(onNoSpeech).toHaveBeenCalledTimes(1) + expect(onDataAvailable).not.toHaveBeenCalled() + expect(recorder.getState()).toBe('stopped') + }) + it('emits audio when speech is detected', async () => { const onDataAvailable = vi.fn() const onNoSpeech = vi.fn() diff --git a/frontend/src/lib/audioRecorder.ts b/frontend/src/lib/audioRecorder.ts index 1afd9891..67788d90 100644 --- a/frontend/src/lib/audioRecorder.ts +++ b/frontend/src/lib/audioRecorder.ts @@ -299,11 +299,14 @@ export class AudioRecorder { } private processRecording(): void { - if (this.isAborted || this.chunks.length === 0 || this.totalSamples === 0) { + if (this.isAborted) { return } - if (this.vad && !this.vad.hasSpeech) { + const hasAudio = this.chunks.length > 0 && this.totalSamples > 0 + const hasSpeech = !this.vad || this.vad.hasSpeech + + if (!hasAudio || !hasSpeech) { this.onNoSpeech?.() return }