Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/features/stt.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ Any OpenAI-compatible transcription API works:
- Self-hosted Whisper servers
- Local STT servers with OpenAI-compatible API

### Performance

When external voice input is enabled, the browser audio pipeline is warmed up ahead of time so the first and subsequent recordings start faster. The audio context and worklet processor are prepared without requesting microphone access, and are retained between recordings; only the microphone track is stopped after each use. Resources are released entirely when external STT is disabled or the voice input UI unmounts.

This optimization applies only to the external API provider. It does not affect the initial permission prompt — the browser still asks for microphone access on the first recording.

## Using Voice Input

### Tap-to-Start / Tap-to-Stop
Expand Down
10 changes: 7 additions & 3 deletions frontend/public/audio-worklet-processor.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,16 @@ class RecorderProcessor extends AudioWorkletProcessor {
}

_flushBuffer() {
const int16 = new Int16Array(this._buffer.length)
for (let i = 0; i < this._buffer.length; i++) {
const length = this._buffer.length
const int16 = new Int16Array(length)
let sumSquares = 0
for (let i = 0; i < length; i++) {
const sample = Math.max(-1, Math.min(1, this._buffer[i]))
sumSquares += sample * sample
int16[i] = sample < 0 ? sample * 32768 : sample * 32767
}
this.port.postMessage(int16, [int16.buffer])
const rms = length > 0 ? Math.sqrt(sumSquares / length) : 0
this.port.postMessage({ samples: int16, rms }, [int16.buffer])
this._buffer = []
}
}
Expand Down
138 changes: 138 additions & 0 deletions frontend/src/hooks/useSTT.test.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import { describe, it, expect, vi, beforeEach } from 'vitest'
import { renderHook, act, waitFor } from '@testing-library/react'
import { useSTT } from './useSTT'

type MockRecorder = {
start: ReturnType<typeof vi.fn>
stop: ReturnType<typeof vi.fn>
abort: ReturnType<typeof vi.fn>
dispose: ReturnType<typeof vi.fn>
prepare: ReturnType<typeof vi.fn>
setOnStateChange: ReturnType<typeof vi.fn>
setOnError: ReturnType<typeof vi.fn>
setOnDataAvailable: ReturnType<typeof vi.fn>
setOnNoSpeech: ReturnType<typeof vi.fn>
}

const mocks = vi.hoisted(() => ({
useSettings: vi.fn(),
AudioRecorder: vi.fn(),
getWebSpeechRecognizer: vi.fn(),
isWebRecognitionSupported: vi.fn(),
}))

vi.mock('@/hooks/useSettings', () => ({
useSettings: mocks.useSettings,
}))

vi.mock('@/lib/audioRecorder', () => ({
AudioRecorder: mocks.AudioRecorder,
}))

vi.mock('@/lib/webSpeechRecognizer', () => ({
getWebSpeechRecognizer: mocks.getWebSpeechRecognizer,
isWebRecognitionSupported: mocks.isWebRecognitionSupported,
}))

const externalSTTPreferences = {
preferences: {
stt: {
enabled: true,
provider: 'external' as const,
endpoint: 'https://api.openai.com',
apiKey: 'test-key',
model: 'whisper-1',
language: 'en-US',
},
},
}

describe('useSTT external provider lifecycle', () => {
let mockRecorder: MockRecorder

beforeEach(() => {
vi.clearAllMocks()

mockRecorder = {
start: vi.fn().mockResolvedValue(undefined),
stop: vi.fn(),
abort: vi.fn(),
dispose: vi.fn(),
prepare: vi.fn().mockResolvedValue(undefined),
setOnStateChange: vi.fn(),
setOnError: vi.fn(),
setOnDataAvailable: vi.fn(),
setOnNoSpeech: vi.fn(),
}

mocks.AudioRecorder.mockImplementation(() => mockRecorder)
mocks.useSettings.mockReturnValue(externalSTTPreferences)
mocks.getWebSpeechRecognizer.mockReturnValue({
start: vi.fn(),
stop: vi.fn(),
abort: vi.fn(),
clearCallbacks: vi.fn(),
onResult: vi.fn(),
onInterimResult: vi.fn(),
onError: vi.fn(),
onEnd: vi.fn(),
onStart: vi.fn(),
})
mocks.isWebRecognitionSupported.mockReturnValue(true)
})

it('does not start external recording until startRecording is called', async () => {
const { result } = renderHook(() => useSTT())

await waitFor(() => {
expect(mocks.AudioRecorder).toHaveBeenCalledTimes(1)
})

expect(mockRecorder.start).not.toHaveBeenCalled()
expect(mockRecorder.prepare).toHaveBeenCalledTimes(1)
expect(mockRecorder.setOnStateChange).toHaveBeenCalledTimes(1)
expect(mockRecorder.setOnError).toHaveBeenCalledTimes(1)
expect(mockRecorder.setOnDataAvailable).toHaveBeenCalledTimes(1)
expect(mockRecorder.setOnNoSpeech).toHaveBeenCalledTimes(1)

await act(async () => {
await result.current.startRecording()
})

expect(mockRecorder.start).toHaveBeenCalledTimes(1)
})

it('clears processing without an error when no speech is detected', async () => {
const { result } = renderHook(() => useSTT())

await waitFor(() => {
expect(mockRecorder.setOnNoSpeech).toHaveBeenCalledTimes(1)
})

const onNoSpeech = mockRecorder.setOnNoSpeech.mock.calls[0][0] as () => void

act(() => {
onNoSpeech()
})

expect(result.current.isProcessing).toBe(false)
expect(result.current.isRecording).toBe(false)
expect(result.current.isError).toBe(false)
expect(result.current.error).toBeNull()
})

it('disposes external recorder resources on unmount', async () => {
const { unmount } = renderHook(() => useSTT())

await waitFor(() => {
expect(mocks.AudioRecorder).toHaveBeenCalledTimes(1)
})

const recorder = mockRecorder

unmount()

expect(recorder.dispose).toHaveBeenCalledTimes(1)
expect(recorder.abort).not.toHaveBeenCalled()
})
})
144 changes: 69 additions & 75 deletions frontend/src/hooks/useSTT.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,13 @@ export function useSTT(userId = 'default') {
}, 3000)
})

recorder.setOnNoSpeech(() => {
setIsProcessing(false)
setIsRecording(false)
setInterimTranscript('')
setState('idle')
})

recorder.setOnDataAvailable(async (blob) => {
if (lastProcessedBlobRef.current === blob) {
return
Expand Down Expand Up @@ -185,26 +192,36 @@ export function useSTT(userId = 'default') {
})
}, [])

useEffect(() => {
if (!isEnabled || !isExternalProvider) {
return
}

const ensureAudioRecorder = useCallback((): AudioRecorder => {
if (!audioRecorder.current) {
audioRecorder.current = new AudioRecorder()
}

if (!recorderConfiguredRef.current) {
setupAudioRecorder(audioRecorder.current)
recorderConfiguredRef.current = true
}
return audioRecorder.current
}, [setupAudioRecorder])

const disposeAudioRecorder = useCallback(() => {
if (audioRecorder.current) {
audioRecorder.current.dispose()
audioRecorder.current = null
}
recorderConfiguredRef.current = false
}, [])

useEffect(() => {
if (!isEnabled || !isExternalProvider) {
return
}

void ensureAudioRecorder().prepare().catch(() => undefined)

return () => {
if (audioRecorder.current) {
audioRecorder.current.abort()
}
disposeAudioRecorder()
}
}, [isEnabled, isExternalProvider, setupAudioRecorder])
}, [isEnabled, isExternalProvider, ensureAudioRecorder, disposeAudioRecorder])

const clearStartupTimeout = useCallback(() => {
if (startupTimeoutRef.current) {
Expand All @@ -214,8 +231,8 @@ export function useSTT(userId = 'default') {
}, [])

const abortAndResetOnTimeout = useCallback(() => {
if (isExternalProvider && audioRecorder.current) {
audioRecorder.current.abort()
if (isExternalProvider) {
disposeAudioRecorder()
} else {
recognizer.current.abort()
}
Expand All @@ -224,7 +241,38 @@ export function useSTT(userId = 'default') {
setState('idle')
setIsError(true)
setError('Microphone start timed out')
}, [isExternalProvider])
}, [isExternalProvider, disposeAudioRecorder])

const runStartupWithTimeout = useCallback(
async (startup: () => Promise<void>, startOpId: number): Promise<boolean> => {
try {
const startupPromise = startup()
const timeoutPromise = new Promise<never>((_, reject) => {
startupTimeoutRef.current = setTimeout(() => {
if (startOpIdRef.current !== startOpId) return
reject(new Error('Microphone start timed out'))
}, STT_START_TIMEOUT_MS)
})

await Promise.race([startupPromise, timeoutPromise])
clearStartupTimeout()

return startOpIdRef.current === startOpId
} catch (err) {
clearStartupTimeout()
if (startOpIdRef.current !== startOpId) return false
setIsProcessing(false)
if (err instanceof Error && err.message === 'Microphone start timed out') {
abortAndResetOnTimeout()
return false
}
setIsError(true)
setError(err instanceof Error ? err.message : 'Failed to start recording')
return false
}
},
[clearStartupTimeout, abortAndResetOnTimeout],
)

const startRecording = useCallback(async (): Promise<boolean> => {
if (!isSupported) {
Expand All @@ -249,79 +297,25 @@ export function useSTT(userId = 'default') {
clearStartupTimeout()

if (isExternalProvider) {
if (!audioRecorder.current) {
audioRecorder.current = new AudioRecorder()
setupAudioRecorder(audioRecorder.current)
}

try {
setIsProcessing(true)

const startupPromise = audioRecorder.current.start()
const timeoutPromise = new Promise<never>((_, reject) => {
startupTimeoutRef.current = setTimeout(() => {
if (startOpIdRef.current !== startOpId) return
reject(new Error('Microphone start timed out'))
}, STT_START_TIMEOUT_MS)
})

await Promise.race([startupPromise, timeoutPromise])
clearStartupTimeout()

if (startOpIdRef.current !== startOpId) return false
const recorder = ensureAudioRecorder()

setIsProcessing(true)
const started = await runStartupWithTimeout(() => recorder.start(), startOpId)
if (started) {
setIsProcessing(false)
return true
} catch (err) {
clearStartupTimeout()
if (startOpIdRef.current !== startOpId) return false
setIsProcessing(false)
if (err instanceof Error && err.message === 'Microphone start timed out') {
abortAndResetOnTimeout()
return false
}
setIsError(true)
setError(err instanceof Error ? err.message : 'Failed to start recording')
return false
}
return started
} else {
const options: SpeechRecognitionOptions = {
language: config.language,
interimResults: true,
maxAlternatives: 1,
}

try {
setIsProcessing(true)

const startupPromise = recognizer.current.start(options)
const timeoutPromise = new Promise<never>((_, reject) => {
startupTimeoutRef.current = setTimeout(() => {
if (startOpIdRef.current !== startOpId) return
reject(new Error('Microphone start timed out'))
}, STT_START_TIMEOUT_MS)
})

await Promise.race([startupPromise, timeoutPromise])
clearStartupTimeout()

if (startOpIdRef.current !== startOpId) return false

return true
} catch (err) {
clearStartupTimeout()
if (startOpIdRef.current !== startOpId) return false
setIsProcessing(false)
if (err instanceof Error && err.message === 'Microphone start timed out') {
abortAndResetOnTimeout()
return false
}
setIsError(true)
setError(err instanceof Error ? err.message : 'Failed to start recording')
return false
}
setIsProcessing(true)
return runStartupWithTimeout(() => recognizer.current.start(options), startOpId)
}
}, [isSupported, isEnabled, isExternalProvider, config.language, setupAudioRecorder, clearStartupTimeout, abortAndResetOnTimeout])
}, [isSupported, isEnabled, isExternalProvider, config.language, clearStartupTimeout, ensureAudioRecorder, runStartupWithTimeout])

const stopRecording = useCallback(() => {
if (isExternalProvider && audioRecorder.current) {
Expand Down
Loading