feat(voice): play TTS through tablet speakers via ElevenLabs proxy
All checks were successful
Deploy / deploy (push) Successful in 2m58s

Stage 2 of voice integration — centralizes TTS on the tablet so the
Python satellite no longer needs ElevenLabs credentials or mpv.

- app/api/voice/tts — POST {text, agent}, proxies to ElevenLabs
  streaming endpoint with flash_v2_5 default, returns audio/mpeg.
  Per-agent voice id via COSMO_TTS_VOICE / LUSYA_TTS_VOICE env.
- VoiceOverlay — on response/error events fetches TTS and plays via
  HTMLAudioElement; on wake event stops playback (barge-in). Dismiss
  timer extended by text length so long responses do not cut off.
- Autoplay caveat: browser may block first playback until user taps
  anywhere on the page (FKB: enable Force Autoplay to bypass).
This commit is contained in:
Cosmo
2026-04-23 12:52:26 +00:00
parent 51c3d6016a
commit a780fc7bd5
2 changed files with 135 additions and 3 deletions

View File

@@ -34,6 +34,55 @@ export default function VoiceOverlay() {
dismissTimer.current = setTimeout(() => setState('idle'), ms)
}
const audioRef = useRef<HTMLAudioElement | null>(null)
const audioUrlRef = useRef<string | null>(null)
const stopAudio = () => {
if (audioRef.current) {
try {
audioRef.current.pause()
audioRef.current.src = ''
} catch {}
audioRef.current = null
}
if (audioUrlRef.current) {
URL.revokeObjectURL(audioUrlRef.current)
audioUrlRef.current = null
}
}
const playTTS = async (textToSpeak: string, agentId: Agent) => {
stopAudio()
if (!textToSpeak) return
try {
const r = await fetch('/api/voice/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text: textToSpeak, agent: agentId }),
})
if (!r.ok) {
console.warn('TTS endpoint error:', r.status)
return
}
const blob = await r.blob()
const url = URL.createObjectURL(blob)
audioUrlRef.current = url
const audio = new Audio(url)
audio.onended = () => {
if (audioUrlRef.current === url) {
URL.revokeObjectURL(url)
audioUrlRef.current = null
}
}
audioRef.current = audio
await audio.play().catch(err => {
console.warn('Audio autoplay blocked:', err)
})
} catch (err) {
console.warn('TTS fetch failed:', err)
}
}
useEffect(() => {
let es: EventSource | null = null
let retry: ReturnType<typeof setTimeout> | null = null
@@ -45,12 +94,15 @@ export default function VoiceOverlay() {
es.onmessage = (e) => {
try {
const evt: VoiceEvent = JSON.parse(e.data)
const currentAgent: Agent = evt.agent ?? agent
if (evt.agent) setAgent(evt.agent)
if (evt.event === 'wake') {
// Barge-in: cut any ongoing TTS when user speaks again
stopAudio()
setState('wake')
setText('')
scheduleDismiss(20000) // safety net: 20s max without command
scheduleDismiss(20000)
} else if (evt.event === 'command') {
setState('command')
setText(evt.text || '')
@@ -58,11 +110,13 @@ export default function VoiceOverlay() {
} else if (evt.event === 'response') {
setState('response')
setText(evt.text || '')
scheduleDismiss(6000)
if (evt.text) playTTS(evt.text, currentAgent)
scheduleDismiss(Math.max(6000, (evt.text?.length || 0) * 80))
} else if (evt.event === 'error') {
setState('error')
setText(evt.text || 'Ошибка')
scheduleDismiss(4000)
if (evt.text) playTTS(evt.text, currentAgent)
scheduleDismiss(5000)
} else if (evt.event === 'idle') {
clearDismiss()
setState('idle')
@@ -82,9 +136,12 @@ export default function VoiceOverlay() {
return () => {
closedByUs = true
clearDismiss()
stopAudio()
if (retry) clearTimeout(retry)
es?.close()
}
// agent is intentionally omitted — we always read from ref via the evt
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [])
const isActive = state !== 'idle'