From a780fc7bd5b618a0a0d98fbf2d508e11e329b025 Mon Sep 17 00:00:00 2001 From: Cosmo Date: Thu, 23 Apr 2026 12:52:26 +0000 Subject: [PATCH] feat(voice): play TTS through tablet speakers via ElevenLabs proxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 2 of voice integration — centralizes TTS on the tablet so the Python satellite no longer needs ElevenLabs credentials or mpv. - app/api/voice/tts — POST {text, agent}, proxies to ElevenLabs streaming endpoint with flash_v2_5 default, returns audio/mpeg. Per-agent voice id via COSMO_TTS_VOICE / LUSYA_TTS_VOICE env. - VoiceOverlay — on response/error events fetches TTS and plays via HTMLAudioElement; on wake event stops playback (barge-in). Dismiss timer extended by text length so long responses do not cut off. - Autoplay caveat: browser may block first playback until user taps anywhere on the page (FKB: enable Force Autoplay to bypass). --- app/api/voice/tts/route.ts | 75 +++++++++++++++++++++++++++++++++++++ components/VoiceOverlay.tsx | 63 +++++++++++++++++++++++++++++-- 2 files changed, 135 insertions(+), 3 deletions(-) create mode 100644 app/api/voice/tts/route.ts diff --git a/app/api/voice/tts/route.ts b/app/api/voice/tts/route.ts new file mode 100644 index 0000000..0979241 --- /dev/null +++ b/app/api/voice/tts/route.ts @@ -0,0 +1,75 @@ +export const dynamic = 'force-dynamic' +export const runtime = 'nodejs' + +import { NextResponse } from 'next/server' + +const ELEVENLABS_BASE = 'https://api.elevenlabs.io/v1' +const DEFAULT_MODEL = 'eleven_flash_v2_5' + +function getVoiceId(agent: string | undefined): string | null { + if (agent === 'lusya') return process.env.LUSYA_TTS_VOICE || null + return process.env.COSMO_TTS_VOICE || null +} + +export async function POST(req: Request) { + const apiKey = process.env.ELEVENLABS_API_KEY + if (!apiKey) { + return NextResponse.json({ error: 'tts_not_configured' }, { status: 503 }) + } + + const body = await req.json().catch(() => null) + const text = typeof body?.text === 'string' ? body.text.trim() : '' + const agent = typeof body?.agent === 'string' ? body.agent : 'cosmo' + + if (!text) { + return NextResponse.json({ error: 'text required' }, { status: 400 }) + } + if (text.length > 4000) { + return NextResponse.json({ error: 'text too long (>4000)' }, { status: 400 }) + } + + const voiceId = getVoiceId(agent) + if (!voiceId) { + return NextResponse.json({ error: `no voice configured for agent=${agent}` }, { status: 503 }) + } + + const model = process.env.ELEVENLABS_MODEL || DEFAULT_MODEL + + const upstream = await fetch( + `${ELEVENLABS_BASE}/text-to-speech/${encodeURIComponent(voiceId)}/stream?output_format=mp3_44100_64`, + { + method: 'POST', + headers: { + 'xi-api-key': apiKey, + Accept: 'audio/mpeg', + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + text, + model_id: model, + voice_settings: { + stability: 0.45, + similarity_boost: 0.75, + style: 0.25, + use_speaker_boost: true, + }, + }), + } + ) + + if (!upstream.ok || !upstream.body) { + const errText = await upstream.text().catch(() => '') + return NextResponse.json( + { error: `elevenlabs_${upstream.status}`, detail: errText.slice(0, 300) }, + { status: 502 } + ) + } + + return new Response(upstream.body, { + headers: { + 'Content-Type': 'audio/mpeg', + 'Cache-Control': 'no-store', + 'X-Accel-Buffering': 'no', + }, + }) +} diff --git a/components/VoiceOverlay.tsx b/components/VoiceOverlay.tsx index 5dd141c..4985be6 100644 --- a/components/VoiceOverlay.tsx +++ b/components/VoiceOverlay.tsx @@ -34,6 +34,55 @@ export default function VoiceOverlay() { dismissTimer.current = setTimeout(() => setState('idle'), ms) } + const audioRef = useRef(null) + const audioUrlRef = useRef(null) + + const stopAudio = () => { + if (audioRef.current) { + try { + audioRef.current.pause() + audioRef.current.src = '' + } catch {} + audioRef.current = null + } + if (audioUrlRef.current) { + URL.revokeObjectURL(audioUrlRef.current) + audioUrlRef.current = null + } + } + + const playTTS = async (textToSpeak: string, agentId: Agent) => { + stopAudio() + if (!textToSpeak) return + try { + const r = await fetch('/api/voice/tts', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: textToSpeak, agent: agentId }), + }) + if (!r.ok) { + console.warn('TTS endpoint error:', r.status) + return + } + const blob = await r.blob() + const url = URL.createObjectURL(blob) + audioUrlRef.current = url + const audio = new Audio(url) + audio.onended = () => { + if (audioUrlRef.current === url) { + URL.revokeObjectURL(url) + audioUrlRef.current = null + } + } + audioRef.current = audio + await audio.play().catch(err => { + console.warn('Audio autoplay blocked:', err) + }) + } catch (err) { + console.warn('TTS fetch failed:', err) + } + } + useEffect(() => { let es: EventSource | null = null let retry: ReturnType | null = null @@ -45,12 +94,15 @@ export default function VoiceOverlay() { es.onmessage = (e) => { try { const evt: VoiceEvent = JSON.parse(e.data) + const currentAgent: Agent = evt.agent ?? agent if (evt.agent) setAgent(evt.agent) if (evt.event === 'wake') { + // Barge-in: cut any ongoing TTS when user speaks again + stopAudio() setState('wake') setText('') - scheduleDismiss(20000) // safety net: 20s max without command + scheduleDismiss(20000) } else if (evt.event === 'command') { setState('command') setText(evt.text || '') @@ -58,11 +110,13 @@ export default function VoiceOverlay() { } else if (evt.event === 'response') { setState('response') setText(evt.text || '') - scheduleDismiss(6000) + if (evt.text) playTTS(evt.text, currentAgent) + scheduleDismiss(Math.max(6000, (evt.text?.length || 0) * 80)) } else if (evt.event === 'error') { setState('error') setText(evt.text || 'Ошибка') - scheduleDismiss(4000) + if (evt.text) playTTS(evt.text, currentAgent) + scheduleDismiss(5000) } else if (evt.event === 'idle') { clearDismiss() setState('idle') @@ -82,9 +136,12 @@ export default function VoiceOverlay() { return () => { closedByUs = true clearDismiss() + stopAudio() if (retry) clearTimeout(retry) es?.close() } + // agent is intentionally omitted — we always read from ref via the evt + // eslint-disable-next-line react-hooks/exhaustive-deps }, []) const isActive = state !== 'idle'