feat(voice): play TTS through tablet speakers via ElevenLabs proxy

Stage 2 of voice integration — centralizes TTS on the tablet so the Python satellite no longer needs ElevenLabs credentials or mpv. - app/api/voice/tts — POST {text, agent}, proxies to ElevenLabs streaming endpoint with flash_v2_5 default, returns audio/mpeg. Per-agent voice id via COSMO_TTS_VOICE / LUSYA_TTS_VOICE env. - VoiceOverlay — on response/error events fetches TTS and plays via HTMLAudioElement; on wake event stops playback (barge-in). Dismiss timer extended by text length so long responses do not cut off. - Autoplay caveat: browser may block first playback until user taps anywhere on the page (FKB: enable Force Autoplay to bypass).
2026-04-23 12:52:26 +00:00
parent 51c3d6016a
commit a780fc7bd5
2 changed files with 135 additions and 3 deletions
--- a/app/api/voice/tts/route.ts
+++ b/app/api/voice/tts/route.ts
@@ -0,0 +1,75 @@
+export const dynamic = 'force-dynamic'
+export const runtime = 'nodejs'
+
+import { NextResponse } from 'next/server'
+
+const ELEVENLABS_BASE = 'https://api.elevenlabs.io/v1'
+const DEFAULT_MODEL = 'eleven_flash_v2_5'
+
+function getVoiceId(agent: string | undefined): string | null {
+  if (agent === 'lusya') return process.env.LUSYA_TTS_VOICE || null
+  return process.env.COSMO_TTS_VOICE || null
+}
+
+export async function POST(req: Request) {
+  const apiKey = process.env.ELEVENLABS_API_KEY
+  if (!apiKey) {
+    return NextResponse.json({ error: 'tts_not_configured' }, { status: 503 })
+  }
+
+  const body = await req.json().catch(() => null)
+  const text = typeof body?.text === 'string' ? body.text.trim() : ''
+  const agent = typeof body?.agent === 'string' ? body.agent : 'cosmo'
+
+  if (!text) {
+    return NextResponse.json({ error: 'text required' }, { status: 400 })
+  }
+  if (text.length > 4000) {
+    return NextResponse.json({ error: 'text too long (>4000)' }, { status: 400 })
+  }
+
+  const voiceId = getVoiceId(agent)
+  if (!voiceId) {
+    return NextResponse.json({ error: `no voice configured for agent=${agent}` }, { status: 503 })
+  }
+
+  const model = process.env.ELEVENLABS_MODEL || DEFAULT_MODEL
+
+  const upstream = await fetch(
+    `${ELEVENLABS_BASE}/text-to-speech/${encodeURIComponent(voiceId)}/stream?output_format=mp3_44100_64`,
+    {
+      method: 'POST',
+      headers: {
+        'xi-api-key': apiKey,
+        Accept: 'audio/mpeg',
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        text,
+        model_id: model,
+        voice_settings: {
+          stability: 0.45,
+          similarity_boost: 0.75,
+          style: 0.25,
+          use_speaker_boost: true,
+        },
+      }),
+    }
+  )
+
+  if (!upstream.ok || !upstream.body) {
+    const errText = await upstream.text().catch(() => '')
+    return NextResponse.json(
+      { error: `elevenlabs_${upstream.status}`, detail: errText.slice(0, 300) },
+      { status: 502 }
+    )
+  }
+
+  return new Response(upstream.body, {
+    headers: {
+      'Content-Type': 'audio/mpeg',
+      'Cache-Control': 'no-store',
+      'X-Accel-Buffering': 'no',
+    },
+  })
+}
--- a/components/VoiceOverlay.tsx
+++ b/components/VoiceOverlay.tsx
@@ -34,6 +34,55 @@ export default function VoiceOverlay() {
    dismissTimer.current = setTimeout(() => setState('idle'), ms)
  }

+  const audioRef = useRef<HTMLAudioElement | null>(null)
+  const audioUrlRef = useRef<string | null>(null)
+
+  const stopAudio = () => {
+    if (audioRef.current) {
+      try {
+        audioRef.current.pause()
+        audioRef.current.src = ''
+      } catch {}
+      audioRef.current = null
+    }
+    if (audioUrlRef.current) {
+      URL.revokeObjectURL(audioUrlRef.current)
+      audioUrlRef.current = null
+    }
+  }
+
+  const playTTS = async (textToSpeak: string, agentId: Agent) => {
+    stopAudio()
+    if (!textToSpeak) return
+    try {
+      const r = await fetch('/api/voice/tts', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ text: textToSpeak, agent: agentId }),
+      })
+      if (!r.ok) {
+        console.warn('TTS endpoint error:', r.status)
+        return
+      }
+      const blob = await r.blob()
+      const url = URL.createObjectURL(blob)
+      audioUrlRef.current = url
+      const audio = new Audio(url)
+      audio.onended = () => {
+        if (audioUrlRef.current === url) {
+          URL.revokeObjectURL(url)
+          audioUrlRef.current = null
+        }
+      }
+      audioRef.current = audio
+      await audio.play().catch(err => {
+        console.warn('Audio autoplay blocked:', err)
+      })
+    } catch (err) {
+      console.warn('TTS fetch failed:', err)
+    }
+  }
+
  useEffect(() => {
    let es: EventSource | null = null
    let retry: ReturnType<typeof setTimeout> | null = null
@@ -45,12 +94,15 @@ export default function VoiceOverlay() {
      es.onmessage = (e) => {
        try {
          const evt: VoiceEvent = JSON.parse(e.data)
+          const currentAgent: Agent = evt.agent ?? agent
          if (evt.agent) setAgent(evt.agent)

          if (evt.event === 'wake') {
+            // Barge-in: cut any ongoing TTS when user speaks again
+            stopAudio()
            setState('wake')
            setText('')
-            scheduleDismiss(20000) // safety net: 20s max without command
+            scheduleDismiss(20000)
          } else if (evt.event === 'command') {
            setState('command')
            setText(evt.text || '')
@@ -58,11 +110,13 @@ export default function VoiceOverlay() {
          } else if (evt.event === 'response') {
            setState('response')
            setText(evt.text || '')
-            scheduleDismiss(6000)
+            if (evt.text) playTTS(evt.text, currentAgent)
+            scheduleDismiss(Math.max(6000, (evt.text?.length || 0) * 80))
          } else if (evt.event === 'error') {
            setState('error')
            setText(evt.text || 'Ошибка')
-            scheduleDismiss(4000)
+            if (evt.text) playTTS(evt.text, currentAgent)
+            scheduleDismiss(5000)
          } else if (evt.event === 'idle') {
            clearDismiss()
            setState('idle')
@@ -82,9 +136,12 @@ export default function VoiceOverlay() {
    return () => {
      closedByUs = true
      clearDismiss()
+      stopAudio()
      if (retry) clearTimeout(retry)
      es?.close()
    }
+    // agent is intentionally omitted — we always read from ref via the evt
+    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [])

  const isActive = state !== 'idle'