feat(voice): play TTS through tablet speakers via ElevenLabs proxy
All checks were successful
Deploy / deploy (push) Successful in 2m58s

Stage 2 of voice integration — centralizes TTS on the tablet so the
Python satellite no longer needs ElevenLabs credentials or mpv.

- app/api/voice/tts — POST {text, agent}, proxies to ElevenLabs
  streaming endpoint with flash_v2_5 default, returns audio/mpeg.
  Per-agent voice id via COSMO_TTS_VOICE / LUSYA_TTS_VOICE env.
- VoiceOverlay — on response/error events fetches TTS and plays via
  HTMLAudioElement; on wake event stops playback (barge-in). Dismiss
  timer extended by text length so long responses do not cut off.
- Autoplay caveat: browser may block first playback until user taps
  anywhere on the page (FKB: enable Force Autoplay to bypass).
This commit is contained in:
Cosmo
2026-04-23 12:52:26 +00:00
parent 51c3d6016a
commit a780fc7bd5
2 changed files with 135 additions and 3 deletions

View File

@@ -0,0 +1,75 @@
export const dynamic = 'force-dynamic'
export const runtime = 'nodejs'
import { NextResponse } from 'next/server'
const ELEVENLABS_BASE = 'https://api.elevenlabs.io/v1'
const DEFAULT_MODEL = 'eleven_flash_v2_5'
function getVoiceId(agent: string | undefined): string | null {
if (agent === 'lusya') return process.env.LUSYA_TTS_VOICE || null
return process.env.COSMO_TTS_VOICE || null
}
export async function POST(req: Request) {
const apiKey = process.env.ELEVENLABS_API_KEY
if (!apiKey) {
return NextResponse.json({ error: 'tts_not_configured' }, { status: 503 })
}
const body = await req.json().catch(() => null)
const text = typeof body?.text === 'string' ? body.text.trim() : ''
const agent = typeof body?.agent === 'string' ? body.agent : 'cosmo'
if (!text) {
return NextResponse.json({ error: 'text required' }, { status: 400 })
}
if (text.length > 4000) {
return NextResponse.json({ error: 'text too long (>4000)' }, { status: 400 })
}
const voiceId = getVoiceId(agent)
if (!voiceId) {
return NextResponse.json({ error: `no voice configured for agent=${agent}` }, { status: 503 })
}
const model = process.env.ELEVENLABS_MODEL || DEFAULT_MODEL
const upstream = await fetch(
`${ELEVENLABS_BASE}/text-to-speech/${encodeURIComponent(voiceId)}/stream?output_format=mp3_44100_64`,
{
method: 'POST',
headers: {
'xi-api-key': apiKey,
Accept: 'audio/mpeg',
'Content-Type': 'application/json',
},
body: JSON.stringify({
text,
model_id: model,
voice_settings: {
stability: 0.45,
similarity_boost: 0.75,
style: 0.25,
use_speaker_boost: true,
},
}),
}
)
if (!upstream.ok || !upstream.body) {
const errText = await upstream.text().catch(() => '')
return NextResponse.json(
{ error: `elevenlabs_${upstream.status}`, detail: errText.slice(0, 300) },
{ status: 502 }
)
}
return new Response(upstream.body, {
headers: {
'Content-Type': 'audio/mpeg',
'Cache-Control': 'no-store',
'X-Accel-Buffering': 'no',
},
})
}