feat(voice): play TTS through tablet speakers via ElevenLabs proxy
All checks were successful
Deploy / deploy (push) Successful in 2m58s
All checks were successful
Deploy / deploy (push) Successful in 2m58s
Stage 2 of voice integration — centralizes TTS on the tablet so the
Python satellite no longer needs ElevenLabs credentials or mpv.
- app/api/voice/tts — POST {text, agent}, proxies to ElevenLabs
streaming endpoint with flash_v2_5 default, returns audio/mpeg.
Per-agent voice id via COSMO_TTS_VOICE / LUSYA_TTS_VOICE env.
- VoiceOverlay — on response/error events fetches TTS and plays via
HTMLAudioElement; on wake event stops playback (barge-in). Dismiss
timer extended by text length so long responses do not cut off.
- Autoplay caveat: browser may block first playback until user taps
anywhere on the page (FKB: enable Force Autoplay to bypass).
This commit is contained in:
75
app/api/voice/tts/route.ts
Normal file
75
app/api/voice/tts/route.ts
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
export const dynamic = 'force-dynamic'
|
||||||
|
export const runtime = 'nodejs'
|
||||||
|
|
||||||
|
import { NextResponse } from 'next/server'
|
||||||
|
|
||||||
|
const ELEVENLABS_BASE = 'https://api.elevenlabs.io/v1'
|
||||||
|
const DEFAULT_MODEL = 'eleven_flash_v2_5'
|
||||||
|
|
||||||
|
function getVoiceId(agent: string | undefined): string | null {
|
||||||
|
if (agent === 'lusya') return process.env.LUSYA_TTS_VOICE || null
|
||||||
|
return process.env.COSMO_TTS_VOICE || null
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function POST(req: Request) {
|
||||||
|
const apiKey = process.env.ELEVENLABS_API_KEY
|
||||||
|
if (!apiKey) {
|
||||||
|
return NextResponse.json({ error: 'tts_not_configured' }, { status: 503 })
|
||||||
|
}
|
||||||
|
|
||||||
|
const body = await req.json().catch(() => null)
|
||||||
|
const text = typeof body?.text === 'string' ? body.text.trim() : ''
|
||||||
|
const agent = typeof body?.agent === 'string' ? body.agent : 'cosmo'
|
||||||
|
|
||||||
|
if (!text) {
|
||||||
|
return NextResponse.json({ error: 'text required' }, { status: 400 })
|
||||||
|
}
|
||||||
|
if (text.length > 4000) {
|
||||||
|
return NextResponse.json({ error: 'text too long (>4000)' }, { status: 400 })
|
||||||
|
}
|
||||||
|
|
||||||
|
const voiceId = getVoiceId(agent)
|
||||||
|
if (!voiceId) {
|
||||||
|
return NextResponse.json({ error: `no voice configured for agent=${agent}` }, { status: 503 })
|
||||||
|
}
|
||||||
|
|
||||||
|
const model = process.env.ELEVENLABS_MODEL || DEFAULT_MODEL
|
||||||
|
|
||||||
|
const upstream = await fetch(
|
||||||
|
`${ELEVENLABS_BASE}/text-to-speech/${encodeURIComponent(voiceId)}/stream?output_format=mp3_44100_64`,
|
||||||
|
{
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'xi-api-key': apiKey,
|
||||||
|
Accept: 'audio/mpeg',
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
text,
|
||||||
|
model_id: model,
|
||||||
|
voice_settings: {
|
||||||
|
stability: 0.45,
|
||||||
|
similarity_boost: 0.75,
|
||||||
|
style: 0.25,
|
||||||
|
use_speaker_boost: true,
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if (!upstream.ok || !upstream.body) {
|
||||||
|
const errText = await upstream.text().catch(() => '')
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: `elevenlabs_${upstream.status}`, detail: errText.slice(0, 300) },
|
||||||
|
{ status: 502 }
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Response(upstream.body, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'audio/mpeg',
|
||||||
|
'Cache-Control': 'no-store',
|
||||||
|
'X-Accel-Buffering': 'no',
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -34,6 +34,55 @@ export default function VoiceOverlay() {
|
|||||||
dismissTimer.current = setTimeout(() => setState('idle'), ms)
|
dismissTimer.current = setTimeout(() => setState('idle'), ms)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const audioRef = useRef<HTMLAudioElement | null>(null)
|
||||||
|
const audioUrlRef = useRef<string | null>(null)
|
||||||
|
|
||||||
|
const stopAudio = () => {
|
||||||
|
if (audioRef.current) {
|
||||||
|
try {
|
||||||
|
audioRef.current.pause()
|
||||||
|
audioRef.current.src = ''
|
||||||
|
} catch {}
|
||||||
|
audioRef.current = null
|
||||||
|
}
|
||||||
|
if (audioUrlRef.current) {
|
||||||
|
URL.revokeObjectURL(audioUrlRef.current)
|
||||||
|
audioUrlRef.current = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const playTTS = async (textToSpeak: string, agentId: Agent) => {
|
||||||
|
stopAudio()
|
||||||
|
if (!textToSpeak) return
|
||||||
|
try {
|
||||||
|
const r = await fetch('/api/voice/tts', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ text: textToSpeak, agent: agentId }),
|
||||||
|
})
|
||||||
|
if (!r.ok) {
|
||||||
|
console.warn('TTS endpoint error:', r.status)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const blob = await r.blob()
|
||||||
|
const url = URL.createObjectURL(blob)
|
||||||
|
audioUrlRef.current = url
|
||||||
|
const audio = new Audio(url)
|
||||||
|
audio.onended = () => {
|
||||||
|
if (audioUrlRef.current === url) {
|
||||||
|
URL.revokeObjectURL(url)
|
||||||
|
audioUrlRef.current = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
audioRef.current = audio
|
||||||
|
await audio.play().catch(err => {
|
||||||
|
console.warn('Audio autoplay blocked:', err)
|
||||||
|
})
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('TTS fetch failed:', err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
let es: EventSource | null = null
|
let es: EventSource | null = null
|
||||||
let retry: ReturnType<typeof setTimeout> | null = null
|
let retry: ReturnType<typeof setTimeout> | null = null
|
||||||
@@ -45,12 +94,15 @@ export default function VoiceOverlay() {
|
|||||||
es.onmessage = (e) => {
|
es.onmessage = (e) => {
|
||||||
try {
|
try {
|
||||||
const evt: VoiceEvent = JSON.parse(e.data)
|
const evt: VoiceEvent = JSON.parse(e.data)
|
||||||
|
const currentAgent: Agent = evt.agent ?? agent
|
||||||
if (evt.agent) setAgent(evt.agent)
|
if (evt.agent) setAgent(evt.agent)
|
||||||
|
|
||||||
if (evt.event === 'wake') {
|
if (evt.event === 'wake') {
|
||||||
|
// Barge-in: cut any ongoing TTS when user speaks again
|
||||||
|
stopAudio()
|
||||||
setState('wake')
|
setState('wake')
|
||||||
setText('')
|
setText('')
|
||||||
scheduleDismiss(20000) // safety net: 20s max without command
|
scheduleDismiss(20000)
|
||||||
} else if (evt.event === 'command') {
|
} else if (evt.event === 'command') {
|
||||||
setState('command')
|
setState('command')
|
||||||
setText(evt.text || '')
|
setText(evt.text || '')
|
||||||
@@ -58,11 +110,13 @@ export default function VoiceOverlay() {
|
|||||||
} else if (evt.event === 'response') {
|
} else if (evt.event === 'response') {
|
||||||
setState('response')
|
setState('response')
|
||||||
setText(evt.text || '')
|
setText(evt.text || '')
|
||||||
scheduleDismiss(6000)
|
if (evt.text) playTTS(evt.text, currentAgent)
|
||||||
|
scheduleDismiss(Math.max(6000, (evt.text?.length || 0) * 80))
|
||||||
} else if (evt.event === 'error') {
|
} else if (evt.event === 'error') {
|
||||||
setState('error')
|
setState('error')
|
||||||
setText(evt.text || 'Ошибка')
|
setText(evt.text || 'Ошибка')
|
||||||
scheduleDismiss(4000)
|
if (evt.text) playTTS(evt.text, currentAgent)
|
||||||
|
scheduleDismiss(5000)
|
||||||
} else if (evt.event === 'idle') {
|
} else if (evt.event === 'idle') {
|
||||||
clearDismiss()
|
clearDismiss()
|
||||||
setState('idle')
|
setState('idle')
|
||||||
@@ -82,9 +136,12 @@ export default function VoiceOverlay() {
|
|||||||
return () => {
|
return () => {
|
||||||
closedByUs = true
|
closedByUs = true
|
||||||
clearDismiss()
|
clearDismiss()
|
||||||
|
stopAudio()
|
||||||
if (retry) clearTimeout(retry)
|
if (retry) clearTimeout(retry)
|
||||||
es?.close()
|
es?.close()
|
||||||
}
|
}
|
||||||
|
// agent is intentionally omitted — we always read from ref via the evt
|
||||||
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, [])
|
}, [])
|
||||||
|
|
||||||
const isActive = state !== 'idle'
|
const isActive = state !== 'idle'
|
||||||
|
|||||||
Reference in New Issue
Block a user