'use client' /** * Голосовой контроллер. * * UX: * - Idle: кнопка-микрофон (перечёркнут). Тап = «активировать ассистента» (нужен * user gesture чтобы AudioContext стартанул). * - Active: загружаются wake-модели (один раз) → запускается wake-word listener * на постоянный фон. Кнопка горит фиолетовым, говорит «Космо». * - Wake-word triggered → MicVAD стартует → onSpeechEnd → STT → chat → TTS. * - Параллельно тап на кнопку = ручной trigger (как раньше) если wake не * срабатывает или wake тренировка ещё слабая. * - Tap во время Active → выключает wake и mic полностью. */ import { useEffect, useRef, useState } from 'react' import { Mic, MicOff } from 'lucide-react' import { WakeWordDetector } from '@/lib/wake-word' import { floatToWav } from '@/lib/audio-wav' import { vlog, vwarn, verror } from '@/lib/debug' type Agent = 'cosmo' | 'lusya' type ControllerState = 'idle' | 'loading' | 'listening' | 'recording' | 'busy' | 'error' const AGENT: Agent = 'cosmo' const WAKE_THRESHOLD = 0.5 function emitLocal(event: string, agent: Agent, text?: string) { window.dispatchEvent( new CustomEvent('voice-local', { detail: { event, agent, text, timestamp: new Date().toISOString() }, }), ) } export default function VoiceController() { const [state, setState] = useState('idle') const wakeRef = useRef(null) const vadRef = useRef(null) const busyRef = useRef(false) useEffect(() => { vlog('[VoiceController] mounted, state=idle, ждём тап на микрофон') // Кнопка X в overlay шлёт voice-cancel → ставим VAD на паузу // (НЕ destroy — иначе следующий wake снова будет ждать 1-2с на инициализацию). const onCancel = () => { vlog('[voice] cancel — пауза VAD') try { vadRef.current?.pause?.() } catch {} busyRef.current = false try { wakeRef.current?.resume?.() } catch {} setState((s) => (wakeRef.current ? 'listening' : 'idle')) emitLocal('idle', AGENT) } window.addEventListener('voice-cancel', onCancel) return () => { window.removeEventListener('voice-cancel', onCancel) try { vadRef.current?.destroy?.() } catch {} try { wakeRef.current?.stop?.() } catch {} vadRef.current = null wakeRef.current = null } }, []) // Обрабатываем результат VAD-захвата фразы и шлём по pipeline. const handleSpeechEnd = async (audio: Float32Array) => { if (busyRef.current) return if (audio.length < 16000 * 0.4) return busyRef.current = true setState('busy') emitLocal('listening', AGENT) try { const wav = floatToWav(audio, 16000) const sttResp = await fetch('/api/voice/stt', { method: 'POST', headers: { 'Content-Type': 'audio/wav' }, body: wav, }) if (!sttResp.ok) throw new Error(`stt ${sttResp.status}`) const { text } = await sttResp.json() const userText = (text || '').trim() if (!userText || userText.length < 2) { emitLocal('idle', AGENT) return } const chatResp = await fetch('/api/voice/chat', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text: userText, agent: AGENT }), }) if (!chatResp.ok) throw new Error(`chat ${chatResp.status}`) } catch (e) { verror('[voice] pipeline error:', e) emitLocal('error', AGENT, 'Не получилось') } finally { busyRef.current = false // VAD на паузу — переиспользуем при следующем wake (без re-init). try { vadRef.current?.pause?.() } catch {} // Wake возобновляем — снова слушаем фоном. try { wakeRef.current?.resume?.() } catch {} setState((s) => (s === 'busy' ? 'listening' : s)) } } // Однократная инициализация VAD. Создаётся в paused-состоянии и переиспользуется // на каждый wake — без этого пауза до первой записи ~1-2с. const initVAD = async () => { if (vadRef.current) return try { const { MicVAD } = await import('@ricky0123/vad-web') // Подавить VAD | debug логи из библиотеки const _origDebug = (console as any)._vadOrig || console.debug ;(console as any)._vadOrig = _origDebug console.debug = (...args: any[]) => { if (typeof args[0] === 'string' && (args[0].startsWith('VAD |') || args[0].startsWith('using default audio'))) return _origDebug.apply(console, args) } const vad = await MicVAD.new({ model: 'v5', baseAssetPath: '/vad/', onnxWASMBasePath: '/vad/', logLevel: 'error', ortConfig: (ort: any) => { ort.env.wasm.numThreads = 1 ort.env.wasm.simd = true }, positiveSpeechThreshold: 0.6, negativeSpeechThreshold: 0.45, minSpeechMs: 160, redemptionMs: 750, onSpeechStart: () => emitLocal('wake', AGENT), onSpeechEnd: handleSpeechEnd, }) vadRef.current = vad // Не вызываем start — ждём пока wake-word триггернёт. vlog('[voice] VAD preloaded (paused)') } catch (e: any) { verror('[voice] VAD init failed:', e?.name, e?.message, e) // Не вырубаем wake — может на ручной trigger ещё попробуем emitLocal('error', AGENT, `VAD: ${e?.message?.slice(0, 60) || 'init'}`) } } const onWakeDetected = async (score: number) => { vlog(`[wake] cosmo score=${score.toFixed(3)}`) if (busyRef.current) return // Пауза wake чтобы VAD-инициализация и команда не триггерили wake снова на эхе. try { wakeRef.current?.pause?.() } catch {} setState('recording') emitLocal('wake', AGENT) // VAD должен быть уже preloaded — мгновенный старт. if (!vadRef.current) await initVAD() try { vadRef.current?.start?.() } catch {} } const start = async () => { if (state !== 'idle' && state !== 'error') return setState('loading') // 0. «Audio unlock» — iOS Safari / Android Chrome не дают воспроизводить // звук без user-gesture. Wake-word срабатывает сам, поэтому позже TTS // тихо отвергнется. Создаём общий AudioContext прямо сейчас (тап = gesture) // и сохраняем в window — VoiceOverlay будет играть через него. try { const w = window as any if (!w.__voicePlaybackCtx) { const Ctx = w.AudioContext || w.webkitAudioContext if (Ctx) w.__voicePlaybackCtx = new Ctx() } const ctx: AudioContext | undefined = w.__voicePlaybackCtx if (ctx && ctx.state === 'suspended') await ctx.resume() vlog('[voice] playback AudioContext state=', ctx?.state) } catch (e: any) { vwarn('[voice] AudioContext init failed:', e?.message) } // 1. Запрос разрешения на микрофон отдельно try { const probe = await navigator.mediaDevices.getUserMedia({ audio: true }) probe.getTracks().forEach((t) => t.stop()) } catch (e: any) { verror('[voice] mic permission failed:', e?.name, e?.message) setState('error') emitLocal('error', AGENT, e?.name === 'NotAllowedError' ? 'Нет доступа к микрофону' : 'Микрофон не открылся') return } // 2. Запуск wake-word try { // Логируем периодически max-score и просто что pipeline жив, чтобы было // видно, что инференс идёт. let maxScore = 0 let scoreCount = 0 const wake = new WakeWordDetector({ modelPath: '/wake/cosmo.onnx', threshold: WAKE_THRESHOLD, onWake: (s) => onWakeDetected(s), onScore: (s) => { if (s > maxScore) maxScore = s scoreCount++ if (scoreCount % 25 === 0) { vlog(`[wake] alive · max score за окно=${maxScore.toFixed(3)} · scoreCount=${scoreCount}`) maxScore = 0 } if (s > 0.15) vlog(`[wake] score=${s.toFixed(3)}`) }, onError: (e) => vwarn('[wake] error', e), }) await wake.start() wakeRef.current = wake setState('listening') // VAD НЕ прелоадим — его второй getUserMedia мешает wake-word audio. // Грузится при первом wake (~1-2с), но дальше переиспользуется (см. handleSpeechEnd). } catch (e: any) { verror('[wake] init failed:', e) setState('error') emitLocal('error', AGENT, `Wake: ${e?.message?.slice(0, 60) || 'init'}`) } } const stop = async () => { try { vadRef.current?.pause?.() } catch {} try { vadRef.current?.destroy?.() } catch {} vadRef.current = null try { await wakeRef.current?.stop?.() } catch {} wakeRef.current = null setState('idle') emitLocal('idle', AGENT) } // Долгий тап = ручной триггер (как раньше push-to-talk). Короткий — toggle вкл/выкл. // Для простоты сейчас: короткий тап в idle = активация; короткий тап в active = выкл. const onTap = async () => { vlog(`[VoiceController] tap! state=${state}`) if (state === 'idle' || state === 'error') { await start() } else if (state === 'listening') { // ручной trigger — эмулируем wake-event onWakeDetected(1.0) } else { await stop() } } const onLongPress = async () => { // Длинный тап всегда выключает (на случай если случайно зашли в плохое состояние) await stop() } // primitive long-press detection const pressTimer = useRef | null>(null) const longPressed = useRef(false) const onPointerDown = () => { longPressed.current = false pressTimer.current = setTimeout(() => { longPressed.current = true onLongPress() }, 700) } const onPointerUp = () => { if (pressTimer.current) clearTimeout(pressTimer.current) pressTimer.current = null if (!longPressed.current) onTap() } const isActive = state === 'listening' || state === 'recording' || state === 'busy' const isLoading = state === 'loading' return ( ) }