feat(voice): server-side LLM/STT — porting Python satellite into tablet

Шаг 1 миграции голосового стека из home-voice-assistant в сам tablet: - /api/voice/chat — Claude Haiku 4.5 с tool-loop (max 4 раунда), prompt caching на system + старой истории, история в /data/voice-history/. Эмитит command/response/error в voice-bus → орб моргает как раньше. - /api/voice/stt — Groq whisper-large-v3-turbo, multipart или raw audio. - lib/voice-text.ts — порт clean_for_speech (без pymorphy3, время в именительном падеже) и strip_fillers + RESET_PATTERNS. - lib/voice-executors.ts — tool executors через loopback fetch на существующие /api/voice/tools/* и /api/voice/timer. - Поддержка ANTHROPIC_PROXY/GROQ_PROXY (fallback на HTTPS_PROXY). После деплоя нужны GROQ_API_KEY и ANTHROPIC_API_KEY в tablet.env. Шаги 2 (push-to-talk в браузере) и 3 (wake-word) — отдельно.
2026-04-27 08:24:19 +00:00
parent a97dd11f25
commit eeac2eefb3
10 changed files with 1215 additions and 4 deletions
--- a/app/api/voice/chat/route.ts
+++ b/app/api/voice/chat/route.ts
@@ -0,0 +1,155 @@
+export const dynamic = 'force-dynamic'
+export const runtime = 'nodejs'
+
+import { NextResponse } from 'next/server'
+import Anthropic from '@anthropic-ai/sdk'
+import { ProxyAgent } from 'undici'
+
+import { voiceBus } from '@/lib/voice-bus'
+import { systemPrompt } from '@/lib/voice-prompts'
+import { TOOL_SCHEMAS } from '@/lib/voice-tool-schemas'
+import { executeTool } from '@/lib/voice-executors'
+import { cleanForSpeech, stripFillers, isResetCommand } from '@/lib/voice-text'
+import {
+  loadHistory, saveHistory, resetHistory,
+  buildMessagesWithCache, stripCacheControl, HistoryMessage,
+} from '@/lib/voice-history'
+
+const MODEL = process.env.ANTHROPIC_MODEL || 'claude-haiku-4-5'
+const MAX_TOKENS = parseInt(process.env.VOICE_MAX_TOKENS || '300', 10)
+const MAX_TOOL_ROUNDS = 4
+
+let _client: Anthropic | null = null
+function client(): Anthropic {
+  if (_client) return _client
+  const apiKey = process.env.ANTHROPIC_API_KEY
+  if (!apiKey) throw new Error('ANTHROPIC_API_KEY not set')
+  const proxy = process.env.ANTHROPIC_PROXY || process.env.HTTPS_PROXY || ''
+  const fetchOptions = proxy
+    ? ({ dispatcher: new ProxyAgent(proxy) } as any)
+    : undefined
+  _client = new Anthropic({ apiKey, fetchOptions })
+  return _client
+}
+
+function emitVoice(event: string, agent: 'cosmo' | 'lusya', text?: string) {
+  voiceBus.emit('voice', {
+    event,
+    agent,
+    text,
+    timestamp: new Date().toISOString(),
+  })
+}
+
+type AgentId = 'cosmo' | 'lusya'
+
+export async function POST(req: Request) {
+  const body = await req.json().catch(() => null)
+  if (!body || typeof body.text !== 'string' || !body.text.trim()) {
+    return NextResponse.json({ error: 'text required' }, { status: 400 })
+  }
+  const userText: string = body.text.trim()
+  const agent: AgentId = body.agent === 'lusya' ? 'lusya' : 'cosmo'
+
+  // Echo command в орб
+  emitVoice('command', agent, userText)
+
+  // Reset-команда — стираем историю и отвечаем шаблонно
+  if (isResetCommand(userText)) {
+    await resetHistory(agent)
+    const msg = 'Начинаю новую сессию.'
+    emitVoice('response', agent, msg)
+    return NextResponse.json({ text: msg, reset: true })
+  }
+
+  // Загружаем историю и добавляем новый user-turn
+  const history = await loadHistory(agent)
+  history.push({ role: 'user', content: userText })
+
+  const systemBlocks: Anthropic.TextBlockParam[] = [
+    {
+      type: 'text',
+      text: systemPrompt(agent),
+      cache_control: { type: 'ephemeral' },
+    },
+  ]
+
+  const apiMessages: Anthropic.MessageParam[] = buildMessagesWithCache(history) as any
+
+  let finalText = ''
+  const initialUserIdx = history.length - 1
+
+  try {
+    const c = client()
+    for (let round = 0; round < MAX_TOOL_ROUNDS; round++) {
+      const t0 = Date.now()
+      const resp = await c.messages.create({
+        model: MODEL,
+        max_tokens: MAX_TOKENS,
+        system: systemBlocks,
+        messages: apiMessages,
+        tools: TOOL_SCHEMAS,
+      })
+
+      const usage = resp.usage as any
+      console.log(
+        `[voice/chat] ${agent} round ${round + 1} ${Date.now() - t0}ms · ` +
+        `stop=${resp.stop_reason} · in=${usage?.input_tokens} out=${usage?.output_tokens} ` +
+        `cache_r=${usage?.cache_read_input_tokens || 0} cache_w=${usage?.cache_creation_input_tokens || 0}`
+      )
+
+      // Разбираем content на text + tool_use
+      const toolUses: Anthropic.ToolUseBlock[] = []
+      for (const block of resp.content) {
+        if (block.type === 'text') finalText += block.text
+        else if (block.type === 'tool_use') toolUses.push(block)
+      }
+
+      // Сохраняем assistant turn в API messages как есть (важно для tool_use_id)
+      apiMessages.push({ role: 'assistant', content: resp.content as any })
+
+      if (resp.stop_reason === 'tool_use' && toolUses.length) {
+        const toolResults: Anthropic.ToolResultBlockParam[] = []
+        for (const tu of toolUses) {
+          console.log(`[voice/chat] tool ${tu.name}(${JSON.stringify(tu.input).slice(0, 200)})`)
+          const result = await executeTool(tu.name, tu.input, agent)
+          toolResults.push({
+            type: 'tool_result',
+            tool_use_id: tu.id,
+            content: JSON.stringify(result),
+          })
+        }
+        apiMessages.push({ role: 'user', content: toolResults })
+        continue
+      }
+
+      // end_turn / max_tokens / stop_sequence — финальный ответ готов
+      break
+    }
+  } catch (e: any) {
+    console.error('[voice/chat] anthropic error:', e?.message || e)
+    const msg = 'Что-то сломалось.'
+    emitVoice('error', agent, msg)
+    return NextResponse.json({ error: 'llm_failed', detail: String(e?.message || e), text: msg }, { status: 502 })
+  }
+
+  if (!finalText.trim()) {
+    const msg = 'Не получил ответ.'
+    emitVoice('error', agent, msg)
+    return NextResponse.json({ text: msg }, { status: 200 })
+  }
+
+  // Сохраняем все turn'ы после initial user (включая tool_use / tool_result)
+  const newTurns = apiMessages.slice(initialUserIdx + 1)
+  for (const turn of newTurns) {
+    history.push({
+      role: turn.role as 'user' | 'assistant',
+      content: stripCacheControl(turn.content),
+    } as HistoryMessage)
+  }
+  await saveHistory(agent, history)
+
+  const cleaned = cleanForSpeech(stripFillers(finalText))
+  emitVoice('response', agent, cleaned)
+  return NextResponse.json({ text: cleaned })
+}
--- a/app/api/voice/stt/route.ts
+++ b/app/api/voice/stt/route.ts
@@ -0,0 +1,71 @@
+export const dynamic = 'force-dynamic'
+export const runtime = 'nodejs'
+
+import { NextResponse } from 'next/server'
+import Groq from 'groq-sdk'
+import { HttpsProxyAgent } from 'https-proxy-agent'
+import { toFile } from 'groq-sdk/uploads'
+
+const STT_MODEL = process.env.GROQ_STT_MODEL || 'whisper-large-v3-turbo'
+
+let _client: Groq | null = null
+function client(): Groq {
+  if (_client) return _client
+  const apiKey = process.env.GROQ_API_KEY
+  if (!apiKey) throw new Error('GROQ_API_KEY not set')
+  const proxy = process.env.GROQ_PROXY || process.env.HTTPS_PROXY || ''
+  const httpAgent = proxy ? new HttpsProxyAgent(proxy) : undefined
+  _client = new Groq({ apiKey, httpAgent })
+  return _client
+}
+
+// Принимает либо multipart/form-data с полем "file",
+// либо raw audio в теле (Content-Type: audio/* — например audio/webm).
+// Возвращает {text: string}.
+export async function POST(req: Request) {
+  let audio: { name: string; data: Buffer; mime: string }
+
+  const ct = req.headers.get('content-type') || ''
+
+  try {
+    if (ct.startsWith('multipart/form-data')) {
+      const fd = await req.formData()
+      const file = fd.get('file')
+      if (!(file instanceof Blob)) {
+        return NextResponse.json({ error: 'file field required' }, { status: 400 })
+      }
+      const ab = await file.arrayBuffer()
+      audio = {
+        name: (file as any).name || 'audio.webm',
+        data: Buffer.from(ab),
+        mime: file.type || 'audio/webm',
+      }
+    } else {
+      const ab = await req.arrayBuffer()
+      if (!ab.byteLength) {
+        return NextResponse.json({ error: 'empty body' }, { status: 400 })
+      }
+      audio = {
+        name: 'audio.webm',
+        data: Buffer.from(ab),
+        mime: ct || 'audio/webm',
+      }
+    }
+  } catch (e) {
+    return NextResponse.json({ error: 'failed_to_read_body' }, { status: 400 })
+  }
+
+  try {
+    const file = await toFile(audio.data, audio.name, { type: audio.mime })
+    const result = await client().audio.transcriptions.create({
+      file,
+      model: STT_MODEL,
+      language: 'ru',
+    })
+    const text = (result as any).text || ''
+    return NextResponse.json({ text })
+  } catch (e: any) {
+    console.error('[voice/stt] groq error:', e?.message || e)
+    return NextResponse.json({ error: 'stt_failed', detail: String(e?.message || e) }, { status: 502 })
+  }
+}