Two-agent voice assistant (Cosmo + Люся) via OpenClaw Gateway. Streaming STT (Groq) + LLM + TTS (ElevenLabs) pipeline with keep-alive sessions, barge-in, and daily conversation sessions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
import io
|
|
import wave
|
|
|
|
from .config import groq_client, STT_PROVIDER, WHISPER_MODEL, WHISPER_LANG, log
|
|
|
|
|
|
def transcribe_groq_bytes(wav_bytes: bytes) -> str:
|
|
"""Отправляет WAV байты в Groq без записи на диск"""
|
|
buf = io.BytesIO(wav_bytes)
|
|
buf.name = "audio.wav"
|
|
result = groq_client.audio.transcriptions.create(
|
|
file=buf,
|
|
model="whisper-large-v3-turbo",
|
|
language="ru",
|
|
)
|
|
return result.text
|
|
|
|
|
|
def frames_to_wav(frames: list[bytes]) -> bytes:
|
|
"""Конвертирует сырые PCM фреймы в WAV в памяти"""
|
|
buf = io.BytesIO()
|
|
wf = wave.open(buf, "wb")
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2)
|
|
wf.setframerate(16000)
|
|
wf.writeframes(b"".join(frames))
|
|
wf.close()
|
|
return buf.getvalue()
|
|
|
|
|
|
def transcribe(frames: list[bytes]) -> str:
|
|
"""Транскрибирует аудио фреймы — всё в памяти, без диска"""
|
|
try:
|
|
wav_bytes = frames_to_wav(frames)
|
|
|
|
if STT_PROVIDER == "groq":
|
|
return transcribe_groq_bytes(wav_bytes)
|
|
|
|
# Whisper fallback — нужен файл на диске
|
|
import tempfile
|
|
import os
|
|
from faster_whisper import WhisperModel
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
f.write(wav_bytes)
|
|
tmp_path = f.name
|
|
try:
|
|
model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
|
segments, _ = model.transcribe(tmp_path, language=WHISPER_LANG)
|
|
return " ".join(s.text for s in segments).strip()
|
|
finally:
|
|
os.unlink(tmp_path)
|
|
except Exception as e:
|
|
log.exception("STT ошибка")
|
|
print(f"⚠️ Ошибка распознавания речи: {e}")
|
|
return ""
|