refactor: VAD upgrade, retry, dead code cleanup, AGENT removal

- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS
- audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response
- llm: retry with exponential backoff on network errors and 5xx
- llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150
- tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default)
- tts: remove dead start_barge_in_listener / was_barge_in helpers
- config: drop AGENT/LUSYA_AGENT — routing happens via session_key
- modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record()
- docs: full rewrite of README and CLAUDE.md to match current architecture
This commit is contained in:
2026-04-16 17:10:59 +03:00
parent a885cbe74b
commit a9001aef92
9 changed files with 541 additions and 358 deletions

View File

@@ -1,10 +1,12 @@
import os
import sys
import subprocess
import threading
from elevenlabs import VoiceSettings
from .config import AUDIO_SINK, AGENTS, SILENCE_THRESHOLD, log
from .config import (
AUDIO_SINK, AGENTS, log,
BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP,
)
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
@@ -40,45 +42,7 @@ def is_speaking() -> bool:
return _current_process is not None and _current_process.poll() is None
_barge_in_flag = threading.Event()
def start_barge_in_listener():
"""Запускает фоновый поток VAD — если услышал голос во время TTS, ставит флаг barge-in."""
_barge_in_flag.clear()
def _listen():
import pyaudio
import numpy as np
try:
audio = pyaudio.PyAudio()
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000,
input=True, frames_per_buffer=1024)
warmup = 8 # ~0.5s прогрев чтобы не словить эхо начала TTS
i = 0
while is_speaking():
data = stream.read(1024, exception_on_overflow=False)
i += 1
if i < warmup:
continue
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
if amplitude > SILENCE_THRESHOLD * 1.5: # порог чуть выше чем для записи
_barge_in_flag.set()
stop_speaking()
break
stream.stop_stream()
audio.terminate()
except Exception:
pass
t = threading.Thread(target=_listen, daemon=True)
t.start()
return t
def was_barge_in() -> bool:
return _barge_in_flag.is_set()
def _mpv_cmd() -> list[str]:
"""Команда mpv для воспроизведения из stdin"""
mpv_bin = os.getenv("MPV_PATH", "mpv")
cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"]
if AUDIO_SINK:
@@ -87,13 +51,19 @@ def _mpv_cmd() -> list[str]:
return cmd
def speak(text: str, agent_id: str = "cosmo"):
def speak(text: str, agent_id: str = "cosmo") -> bool:
"""Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться.
Возвращает True если был прерван голосом."""
try:
if BARGE_IN_ENABLED:
return _speak_with_barge_in(text, agent_id)
_speak_elevenlabs(text, agent_id)
return False
except Exception as e:
log.exception("TTS ошибка")
print(f"⚠️ Ошибка воспроизведения: {e}")
play_error_sound()
return False
def _speak_elevenlabs(text: str, agent_id: str):
@@ -107,11 +77,11 @@ def _speak_elevenlabs(text: str, agent_id: str):
return
voice_settings = VoiceSettings(
stability=0.4, # ниже = живее интонация (для multilingual_v2)
stability=0.4,
similarity_boost=0.8,
style=0.1, # выше = эмоциональнее
style=0.1,
use_speaker_boost=True,
speed=1.1
speed=1.1,
)
audio_stream = client.text_to_speech.convert(
@@ -120,7 +90,7 @@ def _speak_elevenlabs(text: str, agent_id: str):
model_id=ELEVENLABS_MODEL,
output_format="mp3_22050_32",
voice_settings=voice_settings,
optimize_streaming_latency=3
optimize_streaming_latency=3,
)
with _process_lock:
@@ -148,9 +118,74 @@ def _speak_elevenlabs(text: str, agent_id: str):
_current_process = None
def _speak_with_barge_in(text: str, agent_id: str) -> bool:
"""Запускает TTS в фоновом потоке и параллельно слушает мик через VAD.
Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали."""
t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True)
t.start()
interrupted = _listen_for_barge_in(lambda: t.is_alive())
t.join()
return interrupted
def _listen_for_barge_in(still_alive) -> bool:
"""Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал."""
import pyaudio
import numpy as np
try:
import webrtcvad
vad = webrtcvad.Vad(3) # максимум агрессивности — меньше ложных на эхо
except Exception:
vad = None
SR = 16000
FRAME_MS = 30
FRAME_SAMPLES = int(SR * FRAME_MS / 1000)
warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS)
required_speech_frames = 8 # ~240 мс подряд
try:
audio = pyaudio.PyAudio()
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR,
input=True, frames_per_buffer=FRAME_SAMPLES)
except Exception as e:
log.warning(f"Barge-in: не открылся мик: {e}")
return False
interrupted = False
speech_streak = 0
i = 0
try:
while still_alive():
data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
i += 1
if i < warmup_frames:
continue
amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
if amplitude < BARGE_IN_THRESHOLD:
speech_streak = 0
continue
if vad is None or vad.is_speech(data, SR):
speech_streak += 1
if speech_streak >= required_speech_frames:
print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS")
stop_speaking()
interrupted = True
break
else:
speech_streak = 0
except Exception:
log.exception("Barge-in ошибка")
finally:
try:
stream.stop_stream()
audio.terminate()
except Exception:
pass
return interrupted
def _play_sound_file(filename: str, wait: bool = False):
"""Воспроизводит файл из папки sounds/ через mpv.
wait=True — блокирует до конца воспроизведения."""
sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds")
path = os.path.normpath(os.path.join(sounds_dir, filename))
mpv_bin = os.getenv("MPV_PATH", "mpv")
@@ -162,7 +197,6 @@ def _play_sound_file(filename: str, wait: bool = False):
def play_activation_sound():
"""Звук активации — неблокирующий"""
try:
_play_sound_file("Success_Cosmo.mp3", wait=False)
except Exception as e:
@@ -170,7 +204,6 @@ def play_activation_sound():
def play_error_sound():
"""Звук ошибки — 'не получилось'"""
try:
_play_sound_file("Error_Cosmo.mp3")
except Exception as e: