refactor: VAD upgrade, retry, dead code cleanup, AGENT removal

- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS - audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response - llm: retry with exponential backoff on network errors and 5xx - llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150 - tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default) - tts: remove dead start_barge_in_listener / was_barge_in helpers - config: drop AGENT/LUSYA_AGENT — routing happens via session_key - modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record() - docs: full rewrite of README and CLAUDE.md to match current architecture
2026-04-16 17:10:59 +03:00
parent a885cbe74b
commit a9001aef92
9 changed files with 541 additions and 358 deletions
--- a/satellite/tts.py
+++ b/satellite/tts.py
@@ -1,10 +1,12 @@
 import os
-import sys
 import subprocess
 import threading
 from elevenlabs import VoiceSettings

-from .config import AUDIO_SINK, AGENTS, SILENCE_THRESHOLD, log
+from .config import (
+    AUDIO_SINK, AGENTS, log,
+    BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP,
+)

 ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
@@ -40,45 +42,7 @@ def is_speaking() -> bool:
        return _current_process is not None and _current_process.poll() is None


-_barge_in_flag = threading.Event()
-
-def start_barge_in_listener():
-    """Запускает фоновый поток VAD — если услышал голос во время TTS, ставит флаг barge-in."""
-    _barge_in_flag.clear()
-
-    def _listen():
-        import pyaudio
-        import numpy as np
-        try:
-            audio = pyaudio.PyAudio()
-            stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000,
-                                input=True, frames_per_buffer=1024)
-            warmup = 8  # ~0.5s прогрев чтобы не словить эхо начала TTS
-            i = 0
-            while is_speaking():
-                data = stream.read(1024, exception_on_overflow=False)
-                i += 1
-                if i < warmup:
-                    continue
-                amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
-                if amplitude > SILENCE_THRESHOLD * 1.5:  # порог чуть выше чем для записи
-                    _barge_in_flag.set()
-                    stop_speaking()
-                    break
-            stream.stop_stream()
-            audio.terminate()
-        except Exception:
-            pass
-
-    t = threading.Thread(target=_listen, daemon=True)
-    t.start()
-    return t
-
-def was_barge_in() -> bool:
-    return _barge_in_flag.is_set()
-
 def _mpv_cmd() -> list[str]:
-    """Команда mpv для воспроизведения из stdin"""
    mpv_bin = os.getenv("MPV_PATH", "mpv")
    cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"]
    if AUDIO_SINK:
@@ -87,13 +51,19 @@ def _mpv_cmd() -> list[str]:
    return cmd


-def speak(text: str, agent_id: str = "cosmo"):
+def speak(text: str, agent_id: str = "cosmo") -> bool:
+    """Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться.
+    Возвращает True если был прерван голосом."""
    try:
+        if BARGE_IN_ENABLED:
+            return _speak_with_barge_in(text, agent_id)
        _speak_elevenlabs(text, agent_id)
+        return False
    except Exception as e:
        log.exception("TTS ошибка")
        print(f"⚠️  Ошибка воспроизведения: {e}")
        play_error_sound()
+        return False


 def _speak_elevenlabs(text: str, agent_id: str):
@@ -107,11 +77,11 @@ def _speak_elevenlabs(text: str, agent_id: str):
        return

    voice_settings = VoiceSettings(
-        stability=0.4,         # ниже = живее интонация (для multilingual_v2)
+        stability=0.4,
        similarity_boost=0.8,
-        style=0.1,              # выше = эмоциональнее
+        style=0.1,
        use_speaker_boost=True,
-        speed=1.1
+        speed=1.1,
    )

    audio_stream = client.text_to_speech.convert(
@@ -120,7 +90,7 @@ def _speak_elevenlabs(text: str, agent_id: str):
        model_id=ELEVENLABS_MODEL,
        output_format="mp3_22050_32",
        voice_settings=voice_settings,
-        optimize_streaming_latency=3
+        optimize_streaming_latency=3,
    )

    with _process_lock:
@@ -148,9 +118,74 @@ def _speak_elevenlabs(text: str, agent_id: str):
                _current_process = None


+def _speak_with_barge_in(text: str, agent_id: str) -> bool:
+    """Запускает TTS в фоновом потоке и параллельно слушает мик через VAD.
+    Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали."""
+    t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True)
+    t.start()
+    interrupted = _listen_for_barge_in(lambda: t.is_alive())
+    t.join()
+    return interrupted
+
+
+def _listen_for_barge_in(still_alive) -> bool:
+    """Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал."""
+    import pyaudio
+    import numpy as np
+    try:
+        import webrtcvad
+        vad = webrtcvad.Vad(3)  # максимум агрессивности — меньше ложных на эхо
+    except Exception:
+        vad = None
+
+    SR = 16000
+    FRAME_MS = 30
+    FRAME_SAMPLES = int(SR * FRAME_MS / 1000)
+    warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS)
+    required_speech_frames = 8  # ~240 мс подряд
+
+    try:
+        audio = pyaudio.PyAudio()
+        stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR,
+                            input=True, frames_per_buffer=FRAME_SAMPLES)
+    except Exception as e:
+        log.warning(f"Barge-in: не открылся мик: {e}")
+        return False
+
+    interrupted = False
+    speech_streak = 0
+    i = 0
+    try:
+        while still_alive():
+            data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
+            i += 1
+            if i < warmup_frames:
+                continue
+            amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
+            if amplitude < BARGE_IN_THRESHOLD:
+                speech_streak = 0
+                continue
+            if vad is None or vad.is_speech(data, SR):
+                speech_streak += 1
+                if speech_streak >= required_speech_frames:
+                    print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS")
+                    stop_speaking()
+                    interrupted = True
+                    break
+            else:
+                speech_streak = 0
+    except Exception:
+        log.exception("Barge-in ошибка")
+    finally:
+        try:
+            stream.stop_stream()
+            audio.terminate()
+        except Exception:
+            pass
+    return interrupted
+
+
 def _play_sound_file(filename: str, wait: bool = False):
-    """Воспроизводит файл из папки sounds/ через mpv.
-    wait=True — блокирует до конца воспроизведения."""
    sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds")
    path = os.path.normpath(os.path.join(sounds_dir, filename))
    mpv_bin = os.getenv("MPV_PATH", "mpv")
@@ -162,7 +197,6 @@ def _play_sound_file(filename: str, wait: bool = False):


 def play_activation_sound():
-    """Звук активации — неблокирующий"""
    try:
        _play_sound_file("Success_Cosmo.mp3", wait=False)
    except Exception as e:
@@ -170,7 +204,6 @@ def play_activation_sound():


 def play_error_sound():
-    """Звук ошибки — 'не получилось'"""
    try:
        _play_sound_file("Error_Cosmo.mp3")
    except Exception as e: