refactor: VAD upgrade, retry, dead code cleanup, AGENT removal

- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS - audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response - llm: retry with exponential backoff on network errors and 5xx - llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150 - tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default) - tts: remove dead start_barge_in_listener / was_barge_in helpers - config: drop AGENT/LUSYA_AGENT — routing happens via session_key - modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record() - docs: full rewrite of README and CLAUDE.md to match current architecture
2026-04-16 17:10:59 +03:00
parent a885cbe74b
commit a9001aef92
9 changed files with 541 additions and 358 deletions
--- a/satellite/audio.py
+++ b/satellite/audio.py
@@ -2,22 +2,59 @@ import os
 import pyaudio
 import numpy as np

-from .config import SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION, log
+from .config import (
+    SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION,
+    FOLLOWUP_TIMEOUT, VAD_AGGRESSIVENESS, log,
+)
 from .stt import transcribe

 ECHO_WARMUP = float(os.getenv("ECHO_WARMUP", "0.5"))  # сек пропуска в начале — гасит эхо от TTS

+try:
+    import webrtcvad
+    _vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
+    _VAD_OK = True
+except Exception as e:
+    log.warning(f"webrtcvad недоступен, fallback на RMS: {e}")
+    _vad = None
+    _VAD_OK = False
+
+# webrtcvad требует фрейм 10/20/30 мс при 8/16/32/48 кГц
+SAMPLE_RATE = 16000
+FRAME_MS = 30
+FRAME_SAMPLES = int(SAMPLE_RATE * FRAME_MS / 1000)  # 480
+FRAME_BYTES = FRAME_SAMPLES * 2                      # int16
+
+
+def _is_speech(frame: bytes) -> bool:
+    """Единое решение по VAD: webrtcvad + RMS-гейт, чтобы не ловить шёпот и эхо."""
+    amplitude = float(np.abs(np.frombuffer(frame, dtype=np.int16)).mean())
+    if amplitude < SILENCE_THRESHOLD:
+        return False
+    if _VAD_OK:
+        try:
+            return _vad.is_speech(frame, SAMPLE_RATE)
+        except Exception:
+            pass
+    return True  # RMS уже прошёл — считаем речью
+
+
+def record(initial_silence_timeout: float | None = None) -> str:
+    """Запись до тишины + STT.
+    initial_silence_timeout — через сколько секунд выйти если пользователь вообще не начал говорить.
+    По умолчанию FOLLOWUP_TIMEOUT (короткое ожидание после ответа бота).
+    """
+    if initial_silence_timeout is None:
+        initial_silence_timeout = FOLLOWUP_TIMEOUT

-def record() -> str:
-    """Запись до тишины (VAD) + STT. Игнорирует ECHO_WARMUP в начале."""
    try:
        audio = pyaudio.PyAudio()
        stream = audio.open(
            format=pyaudio.paInt16,
            channels=1,
-            rate=16000,
+            rate=SAMPLE_RATE,
            input=True,
-            frames_per_buffer=1024,
+            frames_per_buffer=FRAME_SAMPLES,
        )
    except Exception as e:
        log.exception("Не удалось открыть микрофон")
@@ -25,30 +62,38 @@ def record() -> str:
        return ""

    print("🎙️  Говори...")
-    frames = []
-    silent_chunks = 0
+    frames: list[bytes] = []
    speaking_started = False
-    max_chunks = int(16000 / 1024 * MAX_DURATION)
-    silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
-    warmup_chunks = int(16000 / 1024 * ECHO_WARMUP)
+    trailing_silence = 0  # фреймы тишины после начала речи
+    initial_silence  = 0  # фреймы тишины до начала речи
+
+    max_frames              = int(MAX_DURATION * 1000 / FRAME_MS)
+    warmup_frames           = int(ECHO_WARMUP * 1000 / FRAME_MS)
+    silence_frames_needed   = int(SILENCE_DURATION * 1000 / FRAME_MS)
+    initial_silence_limit   = int(initial_silence_timeout * 1000 / FRAME_MS)

    try:
-        for i in range(max_chunks):
-            data = stream.read(1024, exception_on_overflow=False)
-            if i < warmup_chunks:
-                continue  # гасим эхо от TTS / звука активации
+        for i in range(max_frames):
+            data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
+            if i < warmup_frames:
+                continue
            frames.append(data)

-            amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
-
-            if amplitude > SILENCE_THRESHOLD:
+            if _is_speech(data):
                speaking_started = True
-                silent_chunks = 0
-            elif speaking_started:
-                silent_chunks += 1
-                if silent_chunks >= silence_chunks_needed:
-                    print("🔇 Конец речи")
-                    break
+                trailing_silence = 0
+            else:
+                if speaking_started:
+                    trailing_silence += 1
+                    if trailing_silence >= silence_frames_needed:
+                        print("🔇 Конец речи")
+                        break
+                else:
+                    initial_silence += 1
+                    if initial_silence >= initial_silence_limit:
+                        print("😴 Пользователь молчит, выхожу")
+                        speaking_started = False
+                        break
    except Exception as e:
        log.exception("Ошибка при записи аудио")
        print(f"⚠️  Ошибка записи: {e}")
--- a/satellite/config.py
+++ b/satellite/config.py
@@ -20,15 +20,14 @@ logging.basicConfig(
 log = logging.getLogger("cosmo")

 # OpenClaw Gateway — Cosmo (по умолчанию)
+# Роутинг к нужному агенту делается через x-openclaw-session-key, поэтому AGENT не нужен.
 GATEWAY_URL   = os.getenv("GATEWAY_URL", "http://192.168.31.103:18789")
 GATEWAY_TOKEN = os.getenv("GATEWAY_TOKEN")
-AGENT         = os.getenv("AGENT", "openclaw/main")
 VOICE_MODEL   = os.getenv("VOICE_MODEL", "openai/gpt-4o-mini")

 # OpenClaw Gateway — Люся
 LUSYA_GATEWAY_URL   = os.getenv("LUSYA_GATEWAY_URL", "http://192.168.31.103:18790")
 LUSYA_GATEWAY_TOKEN = os.getenv("LUSYA_GATEWAY_TOKEN", GATEWAY_TOKEN)
-LUSYA_AGENT         = os.getenv("LUSYA_AGENT", "openclaw/wife")
 LUSYA_VOICE_MODEL   = os.getenv("LUSYA_VOICE_MODEL", VOICE_MODEL)

 # Keep-alive HTTP сессии — переиспользуют TCP/TLS соединения
@@ -46,20 +45,16 @@ AGENTS = {
    "cosmo": {
        "name": "Cosmo",
        "gateway_url": GATEWAY_URL,
-        "token": GATEWAY_TOKEN,
-        "agent": AGENT,
        "voice_model": VOICE_MODEL,
-        "session_key": os.getenv("COSMO_SESSION_KEY", "voice:home:cosmo"),
+        "session_key": os.getenv("COSMO_SESSION_KEY", "agent:main:voice:home"),
        "tts_voice": os.getenv("COSMO_TTS_VOICE", ""),
        "session": _make_session(GATEWAY_TOKEN),
    },
    "lusya": {
        "name": "Люся",
        "gateway_url": LUSYA_GATEWAY_URL,
-        "token": LUSYA_GATEWAY_TOKEN,
-        "agent": LUSYA_AGENT,
        "voice_model": LUSYA_VOICE_MODEL,
-        "session_key": os.getenv("LUSYA_SESSION_KEY", "voice:home:lusya"),
+        "session_key": os.getenv("LUSYA_SESSION_KEY", "agent:wife:voice:home"),
        "tts_voice": os.getenv("LUSYA_TTS_VOICE", ""),
        "session": _make_session(LUSYA_GATEWAY_TOKEN),
    },
@@ -73,6 +68,18 @@ SILENCE_THRESHOLD = int(os.getenv("SILENCE_THRESHOLD", "500"))
 SILENCE_DURATION  = float(os.getenv("SILENCE_DURATION", "1.5"))
 MAX_DURATION      = int(os.getenv("MAX_DURATION", "15"))
 FOLLOWUP_TIMEOUT  = float(os.getenv("FOLLOWUP_TIMEOUT", "8"))
+VAD_AGGRESSIVENESS = int(os.getenv("VAD_AGGRESSIVENESS", "2"))  # webrtcvad 0..3
+
+# LLM
+VOICE_MAX_TOKENS  = int(os.getenv("VOICE_MAX_TOKENS", "300"))
+LLM_RETRIES       = int(os.getenv("LLM_RETRIES", "3"))
+
+# Barge-in (прерывание TTS голосом)
+# Работает только при разнесённых колонке/мике или в наушниках — иначе эхо собственного TTS
+# будет триггерить прерывание. По умолчанию выключен.
+BARGE_IN_ENABLED   = os.getenv("BARGE_IN_ENABLED", "false").lower() in ("1", "true", "yes")
+BARGE_IN_THRESHOLD = int(os.getenv("BARGE_IN_THRESHOLD", "1500"))  # RMS, обычно > SILENCE_THRESHOLD
+BARGE_IN_WARMUP    = float(os.getenv("BARGE_IN_WARMUP", "0.8"))    # сек пропуска в начале TTS

 # Groq client
 groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
--- a/satellite/llm.py
+++ b/satellite/llm.py
@@ -1,13 +1,13 @@
 import json
 import os
 import re
+import time
 import requests

-from .config import AGENTS, log
+from .config import AGENTS, VOICE_MAX_TOKENS, LLM_RETRIES, log
 from .text import clean_for_speech, find_sentence_end
 from .tts import speak, play_error_sound

-# Ключ голосовой сессии — Cosmo работает как полноценный агент
 VOICE_SESSION_KEY = os.getenv("VOICE_SESSION_KEY", "agent:main:voice:home")

 # "stream" — режем по предложениям (быстро, но рваная интонация)
@@ -26,67 +26,86 @@ FILLER_PATTERNS = re.compile(
    r'(?:(?:сейчас посмотрю|дай мне секунду|дай секунду|проверяю|загружаю|узнаю'
    r'|смотрю|одну секунду|я сейчас посмотрю|я проверю|попробую другой источник'
    r'|нужны конкретные числа|дай мне загрузить)[^.!?]*[.!?]?\s*)+',
-    re.IGNORECASE
+    re.IGNORECASE,
 )

+
 def strip_fillers(text: str) -> str:
    return FILLER_PATTERNS.sub('', text).strip()


-
-
 def is_reset_command(text: str) -> bool:
    return bool(RESET_PATTERNS.search(text))


-def ask_agent_stream(text: str, conv=None, agent_id: str = "cosmo") -> str:
+def _post_with_retry(session, url, headers, payload):
+    """POST с экспоненциальным backoff. Retry на сетевые ошибки и 5xx; 4xx — сразу вверх."""
+    last_exc = None
+    for attempt in range(LLM_RETRIES):
+        try:
+            resp = session.post(url, headers=headers, json=payload, stream=True, timeout=60)
+            if resp.status_code >= 500:
+                raise requests.HTTPError(f"{resp.status_code} {resp.text[:200]}", response=resp)
+            resp.raise_for_status()
+            return resp
+        except (requests.ConnectionError, requests.Timeout, requests.HTTPError) as e:
+            last_exc = e
+            # 4xx (кроме 408/429) не ретраим
+            resp = getattr(e, "response", None)
+            if isinstance(e, requests.HTTPError) and resp is not None:
+                if resp.status_code < 500 and resp.status_code not in (408, 429):
+                    raise
+            if attempt == LLM_RETRIES - 1:
+                raise
+            delay = 0.5 * (2 ** attempt)
+            log.warning(f"Gateway retry {attempt + 1}/{LLM_RETRIES} через {delay:.1f}s: {e}")
+            time.sleep(delay)
+    raise last_exc  # unreachable
+
+
+def ask_agent_stream(text: str, agent_id: str = "cosmo") -> str:
    """Отправляет запрос к OpenClaw gateway и озвучивает ответ."""
    def _maybe_speak(t: str):
        if t.strip():
            speak(t, agent_id)

    cfg = AGENTS.get(agent_id, AGENTS["cosmo"])
-    gateway_url = cfg["gateway_url"]
-    session = cfg["session"]
-    agent = cfg["agent"]
-
    session_key = cfg.get("session_key", VOICE_SESSION_KEY)

+    payload = {
+        "stream": True,
+        "messages": [{"role": "user", "content": text}],
+        "max_tokens": VOICE_MAX_TOKENS,
+    }
+    headers = {
+        "x-ocplatform-model": cfg["voice_model"],
+        "x-openclaw-session-key": session_key,
+    }
+
    try:
-        resp = session.post(
-            f"{gateway_url}/v1/chat/completions",
-            headers={
-                "x-ocplatform-model": cfg["voice_model"],
-                "x-openclaw-session-key": session_key,
-            },
-            json={
-                "model": agent,
-                "stream": True,
-                "messages": [{"role": "user", "content": text}],
-                "max_tokens": 150,
-            },
-            stream=True,
-            timeout=60,
+        resp = _post_with_retry(
+            cfg["session"], f"{cfg['gateway_url']}/v1/chat/completions", headers, payload,
        )
-        resp.raise_for_status()
    except requests.ConnectionError:
-        log.exception("Gateway недоступен")
+        log.exception("Gateway недоступен после retry")
        msg = "Не могу связаться с сервером, попробуй ещё раз."
        print(f"⚠️  {msg}")
        play_error_sound()
        _maybe_speak(msg)
        return msg
    except requests.Timeout:
-        log.exception("Gateway таймаут")
+        log.exception("Gateway таймаут после retry")
        msg = "Сервер не ответил вовремя, попробуй ещё раз."
        print(f"⚠️  {msg}")
        play_error_sound()
        _maybe_speak(msg)
        return msg
-    except requests.HTTPError:
-        log.exception(f"Gateway HTTP ошибка {resp.status_code}")
+    except requests.HTTPError as e:
+        status = e.response.status_code if e.response is not None else "?"
+        body = e.response.text if e.response is not None else ""
+        log.exception(f"Gateway HTTP {status}")
        msg = "Ошибка сервера, попробуй ещё раз."
-        print(f"⚠️  Gateway {resp.status_code}: {resp.text}")
+        print(f"⚠️  Gateway {status}: {body[:200]}")
        play_error_sound()
        _maybe_speak(msg)
        return msg
@@ -98,25 +117,25 @@ def ask_agent_stream(text: str, conv=None, agent_id: str = "cosmo") -> str:
        for line in resp.iter_lines():
            if not line or line == b"data: [DONE]":
                continue
-            if line.startswith(b"data: "):
-                try:
-                    chunk = json.loads(line[6:])
-                    delta = chunk["choices"][0]["delta"].get("content", "")
-                    if not delta:
-                        continue
-
-                    full_text += delta
-                    buffer += delta
-
-                    if TTS_MODE == "stream":
-                        last_punct = find_sentence_end(buffer, min_len=120)
-                        if last_punct > -1:
-                            sentence = clean_for_speech(buffer[:last_punct + 1])
-                            _maybe_speak(sentence)
-                            buffer = buffer[last_punct + 1:].lstrip()
-
-                except (json.JSONDecodeError, KeyError, IndexError):
+            if not line.startswith(b"data: "):
+                continue
+            try:
+                chunk = json.loads(line[6:])
+                delta = chunk["choices"][0]["delta"].get("content", "")
+                if not delta:
                    continue
+
+                full_text += delta
+                buffer += delta
+
+                if TTS_MODE == "stream":
+                    last_punct = find_sentence_end(buffer, min_len=120)
+                    if last_punct > -1:
+                        sentence = clean_for_speech(strip_fillers(buffer[:last_punct + 1]))
+                        _maybe_speak(sentence)
+                        buffer = buffer[last_punct + 1:].lstrip()
+            except (json.JSONDecodeError, KeyError, IndexError):
+                continue
    except Exception as e:
        log.exception("Ошибка при чтении стрима")
        print(f"⚠️  Стрим прервался: {e}")
@@ -132,6 +151,6 @@ def ask_agent_stream(text: str, conv=None, agent_id: str = "cosmo") -> str:
        _maybe_speak(result)
    else:
        if buffer.strip():
-            _maybe_speak(clean_for_speech(buffer))
+            _maybe_speak(clean_for_speech(strip_fillers(buffer)))

    return result
--- a/satellite/modes.py
+++ b/satellite/modes.py
@@ -1,9 +1,8 @@
 import os
-import sys

-from .config import GATEWAY_URL, AGENT, AGENTS, log
+from .config import GATEWAY_URL, AGENTS, FOLLOWUP_TIMEOUT, MAX_DURATION, log
 from .audio import record
-from .tts import speak, stop_speaking, is_speaking, start_barge_in_listener, was_barge_in
+from .tts import speak, stop_speaking
 from .llm import ask_agent_stream, is_reset_command, VOICE_SESSION_KEY

 WAKE_THRESHOLD = float(os.getenv("WAKE_THRESHOLD", "0.5"))
@@ -24,7 +23,6 @@ def _handle_reset(text: str, agent_id: str) -> bool:
                "x-openclaw-session-key": cfg.get("session_key", VOICE_SESSION_KEY),
            },
            json={
-                "model": cfg["agent"],
                "stream": False,
                "messages": [{"role": "user", "content": "/new"}],
            },
@@ -40,11 +38,15 @@ def _handle_reset(text: str, agent_id: str) -> bool:


 def _conversation_loop(agent_id: str, agent_name: str = "Cosmo"):
-    """Основной цикл диалога — слушает и отвечает пока пользователь говорит."""
+    """Основной цикл диалога.
+    Первая запись — с большим таймаутом (MAX_DURATION), дальше — короткий FOLLOWUP_TIMEOUT."""
+    first = True
    while True:
-        text = record()
+        timeout = MAX_DURATION if first else FOLLOWUP_TIMEOUT
+        first = False
+        text = record(initial_silence_timeout=timeout)
        if not text:
-            print(f"😴 Тишина, жду активации...\n")
+            print("😴 Тишина, жду активации...\n")
            return

        print(f"📝 Ты → {agent_name}: {text}")
@@ -59,7 +61,6 @@ def _conversation_loop(agent_id: str, agent_name: str = "Cosmo"):
 def run_with_enter():
    print("\n🦞 Cosmo Satellite запущен (режим: Enter для активации)")
    print(f"   Gateway : {GATEWAY_URL}")
-    print(f"   Агент   : {AGENT}")
    print("\nНажми Enter → говори → получи ответ. Ctrl+C для выхода.\n")

    while True:
@@ -97,7 +98,6 @@ def run_with_porcupine():
                        input=True, frames_per_buffer=1280)

    print("✅ Слушаю через OpenWakeWord...")
-    # print("\nСкажи 'Космо' или 'Люся'...\n")  # TODO: после подключения Люси

    try:
        while True:
@@ -110,12 +110,7 @@ def run_with_porcupine():
                    print(f"PREDICTION cosmo: {cosmo_score:.3f}")

                if cosmo_score > WAKE_THRESHOLD:
-                    if is_speaking():
-                        # Barge-in: прерываем TTS
-                        print("✋ Barge-in: прерываю ответ")
-                        stop_speaking()
-                        cosmo_model.reset()
-                        continue
+                    stop_speaking()  # на случай если TTS ещё играет
                    stream.stop_stream()
                    _conversation_loop("cosmo", "Cosmo")
                    cosmo_model.reset()
@@ -124,10 +119,8 @@ def run_with_porcupine():

                # TODO: Люся — раскомментировать когда модель готова
                # lusya_score = lusya_model.predict(pcm)["lusya"]
-                # if lusya_score > 0.1:
-                #     print(f"PREDICTION lusya: {lusya_score:.3f}")
-                # if lusya_score > 0.5:
-                #     print("✅ Услышала 'Люся'!")
+                # if lusya_score > WAKE_THRESHOLD:
+                #     stop_speaking()
                #     stream.stop_stream()
                #     _conversation_loop("lusya", "Люся")
                #     lusya_model.reset()
--- a/satellite/tts.py
+++ b/satellite/tts.py
@@ -1,10 +1,12 @@
 import os
-import sys
 import subprocess
 import threading
 from elevenlabs import VoiceSettings

-from .config import AUDIO_SINK, AGENTS, SILENCE_THRESHOLD, log
+from .config import (
+    AUDIO_SINK, AGENTS, log,
+    BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP,
+)

 ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
 ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
@@ -40,45 +42,7 @@ def is_speaking() -> bool:
        return _current_process is not None and _current_process.poll() is None


-_barge_in_flag = threading.Event()
-
-def start_barge_in_listener():
-    """Запускает фоновый поток VAD — если услышал голос во время TTS, ставит флаг barge-in."""
-    _barge_in_flag.clear()
-
-    def _listen():
-        import pyaudio
-        import numpy as np
-        try:
-            audio = pyaudio.PyAudio()
-            stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000,
-                                input=True, frames_per_buffer=1024)
-            warmup = 8  # ~0.5s прогрев чтобы не словить эхо начала TTS
-            i = 0
-            while is_speaking():
-                data = stream.read(1024, exception_on_overflow=False)
-                i += 1
-                if i < warmup:
-                    continue
-                amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
-                if amplitude > SILENCE_THRESHOLD * 1.5:  # порог чуть выше чем для записи
-                    _barge_in_flag.set()
-                    stop_speaking()
-                    break
-            stream.stop_stream()
-            audio.terminate()
-        except Exception:
-            pass
-
-    t = threading.Thread(target=_listen, daemon=True)
-    t.start()
-    return t
-
-def was_barge_in() -> bool:
-    return _barge_in_flag.is_set()
-
 def _mpv_cmd() -> list[str]:
-    """Команда mpv для воспроизведения из stdin"""
    mpv_bin = os.getenv("MPV_PATH", "mpv")
    cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"]
    if AUDIO_SINK:
@@ -87,13 +51,19 @@ def _mpv_cmd() -> list[str]:
    return cmd


-def speak(text: str, agent_id: str = "cosmo"):
+def speak(text: str, agent_id: str = "cosmo") -> bool:
+    """Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться.
+    Возвращает True если был прерван голосом."""
    try:
+        if BARGE_IN_ENABLED:
+            return _speak_with_barge_in(text, agent_id)
        _speak_elevenlabs(text, agent_id)
+        return False
    except Exception as e:
        log.exception("TTS ошибка")
        print(f"⚠️  Ошибка воспроизведения: {e}")
        play_error_sound()
+        return False


 def _speak_elevenlabs(text: str, agent_id: str):
@@ -107,11 +77,11 @@ def _speak_elevenlabs(text: str, agent_id: str):
        return

    voice_settings = VoiceSettings(
-        stability=0.4,         # ниже = живее интонация (для multilingual_v2)
+        stability=0.4,
        similarity_boost=0.8,
-        style=0.1,              # выше = эмоциональнее
+        style=0.1,
        use_speaker_boost=True,
-        speed=1.1
+        speed=1.1,
    )

    audio_stream = client.text_to_speech.convert(
@@ -120,7 +90,7 @@ def _speak_elevenlabs(text: str, agent_id: str):
        model_id=ELEVENLABS_MODEL,
        output_format="mp3_22050_32",
        voice_settings=voice_settings,
-        optimize_streaming_latency=3
+        optimize_streaming_latency=3,
    )

    with _process_lock:
@@ -148,9 +118,74 @@ def _speak_elevenlabs(text: str, agent_id: str):
                _current_process = None


+def _speak_with_barge_in(text: str, agent_id: str) -> bool:
+    """Запускает TTS в фоновом потоке и параллельно слушает мик через VAD.
+    Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали."""
+    t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True)
+    t.start()
+    interrupted = _listen_for_barge_in(lambda: t.is_alive())
+    t.join()
+    return interrupted
+
+
+def _listen_for_barge_in(still_alive) -> bool:
+    """Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал."""
+    import pyaudio
+    import numpy as np
+    try:
+        import webrtcvad
+        vad = webrtcvad.Vad(3)  # максимум агрессивности — меньше ложных на эхо
+    except Exception:
+        vad = None
+
+    SR = 16000
+    FRAME_MS = 30
+    FRAME_SAMPLES = int(SR * FRAME_MS / 1000)
+    warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS)
+    required_speech_frames = 8  # ~240 мс подряд
+
+    try:
+        audio = pyaudio.PyAudio()
+        stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR,
+                            input=True, frames_per_buffer=FRAME_SAMPLES)
+    except Exception as e:
+        log.warning(f"Barge-in: не открылся мик: {e}")
+        return False
+
+    interrupted = False
+    speech_streak = 0
+    i = 0
+    try:
+        while still_alive():
+            data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
+            i += 1
+            if i < warmup_frames:
+                continue
+            amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
+            if amplitude < BARGE_IN_THRESHOLD:
+                speech_streak = 0
+                continue
+            if vad is None or vad.is_speech(data, SR):
+                speech_streak += 1
+                if speech_streak >= required_speech_frames:
+                    print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS")
+                    stop_speaking()
+                    interrupted = True
+                    break
+            else:
+                speech_streak = 0
+    except Exception:
+        log.exception("Barge-in ошибка")
+    finally:
+        try:
+            stream.stop_stream()
+            audio.terminate()
+        except Exception:
+            pass
+    return interrupted
+
+
 def _play_sound_file(filename: str, wait: bool = False):
-    """Воспроизводит файл из папки sounds/ через mpv.
-    wait=True — блокирует до конца воспроизведения."""
    sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds")
    path = os.path.normpath(os.path.join(sounds_dir, filename))
    mpv_bin = os.getenv("MPV_PATH", "mpv")
@@ -162,7 +197,6 @@ def _play_sound_file(filename: str, wait: bool = False):


 def play_activation_sound():
-    """Звук активации — неблокирующий"""
    try:
        _play_sound_file("Success_Cosmo.mp3", wait=False)
    except Exception as e:
@@ -170,7 +204,6 @@ def play_activation_sound():


 def play_error_sound():
-    """Звук ошибки — 'не получилось'"""
    try:
        _play_sound_file("Error_Cosmo.mp3")
    except Exception as e: