diff --git a/satellite/tts.py b/satellite/tts.py index d7d9bbf..1463112 100644 --- a/satellite/tts.py +++ b/satellite/tts.py @@ -4,7 +4,7 @@ import subprocess import threading from elevenlabs import VoiceSettings -from .config import AUDIO_SINK, AGENTS, log +from .config import AUDIO_SINK, AGENTS, SILENCE_THRESHOLD, log ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "") ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5") @@ -40,6 +40,43 @@ def is_speaking() -> bool: return _current_process is not None and _current_process.poll() is None +_barge_in_flag = threading.Event() + +def start_barge_in_listener(): + """Запускает фоновый поток VAD — если услышал голос во время TTS, ставит флаг barge-in.""" + _barge_in_flag.clear() + + def _listen(): + import pyaudio + import numpy as np + try: + audio = pyaudio.PyAudio() + stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000, + input=True, frames_per_buffer=1024) + warmup = 8 # ~0.5s прогрев чтобы не словить эхо начала TTS + i = 0 + while is_speaking(): + data = stream.read(1024, exception_on_overflow=False) + i += 1 + if i < warmup: + continue + amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean() + if amplitude > SILENCE_THRESHOLD * 1.5: # порог чуть выше чем для записи + _barge_in_flag.set() + stop_speaking() + break + stream.stop_stream() + audio.terminate() + except Exception: + pass + + t = threading.Thread(target=_listen, daemon=True) + t.start() + return t + +def was_barge_in() -> bool: + return _barge_in_flag.is_set() + def _mpv_cmd() -> list[str]: """Команда mpv для воспроизведения из stdin""" mpv_bin = os.getenv("MPV_PATH", "mpv")