feat: VAD-based barge-in during TTS playback
This commit is contained in:
@@ -4,7 +4,7 @@ import subprocess
|
||||
import threading
|
||||
from elevenlabs import VoiceSettings
|
||||
|
||||
from .config import AUDIO_SINK, AGENTS, log
|
||||
from .config import AUDIO_SINK, AGENTS, SILENCE_THRESHOLD, log
|
||||
|
||||
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
||||
ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
|
||||
@@ -40,6 +40,43 @@ def is_speaking() -> bool:
|
||||
return _current_process is not None and _current_process.poll() is None
|
||||
|
||||
|
||||
_barge_in_flag = threading.Event()
|
||||
|
||||
def start_barge_in_listener():
|
||||
"""Запускает фоновый поток VAD — если услышал голос во время TTS, ставит флаг barge-in."""
|
||||
_barge_in_flag.clear()
|
||||
|
||||
def _listen():
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
try:
|
||||
audio = pyaudio.PyAudio()
|
||||
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000,
|
||||
input=True, frames_per_buffer=1024)
|
||||
warmup = 8 # ~0.5s прогрев чтобы не словить эхо начала TTS
|
||||
i = 0
|
||||
while is_speaking():
|
||||
data = stream.read(1024, exception_on_overflow=False)
|
||||
i += 1
|
||||
if i < warmup:
|
||||
continue
|
||||
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
|
||||
if amplitude > SILENCE_THRESHOLD * 1.5: # порог чуть выше чем для записи
|
||||
_barge_in_flag.set()
|
||||
stop_speaking()
|
||||
break
|
||||
stream.stop_stream()
|
||||
audio.terminate()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
t = threading.Thread(target=_listen, daemon=True)
|
||||
t.start()
|
||||
return t
|
||||
|
||||
def was_barge_in() -> bool:
|
||||
return _barge_in_flag.is_set()
|
||||
|
||||
def _mpv_cmd() -> list[str]:
|
||||
"""Команда mpv для воспроизведения из stdin"""
|
||||
mpv_bin = os.getenv("MPV_PATH", "mpv")
|
||||
|
||||
Reference in New Issue
Block a user