refactor: VAD upgrade, retry, dead code cleanup, AGENT removal

- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS
- audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response
- llm: retry with exponential backoff on network errors and 5xx
- llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150
- tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default)
- tts: remove dead start_barge_in_listener / was_barge_in helpers
- config: drop AGENT/LUSYA_AGENT — routing happens via session_key
- modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record()
- docs: full rewrite of README and CLAUDE.md to match current architecture
This commit is contained in:
2026-04-16 17:10:59 +03:00
parent a885cbe74b
commit a9001aef92
9 changed files with 541 additions and 358 deletions

View File

@@ -2,22 +2,59 @@ import os
import pyaudio
import numpy as np
from .config import SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION, log
from .config import (
SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION,
FOLLOWUP_TIMEOUT, VAD_AGGRESSIVENESS, log,
)
from .stt import transcribe
ECHO_WARMUP = float(os.getenv("ECHO_WARMUP", "0.5")) # сек пропуска в начале — гасит эхо от TTS
try:
import webrtcvad
_vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
_VAD_OK = True
except Exception as e:
log.warning(f"webrtcvad недоступен, fallback на RMS: {e}")
_vad = None
_VAD_OK = False
# webrtcvad требует фрейм 10/20/30 мс при 8/16/32/48 кГц
SAMPLE_RATE = 16000
FRAME_MS = 30
FRAME_SAMPLES = int(SAMPLE_RATE * FRAME_MS / 1000) # 480
FRAME_BYTES = FRAME_SAMPLES * 2 # int16
def _is_speech(frame: bytes) -> bool:
"""Единое решение по VAD: webrtcvad + RMS-гейт, чтобы не ловить шёпот и эхо."""
amplitude = float(np.abs(np.frombuffer(frame, dtype=np.int16)).mean())
if amplitude < SILENCE_THRESHOLD:
return False
if _VAD_OK:
try:
return _vad.is_speech(frame, SAMPLE_RATE)
except Exception:
pass
return True # RMS уже прошёл — считаем речью
def record(initial_silence_timeout: float | None = None) -> str:
"""Запись до тишины + STT.
initial_silence_timeout — через сколько секунд выйти если пользователь вообще не начал говорить.
По умолчанию FOLLOWUP_TIMEOUT (короткое ожидание после ответа бота).
"""
if initial_silence_timeout is None:
initial_silence_timeout = FOLLOWUP_TIMEOUT
def record() -> str:
"""Запись до тишины (VAD) + STT. Игнорирует ECHO_WARMUP в начале."""
try:
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=1024,
frames_per_buffer=FRAME_SAMPLES,
)
except Exception as e:
log.exception("Не удалось открыть микрофон")
@@ -25,30 +62,38 @@ def record() -> str:
return ""
print("🎙️ Говори...")
frames = []
silent_chunks = 0
frames: list[bytes] = []
speaking_started = False
max_chunks = int(16000 / 1024 * MAX_DURATION)
silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
warmup_chunks = int(16000 / 1024 * ECHO_WARMUP)
trailing_silence = 0 # фреймы тишины после начала речи
initial_silence = 0 # фреймы тишины до начала речи
max_frames = int(MAX_DURATION * 1000 / FRAME_MS)
warmup_frames = int(ECHO_WARMUP * 1000 / FRAME_MS)
silence_frames_needed = int(SILENCE_DURATION * 1000 / FRAME_MS)
initial_silence_limit = int(initial_silence_timeout * 1000 / FRAME_MS)
try:
for i in range(max_chunks):
data = stream.read(1024, exception_on_overflow=False)
if i < warmup_chunks:
continue # гасим эхо от TTS / звука активации
for i in range(max_frames):
data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
if i < warmup_frames:
continue
frames.append(data)
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
if amplitude > SILENCE_THRESHOLD:
if _is_speech(data):
speaking_started = True
silent_chunks = 0
elif speaking_started:
silent_chunks += 1
if silent_chunks >= silence_chunks_needed:
print("🔇 Конец речи")
break
trailing_silence = 0
else:
if speaking_started:
trailing_silence += 1
if trailing_silence >= silence_frames_needed:
print("🔇 Конец речи")
break
else:
initial_silence += 1
if initial_silence >= initial_silence_limit:
print("😴 Пользователь молчит, выхожу")
speaking_started = False
break
except Exception as e:
log.exception("Ошибка при записи аудио")
print(f"⚠️ Ошибка записи: {e}")