Files
Daniil Klimov a9001aef92 refactor: VAD upgrade, retry, dead code cleanup, AGENT removal
- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS
- audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response
- llm: retry with exponential backoff on network errors and 5xx
- llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150
- tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default)
- tts: remove dead start_barge_in_listener / was_barge_in helpers
- config: drop AGENT/LUSYA_AGENT — routing happens via session_key
- modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record()
- docs: full rewrite of README and CLAUDE.md to match current architecture
2026-04-16 17:10:59 +03:00

112 lines
4.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import pyaudio
import numpy as np
from .config import (
SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION,
FOLLOWUP_TIMEOUT, VAD_AGGRESSIVENESS, log,
)
from .stt import transcribe
ECHO_WARMUP = float(os.getenv("ECHO_WARMUP", "0.5")) # сек пропуска в начале — гасит эхо от TTS
try:
import webrtcvad
_vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
_VAD_OK = True
except Exception as e:
log.warning(f"webrtcvad недоступен, fallback на RMS: {e}")
_vad = None
_VAD_OK = False
# webrtcvad требует фрейм 10/20/30 мс при 8/16/32/48 кГц
SAMPLE_RATE = 16000
FRAME_MS = 30
FRAME_SAMPLES = int(SAMPLE_RATE * FRAME_MS / 1000) # 480
FRAME_BYTES = FRAME_SAMPLES * 2 # int16
def _is_speech(frame: bytes) -> bool:
"""Единое решение по VAD: webrtcvad + RMS-гейт, чтобы не ловить шёпот и эхо."""
amplitude = float(np.abs(np.frombuffer(frame, dtype=np.int16)).mean())
if amplitude < SILENCE_THRESHOLD:
return False
if _VAD_OK:
try:
return _vad.is_speech(frame, SAMPLE_RATE)
except Exception:
pass
return True # RMS уже прошёл — считаем речью
def record(initial_silence_timeout: float | None = None) -> str:
"""Запись до тишины + STT.
initial_silence_timeout — через сколько секунд выйти если пользователь вообще не начал говорить.
По умолчанию FOLLOWUP_TIMEOUT (короткое ожидание после ответа бота).
"""
if initial_silence_timeout is None:
initial_silence_timeout = FOLLOWUP_TIMEOUT
try:
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=FRAME_SAMPLES,
)
except Exception as e:
log.exception("Не удалось открыть микрофон")
print(f"⚠️ Ошибка микрофона: {e}")
return ""
print("🎙️ Говори...")
frames: list[bytes] = []
speaking_started = False
trailing_silence = 0 # фреймы тишины после начала речи
initial_silence = 0 # фреймы тишины до начала речи
max_frames = int(MAX_DURATION * 1000 / FRAME_MS)
warmup_frames = int(ECHO_WARMUP * 1000 / FRAME_MS)
silence_frames_needed = int(SILENCE_DURATION * 1000 / FRAME_MS)
initial_silence_limit = int(initial_silence_timeout * 1000 / FRAME_MS)
try:
for i in range(max_frames):
data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
if i < warmup_frames:
continue
frames.append(data)
if _is_speech(data):
speaking_started = True
trailing_silence = 0
else:
if speaking_started:
trailing_silence += 1
if trailing_silence >= silence_frames_needed:
print("🔇 Конец речи")
break
else:
initial_silence += 1
if initial_silence >= initial_silence_limit:
print("😴 Пользователь молчит, выхожу")
speaking_started = False
break
except Exception as e:
log.exception("Ошибка при записи аудио")
print(f"⚠️ Ошибка записи: {e}")
finally:
stream.stop_stream()
audio.terminate()
if not speaking_started:
return ""
text = transcribe(frames)
# отсекаем мусор от эха (одиночные знаки препинания, пробелы)
if not text or not text.strip() or len(text.strip()) < 2:
return ""
return text