- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS - audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response - llm: retry with exponential backoff on network errors and 5xx - llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150 - tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default) - tts: remove dead start_barge_in_listener / was_barge_in helpers - config: drop AGENT/LUSYA_AGENT — routing happens via session_key - modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record() - docs: full rewrite of README and CLAUDE.md to match current architecture
211 lines
6.8 KiB
Python
211 lines
6.8 KiB
Python
import os
|
|
import subprocess
|
|
import threading
|
|
from elevenlabs import VoiceSettings
|
|
|
|
from .config import (
|
|
AUDIO_SINK, AGENTS, log,
|
|
BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP,
|
|
)
|
|
|
|
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
|
ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
|
|
|
|
_elevenlabs_client = None
|
|
_current_process: subprocess.Popen | None = None
|
|
_process_lock = threading.Lock()
|
|
|
|
|
|
def _get_elevenlabs():
|
|
global _elevenlabs_client
|
|
if _elevenlabs_client is None:
|
|
from elevenlabs.client import ElevenLabs
|
|
_elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
|
return _elevenlabs_client
|
|
|
|
|
|
def stop_speaking():
|
|
"""Прерывает текущее воспроизведение (barge-in)"""
|
|
global _current_process
|
|
with _process_lock:
|
|
if _current_process and _current_process.poll() is None:
|
|
_current_process.terminate()
|
|
try:
|
|
_current_process.wait(timeout=1)
|
|
except subprocess.TimeoutExpired:
|
|
_current_process.kill()
|
|
_current_process = None
|
|
|
|
|
|
def is_speaking() -> bool:
|
|
with _process_lock:
|
|
return _current_process is not None and _current_process.poll() is None
|
|
|
|
|
|
def _mpv_cmd() -> list[str]:
|
|
mpv_bin = os.getenv("MPV_PATH", "mpv")
|
|
cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"]
|
|
if AUDIO_SINK:
|
|
cmd.append(f"--audio-device=pulse/{AUDIO_SINK}")
|
|
cmd.append("-")
|
|
return cmd
|
|
|
|
|
|
def speak(text: str, agent_id: str = "cosmo") -> bool:
|
|
"""Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться.
|
|
Возвращает True если был прерван голосом."""
|
|
try:
|
|
if BARGE_IN_ENABLED:
|
|
return _speak_with_barge_in(text, agent_id)
|
|
_speak_elevenlabs(text, agent_id)
|
|
return False
|
|
except Exception as e:
|
|
log.exception("TTS ошибка")
|
|
print(f"⚠️ Ошибка воспроизведения: {e}")
|
|
play_error_sound()
|
|
return False
|
|
|
|
|
|
def _speak_elevenlabs(text: str, agent_id: str):
|
|
global _current_process
|
|
client = _get_elevenlabs()
|
|
voice_id = AGENTS.get(agent_id, AGENTS["cosmo"]).get("tts_voice", "")
|
|
|
|
if not voice_id:
|
|
log.error(f"tts_voice не задан для {agent_id}")
|
|
print(f"⚠️ tts_voice не задан для {agent_id}")
|
|
return
|
|
|
|
voice_settings = VoiceSettings(
|
|
stability=0.4,
|
|
similarity_boost=0.8,
|
|
style=0.1,
|
|
use_speaker_boost=True,
|
|
speed=1.1,
|
|
)
|
|
|
|
audio_stream = client.text_to_speech.convert(
|
|
text=text,
|
|
voice_id=voice_id,
|
|
model_id=ELEVENLABS_MODEL,
|
|
output_format="mp3_22050_32",
|
|
voice_settings=voice_settings,
|
|
optimize_streaming_latency=3,
|
|
)
|
|
|
|
with _process_lock:
|
|
_current_process = subprocess.Popen(
|
|
_mpv_cmd(), stdin=subprocess.PIPE,
|
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
|
)
|
|
proc = _current_process
|
|
|
|
try:
|
|
for chunk in audio_stream:
|
|
if proc.poll() is not None:
|
|
break
|
|
try:
|
|
proc.stdin.write(chunk)
|
|
except BrokenPipeError:
|
|
break
|
|
proc.stdin.close()
|
|
proc.wait()
|
|
except Exception:
|
|
proc.kill()
|
|
finally:
|
|
with _process_lock:
|
|
if _current_process is proc:
|
|
_current_process = None
|
|
|
|
|
|
def _speak_with_barge_in(text: str, agent_id: str) -> bool:
|
|
"""Запускает TTS в фоновом потоке и параллельно слушает мик через VAD.
|
|
Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали."""
|
|
t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True)
|
|
t.start()
|
|
interrupted = _listen_for_barge_in(lambda: t.is_alive())
|
|
t.join()
|
|
return interrupted
|
|
|
|
|
|
def _listen_for_barge_in(still_alive) -> bool:
|
|
"""Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал."""
|
|
import pyaudio
|
|
import numpy as np
|
|
try:
|
|
import webrtcvad
|
|
vad = webrtcvad.Vad(3) # максимум агрессивности — меньше ложных на эхо
|
|
except Exception:
|
|
vad = None
|
|
|
|
SR = 16000
|
|
FRAME_MS = 30
|
|
FRAME_SAMPLES = int(SR * FRAME_MS / 1000)
|
|
warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS)
|
|
required_speech_frames = 8 # ~240 мс подряд
|
|
|
|
try:
|
|
audio = pyaudio.PyAudio()
|
|
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR,
|
|
input=True, frames_per_buffer=FRAME_SAMPLES)
|
|
except Exception as e:
|
|
log.warning(f"Barge-in: не открылся мик: {e}")
|
|
return False
|
|
|
|
interrupted = False
|
|
speech_streak = 0
|
|
i = 0
|
|
try:
|
|
while still_alive():
|
|
data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
|
|
i += 1
|
|
if i < warmup_frames:
|
|
continue
|
|
amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
|
|
if amplitude < BARGE_IN_THRESHOLD:
|
|
speech_streak = 0
|
|
continue
|
|
if vad is None or vad.is_speech(data, SR):
|
|
speech_streak += 1
|
|
if speech_streak >= required_speech_frames:
|
|
print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS")
|
|
stop_speaking()
|
|
interrupted = True
|
|
break
|
|
else:
|
|
speech_streak = 0
|
|
except Exception:
|
|
log.exception("Barge-in ошибка")
|
|
finally:
|
|
try:
|
|
stream.stop_stream()
|
|
audio.terminate()
|
|
except Exception:
|
|
pass
|
|
return interrupted
|
|
|
|
|
|
def _play_sound_file(filename: str, wait: bool = False):
|
|
sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds")
|
|
path = os.path.normpath(os.path.join(sounds_dir, filename))
|
|
mpv_bin = os.getenv("MPV_PATH", "mpv")
|
|
cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal", path]
|
|
if wait:
|
|
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
else:
|
|
subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
|
|
|
|
def play_activation_sound():
|
|
try:
|
|
_play_sound_file("Success_Cosmo.mp3", wait=False)
|
|
except Exception as e:
|
|
log.warning(f"Ошибка звука активации: {e}")
|
|
|
|
|
|
def play_error_sound():
|
|
try:
|
|
_play_sound_file("Error_Cosmo.mp3")
|
|
except Exception as e:
|
|
log.warning(f"Ошибка звука ошибки: {e}")
|