import os import subprocess import threading from elevenlabs import VoiceSettings from .config import ( AUDIO_SINK, AGENTS, log, BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP, ) ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "") ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5") _elevenlabs_client = None _current_process: subprocess.Popen | None = None _process_lock = threading.Lock() def _get_elevenlabs(): global _elevenlabs_client if _elevenlabs_client is None: from elevenlabs.client import ElevenLabs _elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) return _elevenlabs_client def stop_speaking(): """Прерывает текущее воспроизведение (barge-in)""" global _current_process with _process_lock: if _current_process and _current_process.poll() is None: _current_process.terminate() try: _current_process.wait(timeout=1) except subprocess.TimeoutExpired: _current_process.kill() _current_process = None def is_speaking() -> bool: with _process_lock: return _current_process is not None and _current_process.poll() is None def _mpv_cmd() -> list[str]: mpv_bin = os.getenv("MPV_PATH", "mpv") cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"] if AUDIO_SINK: cmd.append(f"--audio-device=pulse/{AUDIO_SINK}") cmd.append("-") return cmd def speak(text: str, agent_id: str = "cosmo") -> bool: """Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться. Возвращает True если был прерван голосом.""" try: if BARGE_IN_ENABLED: return _speak_with_barge_in(text, agent_id) _speak_elevenlabs(text, agent_id) return False except Exception as e: log.exception("TTS ошибка") print(f"⚠️ Ошибка воспроизведения: {e}") play_error_sound() return False def _speak_elevenlabs(text: str, agent_id: str): global _current_process client = _get_elevenlabs() voice_id = AGENTS.get(agent_id, AGENTS["cosmo"]).get("tts_voice", "") if not voice_id: log.error(f"tts_voice не задан для {agent_id}") print(f"⚠️ tts_voice не задан для {agent_id}") return voice_settings = VoiceSettings( stability=0.4, similarity_boost=0.8, style=0.1, use_speaker_boost=True, speed=1.1, ) audio_stream = client.text_to_speech.convert( text=text, voice_id=voice_id, model_id=ELEVENLABS_MODEL, output_format="mp3_22050_32", voice_settings=voice_settings, optimize_streaming_latency=3, ) with _process_lock: _current_process = subprocess.Popen( _mpv_cmd(), stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) proc = _current_process try: for chunk in audio_stream: if proc.poll() is not None: break try: proc.stdin.write(chunk) except BrokenPipeError: break proc.stdin.close() proc.wait() except Exception: proc.kill() finally: with _process_lock: if _current_process is proc: _current_process = None def _speak_with_barge_in(text: str, agent_id: str) -> bool: """Запускает TTS в фоновом потоке и параллельно слушает мик через VAD. Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали.""" t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True) t.start() interrupted = _listen_for_barge_in(lambda: t.is_alive()) t.join() return interrupted def _listen_for_barge_in(still_alive) -> bool: """Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал.""" import pyaudio import numpy as np try: import webrtcvad vad = webrtcvad.Vad(3) # максимум агрессивности — меньше ложных на эхо except Exception: vad = None SR = 16000 FRAME_MS = 30 FRAME_SAMPLES = int(SR * FRAME_MS / 1000) warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS) required_speech_frames = 8 # ~240 мс подряд try: audio = pyaudio.PyAudio() stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR, input=True, frames_per_buffer=FRAME_SAMPLES) except Exception as e: log.warning(f"Barge-in: не открылся мик: {e}") return False interrupted = False speech_streak = 0 i = 0 try: while still_alive(): data = stream.read(FRAME_SAMPLES, exception_on_overflow=False) i += 1 if i < warmup_frames: continue amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean()) if amplitude < BARGE_IN_THRESHOLD: speech_streak = 0 continue if vad is None or vad.is_speech(data, SR): speech_streak += 1 if speech_streak >= required_speech_frames: print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS") stop_speaking() interrupted = True break else: speech_streak = 0 except Exception: log.exception("Barge-in ошибка") finally: try: stream.stop_stream() audio.terminate() except Exception: pass return interrupted def _play_sound_file(filename: str, wait: bool = False): sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds") path = os.path.normpath(os.path.join(sounds_dir, filename)) mpv_bin = os.getenv("MPV_PATH", "mpv") cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal", path] if wait: subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) else: subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) def play_activation_sound(): try: _play_sound_file("Success_Cosmo.mp3", wait=False) except Exception as e: log.warning(f"Ошибка звука активации: {e}") def play_error_sound(): try: _play_sound_file("Error_Cosmo.mp3") except Exception as e: log.warning(f"Ошибка звука ошибки: {e}")