home-voice-assistant/satellite/tts.py

import os
import subprocess
import threading
from elevenlabs import VoiceSettings

from .config import (
    AUDIO_SINK, AGENTS, log,
    BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP,
)

ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")

_elevenlabs_client = None
_current_process: subprocess.Popen | None = None
_process_lock = threading.Lock()


def _get_elevenlabs():
    global _elevenlabs_client
    if _elevenlabs_client is None:
        from elevenlabs.client import ElevenLabs
        _elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
    return _elevenlabs_client


def stop_speaking():
    """Прерывает текущее воспроизведение (barge-in)"""
    global _current_process
    with _process_lock:
        if _current_process and _current_process.poll() is None:
            _current_process.terminate()
            try:
                _current_process.wait(timeout=1)
            except subprocess.TimeoutExpired:
                _current_process.kill()
            _current_process = None


def is_speaking() -> bool:
    with _process_lock:
        return _current_process is not None and _current_process.poll() is None


def _mpv_cmd() -> list[str]:
    mpv_bin = os.getenv("MPV_PATH", "mpv")
    cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"]
    if AUDIO_SINK:
        cmd.append(f"--audio-device=pulse/{AUDIO_SINK}")
    cmd.append("-")
    return cmd


def speak(text: str, agent_id: str = "cosmo") -> bool:
    """Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться.
    Возвращает True если был прерван голосом."""
    try:
        if BARGE_IN_ENABLED:
            return _speak_with_barge_in(text, agent_id)
        _speak_elevenlabs(text, agent_id)
        return False
    except Exception as e:
        log.exception("TTS ошибка")
        print(f"⚠️  Ошибка воспроизведения: {e}")
        play_error_sound()
        return False


def _speak_elevenlabs(text: str, agent_id: str):
    global _current_process
    client = _get_elevenlabs()
    voice_id = AGENTS.get(agent_id, AGENTS["cosmo"]).get("tts_voice", "")

    if not voice_id:
        log.error(f"tts_voice не задан для {agent_id}")
        print(f"⚠️  tts_voice не задан для {agent_id}")
        return

    voice_settings = VoiceSettings(
        stability=0.4,
        similarity_boost=0.8,
        style=0.1,
        use_speaker_boost=True,
        speed=1.1,
    )

    audio_stream = client.text_to_speech.convert(
        text=text,
        voice_id=voice_id,
        model_id=ELEVENLABS_MODEL,
        output_format="mp3_22050_32",
        voice_settings=voice_settings,
        optimize_streaming_latency=3,
    )

    with _process_lock:
        _current_process = subprocess.Popen(
            _mpv_cmd(), stdin=subprocess.PIPE,
            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
        )
        proc = _current_process

    try:
        for chunk in audio_stream:
            if proc.poll() is not None:
                break
            try:
                proc.stdin.write(chunk)
            except BrokenPipeError:
                break
        proc.stdin.close()
        proc.wait()
    except Exception:
        proc.kill()
    finally:
        with _process_lock:
            if _current_process is proc:
                _current_process = None


def _speak_with_barge_in(text: str, agent_id: str) -> bool:
    """Запускает TTS в фоновом потоке и параллельно слушает мик через VAD.
    Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали."""
    t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True)
    t.start()
    interrupted = _listen_for_barge_in(lambda: t.is_alive())
    t.join()
    return interrupted


def _listen_for_barge_in(still_alive) -> bool:
    """Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал."""
    import pyaudio
    import numpy as np
    try:
        import webrtcvad
        vad = webrtcvad.Vad(3)  # максимум агрессивности — меньше ложных на эхо
    except Exception:
        vad = None

    SR = 16000
    FRAME_MS = 30
    FRAME_SAMPLES = int(SR * FRAME_MS / 1000)
    warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS)
    required_speech_frames = 8  # ~240 мс подряд

    try:
        audio = pyaudio.PyAudio()
        stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR,
                            input=True, frames_per_buffer=FRAME_SAMPLES)
    except Exception as e:
        log.warning(f"Barge-in: не открылся мик: {e}")
        return False

    interrupted = False
    speech_streak = 0
    i = 0
    try:
        while still_alive():
            data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
            i += 1
            if i < warmup_frames:
                continue
            amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
            if amplitude < BARGE_IN_THRESHOLD:
                speech_streak = 0
                continue
            if vad is None or vad.is_speech(data, SR):
                speech_streak += 1
                if speech_streak >= required_speech_frames:
                    print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS")
                    stop_speaking()
                    interrupted = True
                    break
            else:
                speech_streak = 0
    except Exception:
        log.exception("Barge-in ошибка")
    finally:
        try:
            stream.stop_stream()
            audio.terminate()
        except Exception:
            pass
    return interrupted


def _play_sound_file(filename: str, wait: bool = False):
    sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds")
    path = os.path.normpath(os.path.join(sounds_dir, filename))
    mpv_bin = os.getenv("MPV_PATH", "mpv")
    cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal", path]
    if wait:
        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    else:
        subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


def play_activation_sound():
    try:
        _play_sound_file("Success_Cosmo.mp3", wait=False)
    except Exception as e:
        log.warning(f"Ошибка звука активации: {e}")


def play_error_sound():
    try:
        _play_sound_file("Error_Cosmo.mp3")
    except Exception as e:
        log.warning(f"Ошибка звука ошибки: {e}")