Files
cosmo-voice-assistant/cosmo/tts.py
Daniil Klimov 110d9cde29 Mac M1 optimizations, fix train pipeline, add Hey Cosmo wake word model
- Fix install_mac.sh: use venv + Python 3.12 (3.14 incompatible with ML libs)
- Fix run_mac.sh: activate venv, add CPU thread optimization env vars
- Fix agent.py: remove f-string from SYSTEM_PROMPT template (NameError on import)
- Add missing deps: sounddevice, pydub, imageio-ffmpeg, omegaconf
- Optimize for M1: torch.inference_mode, set_num_threads, OMP/MKL tuning
- Switch to qwen2.5:3b for faster LLM responses on Mac
- Switch Whisper to medium model with auto compute (small+int8 had poor Russian)
- Add initial_prompt for better Russian transcription
- Add open_app tool for native macOS app launching
- Fix TTS: sanitize Latin text to Cyrillic for Silero compatibility
- Fix wake word echo: add cooldown after TTS, reset model state, raise threshold
- Make "Слушаю" TTS synchronous to avoid mic interference
- Fix train Dockerfile: remove tensorflow/onnx2tf (only ONNX needed), fix deps
- Fix train.sh: use wget for dataset download, add --shm-size=2g
- Add trained hey_cosmo.onnx wake word model
- Add TODO section to CLAUDE.md (ChatterBox TTS, Ollama Modelfile ideas)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 11:19:53 +03:00

101 lines
3.8 KiB
Python

"""
TTS модуль на базе Silero V4 (torch.hub) + sounddevice.
Silero — лучший русскоязычный офлайн TTS.
Модель скачивается автоматически при первом запуске (~50 MB).
"""
import threading
import numpy as np
import sounddevice as sd
from loguru import logger
try:
import torch
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
class TTS:
def __init__(self, config: dict):
tts_cfg = config.get("tts", {})
self.enabled = tts_cfg.get("enabled", True)
self.speaker = tts_cfg.get("silero_speaker", "xenia")
self.sample_rate = tts_cfg.get("sample_rate", 48000)
self._lock = threading.Lock()
self._model = None
if not self.enabled:
return
if not TORCH_AVAILABLE:
logger.warning("torch не установлен — TTS отключён")
self.enabled = False
return
# Оптимизация CPU-инференса на Apple Silicon
num_threads = config.get("performance", {}).get("num_threads", 4)
torch.set_num_threads(num_threads)
self._load_model()
def _load_model(self):
try:
logger.info(f"Загружаю Silero TTS (голос: {self.speaker}, {self.sample_rate} Hz)...")
# torch.hub кэширует модель в ~/.cache/torch/hub
model, _ = torch.hub.load(
repo_or_dir="snakers4/silero-models",
model="silero_tts",
language="ru",
speaker="v4_ru",
trust_repo=True,
)
self._model = model
logger.info("Silero TTS готов")
except Exception as e:
logger.error(f"Ошибка загрузки Silero TTS: {e}")
logger.warning("TTS отключён")
self.enabled = False
@staticmethod
def _sanitize_text(text: str) -> str:
"""Заменяет латиницу на читаемый русский для TTS."""
import re
# Транслитерация частых англ. слов которые Silero не прочитает
text = re.sub(r'[Ss]afari', 'Сафари', text)
text = re.sub(r'[Cc]hrome', 'Хром', text)
text = re.sub(r'[Tt]elegram', 'Телеграм', text)
text = re.sub(r'[Ww]eb[Ss]torm', 'ВебШторм', text)
text = re.sub(r'[Vv][Ss]\s?[Cc]ode', 'ВиЭс Код', text)
# Оставшиеся латинские слова — убираем, чтобы Silero не зависал
text = re.sub(r'[A-Za-z]+', '', text)
# Убираем лишние пробелы
text = re.sub(r'\s+', ' ', text).strip()
return text if text else "Готово"
def say(self, text: str):
"""Произнести текст синхронно."""
if not self.enabled or self._model is None:
logger.info(f"[TTS]: {text}")
return
text = self._sanitize_text(text)
logger.debug(f"TTS: '{text}'")
with self._lock:
try:
with torch.inference_mode():
audio = self._model.apply_tts(
text=text,
speaker=self.speaker,
sample_rate=self.sample_rate,
)
audio_np = audio.numpy() if hasattr(audio, "numpy") else np.array(audio)
sd.play(audio_np, samplerate=self.sample_rate)
sd.wait()
except Exception as e:
logger.error(f"Ошибка TTS: {e}")
def say_async(self, text: str):
"""Произнести текст асинхронно."""
t = threading.Thread(target=self.say, args=(text,), daemon=True)
t.start()