- Fix install_mac.sh: use venv + Python 3.12 (3.14 incompatible with ML libs) - Fix run_mac.sh: activate venv, add CPU thread optimization env vars - Fix agent.py: remove f-string from SYSTEM_PROMPT template (NameError on import) - Add missing deps: sounddevice, pydub, imageio-ffmpeg, omegaconf - Optimize for M1: torch.inference_mode, set_num_threads, OMP/MKL tuning - Switch to qwen2.5:3b for faster LLM responses on Mac - Switch Whisper to medium model with auto compute (small+int8 had poor Russian) - Add initial_prompt for better Russian transcription - Add open_app tool for native macOS app launching - Fix TTS: sanitize Latin text to Cyrillic for Silero compatibility - Fix wake word echo: add cooldown after TTS, reset model state, raise threshold - Make "Слушаю" TTS synchronous to avoid mic interference - Fix train Dockerfile: remove tensorflow/onnx2tf (only ONNX needed), fix deps - Fix train.sh: use wget for dataset download, add --shm-size=2g - Add trained hey_cosmo.onnx wake word model - Add TODO section to CLAUDE.md (ChatterBox TTS, Ollama Modelfile ideas) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
"""
|
||
STT модуль на базе RealtimeSTT.
|
||
Использует faster-whisper + Silero VAD под капотом.
|
||
Поддерживает стриминг — partial transcriptions во время речи.
|
||
"""
|
||
|
||
import threading
|
||
from RealtimeSTT import AudioToTextRecorder
|
||
from loguru import logger
|
||
|
||
|
||
class Transcriber:
|
||
def __init__(self, config: dict):
|
||
whisper_cfg = config["whisper"]
|
||
audio_cfg = config["audio"]
|
||
|
||
self._recorder: AudioToTextRecorder | None = None
|
||
self._config = {
|
||
"model": whisper_cfg["model_size"],
|
||
"language": whisper_cfg["language"],
|
||
"device": whisper_cfg["device"],
|
||
"compute_type": whisper_cfg["compute_type"],
|
||
# Подсказка для Whisper — улучшает распознавание русского
|
||
"initial_prompt": whisper_cfg.get("initial_prompt", ""),
|
||
# Silero VAD параметры
|
||
"silero_sensitivity": 0.4,
|
||
"webrtc_sensitivity": 3,
|
||
"post_speech_silence_duration": audio_cfg["silence_duration"],
|
||
"min_length_of_recording": 0.5,
|
||
"min_gap_between_recordings": 0.01,
|
||
# Отключаем wake word в RealtimeSTT — используем свой
|
||
"wakeword_backend": "none",
|
||
# Не запускать в режиме непрерывного прослушивания
|
||
"use_microphone": True,
|
||
"spinner": False,
|
||
"level": 0, # минимальный лог уровень внутри RealtimeSTT
|
||
}
|
||
|
||
logger.info(
|
||
f"Инициализирую RealtimeSTT: модель={whisper_cfg['model_size']}, "
|
||
f"device={whisper_cfg['device']}, compute={whisper_cfg['compute_type']}"
|
||
)
|
||
self._init_recorder()
|
||
|
||
def _init_recorder(self):
|
||
try:
|
||
self._recorder = AudioToTextRecorder(**self._config)
|
||
logger.info("RealtimeSTT готов")
|
||
except Exception as e:
|
||
logger.error(f"Ошибка инициализации RealtimeSTT: {e}")
|
||
raise
|
||
|
||
def record_and_transcribe(self, on_partial: callable = None) -> str:
|
||
"""
|
||
Записывает команду и транскрибирует.
|
||
on_partial(text) — опциональный колбэк для частичных результатов.
|
||
Возвращает финальный текст.
|
||
"""
|
||
if self._recorder is None:
|
||
self._init_recorder()
|
||
|
||
result_holder = []
|
||
done_event = threading.Event()
|
||
|
||
def on_text(text: str):
|
||
result_holder.append(text)
|
||
done_event.set()
|
||
|
||
# Partial results — показываем что слышим в реальном времени
|
||
if on_partial:
|
||
self._recorder.on_realtime_transcription_update = on_partial
|
||
|
||
logger.info("Слушаю команду...")
|
||
self._recorder.text(on_text)
|
||
done_event.wait(timeout=12.0)
|
||
|
||
text = result_holder[0].strip() if result_holder else ""
|
||
if text:
|
||
logger.info(f"Транскрипция: '{text}'")
|
||
else:
|
||
logger.info("Команда не распознана (тишина или таймаут)")
|
||
return text
|
||
|
||
def shutdown(self):
|
||
if self._recorder:
|
||
try:
|
||
self._recorder.shutdown()
|
||
except Exception:
|
||
pass
|