Switch wake word from Porcupine to openwakeword + training pipeline

- Add training/ pipeline (step_1..step_5) and own-samples flow - record_wav.py with single-shot and long-record modes, RMS-based silence filter - remove_silent.py to drop silent samples and renumber - modes.py: openwakeword inference with reset() and quiet predictions; commented Lusya block for later - stt.py: drop local faster-whisper fallback, Groq-only - config.py: remove unused STT_PROVIDER/WHISPER_* - llm.py: replace __import__("os") hack with proper import - tts.py: remove debug traceback in play_error_sound - requirements.txt: add openwakeword/sounddevice/scipy, drop faster-whisper - deploy/setup.sh: validate ELEVENLABS_API_KEY and WAKE_WORD_COSMO presence - README.md, CLAUDE.md, project_roadmap memory updated to reflect new architecture
2026-04-13 15:40:44 +03:00
parent 0a89bf5105
commit 780f6f0084
13 changed files with 378 additions and 140 deletions
--- a/satellite/config.py
+++ b/satellite/config.py
@@ -63,11 +63,6 @@ AGENTS = {
    },
 }

-# STT
-STT_PROVIDER  = os.getenv("STT_PROVIDER", "groq")
-WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
-WHISPER_LANG  = os.getenv("WHISPER_LANGUAGE", "ru")
-
 # Audio (на Pi: PulseAudio BT sink)
 AUDIO_SINK = os.getenv("AUDIO_SINK", "")

--- a/satellite/llm.py
+++ b/satellite/llm.py
@@ -1,14 +1,15 @@
 import json
+import os
 import re
 import requests
 from datetime import date

-from .config import GATEWAY_URL, VOICE_MODEL, AGENT, AGENTS, log
+from .config import AGENTS, log
 from .text import clean_for_speech, find_sentence_end
 from .tts import speak, play_error_sound

 SYSTEM_PROMPT = "Отвечай кратко, 1-2 предложения, без markdown, без эмодзи."
-MAX_HISTORY = int(__import__("os").getenv("MAX_HISTORY", "20"))
+MAX_HISTORY = int(os.getenv("MAX_HISTORY", "20"))

 RESET_PATTERNS = re.compile(
    r"(начни|начать|создай|открой|давай).{0,10}(новую|новый|чистую|чистый).{0,10}(сессию|сессия|диалог|разговор|чат)"
--- a/satellite/modes.py
+++ b/satellite/modes.py
@@ -75,83 +75,67 @@ def run_with_enter():


 def run_with_porcupine():
-    """Режим продакшн — два wake word через Porcupine (для Pi)"""
-    import pvporcupine
-    import struct
-
-    from .config import AGENTS
-
-    porcupine_key    = os.getenv("PORCUPINE_KEY")
-    wake_word_cosmo  = os.getenv("WAKE_WORD_COSMO")
-    wake_word_lusya  = os.getenv("WAKE_WORD_LUSYA")
-
-    if not porcupine_key:
-        print("❌ PORCUPINE_KEY не задан в .env")
-        sys.exit(1)
-
-    keyword_paths = []
-    wake_word_map = []
-
-    if wake_word_cosmo:
-        keyword_paths.append(wake_word_cosmo)
-        wake_word_map.append("cosmo")
-    if wake_word_lusya:
-        keyword_paths.append(wake_word_lusya)
-        wake_word_map.append("lusya")
-
-    if not keyword_paths:
-        print("❌ WAKE_WORD_COSMO или WAKE_WORD_LUSYA не заданы в .env")
-        sys.exit(1)
-
+    import numpy as np
    import pyaudio
+    from openwakeword.model import Model

-    porcupine = pvporcupine.create(
-        access_key=porcupine_key,
-        keyword_paths=keyword_paths,
+    cosmo_model = Model(
+        wakeword_models=[os.getenv("WAKE_WORD_COSMO")],
+        inference_framework="onnx",
    )
+    # TODO: подключить Люсю — раскомментировать когда модель lusya обучена
+    # lusya_model = Model(
+    #     wakeword_models=[os.getenv("WAKE_WORD_LUSYA")],
+    #     inference_framework="onnx",
+    # )

    audio = pyaudio.PyAudio()
-    stream = audio.open(
-        rate=porcupine.sample_rate,
-        channels=1,
-        format=pyaudio.paInt16,
-        input=True,
-        frames_per_buffer=porcupine.frame_length,
-    )
+    # OpenWakeWord ожидает 16 kHz mono PCM 16-bit, фреймы по 1280 семплов (80 мс)
+    stream = audio.open(rate=16000, channels=1, format=pyaudio.paInt16,
+                        input=True, frames_per_buffer=1280)

-    print("\n🦞 Cosmo Satellite запущен (режим: wake word)")
-    for agent_id in wake_word_map:
-        cfg = AGENTS[agent_id]
-        print(f"   {cfg['name']:6s} : {cfg['gateway_url']} → {cfg['agent']}")
-    print(f"\nСкажи 'Космо' или 'Люся'...\n")
+    print("✅ Слушаю через OpenWakeWord...")
+    print("\nСкажи 'Космо'...\n")
+    # print("\nСкажи 'Космо' или 'Люся'...\n")  # TODO: после подключения Люси

    try:
        while True:
            try:
-                pcm = stream.read(porcupine.frame_length)
-                pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
+                pcm = stream.read(1280, exception_on_overflow=False)
+                pcm = np.frombuffer(pcm, dtype=np.int16)

-                keyword_index = porcupine.process(pcm)
-                if keyword_index >= 0:
-                    agent_id = wake_word_map[keyword_index]
-                    agent_name = AGENTS[agent_id]["name"]
-                    stop_speaking()  # barge-in
-                    print(f"✅ Услышал '{agent_name}'!")
+                cosmo_score = cosmo_model.predict(pcm)["cosmo"]
+                if cosmo_score > 0.1:
+                    print(f"PREDICTION cosmo: {cosmo_score:.3f}")

-                    # отпускаем микрофон на время диалога
+                if cosmo_score > 0.5:
+                    print("✅ Услышал 'Космо'!")
                    stream.stop_stream()
-                    _conversation_loop(agent_id, agent_name)
+                    _conversation_loop("cosmo", "Cosmo")
+                    cosmo_model.reset()
                    stream.start_stream()
+                    continue
+
+                # TODO: Люся — раскомментировать когда модель готова
+                # lusya_score = lusya_model.predict(pcm)["lusya"]
+                # if lusya_score > 0.1:
+                #     print(f"PREDICTION lusya: {lusya_score:.3f}")
+                # if lusya_score > 0.5:
+                #     print("✅ Услышала 'Люся'!")
+                #     stream.stop_stream()
+                #     _conversation_loop("lusya", "Люся")
+                #     lusya_model.reset()
+                #     stream.start_stream()
+                #     continue

            except KeyboardInterrupt:
                raise
            except Exception as e:
-                log.exception("Непредвиденная ошибка в цикле Porcupine")
+                log.exception("Непредвиденная ошибка в wake-word цикле")
                print(f"⚠️  Ошибка: {e} — продолжаю слушать...\n")

    except KeyboardInterrupt:
        print("\n👋 Выход")
    finally:
-        stream.stop_stream()
+        stream.close()
        audio.terminate()
-        porcupine.delete()
--- a/satellite/stt.py
+++ b/satellite/stt.py
@@ -1,23 +1,11 @@
 import io
 import wave

-from .config import groq_client, STT_PROVIDER, WHISPER_MODEL, WHISPER_LANG, log
-
-
-def transcribe_groq_bytes(wav_bytes: bytes) -> str:
-    """Отправляет WAV байты в Groq без записи на диск"""
-    buf = io.BytesIO(wav_bytes)
-    buf.name = "audio.wav"
-    result = groq_client.audio.transcriptions.create(
-        file=buf,
-        model="whisper-large-v3-turbo",
-        language="ru",
-    )
-    return result.text
+from .config import groq_client, log


 def frames_to_wav(frames: list[bytes]) -> bytes:
-    """Конвертирует сырые PCM фреймы в WAV в памяти"""
+    """Сырые PCM-фреймы → WAV в памяти (без диска)."""
    buf = io.BytesIO()
    wf = wave.open(buf, "wb")
    wf.setnchannels(1)
@@ -29,26 +17,17 @@ def frames_to_wav(frames: list[bytes]) -> bytes:


 def transcribe(frames: list[bytes]) -> str:
-    """Транскрибирует аудио фреймы — всё в памяти, без диска"""
+    """STT через Groq whisper-large-v3-turbo. Всё в памяти."""
    try:
        wav_bytes = frames_to_wav(frames)
-
-        if STT_PROVIDER == "groq":
-            return transcribe_groq_bytes(wav_bytes)
-
-        # Whisper fallback — нужен файл на диске
-        import tempfile
-        import os
-        from faster_whisper import WhisperModel
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            f.write(wav_bytes)
-            tmp_path = f.name
-        try:
-            model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
-            segments, _ = model.transcribe(tmp_path, language=WHISPER_LANG)
-            return " ".join(s.text for s in segments).strip()
-        finally:
-            os.unlink(tmp_path)
+        buf = io.BytesIO(wav_bytes)
+        buf.name = "audio.wav"
+        result = groq_client.audio.transcriptions.create(
+            file=buf,
+            model="whisper-large-v3-turbo",
+            language="ru",
+        )
+        return result.text
    except Exception as e:
        log.exception("STT ошибка")
        print(f"⚠️  Ошибка распознавания речи: {e}")
--- a/satellite/tts.py
+++ b/satellite/tts.py
@@ -133,9 +133,6 @@ def play_activation_sound():

 def play_error_sound():
    """Звук ошибки — 'не получилось'"""
-    import traceback
-    print("🔴 play_error_sound вызван из:")
-    traceback.print_stack()
    try:
        _play_sound_file("Error_Cosmo.mp3")
    except Exception as e: