Switch wake word from Porcupine to openwakeword + training pipeline

- Add training/ pipeline (step_1..step_5) and own-samples flow
- record_wav.py with single-shot and long-record modes, RMS-based silence filter
- remove_silent.py to drop silent samples and renumber
- modes.py: openwakeword inference with reset() and quiet predictions; commented Lusya block for later
- stt.py: drop local faster-whisper fallback, Groq-only
- config.py: remove unused STT_PROVIDER/WHISPER_*
- llm.py: replace __import__("os") hack with proper import
- tts.py: remove debug traceback in play_error_sound
- requirements.txt: add openwakeword/sounddevice/scipy, drop faster-whisper
- deploy/setup.sh: validate ELEVENLABS_API_KEY and WAKE_WORD_COSMO presence
- README.md, CLAUDE.md, project_roadmap memory updated to reflect new architecture
This commit is contained in:
2026-04-13 15:40:44 +03:00
parent 0a89bf5105
commit 780f6f0084
13 changed files with 378 additions and 140 deletions

View File

@@ -63,11 +63,6 @@ AGENTS = {
},
}
# STT
STT_PROVIDER = os.getenv("STT_PROVIDER", "groq")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
WHISPER_LANG = os.getenv("WHISPER_LANGUAGE", "ru")
# Audio (на Pi: PulseAudio BT sink)
AUDIO_SINK = os.getenv("AUDIO_SINK", "")

View File

@@ -1,14 +1,15 @@
import json
import os
import re
import requests
from datetime import date
from .config import GATEWAY_URL, VOICE_MODEL, AGENT, AGENTS, log
from .config import AGENTS, log
from .text import clean_for_speech, find_sentence_end
from .tts import speak, play_error_sound
SYSTEM_PROMPT = "Отвечай кратко, 1-2 предложения, без markdown, без эмодзи."
MAX_HISTORY = int(__import__("os").getenv("MAX_HISTORY", "20"))
MAX_HISTORY = int(os.getenv("MAX_HISTORY", "20"))
RESET_PATTERNS = re.compile(
r"(начни|начать|создай|открой|давай).{0,10}(новую|новый|чистую|чистый).{0,10}(сессию|сессия|диалог|разговор|чат)"

View File

@@ -75,83 +75,67 @@ def run_with_enter():
def run_with_porcupine():
"""Режим продакшн — два wake word через Porcupine (для Pi)"""
import pvporcupine
import struct
from .config import AGENTS
porcupine_key = os.getenv("PORCUPINE_KEY")
wake_word_cosmo = os.getenv("WAKE_WORD_COSMO")
wake_word_lusya = os.getenv("WAKE_WORD_LUSYA")
if not porcupine_key:
print("❌ PORCUPINE_KEY не задан в .env")
sys.exit(1)
keyword_paths = []
wake_word_map = []
if wake_word_cosmo:
keyword_paths.append(wake_word_cosmo)
wake_word_map.append("cosmo")
if wake_word_lusya:
keyword_paths.append(wake_word_lusya)
wake_word_map.append("lusya")
if not keyword_paths:
print("❌ WAKE_WORD_COSMO или WAKE_WORD_LUSYA не заданы в .env")
sys.exit(1)
import numpy as np
import pyaudio
from openwakeword.model import Model
porcupine = pvporcupine.create(
access_key=porcupine_key,
keyword_paths=keyword_paths,
cosmo_model = Model(
wakeword_models=[os.getenv("WAKE_WORD_COSMO")],
inference_framework="onnx",
)
# TODO: подключить Люсю — раскомментировать когда модель lusya обучена
# lusya_model = Model(
# wakeword_models=[os.getenv("WAKE_WORD_LUSYA")],
# inference_framework="onnx",
# )
audio = pyaudio.PyAudio()
stream = audio.open(
rate=porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=porcupine.frame_length,
)
# OpenWakeWord ожидает 16 kHz mono PCM 16-bit, фреймы по 1280 семплов (80 мс)
stream = audio.open(rate=16000, channels=1, format=pyaudio.paInt16,
input=True, frames_per_buffer=1280)
print("\n🦞 Cosmo Satellite запущен (режим: wake word)")
for agent_id in wake_word_map:
cfg = AGENTS[agent_id]
print(f" {cfg['name']:6s} : {cfg['gateway_url']}{cfg['agent']}")
print(f"\nСкажи 'Космо' или 'Люся'...\n")
print("✅ Слушаю через OpenWakeWord...")
print("\nСкажи 'Космо'...\n")
# print("\nСкажи 'Космо' или 'Люся'...\n") # TODO: после подключения Люси
try:
while True:
try:
pcm = stream.read(porcupine.frame_length)
pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
pcm = stream.read(1280, exception_on_overflow=False)
pcm = np.frombuffer(pcm, dtype=np.int16)
keyword_index = porcupine.process(pcm)
if keyword_index >= 0:
agent_id = wake_word_map[keyword_index]
agent_name = AGENTS[agent_id]["name"]
stop_speaking() # barge-in
print(f"✅ Услышал '{agent_name}'!")
cosmo_score = cosmo_model.predict(pcm)["cosmo"]
if cosmo_score > 0.1:
print(f"PREDICTION cosmo: {cosmo_score:.3f}")
# отпускаем микрофон на время диалога
if cosmo_score > 0.5:
print("✅ Услышал 'Космо'!")
stream.stop_stream()
_conversation_loop(agent_id, agent_name)
_conversation_loop("cosmo", "Cosmo")
cosmo_model.reset()
stream.start_stream()
continue
# TODO: Люся — раскомментировать когда модель готова
# lusya_score = lusya_model.predict(pcm)["lusya"]
# if lusya_score > 0.1:
# print(f"PREDICTION lusya: {lusya_score:.3f}")
# if lusya_score > 0.5:
# print("✅ Услышала 'Люся'!")
# stream.stop_stream()
# _conversation_loop("lusya", "Люся")
# lusya_model.reset()
# stream.start_stream()
# continue
except KeyboardInterrupt:
raise
except Exception as e:
log.exception("Непредвиденная ошибка в цикле Porcupine")
log.exception("Непредвиденная ошибка в wake-word цикле")
print(f"⚠️ Ошибка: {e} — продолжаю слушать...\n")
except KeyboardInterrupt:
print("\n👋 Выход")
finally:
stream.stop_stream()
stream.close()
audio.terminate()
porcupine.delete()

View File

@@ -1,23 +1,11 @@
import io
import wave
from .config import groq_client, STT_PROVIDER, WHISPER_MODEL, WHISPER_LANG, log
def transcribe_groq_bytes(wav_bytes: bytes) -> str:
"""Отправляет WAV байты в Groq без записи на диск"""
buf = io.BytesIO(wav_bytes)
buf.name = "audio.wav"
result = groq_client.audio.transcriptions.create(
file=buf,
model="whisper-large-v3-turbo",
language="ru",
)
return result.text
from .config import groq_client, log
def frames_to_wav(frames: list[bytes]) -> bytes:
"""Конвертирует сырые PCM фреймы в WAV в памяти"""
"""Сырые PCM-фреймы WAV в памяти (без диска)."""
buf = io.BytesIO()
wf = wave.open(buf, "wb")
wf.setnchannels(1)
@@ -29,26 +17,17 @@ def frames_to_wav(frames: list[bytes]) -> bytes:
def transcribe(frames: list[bytes]) -> str:
"""Транскрибирует аудио фреймы — всё в памяти, без диска"""
"""STT через Groq whisper-large-v3-turbo. Всё в памяти."""
try:
wav_bytes = frames_to_wav(frames)
if STT_PROVIDER == "groq":
return transcribe_groq_bytes(wav_bytes)
# Whisper fallback — нужен файл на диске
import tempfile
import os
from faster_whisper import WhisperModel
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(wav_bytes)
tmp_path = f.name
try:
model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
segments, _ = model.transcribe(tmp_path, language=WHISPER_LANG)
return " ".join(s.text for s in segments).strip()
finally:
os.unlink(tmp_path)
buf = io.BytesIO(wav_bytes)
buf.name = "audio.wav"
result = groq_client.audio.transcriptions.create(
file=buf,
model="whisper-large-v3-turbo",
language="ru",
)
return result.text
except Exception as e:
log.exception("STT ошибка")
print(f"⚠️ Ошибка распознавания речи: {e}")

View File

@@ -133,9 +133,6 @@ def play_activation_sound():
def play_error_sound():
"""Звук ошибки — 'не получилось'"""
import traceback
print("🔴 play_error_sound вызван из:")
traceback.print_stack()
try:
_play_sound_file("Error_Cosmo.mp3")
except Exception as e: