Switch wake word from Porcupine to openwakeword + training pipeline
- Add training/ pipeline (step_1..step_5) and own-samples flow
- record_wav.py with single-shot and long-record modes, RMS-based silence filter
- remove_silent.py to drop silent samples and renumber
- modes.py: openwakeword inference with reset() and quiet predictions; commented Lusya block for later
- stt.py: drop local faster-whisper fallback, Groq-only
- config.py: remove unused STT_PROVIDER/WHISPER_*
- llm.py: replace __import__("os") hack with proper import
- tts.py: remove debug traceback in play_error_sound
- requirements.txt: add openwakeword/sounddevice/scipy, drop faster-whisper
- deploy/setup.sh: validate ELEVENLABS_API_KEY and WAKE_WORD_COSMO presence
- README.md, CLAUDE.md, project_roadmap memory updated to reflect new architecture
This commit is contained in:
@@ -63,11 +63,6 @@ AGENTS = {
|
||||
},
|
||||
}
|
||||
|
||||
# STT
|
||||
STT_PROVIDER = os.getenv("STT_PROVIDER", "groq")
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
|
||||
WHISPER_LANG = os.getenv("WHISPER_LANGUAGE", "ru")
|
||||
|
||||
# Audio (на Pi: PulseAudio BT sink)
|
||||
AUDIO_SINK = os.getenv("AUDIO_SINK", "")
|
||||
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
from datetime import date
|
||||
|
||||
from .config import GATEWAY_URL, VOICE_MODEL, AGENT, AGENTS, log
|
||||
from .config import AGENTS, log
|
||||
from .text import clean_for_speech, find_sentence_end
|
||||
from .tts import speak, play_error_sound
|
||||
|
||||
SYSTEM_PROMPT = "Отвечай кратко, 1-2 предложения, без markdown, без эмодзи."
|
||||
MAX_HISTORY = int(__import__("os").getenv("MAX_HISTORY", "20"))
|
||||
MAX_HISTORY = int(os.getenv("MAX_HISTORY", "20"))
|
||||
|
||||
RESET_PATTERNS = re.compile(
|
||||
r"(начни|начать|создай|открой|давай).{0,10}(новую|новый|чистую|чистый).{0,10}(сессию|сессия|диалог|разговор|чат)"
|
||||
|
||||
@@ -75,83 +75,67 @@ def run_with_enter():
|
||||
|
||||
|
||||
def run_with_porcupine():
|
||||
"""Режим продакшн — два wake word через Porcupine (для Pi)"""
|
||||
import pvporcupine
|
||||
import struct
|
||||
|
||||
from .config import AGENTS
|
||||
|
||||
porcupine_key = os.getenv("PORCUPINE_KEY")
|
||||
wake_word_cosmo = os.getenv("WAKE_WORD_COSMO")
|
||||
wake_word_lusya = os.getenv("WAKE_WORD_LUSYA")
|
||||
|
||||
if not porcupine_key:
|
||||
print("❌ PORCUPINE_KEY не задан в .env")
|
||||
sys.exit(1)
|
||||
|
||||
keyword_paths = []
|
||||
wake_word_map = []
|
||||
|
||||
if wake_word_cosmo:
|
||||
keyword_paths.append(wake_word_cosmo)
|
||||
wake_word_map.append("cosmo")
|
||||
if wake_word_lusya:
|
||||
keyword_paths.append(wake_word_lusya)
|
||||
wake_word_map.append("lusya")
|
||||
|
||||
if not keyword_paths:
|
||||
print("❌ WAKE_WORD_COSMO или WAKE_WORD_LUSYA не заданы в .env")
|
||||
sys.exit(1)
|
||||
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
from openwakeword.model import Model
|
||||
|
||||
porcupine = pvporcupine.create(
|
||||
access_key=porcupine_key,
|
||||
keyword_paths=keyword_paths,
|
||||
cosmo_model = Model(
|
||||
wakeword_models=[os.getenv("WAKE_WORD_COSMO")],
|
||||
inference_framework="onnx",
|
||||
)
|
||||
# TODO: подключить Люсю — раскомментировать когда модель lusya обучена
|
||||
# lusya_model = Model(
|
||||
# wakeword_models=[os.getenv("WAKE_WORD_LUSYA")],
|
||||
# inference_framework="onnx",
|
||||
# )
|
||||
|
||||
audio = pyaudio.PyAudio()
|
||||
stream = audio.open(
|
||||
rate=porcupine.sample_rate,
|
||||
channels=1,
|
||||
format=pyaudio.paInt16,
|
||||
input=True,
|
||||
frames_per_buffer=porcupine.frame_length,
|
||||
)
|
||||
# OpenWakeWord ожидает 16 kHz mono PCM 16-bit, фреймы по 1280 семплов (80 мс)
|
||||
stream = audio.open(rate=16000, channels=1, format=pyaudio.paInt16,
|
||||
input=True, frames_per_buffer=1280)
|
||||
|
||||
print("\n🦞 Cosmo Satellite запущен (режим: wake word)")
|
||||
for agent_id in wake_word_map:
|
||||
cfg = AGENTS[agent_id]
|
||||
print(f" {cfg['name']:6s} : {cfg['gateway_url']} → {cfg['agent']}")
|
||||
print(f"\nСкажи 'Космо' или 'Люся'...\n")
|
||||
print("✅ Слушаю через OpenWakeWord...")
|
||||
print("\nСкажи 'Космо'...\n")
|
||||
# print("\nСкажи 'Космо' или 'Люся'...\n") # TODO: после подключения Люси
|
||||
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
pcm = stream.read(porcupine.frame_length)
|
||||
pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
|
||||
pcm = stream.read(1280, exception_on_overflow=False)
|
||||
pcm = np.frombuffer(pcm, dtype=np.int16)
|
||||
|
||||
keyword_index = porcupine.process(pcm)
|
||||
if keyword_index >= 0:
|
||||
agent_id = wake_word_map[keyword_index]
|
||||
agent_name = AGENTS[agent_id]["name"]
|
||||
stop_speaking() # barge-in
|
||||
print(f"✅ Услышал '{agent_name}'!")
|
||||
cosmo_score = cosmo_model.predict(pcm)["cosmo"]
|
||||
if cosmo_score > 0.1:
|
||||
print(f"PREDICTION cosmo: {cosmo_score:.3f}")
|
||||
|
||||
# отпускаем микрофон на время диалога
|
||||
if cosmo_score > 0.5:
|
||||
print("✅ Услышал 'Космо'!")
|
||||
stream.stop_stream()
|
||||
_conversation_loop(agent_id, agent_name)
|
||||
_conversation_loop("cosmo", "Cosmo")
|
||||
cosmo_model.reset()
|
||||
stream.start_stream()
|
||||
continue
|
||||
|
||||
# TODO: Люся — раскомментировать когда модель готова
|
||||
# lusya_score = lusya_model.predict(pcm)["lusya"]
|
||||
# if lusya_score > 0.1:
|
||||
# print(f"PREDICTION lusya: {lusya_score:.3f}")
|
||||
# if lusya_score > 0.5:
|
||||
# print("✅ Услышала 'Люся'!")
|
||||
# stream.stop_stream()
|
||||
# _conversation_loop("lusya", "Люся")
|
||||
# lusya_model.reset()
|
||||
# stream.start_stream()
|
||||
# continue
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.exception("Непредвиденная ошибка в цикле Porcupine")
|
||||
log.exception("Непредвиденная ошибка в wake-word цикле")
|
||||
print(f"⚠️ Ошибка: {e} — продолжаю слушать...\n")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 Выход")
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
audio.terminate()
|
||||
porcupine.delete()
|
||||
|
||||
@@ -1,23 +1,11 @@
|
||||
import io
|
||||
import wave
|
||||
|
||||
from .config import groq_client, STT_PROVIDER, WHISPER_MODEL, WHISPER_LANG, log
|
||||
|
||||
|
||||
def transcribe_groq_bytes(wav_bytes: bytes) -> str:
|
||||
"""Отправляет WAV байты в Groq без записи на диск"""
|
||||
buf = io.BytesIO(wav_bytes)
|
||||
buf.name = "audio.wav"
|
||||
result = groq_client.audio.transcriptions.create(
|
||||
file=buf,
|
||||
model="whisper-large-v3-turbo",
|
||||
language="ru",
|
||||
)
|
||||
return result.text
|
||||
from .config import groq_client, log
|
||||
|
||||
|
||||
def frames_to_wav(frames: list[bytes]) -> bytes:
|
||||
"""Конвертирует сырые PCM фреймы в WAV в памяти"""
|
||||
"""Сырые PCM-фреймы → WAV в памяти (без диска)."""
|
||||
buf = io.BytesIO()
|
||||
wf = wave.open(buf, "wb")
|
||||
wf.setnchannels(1)
|
||||
@@ -29,26 +17,17 @@ def frames_to_wav(frames: list[bytes]) -> bytes:
|
||||
|
||||
|
||||
def transcribe(frames: list[bytes]) -> str:
|
||||
"""Транскрибирует аудио фреймы — всё в памяти, без диска"""
|
||||
"""STT через Groq whisper-large-v3-turbo. Всё в памяти."""
|
||||
try:
|
||||
wav_bytes = frames_to_wav(frames)
|
||||
|
||||
if STT_PROVIDER == "groq":
|
||||
return transcribe_groq_bytes(wav_bytes)
|
||||
|
||||
# Whisper fallback — нужен файл на диске
|
||||
import tempfile
|
||||
import os
|
||||
from faster_whisper import WhisperModel
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
||||
f.write(wav_bytes)
|
||||
tmp_path = f.name
|
||||
try:
|
||||
model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
||||
segments, _ = model.transcribe(tmp_path, language=WHISPER_LANG)
|
||||
return " ".join(s.text for s in segments).strip()
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
buf = io.BytesIO(wav_bytes)
|
||||
buf.name = "audio.wav"
|
||||
result = groq_client.audio.transcriptions.create(
|
||||
file=buf,
|
||||
model="whisper-large-v3-turbo",
|
||||
language="ru",
|
||||
)
|
||||
return result.text
|
||||
except Exception as e:
|
||||
log.exception("STT ошибка")
|
||||
print(f"⚠️ Ошибка распознавания речи: {e}")
|
||||
|
||||
@@ -133,9 +133,6 @@ def play_activation_sound():
|
||||
|
||||
def play_error_sound():
|
||||
"""Звук ошибки — 'не получилось'"""
|
||||
import traceback
|
||||
print("🔴 play_error_sound вызван из:")
|
||||
traceback.print_stack()
|
||||
try:
|
||||
_play_sound_file("Error_Cosmo.mp3")
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user