Initial commit: Cosmo Voice Satellite

Two-agent voice assistant (Cosmo + Люся) via OpenClaw Gateway.
Streaming STT (Groq) + LLM + TTS (ElevenLabs) pipeline with
keep-alive sessions, barge-in, and daily conversation sessions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-12 13:34:08 +03:00
commit 7ca8268b78
16 changed files with 1143 additions and 0 deletions

0
satellite/__init__.py Normal file
View File

13
satellite/__main__.py Normal file
View File

@@ -0,0 +1,13 @@
"""
Cosmo Satellite — голосовой клиент для OpenClaw Gateway
Запуск: python -m satellite [--wake]
"""
import sys
from .modes import run_with_enter, run_with_porcupine
if __name__ == "__main__":
if "--wake" in sys.argv:
run_with_porcupine()
else:
run_with_enter()

104
satellite/audio.py Normal file
View File

@@ -0,0 +1,104 @@
import pyaudio
import numpy as np
from .config import SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION, log
from .stt import transcribe
def record() -> str:
"""Запись до тишины (VAD) + STT"""
try:
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=1024,
)
except Exception as e:
log.exception("Не удалось открыть микрофон")
print(f"⚠️ Ошибка микрофона: {e}")
return ""
print("🎙️ Говори...")
frames = []
silent_chunks = 0
speaking_started = False
max_chunks = int(16000 / 1024 * MAX_DURATION)
silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
try:
for _ in range(max_chunks):
data = stream.read(1024, exception_on_overflow=False)
frames.append(data)
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
if amplitude > SILENCE_THRESHOLD:
speaking_started = True
silent_chunks = 0
elif speaking_started:
silent_chunks += 1
if silent_chunks >= silence_chunks_needed:
print("🔇 Конец речи")
break
except Exception as e:
log.exception("Ошибка при записи аудио")
print(f"⚠️ Ошибка записи: {e}")
finally:
stream.stop_stream()
audio.terminate()
if not speaking_started:
return ""
return transcribe(frames)
def record_with_timeout(timeout: float = 8.0) -> str:
"""Слушает timeout секунд, возвращает пусто если речи не было"""
try:
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=1024,
)
except Exception as e:
log.exception("Не удалось открыть микрофон (followup)")
print(f"⚠️ Ошибка микрофона: {e}")
return ""
frames = []
silent_chunks = 0
speaking_started = False
max_chunks = int(16000 / 1024 * timeout)
silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
try:
for _ in range(max_chunks):
data = stream.read(1024, exception_on_overflow=False)
frames.append(data)
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
if amplitude > SILENCE_THRESHOLD:
speaking_started = True
silent_chunks = 0
elif speaking_started:
silent_chunks += 1
if silent_chunks >= silence_chunks_needed:
break
except Exception as e:
log.exception("Ошибка при записи аудио (followup)")
print(f"⚠️ Ошибка записи: {e}")
finally:
stream.stop_stream()
audio.terminate()
if not speaking_started:
return ""
return transcribe(frames)

85
satellite/config.py Normal file
View File

@@ -0,0 +1,85 @@
import os
import sys
import logging
import requests as _requests
from dotenv import load_dotenv
from groq import Groq
load_dotenv()
# Логгер — ошибки в файл + короткое сообщение в консоль
LOG_FILE = os.getenv("LOG_FILE", "errors.log")
logging.basicConfig(
level=logging.WARNING,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[
logging.FileHandler(LOG_FILE, encoding="utf-8"),
],
)
log = logging.getLogger("cosmo")
# OpenClaw Gateway — Cosmo (по умолчанию)
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://192.168.31.103:18789")
GATEWAY_TOKEN = os.getenv("GATEWAY_TOKEN")
AGENT = os.getenv("AGENT", "openclaw/main")
VOICE_MODEL = os.getenv("VOICE_MODEL", "openai/gpt-4o-mini")
# OpenClaw Gateway — Люся
LUSYA_GATEWAY_URL = os.getenv("LUSYA_GATEWAY_URL", "http://192.168.31.103:18790")
LUSYA_GATEWAY_TOKEN = os.getenv("LUSYA_GATEWAY_TOKEN", GATEWAY_TOKEN)
LUSYA_AGENT = os.getenv("LUSYA_AGENT", "openclaw/wife")
LUSYA_VOICE_MODEL = os.getenv("LUSYA_VOICE_MODEL", VOICE_MODEL)
# Keep-alive HTTP сессии — переиспользуют TCP/TLS соединения
def _make_session(token: str) -> _requests.Session:
s = _requests.Session()
s.headers.update({
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
})
return s
# Конфиги агентов по имени
AGENTS = {
"cosmo": {
"name": "Cosmo",
"gateway_url": GATEWAY_URL,
"token": GATEWAY_TOKEN,
"agent": AGENT,
"voice_model": VOICE_MODEL,
"tts_voice": os.getenv("COSMO_TTS_VOICE", ""),
"session": _make_session(GATEWAY_TOKEN),
},
"lusya": {
"name": "Люся",
"gateway_url": LUSYA_GATEWAY_URL,
"token": LUSYA_GATEWAY_TOKEN,
"agent": LUSYA_AGENT,
"voice_model": LUSYA_VOICE_MODEL,
"tts_voice": os.getenv("LUSYA_TTS_VOICE", ""),
"session": _make_session(LUSYA_GATEWAY_TOKEN),
},
}
# STT
STT_PROVIDER = os.getenv("STT_PROVIDER", "groq")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
WHISPER_LANG = os.getenv("WHISPER_LANGUAGE", "ru")
# Audio (на Pi: PulseAudio BT sink)
AUDIO_SINK = os.getenv("AUDIO_SINK", "")
# VAD
SILENCE_THRESHOLD = int(os.getenv("SILENCE_THRESHOLD", "500"))
SILENCE_DURATION = float(os.getenv("SILENCE_DURATION", "1.5"))
MAX_DURATION = int(os.getenv("MAX_DURATION", "15"))
FOLLOWUP_TIMEOUT = float(os.getenv("FOLLOWUP_TIMEOUT", "8"))
# Groq client
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
if not GATEWAY_TOKEN:
print("❌ GATEWAY_TOKEN не задан в .env")
sys.exit(1)

141
satellite/llm.py Normal file
View File

@@ -0,0 +1,141 @@
import json
import re
import requests
from datetime import date
from .config import GATEWAY_URL, VOICE_MODEL, AGENT, AGENTS, log
from .text import clean_for_speech, find_sentence_end
from .tts import speak
SYSTEM_PROMPT = "Отвечай кратко, 1-2 предложения, без markdown, без эмодзи."
MAX_HISTORY = int(__import__("os").getenv("MAX_HISTORY", "20"))
RESET_PATTERNS = re.compile(
r"(начни|начать|создай|открой|давай).{0,10}(новую|новый|чистую|чистый).{0,10}(сессию|сессия|диалог|разговор|чат)"
r"|"
r"(сбрось|очисти|обнови).{0,10}(сессию|диалог|разговор|чат|историю|контекст)",
re.IGNORECASE,
)
class Conversation:
"""Хранит историю сообщений — одна сессия на день"""
def __init__(self, agent_id: str = "cosmo"):
self.agent_id = agent_id
self.created_date = date.today()
self.messages = [{"role": "system", "content": SYSTEM_PROMPT}]
def is_expired(self) -> bool:
return date.today() != self.created_date
def reset(self):
self.created_date = date.today()
self.messages = [{"role": "system", "content": SYSTEM_PROMPT}]
def add_user(self, text: str):
self.messages.append({"role": "user", "content": text})
self._trim()
def add_assistant(self, text: str):
self.messages.append({"role": "assistant", "content": text})
self._trim()
def _trim(self):
if len(self.messages) > MAX_HISTORY + 1:
self.messages = [self.messages[0]] + self.messages[-(MAX_HISTORY):]
def is_reset_command(text: str) -> bool:
return bool(RESET_PATTERNS.search(text))
def ask_agent_stream(text: str, conv: "Conversation | None" = None, agent_id: str = "cosmo") -> str:
if conv is None:
conv = Conversation(agent_id)
conv.add_user(text)
cfg = AGENTS.get(agent_id, AGENTS["cosmo"])
gateway_url = cfg["gateway_url"]
session = cfg["session"]
agent = cfg["agent"]
try:
resp = session.post(
f"{gateway_url}/v1/chat/completions",
headers={"x-openclaw-model": cfg["voice_model"]},
json={
"model": agent,
"stream": True,
"messages": conv.messages,
"max_tokens": 150,
},
stream=True,
timeout=60,
)
resp.raise_for_status()
except requests.ConnectionError:
log.exception("Gateway недоступен")
msg = "Не могу связаться с сервером, попробуй ещё раз."
print(f"⚠️ {msg}")
speak(msg, agent_id)
return msg
except requests.Timeout:
log.exception("Gateway таймаут")
msg = "Сервер не ответил вовремя, попробуй ещё раз."
print(f"⚠️ {msg}")
speak(msg, agent_id)
return msg
except requests.HTTPError:
log.exception(f"Gateway HTTP ошибка {resp.status_code}")
msg = "Ошибка сервера, попробуй ещё раз."
print(f"⚠️ Gateway {resp.status_code}: {resp.text}")
speak(msg, agent_id)
return msg
full_text = ""
buffer = ""
try:
for line in resp.iter_lines():
if not line or line == b"data: [DONE]":
continue
if line.startswith(b"data: "):
try:
chunk = json.loads(line[6:])
delta = chunk["choices"][0]["delta"].get("content", "")
if not delta:
continue
full_text += delta
buffer += delta
last_punct = find_sentence_end(buffer, min_len=60)
if last_punct > -1:
sentence = clean_for_speech(buffer[:last_punct + 1])
if sentence.strip():
print(f"🔊 Говорю: {sentence}")
speak(sentence, agent_id)
buffer = buffer[last_punct + 1:].lstrip()
except (json.JSONDecodeError, KeyError, IndexError):
continue
except Exception as e:
log.exception("Ошибка при чтении стрима")
print(f"⚠️ Стрим прервался: {e}")
# Остаток
if buffer.strip():
sentence = clean_for_speech(buffer)
if sentence:
speak(sentence, agent_id)
if not full_text:
msg = "Не получил ответ, попробуй ещё раз."
speak(msg, agent_id)
return msg
result = clean_for_speech(full_text)
conv.add_assistant(full_text)
return result

170
satellite/modes.py Normal file
View File

@@ -0,0 +1,170 @@
import os
import sys
from .config import GATEWAY_URL, AGENT, FOLLOWUP_TIMEOUT, log
from .audio import record, record_with_timeout
from .tts import play_activation_sound, speak, stop_speaking
from .llm import ask_agent_stream, Conversation, is_reset_command
# Персистентные сессии — одна на день для каждого агента
_sessions: dict[str, Conversation] = {}
def _get_session(agent_id: str) -> Conversation:
"""Возвращает текущую сессию, создаёт новую если день сменился"""
conv = _sessions.get(agent_id)
if conv is None or conv.is_expired():
conv = Conversation(agent_id=agent_id)
_sessions[agent_id] = conv
print(f"🆕 Новая сессия для {agent_id}")
return conv
def _handle_reset(text: str, agent_id: str) -> bool:
"""Проверяет команду сброса. Возвращает True если сброс произошёл."""
if is_reset_command(text):
_sessions[agent_id] = Conversation(agent_id=agent_id)
msg = "Начинаю новую сессию."
print(f"🔄 {msg}")
speak(msg, agent_id)
return True
return False
def run_with_enter():
print("\n🦞 Cosmo Satellite запущен (режим: Enter для активации)")
print(f" Gateway : {GATEWAY_URL}")
print(f" Агент : {AGENT}")
print("\nНажми Enter → говори → получи ответ. Ctrl+C для выхода.\n")
while True:
try:
input("⏎ Нажми Enter и говори...")
stop_speaking() # barge-in: прервать если ещё говорит
play_activation_sound()
conv = _get_session("cosmo")
while True:
text = record()
if not text:
print("⚠️ Ничего не распознано")
break
print(f"📝 Ты: {text}")
if _handle_reset(text, "cosmo"):
conv = _get_session("cosmo")
break
response = ask_agent_stream(text, conv=conv)
print(f"🤖 Cosmo: {response}\n")
print(f"👂 Слушаю продолжение ({int(FOLLOWUP_TIMEOUT)} сек)...")
followup = record_with_timeout(timeout=FOLLOWUP_TIMEOUT)
if not followup:
print("😴 Нет продолжения, жду активации...\n")
break
text = followup
except KeyboardInterrupt:
print("\n👋 Выход")
break
except Exception as e:
log.exception("Непредвиденная ошибка в цикле Enter")
print(f"⚠️ Ошибка: {e} — продолжаю работу...\n")
def run_with_porcupine():
"""Режим продакшн — два wake word через Porcupine (для Pi)"""
import pvporcupine
import struct
from .config import AGENTS
porcupine_key = os.getenv("PORCUPINE_KEY")
wake_word_cosmo = os.getenv("WAKE_WORD_COSMO")
wake_word_lusya = os.getenv("WAKE_WORD_LUSYA")
if not porcupine_key:
print("❌ PORCUPINE_KEY не задан в .env")
sys.exit(1)
keyword_paths = []
wake_word_map = []
if wake_word_cosmo:
keyword_paths.append(wake_word_cosmo)
wake_word_map.append("cosmo")
if wake_word_lusya:
keyword_paths.append(wake_word_lusya)
wake_word_map.append("lusya")
if not keyword_paths:
print("❌ WAKE_WORD_COSMO или WAKE_WORD_LUSYA не заданы в .env")
sys.exit(1)
import pyaudio
porcupine = pvporcupine.create(
access_key=porcupine_key,
keyword_paths=keyword_paths,
)
audio = pyaudio.PyAudio()
stream = audio.open(
rate=porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
frames_per_buffer=porcupine.frame_length,
)
print("\n🦞 Cosmo Satellite запущен (режим: wake word)")
for agent_id in wake_word_map:
cfg = AGENTS[agent_id]
print(f" {cfg['name']:6s} : {cfg['gateway_url']}{cfg['agent']}")
print(f"\nСкажи 'Космо' или 'Люся'...\n")
try:
while True:
try:
pcm = stream.read(porcupine.frame_length)
pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
keyword_index = porcupine.process(pcm)
if keyword_index >= 0:
agent_id = wake_word_map[keyword_index]
agent_name = AGENTS[agent_id]["name"]
stop_speaking() # barge-in: прервать если ещё говорит
print(f"✅ Услышал '{agent_name}'!")
play_activation_sound()
conv = _get_session(agent_id)
text = record()
if not text:
continue
print(f"📝 Ты → {agent_name}: {text}")
if _handle_reset(text, agent_id):
continue
response = ask_agent_stream(text, conv=conv, agent_id=agent_id)
print(f"🤖 {agent_name}: {response}\n")
except KeyboardInterrupt:
raise
except Exception as e:
log.exception("Непредвиденная ошибка в цикле Porcupine")
print(f"⚠️ Ошибка: {e} — продолжаю слушать...\n")
except KeyboardInterrupt:
print("\n👋 Выход")
finally:
stream.stop_stream()
audio.terminate()
porcupine.delete()

55
satellite/stt.py Normal file
View File

@@ -0,0 +1,55 @@
import io
import wave
from .config import groq_client, STT_PROVIDER, WHISPER_MODEL, WHISPER_LANG, log
def transcribe_groq_bytes(wav_bytes: bytes) -> str:
"""Отправляет WAV байты в Groq без записи на диск"""
buf = io.BytesIO(wav_bytes)
buf.name = "audio.wav"
result = groq_client.audio.transcriptions.create(
file=buf,
model="whisper-large-v3-turbo",
language="ru",
)
return result.text
def frames_to_wav(frames: list[bytes]) -> bytes:
"""Конвертирует сырые PCM фреймы в WAV в памяти"""
buf = io.BytesIO()
wf = wave.open(buf, "wb")
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes(b"".join(frames))
wf.close()
return buf.getvalue()
def transcribe(frames: list[bytes]) -> str:
"""Транскрибирует аудио фреймы — всё в памяти, без диска"""
try:
wav_bytes = frames_to_wav(frames)
if STT_PROVIDER == "groq":
return transcribe_groq_bytes(wav_bytes)
# Whisper fallback — нужен файл на диске
import tempfile
import os
from faster_whisper import WhisperModel
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(wav_bytes)
tmp_path = f.name
try:
model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
segments, _ = model.transcribe(tmp_path, language=WHISPER_LANG)
return " ".join(s.text for s in segments).strip()
finally:
os.unlink(tmp_path)
except Exception as e:
log.exception("STT ошибка")
print(f"⚠️ Ошибка распознавания речи: {e}")
return ""

67
satellite/text.py Normal file
View File

@@ -0,0 +1,67 @@
import re
def clean_for_speech(text: str) -> str:
text = re.sub(r'\*+', '', text) # убрать **жирный**
text = re.sub(r'#+\s', '', text) # убрать ## заголовки
text = re.sub(r'- ', '', text) # убрать тире списков
text = re.sub(r'\[.*?\]\(.*?\)', '', text) # убрать ссылки
text = re.sub(r'\n+', '. ', text) # переносы → точки
text = re.sub(r'\s+', ' ', text) # лишние пробелы
text = re.sub(r'(\d+)\.(\s)', r'\1\2', text)
return text.strip()
def find_sentence_end(text: str, min_len: int = 60) -> int:
"""Ищет конец предложения, игнорируя ложные точки"""
if len(text) < min_len:
return -1
for match in re.finditer(r'[.!?]', text):
pos = match.start()
if pos < min_len:
continue
before_1 = text[max(0, pos-1):pos] # 1 символ до
before_3 = text[max(0, pos-3):pos] # 3 символа до
after_2 = text[pos+1:pos+3] # 2 символа после
after_stripped = after_2.lstrip()
# 1. Цифра.Цифра → "0.76", "3.14"
if before_1.isdigit() and after_2[:1].isdigit():
continue
# 2. Цифра. Цифра → "1. 2 ГБ"
if before_1.isdigit() and after_stripped[:1].isdigit():
continue
# 3. Аббревиатуры → "ГБ.", "МБ.", "км.", "шт.", "руб.", "млн.", "млрд."
abbrevs = ["гб", "мб", "кб", "тб", "км", "см", "мм", "шт",
"руб", "млн", "млрд", "тыс", "кг", "гр", "мл",
"gb", "mb", "kb", "tb", "km", "ms", "kb"]
if any(before_3.lower().endswith(a) for a in abbrevs):
continue
# 4. Одиночная заглавная буква → "А.", "В.", "США." (инициалы/аббр.)
if len(before_3.strip()) == 1 and before_3.strip().isupper():
continue
# 5. После точки строчная буква → "load avg. нормально"
if after_stripped and after_stripped[0].islower():
continue
# 6. Многоточие → "..."
if text[pos:pos+3] == "...":
continue
# 7. Точка внутри URL или IP → "192.168.1.1", "example.com"
if before_1.isdigit() or (after_2[:1].isdigit() and "." in before_3):
continue
# 8. Процент с точкой → "95.5%"
if "%" in after_2[:2]:
continue
return pos
return -1

110
satellite/tts.py Normal file
View File

@@ -0,0 +1,110 @@
import os
import sys
import subprocess
import threading
from .config import AUDIO_SINK, AGENTS, log
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
_elevenlabs_client = None
_current_process: subprocess.Popen | None = None
_process_lock = threading.Lock()
def _get_elevenlabs():
global _elevenlabs_client
if _elevenlabs_client is None:
from elevenlabs.client import ElevenLabs
_elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
return _elevenlabs_client
def stop_speaking():
"""Прерывает текущее воспроизведение (barge-in)"""
global _current_process
with _process_lock:
if _current_process and _current_process.poll() is None:
_current_process.terminate()
try:
_current_process.wait(timeout=1)
except subprocess.TimeoutExpired:
_current_process.kill()
_current_process = None
def is_speaking() -> bool:
with _process_lock:
return _current_process is not None and _current_process.poll() is None
def _mpv_cmd() -> list[str]:
"""Команда mpv для воспроизведения из stdin"""
cmd = ["mpv", "--no-video", "--really-quiet", "--no-terminal"]
if AUDIO_SINK:
cmd.append(f"--audio-device=pulse/{AUDIO_SINK}")
cmd.append("-")
return cmd
def speak(text: str, agent_id: str = "cosmo"):
try:
_speak_elevenlabs(text, agent_id)
except Exception as e:
log.exception("TTS ошибка")
print(f"⚠️ Ошибка воспроизведения: {e}")
def _speak_elevenlabs(text: str, agent_id: str):
global _current_process
client = _get_elevenlabs()
voice_id = AGENTS.get(agent_id, AGENTS["cosmo"]).get("tts_voice", "")
if not voice_id:
log.error(f"tts_voice не задан для {agent_id}")
print(f"⚠️ tts_voice не задан для {agent_id}")
return
audio_stream = client.text_to_speech.convert(
text=text,
voice_id=voice_id,
model_id=ELEVENLABS_MODEL,
output_format="mp3_44100_128",
)
with _process_lock:
_current_process = subprocess.Popen(
_mpv_cmd(), stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
proc = _current_process
try:
for chunk in audio_stream:
if proc.poll() is not None:
break
try:
proc.stdin.write(chunk)
except BrokenPipeError:
break
proc.stdin.close()
proc.wait()
except Exception:
proc.kill()
finally:
with _process_lock:
if _current_process is proc:
_current_process = None
def play_activation_sound():
"""Звук активации после wake word"""
try:
if sys.platform == "darwin":
subprocess.run(["afplay", "/System/Library/Sounds/Glass.aiff"])
else:
subprocess.run(["paplay", "/usr/share/sounds/freedesktop/stereo/bell.oga"])
except Exception as e:
log.exception("Ошибка звука активации")
print(f"⚠️ Ошибка звука: {e}")