refactor: VAD upgrade, retry, dead code cleanup, AGENT removal
- audio: switch VAD to webrtcvad with RMS gate + fallback to RMS - audio: honor FOLLOWUP_TIMEOUT — short silence wait after bot response - llm: retry with exponential backoff on network errors and 5xx - llm: VOICE_MAX_TOKENS env (default 300) instead of hardcoded 150 - tts: optional VAD-based barge-in (BARGE_IN_ENABLED, off by default) - tts: remove dead start_barge_in_listener / was_barge_in helpers - config: drop AGENT/LUSYA_AGENT — routing happens via session_key - modes: remove unused imports, pass FOLLOWUP_TIMEOUT to follow-up record() - docs: full rewrite of README and CLAUDE.md to match current architecture
This commit is contained in:
@@ -2,22 +2,59 @@ import os
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
|
||||
from .config import SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION, log
|
||||
from .config import (
|
||||
SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION,
|
||||
FOLLOWUP_TIMEOUT, VAD_AGGRESSIVENESS, log,
|
||||
)
|
||||
from .stt import transcribe
|
||||
|
||||
ECHO_WARMUP = float(os.getenv("ECHO_WARMUP", "0.5")) # сек пропуска в начале — гасит эхо от TTS
|
||||
|
||||
try:
|
||||
import webrtcvad
|
||||
_vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
|
||||
_VAD_OK = True
|
||||
except Exception as e:
|
||||
log.warning(f"webrtcvad недоступен, fallback на RMS: {e}")
|
||||
_vad = None
|
||||
_VAD_OK = False
|
||||
|
||||
# webrtcvad требует фрейм 10/20/30 мс при 8/16/32/48 кГц
|
||||
SAMPLE_RATE = 16000
|
||||
FRAME_MS = 30
|
||||
FRAME_SAMPLES = int(SAMPLE_RATE * FRAME_MS / 1000) # 480
|
||||
FRAME_BYTES = FRAME_SAMPLES * 2 # int16
|
||||
|
||||
|
||||
def _is_speech(frame: bytes) -> bool:
|
||||
"""Единое решение по VAD: webrtcvad + RMS-гейт, чтобы не ловить шёпот и эхо."""
|
||||
amplitude = float(np.abs(np.frombuffer(frame, dtype=np.int16)).mean())
|
||||
if amplitude < SILENCE_THRESHOLD:
|
||||
return False
|
||||
if _VAD_OK:
|
||||
try:
|
||||
return _vad.is_speech(frame, SAMPLE_RATE)
|
||||
except Exception:
|
||||
pass
|
||||
return True # RMS уже прошёл — считаем речью
|
||||
|
||||
|
||||
def record(initial_silence_timeout: float | None = None) -> str:
|
||||
"""Запись до тишины + STT.
|
||||
initial_silence_timeout — через сколько секунд выйти если пользователь вообще не начал говорить.
|
||||
По умолчанию FOLLOWUP_TIMEOUT (короткое ожидание после ответа бота).
|
||||
"""
|
||||
if initial_silence_timeout is None:
|
||||
initial_silence_timeout = FOLLOWUP_TIMEOUT
|
||||
|
||||
def record() -> str:
|
||||
"""Запись до тишины (VAD) + STT. Игнорирует ECHO_WARMUP в начале."""
|
||||
try:
|
||||
audio = pyaudio.PyAudio()
|
||||
stream = audio.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=16000,
|
||||
rate=SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=1024,
|
||||
frames_per_buffer=FRAME_SAMPLES,
|
||||
)
|
||||
except Exception as e:
|
||||
log.exception("Не удалось открыть микрофон")
|
||||
@@ -25,30 +62,38 @@ def record() -> str:
|
||||
return ""
|
||||
|
||||
print("🎙️ Говори...")
|
||||
frames = []
|
||||
silent_chunks = 0
|
||||
frames: list[bytes] = []
|
||||
speaking_started = False
|
||||
max_chunks = int(16000 / 1024 * MAX_DURATION)
|
||||
silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
|
||||
warmup_chunks = int(16000 / 1024 * ECHO_WARMUP)
|
||||
trailing_silence = 0 # фреймы тишины после начала речи
|
||||
initial_silence = 0 # фреймы тишины до начала речи
|
||||
|
||||
max_frames = int(MAX_DURATION * 1000 / FRAME_MS)
|
||||
warmup_frames = int(ECHO_WARMUP * 1000 / FRAME_MS)
|
||||
silence_frames_needed = int(SILENCE_DURATION * 1000 / FRAME_MS)
|
||||
initial_silence_limit = int(initial_silence_timeout * 1000 / FRAME_MS)
|
||||
|
||||
try:
|
||||
for i in range(max_chunks):
|
||||
data = stream.read(1024, exception_on_overflow=False)
|
||||
if i < warmup_chunks:
|
||||
continue # гасим эхо от TTS / звука активации
|
||||
for i in range(max_frames):
|
||||
data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
|
||||
if i < warmup_frames:
|
||||
continue
|
||||
frames.append(data)
|
||||
|
||||
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
|
||||
|
||||
if amplitude > SILENCE_THRESHOLD:
|
||||
if _is_speech(data):
|
||||
speaking_started = True
|
||||
silent_chunks = 0
|
||||
elif speaking_started:
|
||||
silent_chunks += 1
|
||||
if silent_chunks >= silence_chunks_needed:
|
||||
print("🔇 Конец речи")
|
||||
break
|
||||
trailing_silence = 0
|
||||
else:
|
||||
if speaking_started:
|
||||
trailing_silence += 1
|
||||
if trailing_silence >= silence_frames_needed:
|
||||
print("🔇 Конец речи")
|
||||
break
|
||||
else:
|
||||
initial_silence += 1
|
||||
if initial_silence >= initial_silence_limit:
|
||||
print("😴 Пользователь молчит, выхожу")
|
||||
speaking_started = False
|
||||
break
|
||||
except Exception as e:
|
||||
log.exception("Ошибка при записи аудио")
|
||||
print(f"⚠️ Ошибка записи: {e}")
|
||||
|
||||
@@ -20,15 +20,14 @@ logging.basicConfig(
|
||||
log = logging.getLogger("cosmo")
|
||||
|
||||
# OpenClaw Gateway — Cosmo (по умолчанию)
|
||||
# Роутинг к нужному агенту делается через x-openclaw-session-key, поэтому AGENT не нужен.
|
||||
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://192.168.31.103:18789")
|
||||
GATEWAY_TOKEN = os.getenv("GATEWAY_TOKEN")
|
||||
AGENT = os.getenv("AGENT", "openclaw/main")
|
||||
VOICE_MODEL = os.getenv("VOICE_MODEL", "openai/gpt-4o-mini")
|
||||
|
||||
# OpenClaw Gateway — Люся
|
||||
LUSYA_GATEWAY_URL = os.getenv("LUSYA_GATEWAY_URL", "http://192.168.31.103:18790")
|
||||
LUSYA_GATEWAY_TOKEN = os.getenv("LUSYA_GATEWAY_TOKEN", GATEWAY_TOKEN)
|
||||
LUSYA_AGENT = os.getenv("LUSYA_AGENT", "openclaw/wife")
|
||||
LUSYA_VOICE_MODEL = os.getenv("LUSYA_VOICE_MODEL", VOICE_MODEL)
|
||||
|
||||
# Keep-alive HTTP сессии — переиспользуют TCP/TLS соединения
|
||||
@@ -46,20 +45,16 @@ AGENTS = {
|
||||
"cosmo": {
|
||||
"name": "Cosmo",
|
||||
"gateway_url": GATEWAY_URL,
|
||||
"token": GATEWAY_TOKEN,
|
||||
"agent": AGENT,
|
||||
"voice_model": VOICE_MODEL,
|
||||
"session_key": os.getenv("COSMO_SESSION_KEY", "voice:home:cosmo"),
|
||||
"session_key": os.getenv("COSMO_SESSION_KEY", "agent:main:voice:home"),
|
||||
"tts_voice": os.getenv("COSMO_TTS_VOICE", ""),
|
||||
"session": _make_session(GATEWAY_TOKEN),
|
||||
},
|
||||
"lusya": {
|
||||
"name": "Люся",
|
||||
"gateway_url": LUSYA_GATEWAY_URL,
|
||||
"token": LUSYA_GATEWAY_TOKEN,
|
||||
"agent": LUSYA_AGENT,
|
||||
"voice_model": LUSYA_VOICE_MODEL,
|
||||
"session_key": os.getenv("LUSYA_SESSION_KEY", "voice:home:lusya"),
|
||||
"session_key": os.getenv("LUSYA_SESSION_KEY", "agent:wife:voice:home"),
|
||||
"tts_voice": os.getenv("LUSYA_TTS_VOICE", ""),
|
||||
"session": _make_session(LUSYA_GATEWAY_TOKEN),
|
||||
},
|
||||
@@ -73,6 +68,18 @@ SILENCE_THRESHOLD = int(os.getenv("SILENCE_THRESHOLD", "500"))
|
||||
SILENCE_DURATION = float(os.getenv("SILENCE_DURATION", "1.5"))
|
||||
MAX_DURATION = int(os.getenv("MAX_DURATION", "15"))
|
||||
FOLLOWUP_TIMEOUT = float(os.getenv("FOLLOWUP_TIMEOUT", "8"))
|
||||
VAD_AGGRESSIVENESS = int(os.getenv("VAD_AGGRESSIVENESS", "2")) # webrtcvad 0..3
|
||||
|
||||
# LLM
|
||||
VOICE_MAX_TOKENS = int(os.getenv("VOICE_MAX_TOKENS", "300"))
|
||||
LLM_RETRIES = int(os.getenv("LLM_RETRIES", "3"))
|
||||
|
||||
# Barge-in (прерывание TTS голосом)
|
||||
# Работает только при разнесённых колонке/мике или в наушниках — иначе эхо собственного TTS
|
||||
# будет триггерить прерывание. По умолчанию выключен.
|
||||
BARGE_IN_ENABLED = os.getenv("BARGE_IN_ENABLED", "false").lower() in ("1", "true", "yes")
|
||||
BARGE_IN_THRESHOLD = int(os.getenv("BARGE_IN_THRESHOLD", "1500")) # RMS, обычно > SILENCE_THRESHOLD
|
||||
BARGE_IN_WARMUP = float(os.getenv("BARGE_IN_WARMUP", "0.8")) # сек пропуска в начале TTS
|
||||
|
||||
# Groq client
|
||||
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
|
||||
|
||||
117
satellite/llm.py
117
satellite/llm.py
@@ -1,13 +1,13 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
|
||||
from .config import AGENTS, log
|
||||
from .config import AGENTS, VOICE_MAX_TOKENS, LLM_RETRIES, log
|
||||
from .text import clean_for_speech, find_sentence_end
|
||||
from .tts import speak, play_error_sound
|
||||
|
||||
# Ключ голосовой сессии — Cosmo работает как полноценный агент
|
||||
VOICE_SESSION_KEY = os.getenv("VOICE_SESSION_KEY", "agent:main:voice:home")
|
||||
|
||||
# "stream" — режем по предложениям (быстро, но рваная интонация)
|
||||
@@ -26,67 +26,86 @@ FILLER_PATTERNS = re.compile(
|
||||
r'(?:(?:сейчас посмотрю|дай мне секунду|дай секунду|проверяю|загружаю|узнаю'
|
||||
r'|смотрю|одну секунду|я сейчас посмотрю|я проверю|попробую другой источник'
|
||||
r'|нужны конкретные числа|дай мне загрузить)[^.!?]*[.!?]?\s*)+',
|
||||
re.IGNORECASE
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def strip_fillers(text: str) -> str:
|
||||
return FILLER_PATTERNS.sub('', text).strip()
|
||||
|
||||
|
||||
|
||||
|
||||
def is_reset_command(text: str) -> bool:
|
||||
return bool(RESET_PATTERNS.search(text))
|
||||
|
||||
|
||||
def ask_agent_stream(text: str, conv=None, agent_id: str = "cosmo") -> str:
|
||||
def _post_with_retry(session, url, headers, payload):
|
||||
"""POST с экспоненциальным backoff. Retry на сетевые ошибки и 5xx; 4xx — сразу вверх."""
|
||||
last_exc = None
|
||||
for attempt in range(LLM_RETRIES):
|
||||
try:
|
||||
resp = session.post(url, headers=headers, json=payload, stream=True, timeout=60)
|
||||
if resp.status_code >= 500:
|
||||
raise requests.HTTPError(f"{resp.status_code} {resp.text[:200]}", response=resp)
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
except (requests.ConnectionError, requests.Timeout, requests.HTTPError) as e:
|
||||
last_exc = e
|
||||
# 4xx (кроме 408/429) не ретраим
|
||||
resp = getattr(e, "response", None)
|
||||
if isinstance(e, requests.HTTPError) and resp is not None:
|
||||
if resp.status_code < 500 and resp.status_code not in (408, 429):
|
||||
raise
|
||||
if attempt == LLM_RETRIES - 1:
|
||||
raise
|
||||
delay = 0.5 * (2 ** attempt)
|
||||
log.warning(f"Gateway retry {attempt + 1}/{LLM_RETRIES} через {delay:.1f}s: {e}")
|
||||
time.sleep(delay)
|
||||
raise last_exc # unreachable
|
||||
|
||||
|
||||
def ask_agent_stream(text: str, agent_id: str = "cosmo") -> str:
|
||||
"""Отправляет запрос к OpenClaw gateway и озвучивает ответ."""
|
||||
def _maybe_speak(t: str):
|
||||
if t.strip():
|
||||
speak(t, agent_id)
|
||||
|
||||
cfg = AGENTS.get(agent_id, AGENTS["cosmo"])
|
||||
gateway_url = cfg["gateway_url"]
|
||||
session = cfg["session"]
|
||||
agent = cfg["agent"]
|
||||
|
||||
session_key = cfg.get("session_key", VOICE_SESSION_KEY)
|
||||
|
||||
payload = {
|
||||
"stream": True,
|
||||
"messages": [{"role": "user", "content": text}],
|
||||
"max_tokens": VOICE_MAX_TOKENS,
|
||||
}
|
||||
headers = {
|
||||
"x-ocplatform-model": cfg["voice_model"],
|
||||
"x-openclaw-session-key": session_key,
|
||||
}
|
||||
|
||||
try:
|
||||
resp = session.post(
|
||||
f"{gateway_url}/v1/chat/completions",
|
||||
headers={
|
||||
"x-ocplatform-model": cfg["voice_model"],
|
||||
"x-openclaw-session-key": session_key,
|
||||
},
|
||||
json={
|
||||
"model": agent,
|
||||
"stream": True,
|
||||
"messages": [{"role": "user", "content": text}],
|
||||
"max_tokens": 150,
|
||||
},
|
||||
stream=True,
|
||||
timeout=60,
|
||||
resp = _post_with_retry(
|
||||
cfg["session"], f"{cfg['gateway_url']}/v1/chat/completions", headers, payload,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except requests.ConnectionError:
|
||||
log.exception("Gateway недоступен")
|
||||
log.exception("Gateway недоступен после retry")
|
||||
msg = "Не могу связаться с сервером, попробуй ещё раз."
|
||||
print(f"⚠️ {msg}")
|
||||
play_error_sound()
|
||||
_maybe_speak(msg)
|
||||
return msg
|
||||
except requests.Timeout:
|
||||
log.exception("Gateway таймаут")
|
||||
log.exception("Gateway таймаут после retry")
|
||||
msg = "Сервер не ответил вовремя, попробуй ещё раз."
|
||||
print(f"⚠️ {msg}")
|
||||
play_error_sound()
|
||||
_maybe_speak(msg)
|
||||
return msg
|
||||
except requests.HTTPError:
|
||||
log.exception(f"Gateway HTTP ошибка {resp.status_code}")
|
||||
except requests.HTTPError as e:
|
||||
status = e.response.status_code if e.response is not None else "?"
|
||||
body = e.response.text if e.response is not None else ""
|
||||
log.exception(f"Gateway HTTP {status}")
|
||||
msg = "Ошибка сервера, попробуй ещё раз."
|
||||
print(f"⚠️ Gateway {resp.status_code}: {resp.text}")
|
||||
print(f"⚠️ Gateway {status}: {body[:200]}")
|
||||
play_error_sound()
|
||||
_maybe_speak(msg)
|
||||
return msg
|
||||
@@ -98,25 +117,25 @@ def ask_agent_stream(text: str, conv=None, agent_id: str = "cosmo") -> str:
|
||||
for line in resp.iter_lines():
|
||||
if not line or line == b"data: [DONE]":
|
||||
continue
|
||||
if line.startswith(b"data: "):
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
delta = chunk["choices"][0]["delta"].get("content", "")
|
||||
if not delta:
|
||||
continue
|
||||
|
||||
full_text += delta
|
||||
buffer += delta
|
||||
|
||||
if TTS_MODE == "stream":
|
||||
last_punct = find_sentence_end(buffer, min_len=120)
|
||||
if last_punct > -1:
|
||||
sentence = clean_for_speech(buffer[:last_punct + 1])
|
||||
_maybe_speak(sentence)
|
||||
buffer = buffer[last_punct + 1:].lstrip()
|
||||
|
||||
except (json.JSONDecodeError, KeyError, IndexError):
|
||||
if not line.startswith(b"data: "):
|
||||
continue
|
||||
try:
|
||||
chunk = json.loads(line[6:])
|
||||
delta = chunk["choices"][0]["delta"].get("content", "")
|
||||
if not delta:
|
||||
continue
|
||||
|
||||
full_text += delta
|
||||
buffer += delta
|
||||
|
||||
if TTS_MODE == "stream":
|
||||
last_punct = find_sentence_end(buffer, min_len=120)
|
||||
if last_punct > -1:
|
||||
sentence = clean_for_speech(strip_fillers(buffer[:last_punct + 1]))
|
||||
_maybe_speak(sentence)
|
||||
buffer = buffer[last_punct + 1:].lstrip()
|
||||
except (json.JSONDecodeError, KeyError, IndexError):
|
||||
continue
|
||||
except Exception as e:
|
||||
log.exception("Ошибка при чтении стрима")
|
||||
print(f"⚠️ Стрим прервался: {e}")
|
||||
@@ -132,6 +151,6 @@ def ask_agent_stream(text: str, conv=None, agent_id: str = "cosmo") -> str:
|
||||
_maybe_speak(result)
|
||||
else:
|
||||
if buffer.strip():
|
||||
_maybe_speak(clean_for_speech(buffer))
|
||||
_maybe_speak(clean_for_speech(strip_fillers(buffer)))
|
||||
|
||||
return result
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from .config import GATEWAY_URL, AGENT, AGENTS, log
|
||||
from .config import GATEWAY_URL, AGENTS, FOLLOWUP_TIMEOUT, MAX_DURATION, log
|
||||
from .audio import record
|
||||
from .tts import speak, stop_speaking, is_speaking, start_barge_in_listener, was_barge_in
|
||||
from .tts import speak, stop_speaking
|
||||
from .llm import ask_agent_stream, is_reset_command, VOICE_SESSION_KEY
|
||||
|
||||
WAKE_THRESHOLD = float(os.getenv("WAKE_THRESHOLD", "0.5"))
|
||||
@@ -24,7 +23,6 @@ def _handle_reset(text: str, agent_id: str) -> bool:
|
||||
"x-openclaw-session-key": cfg.get("session_key", VOICE_SESSION_KEY),
|
||||
},
|
||||
json={
|
||||
"model": cfg["agent"],
|
||||
"stream": False,
|
||||
"messages": [{"role": "user", "content": "/new"}],
|
||||
},
|
||||
@@ -40,11 +38,15 @@ def _handle_reset(text: str, agent_id: str) -> bool:
|
||||
|
||||
|
||||
def _conversation_loop(agent_id: str, agent_name: str = "Cosmo"):
|
||||
"""Основной цикл диалога — слушает и отвечает пока пользователь говорит."""
|
||||
"""Основной цикл диалога.
|
||||
Первая запись — с большим таймаутом (MAX_DURATION), дальше — короткий FOLLOWUP_TIMEOUT."""
|
||||
first = True
|
||||
while True:
|
||||
text = record()
|
||||
timeout = MAX_DURATION if first else FOLLOWUP_TIMEOUT
|
||||
first = False
|
||||
text = record(initial_silence_timeout=timeout)
|
||||
if not text:
|
||||
print(f"😴 Тишина, жду активации...\n")
|
||||
print("😴 Тишина, жду активации...\n")
|
||||
return
|
||||
|
||||
print(f"📝 Ты → {agent_name}: {text}")
|
||||
@@ -59,7 +61,6 @@ def _conversation_loop(agent_id: str, agent_name: str = "Cosmo"):
|
||||
def run_with_enter():
|
||||
print("\n🦞 Cosmo Satellite запущен (режим: Enter для активации)")
|
||||
print(f" Gateway : {GATEWAY_URL}")
|
||||
print(f" Агент : {AGENT}")
|
||||
print("\nНажми Enter → говори → получи ответ. Ctrl+C для выхода.\n")
|
||||
|
||||
while True:
|
||||
@@ -97,7 +98,6 @@ def run_with_porcupine():
|
||||
input=True, frames_per_buffer=1280)
|
||||
|
||||
print("✅ Слушаю через OpenWakeWord...")
|
||||
# print("\nСкажи 'Космо' или 'Люся'...\n") # TODO: после подключения Люси
|
||||
|
||||
try:
|
||||
while True:
|
||||
@@ -110,12 +110,7 @@ def run_with_porcupine():
|
||||
print(f"PREDICTION cosmo: {cosmo_score:.3f}")
|
||||
|
||||
if cosmo_score > WAKE_THRESHOLD:
|
||||
if is_speaking():
|
||||
# Barge-in: прерываем TTS
|
||||
print("✋ Barge-in: прерываю ответ")
|
||||
stop_speaking()
|
||||
cosmo_model.reset()
|
||||
continue
|
||||
stop_speaking() # на случай если TTS ещё играет
|
||||
stream.stop_stream()
|
||||
_conversation_loop("cosmo", "Cosmo")
|
||||
cosmo_model.reset()
|
||||
@@ -124,10 +119,8 @@ def run_with_porcupine():
|
||||
|
||||
# TODO: Люся — раскомментировать когда модель готова
|
||||
# lusya_score = lusya_model.predict(pcm)["lusya"]
|
||||
# if lusya_score > 0.1:
|
||||
# print(f"PREDICTION lusya: {lusya_score:.3f}")
|
||||
# if lusya_score > 0.5:
|
||||
# print("✅ Услышала 'Люся'!")
|
||||
# if lusya_score > WAKE_THRESHOLD:
|
||||
# stop_speaking()
|
||||
# stream.stop_stream()
|
||||
# _conversation_loop("lusya", "Люся")
|
||||
# lusya_model.reset()
|
||||
|
||||
131
satellite/tts.py
131
satellite/tts.py
@@ -1,10 +1,12 @@
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import threading
|
||||
from elevenlabs import VoiceSettings
|
||||
|
||||
from .config import AUDIO_SINK, AGENTS, SILENCE_THRESHOLD, log
|
||||
from .config import (
|
||||
AUDIO_SINK, AGENTS, log,
|
||||
BARGE_IN_ENABLED, BARGE_IN_THRESHOLD, BARGE_IN_WARMUP,
|
||||
)
|
||||
|
||||
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
|
||||
ELEVENLABS_MODEL = os.getenv("ELEVENLABS_MODEL", "eleven_flash_v2_5")
|
||||
@@ -40,45 +42,7 @@ def is_speaking() -> bool:
|
||||
return _current_process is not None and _current_process.poll() is None
|
||||
|
||||
|
||||
_barge_in_flag = threading.Event()
|
||||
|
||||
def start_barge_in_listener():
|
||||
"""Запускает фоновый поток VAD — если услышал голос во время TTS, ставит флаг barge-in."""
|
||||
_barge_in_flag.clear()
|
||||
|
||||
def _listen():
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
try:
|
||||
audio = pyaudio.PyAudio()
|
||||
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000,
|
||||
input=True, frames_per_buffer=1024)
|
||||
warmup = 8 # ~0.5s прогрев чтобы не словить эхо начала TTS
|
||||
i = 0
|
||||
while is_speaking():
|
||||
data = stream.read(1024, exception_on_overflow=False)
|
||||
i += 1
|
||||
if i < warmup:
|
||||
continue
|
||||
amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
|
||||
if amplitude > SILENCE_THRESHOLD * 1.5: # порог чуть выше чем для записи
|
||||
_barge_in_flag.set()
|
||||
stop_speaking()
|
||||
break
|
||||
stream.stop_stream()
|
||||
audio.terminate()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
t = threading.Thread(target=_listen, daemon=True)
|
||||
t.start()
|
||||
return t
|
||||
|
||||
def was_barge_in() -> bool:
|
||||
return _barge_in_flag.is_set()
|
||||
|
||||
def _mpv_cmd() -> list[str]:
|
||||
"""Команда mpv для воспроизведения из stdin"""
|
||||
mpv_bin = os.getenv("MPV_PATH", "mpv")
|
||||
cmd = [mpv_bin, "--no-video", "--really-quiet", "--no-terminal"]
|
||||
if AUDIO_SINK:
|
||||
@@ -87,13 +51,19 @@ def _mpv_cmd() -> list[str]:
|
||||
return cmd
|
||||
|
||||
|
||||
def speak(text: str, agent_id: str = "cosmo"):
|
||||
def speak(text: str, agent_id: str = "cosmo") -> bool:
|
||||
"""Озвучивает text. Если BARGE_IN_ENABLED — слушает мик и может прерваться.
|
||||
Возвращает True если был прерван голосом."""
|
||||
try:
|
||||
if BARGE_IN_ENABLED:
|
||||
return _speak_with_barge_in(text, agent_id)
|
||||
_speak_elevenlabs(text, agent_id)
|
||||
return False
|
||||
except Exception as e:
|
||||
log.exception("TTS ошибка")
|
||||
print(f"⚠️ Ошибка воспроизведения: {e}")
|
||||
play_error_sound()
|
||||
return False
|
||||
|
||||
|
||||
def _speak_elevenlabs(text: str, agent_id: str):
|
||||
@@ -107,11 +77,11 @@ def _speak_elevenlabs(text: str, agent_id: str):
|
||||
return
|
||||
|
||||
voice_settings = VoiceSettings(
|
||||
stability=0.4, # ниже = живее интонация (для multilingual_v2)
|
||||
stability=0.4,
|
||||
similarity_boost=0.8,
|
||||
style=0.1, # выше = эмоциональнее
|
||||
style=0.1,
|
||||
use_speaker_boost=True,
|
||||
speed=1.1
|
||||
speed=1.1,
|
||||
)
|
||||
|
||||
audio_stream = client.text_to_speech.convert(
|
||||
@@ -120,7 +90,7 @@ def _speak_elevenlabs(text: str, agent_id: str):
|
||||
model_id=ELEVENLABS_MODEL,
|
||||
output_format="mp3_22050_32",
|
||||
voice_settings=voice_settings,
|
||||
optimize_streaming_latency=3
|
||||
optimize_streaming_latency=3,
|
||||
)
|
||||
|
||||
with _process_lock:
|
||||
@@ -148,9 +118,74 @@ def _speak_elevenlabs(text: str, agent_id: str):
|
||||
_current_process = None
|
||||
|
||||
|
||||
def _speak_with_barge_in(text: str, agent_id: str) -> bool:
|
||||
"""Запускает TTS в фоновом потоке и параллельно слушает мик через VAD.
|
||||
Если обнаружена сильная речь — прерывает TTS. Возвращает True если прервали."""
|
||||
t = threading.Thread(target=_speak_elevenlabs, args=(text, agent_id), daemon=True)
|
||||
t.start()
|
||||
interrupted = _listen_for_barge_in(lambda: t.is_alive())
|
||||
t.join()
|
||||
return interrupted
|
||||
|
||||
|
||||
def _listen_for_barge_in(still_alive) -> bool:
|
||||
"""Ждёт речь на входе пока still_alive() == True. Возвращает True если прервал."""
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
try:
|
||||
import webrtcvad
|
||||
vad = webrtcvad.Vad(3) # максимум агрессивности — меньше ложных на эхо
|
||||
except Exception:
|
||||
vad = None
|
||||
|
||||
SR = 16000
|
||||
FRAME_MS = 30
|
||||
FRAME_SAMPLES = int(SR * FRAME_MS / 1000)
|
||||
warmup_frames = int(BARGE_IN_WARMUP * 1000 / FRAME_MS)
|
||||
required_speech_frames = 8 # ~240 мс подряд
|
||||
|
||||
try:
|
||||
audio = pyaudio.PyAudio()
|
||||
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=SR,
|
||||
input=True, frames_per_buffer=FRAME_SAMPLES)
|
||||
except Exception as e:
|
||||
log.warning(f"Barge-in: не открылся мик: {e}")
|
||||
return False
|
||||
|
||||
interrupted = False
|
||||
speech_streak = 0
|
||||
i = 0
|
||||
try:
|
||||
while still_alive():
|
||||
data = stream.read(FRAME_SAMPLES, exception_on_overflow=False)
|
||||
i += 1
|
||||
if i < warmup_frames:
|
||||
continue
|
||||
amplitude = float(np.abs(np.frombuffer(data, dtype=np.int16)).mean())
|
||||
if amplitude < BARGE_IN_THRESHOLD:
|
||||
speech_streak = 0
|
||||
continue
|
||||
if vad is None or vad.is_speech(data, SR):
|
||||
speech_streak += 1
|
||||
if speech_streak >= required_speech_frames:
|
||||
print(f"✋ Barge-in: слышу речь ({amplitude:.0f}), прерываю TTS")
|
||||
stop_speaking()
|
||||
interrupted = True
|
||||
break
|
||||
else:
|
||||
speech_streak = 0
|
||||
except Exception:
|
||||
log.exception("Barge-in ошибка")
|
||||
finally:
|
||||
try:
|
||||
stream.stop_stream()
|
||||
audio.terminate()
|
||||
except Exception:
|
||||
pass
|
||||
return interrupted
|
||||
|
||||
|
||||
def _play_sound_file(filename: str, wait: bool = False):
|
||||
"""Воспроизводит файл из папки sounds/ через mpv.
|
||||
wait=True — блокирует до конца воспроизведения."""
|
||||
sounds_dir = os.path.join(os.path.dirname(__file__), "..", "sounds")
|
||||
path = os.path.normpath(os.path.join(sounds_dir, filename))
|
||||
mpv_bin = os.getenv("MPV_PATH", "mpv")
|
||||
@@ -162,7 +197,6 @@ def _play_sound_file(filename: str, wait: bool = False):
|
||||
|
||||
|
||||
def play_activation_sound():
|
||||
"""Звук активации — неблокирующий"""
|
||||
try:
|
||||
_play_sound_file("Success_Cosmo.mp3", wait=False)
|
||||
except Exception as e:
|
||||
@@ -170,7 +204,6 @@ def play_activation_sound():
|
||||
|
||||
|
||||
def play_error_sound():
|
||||
"""Звук ошибки — 'не получилось'"""
|
||||
try:
|
||||
_play_sound_file("Error_Cosmo.mp3")
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user