Edit code for success run

2026-04-12 21:58:40 +03:00
parent 128cc70ab9
commit 0a89bf5105
8 changed files with 111 additions and 101 deletions
--- a/satellite/audio.py
+++ b/satellite/audio.py
@@ -1,12 +1,15 @@
+import os
 import pyaudio
 import numpy as np

 from .config import SILENCE_THRESHOLD, SILENCE_DURATION, MAX_DURATION, log
 from .stt import transcribe

+ECHO_WARMUP = float(os.getenv("ECHO_WARMUP", "0.5"))  # сек пропуска в начале — гасит эхо от TTS
+

 def record() -> str:
-    """Запись до тишины (VAD) + STT"""
+    """Запись до тишины (VAD) + STT. Игнорирует ECHO_WARMUP в начале."""
    try:
        audio = pyaudio.PyAudio()
        stream = audio.open(
@@ -27,13 +30,13 @@ def record() -> str:
    speaking_started = False
    max_chunks = int(16000 / 1024 * MAX_DURATION)
    silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
-    warmup_chunks = int(16000 / 1024 * 0.3)  # 0.3 сек — эхо звука активации
+    warmup_chunks = int(16000 / 1024 * ECHO_WARMUP)

    try:
        for i in range(max_chunks):
            data = stream.read(1024, exception_on_overflow=False)
            if i < warmup_chunks:
-                continue  # пропускаем эхо от звука активации
+                continue  # гасим эхо от TTS / звука активации
            frames.append(data)

            amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
@@ -56,52 +59,8 @@ def record() -> str:
    if not speaking_started:
        return ""

-    return transcribe(frames)
-
-
-def record_with_timeout(timeout: float = 8.0) -> str:
-    """Слушает timeout секунд, возвращает пусто если речи не было"""
-    try:
-        audio = pyaudio.PyAudio()
-        stream = audio.open(
-            format=pyaudio.paInt16,
-            channels=1,
-            rate=16000,
-            input=True,
-            frames_per_buffer=1024,
-        )
-    except Exception as e:
-        log.exception("Не удалось открыть микрофон (followup)")
-        print(f"⚠️  Ошибка микрофона: {e}")
+    text = transcribe(frames)
+    # отсекаем мусор от эха (одиночные знаки препинания, пробелы)
+    if not text or not text.strip() or len(text.strip()) < 2:
        return ""
-
-    frames = []
-    silent_chunks = 0
-    speaking_started = False
-    max_chunks = int(16000 / 1024 * timeout)
-    silence_chunks_needed = int(16000 / 1024 * SILENCE_DURATION)
-
-    try:
-        for _ in range(max_chunks):
-            data = stream.read(1024, exception_on_overflow=False)
-            frames.append(data)
-            amplitude = np.abs(np.frombuffer(data, dtype=np.int16)).mean()
-
-            if amplitude > SILENCE_THRESHOLD:
-                speaking_started = True
-                silent_chunks = 0
-            elif speaking_started:
-                silent_chunks += 1
-                if silent_chunks >= silence_chunks_needed:
-                    break
-    except Exception as e:
-        log.exception("Ошибка при записи аудио (followup)")
-        print(f"⚠️  Ошибка записи: {e}")
-    finally:
-        stream.stop_stream()
-        audio.terminate()
-
-    if not speaking_started:
-        return ""
-
-    return transcribe(frames)
+    return text
--- a/satellite/llm.py
+++ b/satellite/llm.py
@@ -79,21 +79,21 @@ def ask_agent_stream(text: str, conv: "Conversation | None" = None, agent_id: st
        log.exception("Gateway недоступен")
        msg = "Не могу связаться с сервером, попробуй ещё раз."
        print(f"⚠️  {msg}")
-        #play_error_sound()
+        play_error_sound()
        speak(msg, agent_id)
        return msg
    except requests.Timeout:
        log.exception("Gateway таймаут")
        msg = "Сервер не ответил вовремя, попробуй ещё раз."
        print(f"⚠️  {msg}")
-        #play_error_sound()
+        play_error_sound()
        speak(msg, agent_id)
        return msg
    except requests.HTTPError:
        log.exception(f"Gateway HTTP ошибка {resp.status_code}")
        msg = "Ошибка сервера, попробуй ещё раз."
        print(f"⚠️  Gateway {resp.status_code}: {resp.text}")
-        #play_error_sound()
+        play_error_sound()
        speak(msg, agent_id)
        return msg

--- a/satellite/modes.py
+++ b/satellite/modes.py
@@ -1,9 +1,9 @@
 import os
 import sys

-from .config import GATEWAY_URL, AGENT, FOLLOWUP_TIMEOUT, log
-from .audio import record, record_with_timeout
-from .tts import play_activation_sound, speak, stop_speaking
+from .config import GATEWAY_URL, AGENT, log
+from .audio import record
+from .tts import speak, stop_speaking
 from .llm import ask_agent_stream, Conversation, is_reset_command

 # Персистентные сессии — одна на день для каждого агента
@@ -31,6 +31,29 @@ def _handle_reset(text: str, agent_id: str) -> bool:
    return False


+def _conversation_loop(agent_id: str, agent_name: str = "Cosmo"):
+    """Основной цикл диалога — слушает и отвечает пока пользователь говорит.
+    Выходит когда в течение MAX_DURATION не было речи."""
+    conv = _get_session(agent_id)
+
+    while True:
+        text = record()
+        if not text:
+            print(f"😴 Тишина, жду активации...\n")
+            return
+
+        print(f"📝 Ты → {agent_name}: {text}")
+
+        if _handle_reset(text, agent_id):
+            conv = _get_session(agent_id)
+            continue
+
+        response = ask_agent_stream(text, conv=conv, agent_id=agent_id)
+        print(f"🤖 {agent_name}: {response}\n")
+        # после ответа — следующая итерация с новым record()
+        # record() сам гасит эхо через ECHO_WARMUP
+
+
 def run_with_enter():
    print("\n🦞 Cosmo Satellite запущен (режим: Enter для активации)")
    print(f"   Gateway : {GATEWAY_URL}")
@@ -40,34 +63,8 @@ def run_with_enter():
    while True:
        try:
            input("⏎  Нажми Enter и говори...")
-            stop_speaking()  # barge-in: прервать если ещё говорит
-            play_activation_sound()
-
-            conv = _get_session("cosmo")
-
-            while True:
-                text = record()
-                if not text:
-                    print("⚠️  Ничего не распознано")
-                    break
-
-                print(f"📝 Ты: {text}")
-
-                if _handle_reset(text, "cosmo"):
-                    conv = _get_session("cosmo")
-                    break
-
-                response = ask_agent_stream(text, conv=conv)
-                print(f"🤖 Cosmo: {response}\n")
-
-                print(f"👂 Слушаю продолжение ({int(FOLLOWUP_TIMEOUT)} сек)...")
-                followup = record_with_timeout(timeout=FOLLOWUP_TIMEOUT)
-
-                if not followup:
-                    print("😴 Нет продолжения, жду активации...\n")
-                    break
-
-                text = followup
+            stop_speaking()  # barge-in
+            _conversation_loop("cosmo", "Cosmo")

        except KeyboardInterrupt:
            print("\n👋 Выход")
@@ -138,23 +135,13 @@ def run_with_porcupine():
                if keyword_index >= 0:
                    agent_id = wake_word_map[keyword_index]
                    agent_name = AGENTS[agent_id]["name"]
-                    stop_speaking()  # barge-in: прервать если ещё говорит
+                    stop_speaking()  # barge-in
                    print(f"✅ Услышал '{agent_name}'!")
-                    play_activation_sound()

-                    conv = _get_session(agent_id)
-
-                    text = record()
-                    if not text:
-                        continue
-
-                    print(f"📝 Ты → {agent_name}: {text}")
-
-                    if _handle_reset(text, agent_id):
-                        continue
-
-                    response = ask_agent_stream(text, conv=conv, agent_id=agent_id)
-                    print(f"🤖 {agent_name}: {response}\n")
+                    # отпускаем микрофон на время диалога
+                    stream.stop_stream()
+                    _conversation_loop(agent_id, agent_name)
+                    stream.start_stream()

            except KeyboardInterrupt:
                raise
--- a/satellite/tts.py
+++ b/satellite/tts.py
@@ -2,6 +2,7 @@ import os
 import sys
 import subprocess
 import threading
+from elevenlabs import VoiceSettings

 from .config import AUDIO_SINK, AGENTS, log

@@ -68,11 +69,20 @@ def _speak_elevenlabs(text: str, agent_id: str):
        print(f"⚠️  tts_voice не задан для {agent_id}")
        return

+    voice_settings = VoiceSettings(
+        stability=0.5,
+        similarity_boost=0.75,
+        style=0.0,
+        use_speaker_boost=True,
+        speed=1.1  # Значение от 0.7 до 1.2. 1.1 — это ускорение на 10%
+    )
+
    audio_stream = client.text_to_speech.convert(
        text=text,
        voice_id=voice_id,
        model_id=ELEVENLABS_MODEL,
        output_format="mp3_44100_128",
+        voice_settings=voice_settings
    )

    with _process_lock: