feat(llm): direct Claude Haiku 4.5 backend with prompt caching

Adds a parallel LLM backend that bypasses OpenClaw and talks to Anthropic Messages API directly. Selected via LLM_BACKEND=claude in .env; default remains openclaw so nothing breaks for existing setup. Why: OpenClaw gateway adds 500-1000ms overhead on every turn (auth, memory fetch, routing). Direct Haiku 4.5 + prompt caching = faster first token and -90% cost on cached chunks. - satellite/llm_claude.py — Anthropic SDK streaming client, prompt caching on system prompt and all-but-last-2 history messages, per agent+date JSON history in HISTORY_DIR, reset_history() for the 'сбрось' command, per-agent system prompts (Cosmo / Люся), fallback to error event if SDK/key missing. - satellite/llm.py — dispatches to ask_claude_stream when backend=claude, exports LLM_BACKEND so modes.py can route reset too. - satellite/modes.py — _handle_reset calls reset_history when backend is claude, keeps /new POST for openclaw. - requirements.txt — anthropic >= 0.50.0 - .env.example — LLM_BACKEND, ANTHROPIC_API_KEY, ANTHROPIC_MODEL, HISTORY_DIR, MAX_HISTORY, HTTPS_PROXY block for non-RU egress. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 13:12:39 +00:00
parent 584e21923c
commit 05de9c284b
5 changed files with 300 additions and 20 deletions
--- a/.env.example
+++ b/.env.example
@@ -59,3 +59,22 @@ VOICE_API_KEY=your_voice_api_key_here
 # голос ассистента проигрывается на планшете через ElevenLabs proxy,
 # локальный mpv/speak пропускается. false = говорим локально как раньше.
 TABLET_TTS_ENABLED=true
 # ——————————————————————————————————————————————
 # LLM backend
 # openclaw (дефолт) — существующий путь через gateway с памятью на сервере
 # claude             — прямой вызов Anthropic Haiku 4.5 с локальной историей
 #                       и prompt caching (быстрее + дешевле, но без tools)
 LLM_BACKEND=openclaw
 # Для LLM_BACKEND=claude:
 ANTHROPIC_API_KEY=your_anthropic_key_here
 ANTHROPIC_MODEL=claude-haiku-4-5
 HISTORY_DIR=data/history    # куда сохранять JSON истории per-agent per-date
 MAX_HISTORY=40              # лимит сообщений в истории
 # Egress proxy для non-RU сервисов (Anthropic, Groq, OpenAI).
 # httpx и requests подхватывают автоматически. Пусто = прямой выход.
 HTTPS_PROXY=http://192.168.31.103:8888
 HTTP_PROXY=http://192.168.31.103:8888
 NO_PROXY=localhost,127.0.0.1,192.168.31.0/24
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,9 @@ webrtcvad-wheels
 # STT через облако
 groq
 # LLM — прямой Claude (альтернатива OpenClaw, активируется LLM_BACKEND=claude)
 anthropic>=0.50.0
 # TTS
 elevenlabs
--- a/satellite/llm.py
+++ b/satellite/llm.py
@@ -11,6 +11,9 @@ from . import notifier
 VOICE_SESSION_KEY = os.getenv("VOICE_SESSION_KEY", "agent:main:voice:home")
 # Feature flag — выбор LLM backend. openclaw (дефолт) или claude (прямой Anthropic).
 LLM_BACKEND = os.getenv("LLM_BACKEND", "openclaw").lower()
 # "stream" — режем по предложениям (быстро, но рваная интонация)
 # "full"   — собираем весь ответ, потом TTS (естественно, но пауза перед началом)
 TTS_MODE = os.getenv("TTS_MODE", "full")
@@ -65,7 +68,12 @@ def _post_with_retry(session, url, headers, payload):
 def ask_agent_stream(text: str, agent_id: str = "cosmo") -> str:
-    """Отправляет запрос к OpenClaw gateway и озвучивает ответ."""
+    """Отправляет запрос к выбранному LLM backend и озвучивает ответ."""
    if LLM_BACKEND == "claude":
        from .llm_claude import ask_claude_stream
        return ask_claude_stream(text, agent_id)
    # Иначе — путь через OpenClaw (старый behaviour)
    def _maybe_speak(t: str):
        # Если TTS на планшете — пропускаем локальный звук, планшет зачитает по response event.
        if t.strip() and notifier.speak_locally():
--- a/satellite/llm_claude.py
+++ b/satellite/llm_claude.py
@@ -0,0 +1,242 @@
 """
 Прямой клиент Claude Haiku 4.5 (Anthropic SDK) — альтернатива OpenClaw gateway.
 Отличия от `llm.ask_agent_stream`:
  * Сессия и история живут **локально** на клиенте (JSON в HISTORY_DIR/{agent}-{date}.json).
    Смена даты = автосброс.
  * Prompt caching через Anthropic cache_control: system prompt и старая часть истории
    кешируются на 5 минут → latency first-token ниже, стоимость -90% на cached-tokens.
  * Используется когда LLM_BACKEND=claude.
 Если HTTPS_PROXY задан (напр. http://192.168.31.103:8888) — httpx подхватит автоматически,
 Anthropic SDK пойдёт через прокси.
 """
 import json
 import os
 import time
 from datetime import date
 from pathlib import Path
 try:
    import anthropic
 except ImportError:
    anthropic = None  # SDK опциональный, активируется только при LLM_BACKEND=claude
 from .config import log
 from .text import clean_for_speech
 from .tts import speak, play_error_sound
 from . import notifier
 from .llm import strip_fillers  # переиспользуем чистку филлеров
 ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
 ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-haiku-4-5")
 HISTORY_DIR = Path(os.getenv("HISTORY_DIR", "data/history"))
 MAX_TOKENS = int(os.getenv("VOICE_MAX_TOKENS", "300"))
 MAX_HISTORY_MESSAGES = int(os.getenv("MAX_HISTORY", "40"))
 # Граница кеша — все сообщения кроме последних N идут в cache block,
 # что даёт prompt caching хит каждый турн.
 CACHE_TAIL_UNCACHED = 2
 COSMO_SYSTEM_PROMPT = """Ты — Cosmo, домашний голосовой ассистент Даниила (Санкт-Петербург).
 Стиль:
 - Короткие ответы: 1-2 предложения, редко 3. Это голосовой канал — многословность утомляет.
 - Разговорный русский, без канцелярита, без формальных оборотов («здравствуйте», «уважаемый»).
 - Обращение на «ты».
 - Не предваряй ответ фразами-заполнителями («сейчас посмотрю», «минутку», «проверяю») — сразу отвечай.
 - Без эмодзи, маркированных списков, код-блоков — всё будет зачитано.
 - Если не знаешь — скажи коротко, не оправдывайся.
 Контекст: Даниил — разработчик, живёт в СПб с женой Светой. Сегодня {today}."""
 LUSYA_SYSTEM_PROMPT = """Ты — Люся, домашний голосовой ассистент Светы (Санкт-Петербург).
 Стиль:
 - Тёплый, заботливый, чуть эмоциональный, но лаконичный. 1-2 предложения.
 - Обращение на «ты».
 - Без эмодзи, списков, код-блоков — это голос.
 - Если не знаешь — скажи коротко.
 Сегодня {today}."""
 _client: "anthropic.Anthropic | None" = None
 def _get_client() -> "anthropic.Anthropic":
    global _client
    if anthropic is None:
        raise RuntimeError(
            "anthropic SDK не установлен. Запусти `pip install anthropic` "
            "или оставь LLM_BACKEND=openclaw."
        )
    if not ANTHROPIC_API_KEY:
        raise RuntimeError("ANTHROPIC_API_KEY не задан в .env")
    if _client is None:
        _client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    return _client
 def _system_prompt(agent_id: str) -> str:
    template = LUSYA_SYSTEM_PROMPT if agent_id == "lusya" else COSMO_SYSTEM_PROMPT
    return template.format(today=date.today().isoformat())
 def _history_path(agent_id: str) -> Path:
    HISTORY_DIR.mkdir(parents=True, exist_ok=True)
    today = date.today().isoformat()
    return HISTORY_DIR / f"{agent_id}-{today}.json"
 def load_history(agent_id: str) -> list[dict]:
    path = _history_path(agent_id)
    if not path.exists():
        return []
    try:
        return json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        log.exception(f"Не смог прочитать историю {path}")
        return []
 def save_history(agent_id: str, history: list[dict]):
    path = _history_path(agent_id)
    try:
        path.write_text(json.dumps(history, ensure_ascii=False, indent=2), encoding="utf-8")
    except Exception:
        log.exception(f"Не смог сохранить историю {path}")
 def reset_history(agent_id: str):
    """Удаляет историю диалога за текущий день."""
    path = _history_path(agent_id)
    if path.exists():
        path.unlink()
        log.info(f"История сброшена: {path}")
 def _build_messages(history: list[dict]) -> list[dict]:
    """
    Готовит messages array для Claude API с prompt caching.
    Последние N=CACHE_TAIL_UNCACHED сообщений остаются динамическими (без кеша),
    а всё что раньше — помечается cache_control на границе.
    """
    if len(history) <= CACHE_TAIL_UNCACHED:
        return [{"role": m["role"], "content": m["content"]} for m in history]
    cache_boundary = len(history) - CACHE_TAIL_UNCACHED
    messages = []
    for i, msg in enumerate(history):
        # Граница кеша — на последнем «старом» сообщении ставим cache_control.
        if i == cache_boundary - 1:
            messages.append({
                "role": msg["role"],
                "content": [{
                    "type": "text",
                    "text": msg["content"],
                    "cache_control": {"type": "ephemeral"},
                }],
            })
        else:
            messages.append({"role": msg["role"], "content": msg["content"]})
    return messages
 def ask_claude_stream(text: str, agent_id: str = "cosmo") -> str:
    """Спросить Claude Haiku 4.5 напрямую. Возвращает cleaned text (без speak — это делается снаружи)."""
    def _speak_if_local(t: str):
        if t.strip() and notifier.speak_locally():
            speak(t, agent_id)
    try:
        client = _get_client()
    except RuntimeError as e:
        log.error(str(e))
        msg = "Клод не настроен, попробуй OpenClaw."
        play_error_sound()
        notifier.error(msg, agent_id)
        _speak_if_local(msg)
        return msg
    history = load_history(agent_id)
    history.append({"role": "user", "content": text})
    # Обрезаем слишком длинную историю
    if len(history) > MAX_HISTORY_MESSAGES:
        history = history[-MAX_HISTORY_MESSAGES:]
    system_blocks = [{
        "type": "text",
        "text": _system_prompt(agent_id),
        "cache_control": {"type": "ephemeral"},
    }]
    messages = _build_messages(history)
    start = time.time()
    full_text = ""
    try:
        with client.messages.stream(
            model=ANTHROPIC_MODEL,
            max_tokens=MAX_TOKENS,
            system=system_blocks,
            messages=messages,
        ) as stream:
            for chunk in stream.text_stream:
                full_text += chunk
            final = stream.get_final_message()
            usage = final.usage
            cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0
            cache_write = getattr(usage, "cache_creation_input_tokens", 0) or 0
            elapsed = time.time() - start
            print(
                f"🧠 Claude {ANTHROPIC_MODEL} {elapsed:.2f}s · "
                f"in={usage.input_tokens} out={usage.output_tokens} "
                f"cache_r={cache_read} cache_w={cache_write}"
            )
    except anthropic.APIConnectionError:
        log.exception("Anthropic API connection error")
        msg = "Не могу связаться с Клодом."
        play_error_sound()
        notifier.error(msg, agent_id)
        _speak_if_local(msg)
        return msg
    except anthropic.APITimeoutError:
        log.exception("Anthropic timeout")
        msg = "Клод не ответил вовремя."
        play_error_sound()
        notifier.error(msg, agent_id)
        _speak_if_local(msg)
        return msg
    except anthropic.APIStatusError as e:
        status = getattr(e, "status_code", "?")
        log.exception(f"Anthropic API status {status}")
        msg = "Ошибка Клода."
        play_error_sound()
        notifier.error(msg, agent_id)
        _speak_if_local(msg)
        return msg
    except Exception as e:
        log.exception(f"Неожиданная ошибка Claude: {e}")
        msg = "Что-то сломалось."
        play_error_sound()
        notifier.error(msg, agent_id)
        _speak_if_local(msg)
        return msg
    if not full_text:
        msg = "Не получил ответ."
        notifier.error(msg, agent_id)
        _speak_if_local(msg)
        return msg
    # Сохраняем реплику ассистента (до strip_fillers/clean — для верности истории)
    history.append({"role": "assistant", "content": full_text})
    save_history(agent_id, history)
    result = clean_for_speech(strip_fillers(full_text))
    _speak_if_local(result)
    return result
--- a/satellite/modes.py
+++ b/satellite/modes.py
@@ -3,34 +3,42 @@ import os
 from .config import GATEWAY_URL, AGENTS, FOLLOWUP_TIMEOUT, MAX_DURATION, log
 from .audio import record
 from .tts import speak, stop_speaking
-from .llm import ask_agent_stream, is_reset_command, VOICE_SESSION_KEY
+from .llm import ask_agent_stream, is_reset_command, VOICE_SESSION_KEY, LLM_BACKEND
 from . import notifier
 WAKE_THRESHOLD = float(os.getenv("WAKE_THRESHOLD", "0.5"))
 def _handle_reset(text: str, agent_id: str) -> bool:
-    """Команда сброса — отправляет slash-команду /new в OpenClaw (без озвучки ответа)."""
+    """Команда сброса. В зависимости от backend:
    - claude:    удаляет локальный файл истории
    - openclaw:  шлёт /new в gateway
    """
    if not is_reset_command(text):
        return False
-    cfg = AGENTS.get(agent_id, AGENTS["cosmo"])
+    if LLM_BACKEND == "claude":
-    print("🔄 Отправляю /new в OpenClaw")
+        from .llm_claude import reset_history
-    try:
+        print("🔄 Сбрасываю локальную историю (Claude)")
-        cfg["session"].post(
+        reset_history(agent_id)
-            f"{cfg['gateway_url']}/v1/chat/completions",
+    else:
-            headers={
+        cfg = AGENTS.get(agent_id, AGENTS["cosmo"])
-                "x-ocplatform-model": cfg["voice_model"],
+        print("🔄 Отправляю /new в OpenClaw")
-                "x-openclaw-session-key": cfg.get("session_key", VOICE_SESSION_KEY),
+        try:
-            },
+            cfg["session"].post(
-            json={
+                f"{cfg['gateway_url']}/v1/chat/completions",
-                "stream": False,
+                headers={
-                "messages": [{"role": "user", "content": "/new"}],
+                    "x-ocplatform-model": cfg["voice_model"],
-            },
+                    "x-openclaw-session-key": cfg.get("session_key", VOICE_SESSION_KEY),
-            timeout=30,
+                },
-        )
+                json={
-    except Exception:
+                    "stream": False,
-        log.exception("Не удалось отправить /new")
+                    "messages": [{"role": "user", "content": "/new"}],
                },
                timeout=30,
            )
        except Exception:
            log.exception("Не удалось отправить /new")
    msg = "Начинаю новую сессию."
    print(f"🔄 {msg}")