Files
home-voice-assistant/satellite/text.py
Daniil Klimov 7ca8268b78 Initial commit: Cosmo Voice Satellite
Two-agent voice assistant (Cosmo + Люся) via OpenClaw Gateway.
Streaming STT (Groq) + LLM + TTS (ElevenLabs) pipeline with
keep-alive sessions, barge-in, and daily conversation sessions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 13:34:08 +03:00

68 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
def clean_for_speech(text: str) -> str:
text = re.sub(r'\*+', '', text) # убрать **жирный**
text = re.sub(r'#+\s', '', text) # убрать ## заголовки
text = re.sub(r'- ', '', text) # убрать тире списков
text = re.sub(r'\[.*?\]\(.*?\)', '', text) # убрать ссылки
text = re.sub(r'\n+', '. ', text) # переносы → точки
text = re.sub(r'\s+', ' ', text) # лишние пробелы
text = re.sub(r'(\d+)\.(\s)', r'\1\2', text)
return text.strip()
def find_sentence_end(text: str, min_len: int = 60) -> int:
"""Ищет конец предложения, игнорируя ложные точки"""
if len(text) < min_len:
return -1
for match in re.finditer(r'[.!?]', text):
pos = match.start()
if pos < min_len:
continue
before_1 = text[max(0, pos-1):pos] # 1 символ до
before_3 = text[max(0, pos-3):pos] # 3 символа до
after_2 = text[pos+1:pos+3] # 2 символа после
after_stripped = after_2.lstrip()
# 1. Цифра.Цифра → "0.76", "3.14"
if before_1.isdigit() and after_2[:1].isdigit():
continue
# 2. Цифра. Цифра → "1. 2 ГБ"
if before_1.isdigit() and after_stripped[:1].isdigit():
continue
# 3. Аббревиатуры → "ГБ.", "МБ.", "км.", "шт.", "руб.", "млн.", "млрд."
abbrevs = ["гб", "мб", "кб", "тб", "км", "см", "мм", "шт",
"руб", "млн", "млрд", "тыс", "кг", "гр", "мл",
"gb", "mb", "kb", "tb", "km", "ms", "kb"]
if any(before_3.lower().endswith(a) for a in abbrevs):
continue
# 4. Одиночная заглавная буква → "А.", "В.", "США." (инициалы/аббр.)
if len(before_3.strip()) == 1 and before_3.strip().isupper():
continue
# 5. После точки строчная буква → "load avg. нормально"
if after_stripped and after_stripped[0].islower():
continue
# 6. Многоточие → "..."
if text[pos:pos+3] == "...":
continue
# 7. Точка внутри URL или IP → "192.168.1.1", "example.com"
if before_1.isdigit() or (after_2[:1].isdigit() and "." in before_3):
continue
# 8. Процент с точкой → "95.5%"
if "%" in after_2[:2]:
continue
return pos
return -1