Voice-in/voice-out server for the Shop Bob machine shop assistant. STT (faster-whisper), LLM (Ollama), TTS (Piper) with sentence-level audio streaming over WebSocket for low-latency responses. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
59 lines
1.8 KiB
Python
59 lines
1.8 KiB
Python
import asyncio
|
|
import json
|
|
import logging
|
|
from collections.abc import AsyncGenerator
|
|
|
|
import httpx
|
|
|
|
from .config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_semaphore = asyncio.Semaphore(settings.max_concurrent_llm)
|
|
|
|
|
|
async def check_ollama() -> bool:
|
|
"""Verify Ollama is reachable."""
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
resp = await client.get(f"{settings.ollama_url}/api/tags", timeout=5)
|
|
resp.raise_for_status()
|
|
return True
|
|
except Exception as e:
|
|
logger.error("Ollama not reachable at %s: %s", settings.ollama_url, e)
|
|
return False
|
|
|
|
|
|
async def generate_response(
|
|
transcript: str,
|
|
system_prompt: str | None = None,
|
|
) -> AsyncGenerator[str, None]:
|
|
"""Stream text tokens from Ollama for the given user transcript."""
|
|
prompt = system_prompt or settings.system_prompt
|
|
|
|
payload = {
|
|
"model": settings.llm_model,
|
|
"messages": [
|
|
{"role": "system", "content": prompt},
|
|
{"role": "user", "content": transcript},
|
|
],
|
|
"stream": True,
|
|
}
|
|
|
|
async with _semaphore:
|
|
async with httpx.AsyncClient(timeout=httpx.Timeout(120.0, connect=10.0)) as client:
|
|
async with client.stream(
|
|
"POST",
|
|
f"{settings.ollama_url}/api/chat",
|
|
json=payload,
|
|
) as resp:
|
|
resp.raise_for_status()
|
|
async for line in resp.aiter_lines():
|
|
if not line:
|
|
continue
|
|
data = json.loads(line)
|
|
token = data.get("message", {}).get("content", "")
|
|
if token:
|
|
yield token
|
|
if data.get("done"):
|
|
break
|