shop-bob/server/llm.py
dan 98310bf062 Add server component: FastAPI + WebSocket speech pipeline
Voice-in/voice-out server for the Shop Bob machine shop assistant.
STT (faster-whisper), LLM (Ollama), TTS (Piper) with sentence-level
audio streaming over WebSocket for low-latency responses.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 13:23:01 -08:00

59 lines
1.8 KiB
Python

import asyncio
import json
import logging
from collections.abc import AsyncGenerator
import httpx
from .config import settings
logger = logging.getLogger(__name__)
_semaphore = asyncio.Semaphore(settings.max_concurrent_llm)
async def check_ollama() -> bool:
"""Verify Ollama is reachable."""
try:
async with httpx.AsyncClient() as client:
resp = await client.get(f"{settings.ollama_url}/api/tags", timeout=5)
resp.raise_for_status()
return True
except Exception as e:
logger.error("Ollama not reachable at %s: %s", settings.ollama_url, e)
return False
async def generate_response(
transcript: str,
system_prompt: str | None = None,
) -> AsyncGenerator[str, None]:
"""Stream text tokens from Ollama for the given user transcript."""
prompt = system_prompt or settings.system_prompt
payload = {
"model": settings.llm_model,
"messages": [
{"role": "system", "content": prompt},
{"role": "user", "content": transcript},
],
"stream": True,
}
async with _semaphore:
async with httpx.AsyncClient(timeout=httpx.Timeout(120.0, connect=10.0)) as client:
async with client.stream(
"POST",
f"{settings.ollama_url}/api/chat",
json=payload,
) as resp:
resp.raise_for_status()
async for line in resp.aiter_lines():
if not line:
continue
data = json.loads(line)
token = data.get("message", {}).get("content", "")
if token:
yield token
if data.get("done"):
break