shop-bob/server/llm.py

import asyncio
import json
import logging
from collections.abc import AsyncGenerator

import httpx

from .config import settings

logger = logging.getLogger(__name__)

_semaphore = asyncio.Semaphore(settings.max_concurrent_llm)


async def check_ollama() -> bool:
    """Verify Ollama is reachable."""
    try:
        async with httpx.AsyncClient() as client:
            resp = await client.get(f"{settings.ollama_url}/api/tags", timeout=5)
            resp.raise_for_status()
            return True
    except Exception as e:
        logger.error("Ollama not reachable at %s: %s", settings.ollama_url, e)
        return False


async def generate_response(
    transcript: str,
    system_prompt: str | None = None,
) -> AsyncGenerator[str, None]:
    """Stream text tokens from Ollama for the given user transcript."""
    prompt = system_prompt or settings.system_prompt

    payload = {
        "model": settings.llm_model,
        "messages": [
            {"role": "system", "content": prompt},
            {"role": "user", "content": transcript},
        ],
        "stream": True,
    }

    async with _semaphore:
        async with httpx.AsyncClient(timeout=httpx.Timeout(120.0, connect=10.0)) as client:
            async with client.stream(
                "POST",
                f"{settings.ollama_url}/api/chat",
                json=payload,
            ) as resp:
                resp.raise_for_status()
                async for line in resp.aiter_lines():
                    if not line:
                        continue
                    data = json.loads(line)
                    token = data.get("message", {}).get("content", "")
                    if token:
                        yield token
                    if data.get("done"):
                        break