shop-bob/server/pipeline.py

import json
import logging
import re

from fastapi import WebSocket

from . import llm, stt, tts

logger = logging.getLogger(__name__)

# Regex to split text on sentence boundaries while keeping the delimiters
_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+")


async def _send_status(ws: WebSocket, state: str) -> None:
    await ws.send_text(json.dumps({"type": "status", "state": state}))


async def process_request(
    audio_bytes: bytes,
    sample_rate: int,
    websocket: WebSocket,
) -> None:
    """Run the full speech-in → text-out → speech-out pipeline."""
    try:
        # --- STT ---
        await _send_status(websocket, "transcribing")
        transcript = await stt.transcribe(audio_bytes, sample_rate)

        if not transcript.strip():
            await websocket.send_text(
                json.dumps({"type": "transcript", "text": ""})
            )
            await websocket.send_text(json.dumps({"type": "response_end"}))
            return

        await websocket.send_text(
            json.dumps({"type": "transcript", "text": transcript})
        )

        # --- LLM ---
        await _send_status(websocket, "thinking")
        full_response = ""
        sentence_buffer = ""

        # --- Sentence-level TTS streaming ---
        await _send_status(websocket, "speaking")

        async for token in llm.generate_response(transcript):
            full_response += token
            sentence_buffer += token

            # Check if we have one or more complete sentences
            parts = _SENTENCE_RE.split(sentence_buffer)
            if len(parts) > 1:
                # All parts except the last are complete sentences
                for sentence in parts[:-1]:
                    sentence = sentence.strip()
                    if sentence:
                        audio_chunk = await tts.synthesize(sentence)
                        await websocket.send_bytes(audio_chunk)
                # Keep the incomplete remainder
                sentence_buffer = parts[-1]

        # Flush any remaining text
        sentence_buffer = sentence_buffer.strip()
        if sentence_buffer:
            audio_chunk = await tts.synthesize(sentence_buffer)
            await websocket.send_bytes(audio_chunk)

        # Send the full text response and signal completion
        await websocket.send_text(
            json.dumps({"type": "response_text", "text": full_response})
        )
        await websocket.send_text(json.dumps({"type": "response_end"}))

    except Exception:
        logger.exception("Pipeline error")
        try:
            await websocket.send_text(
                json.dumps({"type": "error", "text": "Internal processing error"})
            )
            await websocket.send_text(json.dumps({"type": "response_end"}))
        except Exception:
            pass  # Client already disconnected