shop-bob/server/stt.py
dan 98310bf062 Add server component: FastAPI + WebSocket speech pipeline
Voice-in/voice-out server for the Shop Bob machine shop assistant.
STT (faster-whisper), LLM (Ollama), TTS (Piper) with sentence-level
audio streaming over WebSocket for low-latency responses.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 13:23:01 -08:00

62 lines
2 KiB
Python

import asyncio
import logging
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from faster_whisper import WhisperModel
from .config import settings
logger = logging.getLogger(__name__)
_model: WhisperModel | None = None
_executor = ThreadPoolExecutor(max_workers=settings.max_concurrent_transcriptions)
_semaphore = asyncio.Semaphore(settings.max_concurrent_transcriptions)
def load_model() -> None:
global _model
logger.info(
"Loading Whisper model %s on %s (%s)...",
settings.whisper_model,
settings.whisper_device,
settings.whisper_compute_type,
)
_model = WhisperModel(
settings.whisper_model,
device=settings.whisper_device,
compute_type=settings.whisper_compute_type,
)
logger.info("Whisper model loaded.")
def _transcribe_sync(audio_bytes: bytes, sample_rate: int) -> str:
assert _model is not None, "Whisper model not loaded — call load_model() first"
# Convert raw PCM 16-bit mono bytes to float32 numpy array
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
if sample_rate != 16000:
# faster-whisper expects 16kHz — resample via simple linear interpolation
duration = len(audio) / sample_rate
target_len = int(duration * 16000)
audio = np.interp(
np.linspace(0, len(audio) - 1, target_len),
np.arange(len(audio)),
audio,
).astype(np.float32)
segments, info = _model.transcribe(audio, beam_size=5)
text = " ".join(seg.text.strip() for seg in segments)
logger.info("Transcribed %.1fs audio → %d chars", info.duration, len(text))
return text
async def transcribe(audio_bytes: bytes, sample_rate: int) -> str:
async with _semaphore:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(
_executor,
partial(_transcribe_sync, audio_bytes, sample_rate),
)