Voice-in/voice-out server for the Shop Bob machine shop assistant. STT (faster-whisper), LLM (Ollama), TTS (Piper) with sentence-level audio streaming over WebSocket for low-latency responses. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
62 lines
2 KiB
Python
62 lines
2 KiB
Python
import asyncio
|
|
import logging
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from functools import partial
|
|
|
|
import numpy as np
|
|
from faster_whisper import WhisperModel
|
|
|
|
from .config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_model: WhisperModel | None = None
|
|
_executor = ThreadPoolExecutor(max_workers=settings.max_concurrent_transcriptions)
|
|
_semaphore = asyncio.Semaphore(settings.max_concurrent_transcriptions)
|
|
|
|
|
|
def load_model() -> None:
|
|
global _model
|
|
logger.info(
|
|
"Loading Whisper model %s on %s (%s)...",
|
|
settings.whisper_model,
|
|
settings.whisper_device,
|
|
settings.whisper_compute_type,
|
|
)
|
|
_model = WhisperModel(
|
|
settings.whisper_model,
|
|
device=settings.whisper_device,
|
|
compute_type=settings.whisper_compute_type,
|
|
)
|
|
logger.info("Whisper model loaded.")
|
|
|
|
|
|
def _transcribe_sync(audio_bytes: bytes, sample_rate: int) -> str:
|
|
assert _model is not None, "Whisper model not loaded — call load_model() first"
|
|
|
|
# Convert raw PCM 16-bit mono bytes to float32 numpy array
|
|
audio = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
|
|
|
|
if sample_rate != 16000:
|
|
# faster-whisper expects 16kHz — resample via simple linear interpolation
|
|
duration = len(audio) / sample_rate
|
|
target_len = int(duration * 16000)
|
|
audio = np.interp(
|
|
np.linspace(0, len(audio) - 1, target_len),
|
|
np.arange(len(audio)),
|
|
audio,
|
|
).astype(np.float32)
|
|
|
|
segments, info = _model.transcribe(audio, beam_size=5)
|
|
text = " ".join(seg.text.strip() for seg in segments)
|
|
logger.info("Transcribed %.1fs audio → %d chars", info.duration, len(text))
|
|
return text
|
|
|
|
|
|
async def transcribe(audio_bytes: bytes, sample_rate: int) -> str:
|
|
async with _semaphore:
|
|
loop = asyncio.get_running_loop()
|
|
return await loop.run_in_executor(
|
|
_executor,
|
|
partial(_transcribe_sync, audio_bytes, sample_rate),
|
|
)
|