Thread Registry: - Root thread initialization at boot - Thread chain tracking for message flow - register_thread() for external message UUIDs LLM Router: - Multi-backend support with failover strategy - Token bucket rate limiting per backend - Async completion API with retries Console Handler: - Message-driven REPL (not separate async loop) - ConsolePrompt/ConsoleInput payloads - Handler returns None to disconnect Boot System: - System primitives module - Boot message injected at startup - Initializes root thread context Documentation: - Updated v2.1 docs for new architecture - LLM router documentation - Gap analysis cross-check Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
115 lines
3.7 KiB
Python
115 lines
3.7 KiB
Python
"""
|
|
Token bucket for rate limiting LLM API calls.
|
|
|
|
Implements a classic token bucket algorithm:
|
|
- Bucket fills at a steady rate (tokens_per_minute / 60 per second)
|
|
- Requests consume tokens from the bucket
|
|
- If bucket is empty, request waits or fails
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class TokenBucket:
|
|
"""
|
|
Async-safe token bucket for rate limiting.
|
|
|
|
Args:
|
|
tokens_per_minute: Refill rate (TPM)
|
|
burst_capacity: Max tokens the bucket can hold (defaults to TPM)
|
|
"""
|
|
tokens_per_minute: int
|
|
burst_capacity: int = None
|
|
|
|
# Runtime state
|
|
_tokens: float = field(default=None, repr=False)
|
|
_last_refill: float = field(default=None, repr=False)
|
|
_lock: asyncio.Lock = field(default=None, repr=False)
|
|
|
|
def __post_init__(self):
|
|
if self.burst_capacity is None:
|
|
self.burst_capacity = self.tokens_per_minute
|
|
self._tokens = float(self.burst_capacity)
|
|
self._last_refill = time.monotonic()
|
|
self._lock = asyncio.Lock()
|
|
|
|
def _refill(self) -> None:
|
|
"""Add tokens based on elapsed time."""
|
|
now = time.monotonic()
|
|
elapsed = now - self._last_refill
|
|
|
|
# tokens_per_second = tokens_per_minute / 60
|
|
tokens_to_add = elapsed * (self.tokens_per_minute / 60.0)
|
|
self._tokens = min(self.burst_capacity, self._tokens + tokens_to_add)
|
|
self._last_refill = now
|
|
|
|
async def acquire(self, tokens: int, timeout: float = None) -> bool:
|
|
"""
|
|
Try to consume tokens from the bucket.
|
|
|
|
Args:
|
|
tokens: Number of tokens to consume
|
|
timeout: Max seconds to wait (None = wait forever, 0 = don't wait)
|
|
|
|
Returns:
|
|
True if tokens acquired, False if timed out
|
|
"""
|
|
deadline = None if timeout is None else time.monotonic() + timeout
|
|
|
|
async with self._lock:
|
|
while True:
|
|
self._refill()
|
|
|
|
if self._tokens >= tokens:
|
|
self._tokens -= tokens
|
|
return True
|
|
|
|
if timeout == 0:
|
|
return False
|
|
|
|
# Calculate wait time for enough tokens
|
|
tokens_needed = tokens - self._tokens
|
|
wait_seconds = tokens_needed / (self.tokens_per_minute / 60.0)
|
|
|
|
# Respect deadline
|
|
if deadline is not None:
|
|
remaining = deadline - time.monotonic()
|
|
if remaining <= 0:
|
|
return False
|
|
wait_seconds = min(wait_seconds, remaining)
|
|
|
|
# Release lock while waiting
|
|
self._lock.release()
|
|
try:
|
|
await asyncio.sleep(wait_seconds)
|
|
finally:
|
|
await self._lock.acquire()
|
|
|
|
def try_acquire(self, tokens: int) -> bool:
|
|
"""Non-blocking acquire. Returns False if not enough tokens."""
|
|
self._refill()
|
|
if self._tokens >= tokens:
|
|
self._tokens -= tokens
|
|
return True
|
|
return False
|
|
|
|
@property
|
|
def available(self) -> float:
|
|
"""Current tokens available (approximate, doesn't lock)."""
|
|
self._refill()
|
|
return self._tokens
|
|
|
|
def report(self, actual_tokens: int) -> None:
|
|
"""
|
|
Adjust after learning actual token usage.
|
|
|
|
Call this after an LLM response when you know the real token count.
|
|
If we estimated 1000 but it was actually 1200, consume 200 more.
|
|
If we estimated 1000 but it was 800, give back 200.
|
|
"""
|
|
# This is called after the fact, so we just need to track
|
|
# the delta. The initial acquire already happened.
|
|
pass # For now, we'll handle this at the router level
|