Add token budget enforcement and usage tracking

Token Budget System: - ThreadBudgetRegistry tracks per-thread token usage with configurable limits - BudgetExhaustedError raised when thread exceeds max_tokens_per_thread - Integrates with LLMRouter to block LLM calls when budget exhausted - Automatic cleanup when threads are pruned Usage Tracking (for production billing): - UsageTracker emits events after each LLM completion - Subscribers receive UsageEvent with tokens, latency, estimated cost - Cost estimation for common models (Grok, Claude, GPT, etc.) - Aggregate stats by agent, model, and totals Configuration: - max_tokens_per_thread in organism.yaml (default 100k) - LLMRouter.complete() accepts thread_id and metadata parameters Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 21:07:43 -08:00 · 2026-01-27 21:07:43 -08:00 · 8b11323a8b
commit 8b11323a8b
parent 4530c06835
7 changed files with 1341 additions and 6 deletions
--- a/tests/test_token_budget.py
+++ b/tests/test_token_budget.py
@ -0,0 +1,573 @@
+"""
+test_token_budget.py — Tests for token budget and usage tracking.
+
+Tests:
+1. ThreadBudgetRegistry - per-thread token limits
+2. UsageTracker - billing/gas usage events
+3. LLMRouter integration - budget enforcement
+"""
+
+import pytest
+from unittest.mock import Mock, AsyncMock, patch
+
+from xml_pipeline.message_bus.budget_registry import (
+    ThreadBudget,
+    ThreadBudgetRegistry,
+    BudgetExhaustedError,
+    get_budget_registry,
+    configure_budget_registry,
+    reset_budget_registry,
+)
+from xml_pipeline.llm.usage_tracker import (
+    UsageEvent,
+    UsageTracker,
+    UsageTotals,
+    estimate_cost,
+    get_usage_tracker,
+    reset_usage_tracker,
+)
+
+
+# ============================================================================
+# ThreadBudget Tests
+# ============================================================================
+
+class TestThreadBudget:
+    """Test ThreadBudget dataclass."""
+
+    def test_initial_state(self):
+        """New budget should have zero usage."""
+        budget = ThreadBudget(max_tokens=10000)
+        assert budget.total_tokens == 0
+        assert budget.remaining == 10000
+        assert budget.is_exhausted is False
+
+    def test_consume_tokens(self):
+        """Consuming tokens should update totals."""
+        budget = ThreadBudget(max_tokens=10000)
+        budget.consume(prompt_tokens=500, completion_tokens=300)
+
+        assert budget.prompt_tokens == 500
+        assert budget.completion_tokens == 300
+        assert budget.total_tokens == 800
+        assert budget.remaining == 9200
+        assert budget.request_count == 1
+
+    def test_can_consume_within_budget(self):
+        """can_consume should return True if within budget."""
+        budget = ThreadBudget(max_tokens=1000)
+        budget.consume(prompt_tokens=400)
+
+        assert budget.can_consume(500) is True
+        assert budget.can_consume(600) is True
+        assert budget.can_consume(601) is False
+
+    def test_is_exhausted(self):
+        """is_exhausted should return True when budget exceeded."""
+        budget = ThreadBudget(max_tokens=1000)
+        budget.consume(prompt_tokens=1000)
+
+        assert budget.is_exhausted is True
+        assert budget.remaining == 0
+
+    def test_remaining_never_negative(self):
+        """remaining should never go negative."""
+        budget = ThreadBudget(max_tokens=100)
+        budget.consume(prompt_tokens=200)
+
+        assert budget.remaining == 0
+        assert budget.total_tokens == 200
+
+
+# ============================================================================
+# ThreadBudgetRegistry Tests
+# ============================================================================
+
+class TestThreadBudgetRegistry:
+    """Test ThreadBudgetRegistry."""
+
+    @pytest.fixture(autouse=True)
+    def reset(self):
+        """Reset global registry before each test."""
+        reset_budget_registry()
+        yield
+        reset_budget_registry()
+
+    def test_default_budget_creation(self):
+        """Getting budget for new thread should create one."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=50000)
+        budget = registry.get_budget("thread-1")
+
+        assert budget.max_tokens == 50000
+        assert budget.total_tokens == 0
+
+    def test_configure_max_tokens(self):
+        """configure() should update default for new threads."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+        budget1 = registry.get_budget("thread-1")
+
+        registry.configure(max_tokens_per_thread=20000)
+        budget2 = registry.get_budget("thread-2")
+
+        assert budget1.max_tokens == 10000  # Original unchanged
+        assert budget2.max_tokens == 20000  # New default
+
+    def test_check_budget_success(self):
+        """check_budget should pass when within budget."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+
+        result = registry.check_budget("thread-1", estimated_tokens=5000)
+        assert result is True
+
+    def test_check_budget_exhausted(self):
+        """check_budget should raise when budget exhausted."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=1000)
+        registry.consume("thread-1", prompt_tokens=1000)
+
+        with pytest.raises(BudgetExhaustedError) as exc_info:
+            registry.check_budget("thread-1", estimated_tokens=100)
+
+        assert "budget exhausted" in str(exc_info.value)
+        assert exc_info.value.thread_id == "thread-1"
+        assert exc_info.value.used == 1000
+        assert exc_info.value.max_tokens == 1000
+
+    def test_check_budget_would_exceed(self):
+        """check_budget should raise when estimate would exceed."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=1000)
+        registry.consume("thread-1", prompt_tokens=600)
+
+        with pytest.raises(BudgetExhaustedError):
+            registry.check_budget("thread-1", estimated_tokens=500)
+
+    def test_consume_returns_budget(self):
+        """consume() should return updated budget."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+
+        budget = registry.consume("thread-1", prompt_tokens=100, completion_tokens=50)
+
+        assert budget.total_tokens == 150
+        assert budget.request_count == 1
+
+    def test_get_usage(self):
+        """get_usage should return dict with all stats."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+        registry.consume("thread-1", prompt_tokens=500, completion_tokens=200)
+        registry.consume("thread-1", prompt_tokens=300, completion_tokens=100)
+
+        usage = registry.get_usage("thread-1")
+
+        assert usage["prompt_tokens"] == 800
+        assert usage["completion_tokens"] == 300
+        assert usage["total_tokens"] == 1100
+        assert usage["remaining"] == 8900
+        assert usage["request_count"] == 2
+
+    def test_get_all_usage(self):
+        """get_all_usage should return all threads."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+        registry.consume("thread-1", prompt_tokens=100)
+        registry.consume("thread-2", prompt_tokens=200)
+
+        all_usage = registry.get_all_usage()
+
+        assert len(all_usage) == 2
+        assert "thread-1" in all_usage
+        assert "thread-2" in all_usage
+
+    def test_reset_thread(self):
+        """reset_thread should remove budget for thread."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+        registry.consume("thread-1", prompt_tokens=500)
+        registry.reset_thread("thread-1")
+
+        # Getting budget should create new one with zero usage
+        budget = registry.get_budget("thread-1")
+        assert budget.total_tokens == 0
+
+    def test_cleanup_thread(self):
+        """cleanup_thread should return and remove budget."""
+        registry = ThreadBudgetRegistry(max_tokens_per_thread=10000)
+        registry.consume("thread-1", prompt_tokens=500)
+
+        final_budget = registry.cleanup_thread("thread-1")
+
+        assert final_budget.total_tokens == 500
+        assert registry.cleanup_thread("thread-1") is None  # Already cleaned
+
+    def test_global_registry(self):
+        """Global registry should be singleton."""
+        registry1 = get_budget_registry()
+        registry2 = get_budget_registry()
+
+        assert registry1 is registry2
+
+    def test_global_configure(self):
+        """configure_budget_registry should update global."""
+        configure_budget_registry(max_tokens_per_thread=75000)
+        registry = get_budget_registry()
+
+        budget = registry.get_budget("new-thread")
+        assert budget.max_tokens == 75000
+
+
+# ============================================================================
+# UsageTracker Tests
+# ============================================================================
+
+class TestUsageTracker:
+    """Test UsageTracker for billing/metering."""
+
+    @pytest.fixture(autouse=True)
+    def reset(self):
+        """Reset global tracker before each test."""
+        reset_usage_tracker()
+        yield
+        reset_usage_tracker()
+
+    def test_record_creates_event(self):
+        """record() should create and return UsageEvent."""
+        tracker = UsageTracker()
+
+        event = tracker.record(
+            thread_id="thread-1",
+            agent_id="greeter",
+            model="grok-4.1",
+            provider="xai",
+            prompt_tokens=500,
+            completion_tokens=200,
+            latency_ms=150.5,
+        )
+
+        assert event.thread_id == "thread-1"
+        assert event.agent_id == "greeter"
+        assert event.model == "grok-4.1"
+        assert event.total_tokens == 700
+        assert event.timestamp is not None
+
+    def test_record_estimates_cost(self):
+        """record() should estimate cost for known models."""
+        tracker = UsageTracker()
+
+        event = tracker.record(
+            thread_id="thread-1",
+            agent_id="agent",
+            model="grok-4.1",
+            provider="xai",
+            prompt_tokens=1_000_000,  # 1M prompt
+            completion_tokens=1_000_000,  # 1M completion
+            latency_ms=1000,
+        )
+
+        # grok-4.1: $3/1M prompt + $15/1M completion = $18
+        assert event.estimated_cost == 18.0
+
+    def test_subscriber_receives_events(self):
+        """Subscribers should receive events on record."""
+        tracker = UsageTracker()
+        received = []
+
+        tracker.subscribe(lambda e: received.append(e))
+
+        tracker.record(
+            thread_id="t1",
+            agent_id="agent",
+            model="gpt-4o",
+            provider="openai",
+            prompt_tokens=100,
+            completion_tokens=50,
+            latency_ms=50,
+        )
+
+        assert len(received) == 1
+        assert received[0].thread_id == "t1"
+
+    def test_unsubscribe(self):
+        """unsubscribe should stop receiving events."""
+        tracker = UsageTracker()
+        received = []
+        callback = lambda e: received.append(e)
+
+        tracker.subscribe(callback)
+        tracker.record(thread_id="t1", agent_id=None, model="m", provider="p",
+                       prompt_tokens=10, completion_tokens=10, latency_ms=10)
+
+        tracker.unsubscribe(callback)
+        tracker.record(thread_id="t2", agent_id=None, model="m", provider="p",
+                       prompt_tokens=10, completion_tokens=10, latency_ms=10)
+
+        assert len(received) == 1
+
+    def test_get_totals(self):
+        """get_totals should return aggregate stats."""
+        tracker = UsageTracker()
+
+        tracker.record(thread_id="t1", agent_id="a1", model="m1", provider="p",
+                       prompt_tokens=100, completion_tokens=50, latency_ms=100)
+        tracker.record(thread_id="t2", agent_id="a2", model="m2", provider="p",
+                       prompt_tokens=200, completion_tokens=100, latency_ms=200)
+
+        totals = tracker.get_totals()
+
+        assert totals["prompt_tokens"] == 300
+        assert totals["completion_tokens"] == 150
+        assert totals["total_tokens"] == 450
+        assert totals["request_count"] == 2
+        assert totals["avg_latency_ms"] == 150.0
+
+    def test_get_agent_totals(self):
+        """get_agent_totals should return per-agent stats."""
+        tracker = UsageTracker()
+
+        tracker.record(thread_id="t1", agent_id="greeter", model="m", provider="p",
+                       prompt_tokens=100, completion_tokens=50, latency_ms=100)
+        tracker.record(thread_id="t2", agent_id="greeter", model="m", provider="p",
+                       prompt_tokens=100, completion_tokens=50, latency_ms=100)
+        tracker.record(thread_id="t3", agent_id="shouter", model="m", provider="p",
+                       prompt_tokens=200, completion_tokens=100, latency_ms=200)
+
+        greeter = tracker.get_agent_totals("greeter")
+        shouter = tracker.get_agent_totals("shouter")
+
+        assert greeter["total_tokens"] == 300
+        assert greeter["request_count"] == 2
+        assert shouter["total_tokens"] == 300
+        assert shouter["request_count"] == 1
+
+    def test_get_model_totals(self):
+        """get_model_totals should return per-model stats."""
+        tracker = UsageTracker()
+
+        tracker.record(thread_id="t1", agent_id="a", model="grok-4.1", provider="xai",
+                       prompt_tokens=1000, completion_tokens=500, latency_ms=100)
+        tracker.record(thread_id="t2", agent_id="a", model="claude-sonnet-4", provider="anthropic",
+                       prompt_tokens=500, completion_tokens=250, latency_ms=100)
+
+        grok = tracker.get_model_totals("grok-4.1")
+        claude = tracker.get_model_totals("claude-sonnet-4")
+
+        assert grok["total_tokens"] == 1500
+        assert claude["total_tokens"] == 750
+
+    def test_metadata_passed_through(self):
+        """Metadata should be included in events."""
+        tracker = UsageTracker()
+        received = []
+        tracker.subscribe(lambda e: received.append(e))
+
+        tracker.record(
+            thread_id="t1",
+            agent_id="a",
+            model="m",
+            provider="p",
+            prompt_tokens=10,
+            completion_tokens=10,
+            latency_ms=10,
+            metadata={"org_id": "org-123", "user_id": "user-456"},
+        )
+
+        assert received[0].metadata["org_id"] == "org-123"
+        assert received[0].metadata["user_id"] == "user-456"
+
+
+# ============================================================================
+# Cost Estimation Tests
+# ============================================================================
+
+class TestCostEstimation:
+    """Test cost estimation for various models."""
+
+    def test_grok_cost(self):
+        """Grok models should use correct pricing."""
+        cost = estimate_cost("grok-4.1", prompt_tokens=1_000_000, completion_tokens=1_000_000)
+        # $3/1M prompt + $15/1M completion = $18
+        assert cost == 18.0
+
+    def test_claude_opus_cost(self):
+        """Claude Opus should use correct pricing."""
+        cost = estimate_cost("claude-opus-4", prompt_tokens=1_000_000, completion_tokens=1_000_000)
+        # $15/1M prompt + $75/1M completion = $90
+        assert cost == 90.0
+
+    def test_gpt4o_cost(self):
+        """GPT-4o should use correct pricing."""
+        cost = estimate_cost("gpt-4o", prompt_tokens=1_000_000, completion_tokens=1_000_000)
+        # $2.5/1M prompt + $10/1M completion = $12.5
+        assert cost == 12.5
+
+    def test_unknown_model_returns_none(self):
+        """Unknown model should return None."""
+        cost = estimate_cost("unknown-model", prompt_tokens=1000, completion_tokens=500)
+        assert cost is None
+
+    def test_small_usage_cost(self):
+        """Small token counts should produce fractional costs."""
+        cost = estimate_cost("gpt-4o-mini", prompt_tokens=1000, completion_tokens=500)
+        # 1000 tokens * $0.15/1M = $0.00015
+        # 500 tokens * $0.6/1M = $0.0003
+        # Total = $0.00045
+        assert cost == pytest.approx(0.00045, rel=1e-4)
+
+
+# ============================================================================
+# LLMRouter Integration Tests (Mocked)
+# ============================================================================
+
+class TestLLMRouterBudgetIntegration:
+    """Test LLMRouter budget enforcement."""
+
+    @pytest.fixture(autouse=True)
+    def reset_all(self):
+        """Reset all global registries."""
+        reset_budget_registry()
+        reset_usage_tracker()
+        yield
+        reset_budget_registry()
+        reset_usage_tracker()
+
+    @pytest.mark.asyncio
+    async def test_complete_consumes_budget(self):
+        """LLM complete should consume from thread budget."""
+        from xml_pipeline.llm.router import LLMRouter
+        from xml_pipeline.llm.backend import LLMResponse
+
+        # Create mock backend
+        mock_backend = Mock()
+        mock_backend.name = "mock"
+        mock_backend.provider = "test"
+        mock_backend.serves_model = Mock(return_value=True)
+        mock_backend.priority = 1
+        mock_backend.load = 0
+        mock_backend.complete = AsyncMock(return_value=LLMResponse(
+            content="Hello!",
+            model="test-model",
+            usage={"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150},
+            finish_reason="stop",
+        ))
+
+        # Configure budget
+        configure_budget_registry(max_tokens_per_thread=10000)
+        budget_registry = get_budget_registry()
+
+        # Create router with mock backend
+        router = LLMRouter()
+        router.backends.append(mock_backend)
+
+        # Make request
+        response = await router.complete(
+            model="test-model",
+            messages=[{"role": "user", "content": "Hi"}],
+            thread_id="test-thread-123",
+        )
+
+        assert response.content == "Hello!"
+
+        # Verify budget consumed
+        usage = budget_registry.get_usage("test-thread-123")
+        assert usage["prompt_tokens"] == 100
+        assert usage["completion_tokens"] == 50
+        assert usage["total_tokens"] == 150
+
+    @pytest.mark.asyncio
+    async def test_complete_emits_usage_event(self):
+        """LLM complete should emit usage event."""
+        from xml_pipeline.llm.router import LLMRouter
+        from xml_pipeline.llm.backend import LLMResponse
+
+        mock_backend = Mock()
+        mock_backend.name = "mock"
+        mock_backend.provider = "test"
+        mock_backend.serves_model = Mock(return_value=True)
+        mock_backend.priority = 1
+        mock_backend.load = 0
+        mock_backend.complete = AsyncMock(return_value=LLMResponse(
+            content="Hello!",
+            model="test-model",
+            usage={"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150},
+            finish_reason="stop",
+        ))
+
+        # Subscribe to usage events
+        tracker = get_usage_tracker()
+        received_events = []
+        tracker.subscribe(lambda e: received_events.append(e))
+
+        # Create router and make request
+        router = LLMRouter()
+        router.backends.append(mock_backend)
+
+        await router.complete(
+            model="test-model",
+            messages=[{"role": "user", "content": "Hi"}],
+            thread_id="test-thread",
+            agent_id="greeter",
+            metadata={"org_id": "test-org"},
+        )
+
+        # Verify event emitted
+        assert len(received_events) == 1
+        event = received_events[0]
+        assert event.thread_id == "test-thread"
+        assert event.agent_id == "greeter"
+        assert event.total_tokens == 150
+        assert event.metadata["org_id"] == "test-org"
+
+    @pytest.mark.asyncio
+    async def test_complete_raises_when_budget_exhausted(self):
+        """LLM complete should raise when budget exhausted."""
+        from xml_pipeline.llm.router import LLMRouter
+
+        # Configure small budget and exhaust it
+        configure_budget_registry(max_tokens_per_thread=100)
+        budget_registry = get_budget_registry()
+        budget_registry.consume("test-thread", prompt_tokens=100)
+
+        mock_backend = Mock()
+        mock_backend.name = "mock"
+        mock_backend.serves_model = Mock(return_value=True)
+        mock_backend.priority = 1
+
+        router = LLMRouter()
+        router.backends.append(mock_backend)
+
+        with pytest.raises(BudgetExhaustedError) as exc_info:
+            await router.complete(
+                model="test-model",
+                messages=[{"role": "user", "content": "Hi"}],
+                thread_id="test-thread",
+            )
+
+        assert "budget exhausted" in str(exc_info.value)
+        # Backend should NOT have been called
+        mock_backend.complete.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_complete_without_thread_id_skips_budget(self):
+        """LLM complete without thread_id should skip budget check."""
+        from xml_pipeline.llm.router import LLMRouter
+        from xml_pipeline.llm.backend import LLMResponse
+
+        mock_backend = Mock()
+        mock_backend.name = "mock"
+        mock_backend.provider = "test"
+        mock_backend.serves_model = Mock(return_value=True)
+        mock_backend.priority = 1
+        mock_backend.load = 0
+        mock_backend.complete = AsyncMock(return_value=LLMResponse(
+            content="Hello!",
+            model="test-model",
+            usage={"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150},
+            finish_reason="stop",
+        ))
+
+        router = LLMRouter()
+        router.backends.append(mock_backend)
+
+        # Should not raise - no budget checking
+        response = await router.complete(
+            model="test-model",
+            messages=[{"role": "user", "content": "Hi"}],
+            # No thread_id
+        )
+
+        assert response.content == "Hello!"
--- a/xml_pipeline/llm/init.py
+++ b/xml_pipeline/llm/init.py
@ -16,7 +16,20 @@ Usage:
    response = await router.complete(
        model="grok-4.1",
        messages=[{"role": "user", "content": "Hello"}],
+        thread_id=metadata.thread_id,  # For budget enforcement
+        agent_id=metadata.own_name,    # For usage tracking
    )
+
+Usage Tracking:
+    from xml_pipeline.llm import get_usage_tracker
+
+    tracker = get_usage_tracker()
+
+    # Subscribe to events for billing
+    tracker.subscribe(lambda event: billing_api.record(event))
+
+    # Query totals
+    totals = tracker.get_totals()
 """

 from xml_pipeline.llm.router import (
@ -27,14 +40,27 @@ from xml_pipeline.llm.router import (
    Strategy,
 )
 from xml_pipeline.llm.backend import LLMRequest, LLMResponse, BackendError
+from xml_pipeline.llm.usage_tracker import (
+    UsageTracker,
+    UsageEvent,
+    get_usage_tracker,
+    reset_usage_tracker,
+)

 __all__ = [
+    # Router
    "LLMRouter",
    "get_router",
    "configure_router",
    "complete",
    "Strategy",
+    # Backend
    "LLMRequest",
    "LLMResponse",
    "BackendError",
+    # Usage tracking
+    "UsageTracker",
+    "UsageEvent",
+    "get_usage_tracker",
+    "reset_usage_tracker",
 ]
--- a/xml_pipeline/llm/router.py
+++ b/xml_pipeline/llm/router.py
@ -9,6 +9,8 @@ The router handles:
 - Load balancing (failover, round-robin, least-loaded)
 - Retries with exponential backoff
 - Token tracking per agent
+- Thread budget enforcement
+- Usage event emission for billing
 """

 from __future__ import annotations
@ -16,6 +18,7 @@ from __future__ import annotations
 import asyncio
 import logging
 import random
+import time
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Dict, Any, Optional
@ -125,6 +128,8 @@ class LLMRouter:
        max_tokens: int = None,
        tools: List[Dict] = None,
        agent_id: str = None,
+        thread_id: str = None,
+        metadata: Dict[str, Any] = None,
    ) -> LLMResponse:
        """
        Execute a completion request.
@ -136,10 +141,27 @@ class LLMRouter:
            max_tokens: Max tokens in response
            tools: Tool definitions for function calling
            agent_id: Optional agent ID for usage tracking
+            thread_id: Optional thread ID for budget enforcement
+            metadata: Optional metadata for usage events (org_id, user_id, etc.)

        Returns:
            LLMResponse with content and usage stats
+
+        Raises:
+            BudgetExhaustedError: If thread has no remaining budget
+            BackendError: If all backends fail
        """
+        # Estimate tokens for budget check (rough: 4 chars per token)
+        estimated_tokens = sum(len(m.get("content", "")) for m in messages) // 4
+        estimated_tokens = max(estimated_tokens, 100)  # minimum estimate
+
+        # Check thread budget before proceeding
+        if thread_id:
+            from xml_pipeline.message_bus.budget_registry import get_budget_registry
+            budget_registry = get_budget_registry()
+            # This raises BudgetExhaustedError if over budget
+            budget_registry.check_budget(thread_id, estimated_tokens)
+
        candidates = self._find_backends(model)
        request = LLMRequest(
            model=model,
@ -151,6 +173,7 @@ class LLMRouter:

        last_error = None
        tried_backends = set()
+        start_time = time.monotonic()

        for attempt in range(self.retries + 1):
            # Select backend (different selection on retry for failover)
@ -170,14 +193,46 @@ class LLMRouter:
                logger.debug(f"Attempting {model} on {backend.name} (attempt {attempt + 1})")
                response = await backend.complete(request)

-                # Track usage
+                # Calculate latency
+                latency_ms = (time.monotonic() - start_time) * 1000
+
+                # Extract usage
+                prompt_tokens = response.usage.get("prompt_tokens", 0)
+                completion_tokens = response.usage.get("completion_tokens", 0)
+                total_tokens = response.usage.get("total_tokens", 0)
+
+                # Track per-agent usage (internal)
                if agent_id:
                    usage = self._agent_usage.setdefault(agent_id, AgentUsage())
-                    usage.total_tokens += response.usage.get("total_tokens", 0)
-                    usage.prompt_tokens += response.usage.get("prompt_tokens", 0)
-                    usage.completion_tokens += response.usage.get("completion_tokens", 0)
+                    usage.total_tokens += total_tokens
+                    usage.prompt_tokens += prompt_tokens
+                    usage.completion_tokens += completion_tokens
                    usage.request_count += 1

+                # Record to thread budget (enforcement)
+                if thread_id:
+                    from xml_pipeline.message_bus.budget_registry import get_budget_registry
+                    budget_registry = get_budget_registry()
+                    budget_registry.consume(
+                        thread_id,
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                    )
+
+                # Emit usage event (for billing)
+                from xml_pipeline.llm.usage_tracker import get_usage_tracker
+                tracker = get_usage_tracker()
+                tracker.record(
+                    thread_id=thread_id or "",
+                    agent_id=agent_id,
+                    model=response.model,
+                    provider=backend.provider,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    latency_ms=latency_ms,
+                    metadata=metadata,
+                )
+
                return response

            except RateLimitError as e:
@ -286,6 +341,10 @@ def configure_router(config: Dict[str, Any]) -> LLMRouter:
 async def complete(
    model: str,
    messages: List[Dict[str, str]],
+    *,
+    thread_id: str = None,
+    agent_id: str = None,
+    metadata: Dict[str, Any] = None,
    **kwargs,
 ) -> LLMResponse:
    """
@ -293,6 +352,32 @@ async def complete(

    Usage:
        from xml_pipeline.llm import router
-        response = await router.complete("grok-4.1", messages)
+        response = await router.complete(
+            "grok-4.1",
+            messages,
+            thread_id=metadata.thread_id,
+            agent_id=metadata.own_name,
+        )
+
+    Args:
+        model: Model name
+        messages: Chat messages
+        thread_id: Thread UUID for budget enforcement
+        agent_id: Agent name for usage tracking
+        metadata: Extra metadata for billing events
+        **kwargs: Additional arguments (temperature, max_tokens, tools)
+
+    Returns:
+        LLMResponse with content and usage stats
+
+    Raises:
+        BudgetExhaustedError: If thread budget exhausted
    """
-    return await get_router().complete(model, messages, **kwargs)
+    return await get_router().complete(
+        model,
+        messages,
+        thread_id=thread_id,
+        agent_id=agent_id,
+        metadata=metadata,
+        **kwargs,
+    )
--- a/xml_pipeline/llm/usage_tracker.py
+++ b/xml_pipeline/llm/usage_tracker.py
@ -0,0 +1,346 @@
+"""
+Usage Tracker — Production billing and gas usage metering.
+
+This module provides hooks for tracking LLM usage at the platform level.
+External billing systems can subscribe to usage events for metering.
+
+Usage Tracking Layers:
+1. Per-agent (LLMRouter._agent_usage) — Internal token tracking
+2. Per-thread (ThreadBudgetRegistry) — Enforcement limits
+3. Platform (UsageTracker) — Production billing/metering
+
+Example:
+    from xml_pipeline.llm.usage_tracker import get_usage_tracker
+
+    tracker = get_usage_tracker()
+
+    # Subscribe to usage events (for billing webhook, database, etc.)
+    def record_usage(event: UsageEvent):
+        billing_db.record(
+            org_id=event.metadata.get("org_id"),
+            tokens=event.total_tokens,
+            cost=event.estimated_cost,
+        )
+
+    tracker.subscribe(record_usage)
+
+    # Query aggregate usage
+    totals = tracker.get_totals()
+    print(f"Total tokens: {totals['total_tokens']}")
+"""
+
+from __future__ import annotations
+
+import threading
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any, Callable, Dict, List, Optional
+
+
+@dataclass
+class UsageEvent:
+    """
+    Usage event emitted after each LLM completion.
+
+    This is the main interface for billing systems.
+    """
+
+    # Request identification
+    thread_id: str
+    agent_id: Optional[str]
+    model: str
+    provider: str
+
+    # Token usage
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+    # Timing
+    timestamp: str  # ISO 8601
+    latency_ms: float  # Request duration
+
+    # Cost estimation (if available)
+    estimated_cost: Optional[float] = None
+
+    # Extensible metadata (org_id, user_id, etc.)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+# Cost per 1M tokens for common models (approximate, update as needed)
+MODEL_COSTS: Dict[str, Dict[str, float]] = {
+    # xAI Grok
+    "grok-4.1": {"prompt": 3.0, "completion": 15.0},
+    "grok-3": {"prompt": 3.0, "completion": 15.0},
+    # Anthropic Claude
+    "claude-opus-4": {"prompt": 15.0, "completion": 75.0},
+    "claude-sonnet-4": {"prompt": 3.0, "completion": 15.0},
+    "claude-sonnet-3-5": {"prompt": 3.0, "completion": 15.0},
+    # OpenAI
+    "gpt-4o": {"prompt": 2.5, "completion": 10.0},
+    "gpt-4o-mini": {"prompt": 0.15, "completion": 0.6},
+    "o1": {"prompt": 15.0, "completion": 60.0},
+    "o3-mini": {"prompt": 1.1, "completion": 4.4},
+}
+
+
+def estimate_cost(
+    model: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+) -> Optional[float]:
+    """
+    Estimate cost in USD for a completion.
+
+    Returns None if model pricing is unknown.
+    """
+    # Normalize model name for lookup
+    model_lower = model.lower()
+
+    # Find matching pricing (prefer longest prefix match)
+    pricing = None
+    best_match_len = 0
+
+    for model_prefix, costs in MODEL_COSTS.items():
+        prefix_lower = model_prefix.lower()
+        if model_lower.startswith(prefix_lower):
+            if len(prefix_lower) > best_match_len:
+                pricing = costs
+                best_match_len = len(prefix_lower)
+
+    if pricing is None:
+        return None
+
+    # Cost = (tokens / 1M) * cost_per_million
+    prompt_cost = (prompt_tokens / 1_000_000) * pricing["prompt"]
+    completion_cost = (completion_tokens / 1_000_000) * pricing["completion"]
+
+    return round(prompt_cost + completion_cost, 6)
+
+
+UsageCallback = Callable[[UsageEvent], None]
+
+
+@dataclass
+class UsageTotals:
+    """Aggregate usage statistics."""
+
+    total_tokens: int = 0
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    request_count: int = 0
+    total_cost: float = 0.0
+    total_latency_ms: float = 0.0
+
+
+class UsageTracker:
+    """
+    Platform-level usage tracking for billing and metering.
+
+    Thread-safe. Supports multiple subscribers for real-time event streaming.
+
+    Integration points:
+    - Webhook to billing API
+    - Database for usage records
+    - Metrics/observability (Prometheus, DataDog)
+    - Real-time dashboard (WebSocket)
+    """
+
+    def __init__(self):
+        self._callbacks: List[UsageCallback] = []
+        self._lock = threading.Lock()
+
+        # Aggregate tracking
+        self._totals = UsageTotals()
+        self._per_agent: Dict[str, UsageTotals] = {}
+        self._per_model: Dict[str, UsageTotals] = {}
+
+    def subscribe(self, callback: UsageCallback) -> None:
+        """
+        Subscribe to usage events.
+
+        Callbacks are invoked synchronously after each LLM completion.
+        For async processing, use a queue in your callback.
+        """
+        with self._lock:
+            self._callbacks.append(callback)
+
+    def unsubscribe(self, callback: UsageCallback) -> None:
+        """Unsubscribe from usage events."""
+        with self._lock:
+            if callback in self._callbacks:
+                self._callbacks.remove(callback)
+
+    def record(
+        self,
+        thread_id: str,
+        agent_id: Optional[str],
+        model: str,
+        provider: str,
+        prompt_tokens: int,
+        completion_tokens: int,
+        latency_ms: float,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> UsageEvent:
+        """
+        Record a usage event and notify subscribers.
+
+        Called by LLMRouter after each completion.
+
+        Returns:
+            The created UsageEvent (for chaining/logging)
+        """
+        total_tokens = prompt_tokens + completion_tokens
+
+        event = UsageEvent(
+            thread_id=thread_id,
+            agent_id=agent_id,
+            model=model,
+            provider=provider,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            latency_ms=latency_ms,
+            estimated_cost=estimate_cost(model, prompt_tokens, completion_tokens),
+            metadata=metadata or {},
+        )
+
+        # Update aggregates
+        with self._lock:
+            self._update_totals(self._totals, event)
+
+            if agent_id:
+                if agent_id not in self._per_agent:
+                    self._per_agent[agent_id] = UsageTotals()
+                self._update_totals(self._per_agent[agent_id], event)
+
+            if model not in self._per_model:
+                self._per_model[model] = UsageTotals()
+            self._update_totals(self._per_model[model], event)
+
+            # Copy callbacks to avoid holding lock during invocation
+            callbacks = list(self._callbacks)
+
+        # Notify subscribers (outside lock)
+        for callback in callbacks:
+            try:
+                callback(event)
+            except Exception:
+                # Don't let subscriber errors break tracking
+                pass
+
+        return event
+
+    def _update_totals(self, totals: UsageTotals, event: UsageEvent) -> None:
+        """Update aggregate totals from an event."""
+        totals.total_tokens += event.total_tokens
+        totals.prompt_tokens += event.prompt_tokens
+        totals.completion_tokens += event.completion_tokens
+        totals.request_count += 1
+        totals.total_latency_ms += event.latency_ms
+        if event.estimated_cost:
+            totals.total_cost += event.estimated_cost
+
+    def get_totals(self) -> Dict[str, Any]:
+        """Get aggregate usage totals."""
+        with self._lock:
+            return {
+                "total_tokens": self._totals.total_tokens,
+                "prompt_tokens": self._totals.prompt_tokens,
+                "completion_tokens": self._totals.completion_tokens,
+                "request_count": self._totals.request_count,
+                "total_cost": round(self._totals.total_cost, 4),
+                "avg_latency_ms": (
+                    self._totals.total_latency_ms / self._totals.request_count
+                    if self._totals.request_count > 0
+                    else 0
+                ),
+            }
+
+    def get_agent_totals(self, agent_id: str) -> Dict[str, Any]:
+        """Get usage totals for a specific agent."""
+        with self._lock:
+            totals = self._per_agent.get(agent_id, UsageTotals())
+            return {
+                "total_tokens": totals.total_tokens,
+                "prompt_tokens": totals.prompt_tokens,
+                "completion_tokens": totals.completion_tokens,
+                "request_count": totals.request_count,
+                "total_cost": round(totals.total_cost, 4),
+            }
+
+    def get_model_totals(self, model: str) -> Dict[str, Any]:
+        """Get usage totals for a specific model."""
+        with self._lock:
+            totals = self._per_model.get(model, UsageTotals())
+            return {
+                "total_tokens": totals.total_tokens,
+                "prompt_tokens": totals.prompt_tokens,
+                "completion_tokens": totals.completion_tokens,
+                "request_count": totals.request_count,
+                "total_cost": round(totals.total_cost, 4),
+            }
+
+    def get_all_agent_totals(self) -> Dict[str, Dict[str, Any]]:
+        """Get usage totals for all agents."""
+        with self._lock:
+            return {
+                agent_id: {
+                    "total_tokens": t.total_tokens,
+                    "prompt_tokens": t.prompt_tokens,
+                    "completion_tokens": t.completion_tokens,
+                    "request_count": t.request_count,
+                    "total_cost": round(t.total_cost, 4),
+                }
+                for agent_id, t in self._per_agent.items()
+            }
+
+    def get_all_model_totals(self) -> Dict[str, Dict[str, Any]]:
+        """Get usage totals for all models."""
+        with self._lock:
+            return {
+                model: {
+                    "total_tokens": t.total_tokens,
+                    "prompt_tokens": t.prompt_tokens,
+                    "completion_tokens": t.completion_tokens,
+                    "request_count": t.request_count,
+                    "total_cost": round(t.total_cost, 4),
+                }
+                for model, t in self._per_model.items()
+            }
+
+    def reset(self) -> None:
+        """Reset all tracking (for testing)."""
+        with self._lock:
+            self._totals = UsageTotals()
+            self._per_agent.clear()
+            self._per_model.clear()
+
+
+# =============================================================================
+# Global Instance
+# =============================================================================
+
+_tracker: Optional[UsageTracker] = None
+_tracker_lock = threading.Lock()
+
+
+def get_usage_tracker() -> UsageTracker:
+    """Get the global usage tracker."""
+    global _tracker
+    if _tracker is None:
+        with _tracker_lock:
+            if _tracker is None:
+                _tracker = UsageTracker()
+    return _tracker
+
+
+def reset_usage_tracker() -> None:
+    """Reset the global tracker (for testing)."""
+    global _tracker
+    with _tracker_lock:
+        if _tracker is not None:
+            _tracker.reset()
+        _tracker = None
--- a/xml_pipeline/message_bus/init.py
+++ b/xml_pipeline/message_bus/init.py
@ -67,6 +67,15 @@ from xml_pipeline.message_bus.buffer_registry import (
    reset_buffer_registry,
 )

+from xml_pipeline.message_bus.budget_registry import (
+    ThreadBudget,
+    ThreadBudgetRegistry,
+    BudgetExhaustedError,
+    get_budget_registry,
+    configure_budget_registry,
+    reset_budget_registry,
+)
+
 __all__ = [
    # Pump
    "StreamPump",
@ -102,4 +111,11 @@ __all__ = [
    "BufferRegistry",
    "get_buffer_registry",
    "reset_buffer_registry",
+    # Budget registry
+    "ThreadBudget",
+    "ThreadBudgetRegistry",
+    "BudgetExhaustedError",
+    "get_budget_registry",
+    "configure_budget_registry",
+    "reset_budget_registry",
 ]
--- a/xml_pipeline/message_bus/budget_registry.py
+++ b/xml_pipeline/message_bus/budget_registry.py
@ -0,0 +1,280 @@
+"""
+Thread Budget Registry — Enforces per-thread token limits.
+
+Each thread has a token budget that tracks:
+- Total tokens consumed (prompt + completion)
+- Requests made
+- Remaining budget
+
+When a thread exhausts its budget, LLM calls are blocked.
+
+Example config:
+    organism:
+      max_tokens_per_thread: 100000  # 100k tokens per thread
+"""
+
+from __future__ import annotations
+
+import threading
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+
+@dataclass
+class ThreadBudget:
+    """Track token usage for a single thread."""
+
+    max_tokens: int
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    request_count: int = 0
+
+    @property
+    def total_tokens(self) -> int:
+        """Total tokens consumed."""
+        return self.prompt_tokens + self.completion_tokens
+
+    @property
+    def remaining(self) -> int:
+        """Remaining token budget."""
+        return max(0, self.max_tokens - self.total_tokens)
+
+    @property
+    def is_exhausted(self) -> bool:
+        """True if budget is exhausted."""
+        return self.total_tokens >= self.max_tokens
+
+    def can_consume(self, estimated_tokens: int) -> bool:
+        """Check if we can consume the given tokens without exceeding budget."""
+        return self.total_tokens + estimated_tokens <= self.max_tokens
+
+    def consume(
+        self,
+        prompt_tokens: int = 0,
+        completion_tokens: int = 0,
+    ) -> None:
+        """Record token consumption."""
+        self.prompt_tokens += prompt_tokens
+        self.completion_tokens += completion_tokens
+        self.request_count += 1
+
+
+class BudgetExhaustedError(Exception):
+    """Raised when a thread's token budget is exhausted."""
+
+    def __init__(self, thread_id: str, used: int, max_tokens: int):
+        self.thread_id = thread_id
+        self.used = used
+        self.max_tokens = max_tokens
+        super().__init__(
+            f"Thread {thread_id[:8]}... budget exhausted: "
+            f"{used}/{max_tokens} tokens used"
+        )
+
+
+class ThreadBudgetRegistry:
+    """
+    Manages token budgets per thread.
+
+    Thread-safe for concurrent access.
+
+    Usage:
+        registry = get_budget_registry()
+        registry.configure(max_tokens_per_thread=100000)
+
+        # Before LLM call
+        registry.check_budget(thread_id, estimated_tokens=1000)
+
+        # After LLM call
+        registry.consume(thread_id, prompt=500, completion=300)
+
+        # Get usage
+        budget = registry.get_budget(thread_id)
+        print(f"Used: {budget.total_tokens}, Remaining: {budget.remaining}")
+    """
+
+    def __init__(self, max_tokens_per_thread: int = 100_000):
+        """
+        Initialize budget registry.
+
+        Args:
+            max_tokens_per_thread: Default budget for new threads.
+        """
+        self._max_tokens_per_thread = max_tokens_per_thread
+        self._budgets: Dict[str, ThreadBudget] = {}
+        self._lock = threading.Lock()
+
+    def configure(self, max_tokens_per_thread: int) -> None:
+        """
+        Update default max tokens for new threads.
+
+        Existing threads keep their current budgets.
+        """
+        with self._lock:
+            self._max_tokens_per_thread = max_tokens_per_thread
+
+    @property
+    def max_tokens_per_thread(self) -> int:
+        """Get the default max tokens per thread."""
+        return self._max_tokens_per_thread
+
+    def get_budget(self, thread_id: str) -> ThreadBudget:
+        """
+        Get or create budget for a thread.
+
+        Args:
+            thread_id: Thread UUID
+
+        Returns:
+            ThreadBudget instance
+        """
+        with self._lock:
+            if thread_id not in self._budgets:
+                self._budgets[thread_id] = ThreadBudget(
+                    max_tokens=self._max_tokens_per_thread
+                )
+            return self._budgets[thread_id]
+
+    def check_budget(
+        self,
+        thread_id: str,
+        estimated_tokens: int = 0,
+    ) -> bool:
+        """
+        Check if thread has budget for the estimated tokens.
+
+        Args:
+            thread_id: Thread UUID
+            estimated_tokens: Estimated tokens for the request
+
+        Returns:
+            True if budget available
+
+        Raises:
+            BudgetExhaustedError if budget is exhausted
+        """
+        budget = self.get_budget(thread_id)
+
+        if budget.is_exhausted:
+            raise BudgetExhaustedError(
+                thread_id=thread_id,
+                used=budget.total_tokens,
+                max_tokens=budget.max_tokens,
+            )
+
+        if not budget.can_consume(estimated_tokens):
+            raise BudgetExhaustedError(
+                thread_id=thread_id,
+                used=budget.total_tokens,
+                max_tokens=budget.max_tokens,
+            )
+
+        return True
+
+    def consume(
+        self,
+        thread_id: str,
+        prompt_tokens: int = 0,
+        completion_tokens: int = 0,
+    ) -> ThreadBudget:
+        """
+        Record token consumption for a thread.
+
+        Args:
+            thread_id: Thread UUID
+            prompt_tokens: Prompt tokens used
+            completion_tokens: Completion tokens used
+
+        Returns:
+            Updated ThreadBudget
+        """
+        budget = self.get_budget(thread_id)
+        with self._lock:
+            budget.consume(prompt_tokens, completion_tokens)
+        return budget
+
+    def get_usage(self, thread_id: str) -> Dict[str, int]:
+        """
+        Get usage stats for a thread.
+
+        Returns:
+            Dict with prompt_tokens, completion_tokens, total_tokens,
+            remaining, max_tokens, request_count
+        """
+        budget = self.get_budget(thread_id)
+        return {
+            "prompt_tokens": budget.prompt_tokens,
+            "completion_tokens": budget.completion_tokens,
+            "total_tokens": budget.total_tokens,
+            "remaining": budget.remaining,
+            "max_tokens": budget.max_tokens,
+            "request_count": budget.request_count,
+        }
+
+    def get_all_usage(self) -> Dict[str, Dict[str, int]]:
+        """Get usage stats for all threads."""
+        with self._lock:
+            return {
+                thread_id: {
+                    "prompt_tokens": b.prompt_tokens,
+                    "completion_tokens": b.completion_tokens,
+                    "total_tokens": b.total_tokens,
+                    "remaining": b.remaining,
+                    "max_tokens": b.max_tokens,
+                    "request_count": b.request_count,
+                }
+                for thread_id, b in self._budgets.items()
+            }
+
+    def reset_thread(self, thread_id: str) -> None:
+        """Reset budget for a specific thread."""
+        with self._lock:
+            self._budgets.pop(thread_id, None)
+
+    def cleanup_thread(self, thread_id: str) -> Optional[ThreadBudget]:
+        """
+        Remove budget when thread is pruned/completed.
+
+        Returns the final budget for logging/billing, or None if not found.
+        """
+        with self._lock:
+            return self._budgets.pop(thread_id, None)
+
+    def clear(self) -> None:
+        """Clear all budgets (for testing)."""
+        with self._lock:
+            self._budgets.clear()
+
+
+# =============================================================================
+# Global Instance
+# =============================================================================
+
+_registry: Optional[ThreadBudgetRegistry] = None
+_registry_lock = threading.Lock()
+
+
+def get_budget_registry() -> ThreadBudgetRegistry:
+    """Get the global budget registry."""
+    global _registry
+    if _registry is None:
+        with _registry_lock:
+            if _registry is None:
+                _registry = ThreadBudgetRegistry()
+    return _registry
+
+
+def configure_budget_registry(max_tokens_per_thread: int) -> ThreadBudgetRegistry:
+    """Configure the global budget registry."""
+    registry = get_budget_registry()
+    registry.configure(max_tokens_per_thread)
+    return registry
+
+
+def reset_budget_registry() -> None:
+    """Reset the global registry (for testing)."""
+    global _registry
+    with _registry_lock:
+        if _registry is not None:
+            _registry.clear()
+        _registry = None
--- a/xml_pipeline/message_bus/stream_pump.py
+++ b/xml_pipeline/message_bus/stream_pump.py
@ -141,6 +141,9 @@ class OrganismConfig:
    max_concurrent_handlers: int = 20     # Concurrent handler invocations
    max_concurrent_per_agent: int = 5     # Per-agent rate limit

+    # Token budget enforcement
+    max_tokens_per_thread: int = 100_000  # Max tokens per conversation thread
+
    # LLM configuration (optional)
    llm_config: Dict[str, Any] = field(default_factory=dict)

@ -1271,6 +1274,7 @@ class ConfigLoader:
            max_concurrent_pipelines=raw.get("max_concurrent_pipelines", 50),
            max_concurrent_handlers=raw.get("max_concurrent_handlers", 20),
            max_concurrent_per_agent=raw.get("max_concurrent_per_agent", 5),
+            max_tokens_per_thread=raw.get("max_tokens_per_thread", 100_000),
            llm_config=raw.get("llm", {}),
            process_pool_enabled=process_pool_enabled,
            process_pool_workers=process_pool_workers,
@ -1430,6 +1434,11 @@ async def bootstrap(config_path: str = "config/organism.yaml") -> StreamPump:
        configure_router(config.llm_config)
        print(f"LLM backends: {len(config.llm_config.get('backends', []))}")

+    # Configure thread budget registry
+    from xml_pipeline.message_bus.budget_registry import configure_budget_registry
+    configure_budget_registry(config.max_tokens_per_thread)
+    print(f"Token budget: {config.max_tokens_per_thread:,} per thread")
+
    # Initialize root thread in registry
    registry = get_registry()
    root_uuid = registry.initialize_root(config.name)