Three workstreams implemented: W1 (Repo Split): Remove proprietary BloxServer files and docs, update pyproject.toml URLs to public GitHub, clean doc references, add CI workflow (.github/workflows/ci.yml) and CONTRIBUTING.md. W2 (Message Journal): Add DispatchHook protocol for dispatch lifecycle events, SQLite-backed MessageJournal with WAL mode for certified-mail delivery guarantees (PENDING→DISPATCHED→ACKED/FAILED), integrate hooks into StreamPump._dispatch_to_handlers(), add journal REST endpoints, and aiosqlite dependency. W3 (Hot Deployment): Add RestartOrchestrator for graceful restart with queue drain and journal stats collection, SIGHUP signal handler in CLI, POST /organism/restart endpoint, restart-aware app lifespan with journal recovery on boot, and os.execv/subprocess re-exec for Unix/Windows. All 439 tests pass (37 new tests for W2/W3). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
137 lines
4.1 KiB
Python
137 lines
4.1 KiB
Python
"""
|
|
restart.py — Graceful restart orchestrator for AgentServer.
|
|
|
|
Handles the restart protocol:
|
|
1. SIGNAL → SIGHUP or POST /organism/restart
|
|
2. DRAIN → pump._running = False; wait for queue to drain
|
|
3. PERSIST → Journal already has all in-flight state
|
|
4. STOP → Shutdown pump, server, process pool
|
|
5. EXEC → os.execv() (Unix) or subprocess (Windows)
|
|
6. BOOT → bootstrap() runs, journal replays unacknowledged
|
|
7. VERIFY → Compare journal stats pre/post restart
|
|
|
|
The journal (W2) provides the safety net: unacknowledged entries are
|
|
replayed on boot, ensuring no messages are lost during restart.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Dict, Optional, TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from xml_pipeline.message_bus.stream_pump import StreamPump
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class RestartResult:
|
|
"""Result of a restart drain operation."""
|
|
success: bool
|
|
drained: bool
|
|
journal_stats: Dict[str, Any] = field(default_factory=dict)
|
|
error: Optional[str] = None
|
|
|
|
|
|
class RestartOrchestrator:
|
|
"""
|
|
Orchestrates graceful restart of the organism.
|
|
|
|
The restart protocol:
|
|
1. Drain the message queue (stop accepting new messages)
|
|
2. Wait for in-flight handlers to complete (with timeout)
|
|
3. Collect journal stats for post-restart verification
|
|
4. Re-exec the process (or signal the caller to do so)
|
|
"""
|
|
|
|
def __init__(self, pump: "StreamPump") -> None:
|
|
self._pump = pump
|
|
self._restarting = False
|
|
|
|
@property
|
|
def is_restarting(self) -> bool:
|
|
return self._restarting
|
|
|
|
async def initiate_restart(self, timeout: float = 30.0) -> RestartResult:
|
|
"""
|
|
Drain the pump and prepare for restart.
|
|
|
|
Args:
|
|
timeout: Maximum seconds to wait for drain
|
|
|
|
Returns:
|
|
RestartResult with drain status and journal stats
|
|
"""
|
|
if self._restarting:
|
|
return RestartResult(
|
|
success=False,
|
|
drained=False,
|
|
error="Restart already in progress",
|
|
)
|
|
|
|
self._restarting = True
|
|
logger.info("Restart initiated — draining message queue...")
|
|
|
|
# Collect pre-restart journal stats
|
|
journal_stats: Dict[str, Any] = {}
|
|
for hook in self._pump.dispatch_hooks:
|
|
from xml_pipeline.message_bus.journal import MessageJournal
|
|
if isinstance(hook, MessageJournal):
|
|
journal_stats = await hook.get_stats()
|
|
break
|
|
|
|
# Stop accepting new messages
|
|
self._pump._running = False
|
|
|
|
# Wait for queue to drain with timeout
|
|
drained = False
|
|
try:
|
|
await asyncio.wait_for(
|
|
self._pump.queue.join(),
|
|
timeout=timeout,
|
|
)
|
|
drained = True
|
|
logger.info("Message queue drained successfully")
|
|
except asyncio.TimeoutError:
|
|
logger.warning(
|
|
f"Queue drain timed out after {timeout}s — "
|
|
f"{self._pump.queue.qsize()} messages remaining"
|
|
)
|
|
|
|
# Shutdown process pool if active
|
|
if self._pump._process_pool:
|
|
self._pump._process_pool.shutdown(wait=True)
|
|
logger.info("ProcessPool shutdown complete")
|
|
|
|
return RestartResult(
|
|
success=True,
|
|
drained=drained,
|
|
journal_stats=journal_stats,
|
|
)
|
|
|
|
@staticmethod
|
|
def exec_restart() -> None:
|
|
"""
|
|
Re-exec the current process.
|
|
|
|
On Unix, uses os.execv() for in-place replacement.
|
|
On Windows, starts a new process and exits.
|
|
"""
|
|
python = sys.executable
|
|
args = sys.argv[:]
|
|
|
|
logger.info(f"Re-executing: {python} {' '.join(args)}")
|
|
|
|
if sys.platform == "win32":
|
|
# Windows: start new process and exit
|
|
subprocess.Popen([python] + args)
|
|
sys.exit(0)
|
|
else:
|
|
# Unix: in-place replacement
|
|
os.execv(python, [python] + args)
|