diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..0988c18
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,49 @@
+# Version control
+.git
+.gitignore
+
+# Python artifacts
+__pycache__
+*.pyc
+*.pyo
+*.egg-info
+*.egg
+dist/
+build/
+.eggs/
+
+# Virtual environments
+.venv
+venv
+env
+
+# Tests and docs (not needed in runtime)
+tests/
+docs/
+*.md
+!README.md
+
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Environment and secrets
+.env
+.env.*
+!.env.example
+
+# OS files
+.DS_Store
+Thumbs.db
+
+# Development artifacts
+bloxserver/
+*.db
+*.sqlite
+*.sqlite3
+
+# CI/CD
+.github/
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5019cbc..cf5d35a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -68,3 +68,17 @@ jobs:
- name: MyPy
run: mypy xml_pipeline/ --ignore-missing-imports
+
+ docker:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Build Docker image
+ run: docker build -t agentos:test .
+
+ - name: Dry-run config validation
+ run: |
+ docker run --rm \
+ -v ${{ github.workspace }}/config/organism.yaml:/config/organism.yaml:ro \
+ agentos:test --dry-run /config/organism.yaml
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..d1dc164
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,99 @@
+# AgentOS — Sealed organism runtime
+#
+# Multi-stage build: python:3.12-slim builder + minimal runtime.
+# Not Alpine: musl breaks lxml C extensions.
+#
+# Usage:
+# docker build -t agentos .
+# docker run -v ./organism.yaml:/config/organism.yaml \
+# -e XAI_API_KEY=xai-... \
+# -p 8080:8080 -p 9090:9090 \
+# agentos
+
+# =============================================================================
+# Stage 1: Builder — install dependencies and build wheels
+# =============================================================================
+FROM python:3.12-slim AS builder
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ build-essential \
+ libxml2-dev \
+ libxslt1-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build
+
+# Copy dependency specification first (layer caching)
+COPY pyproject.toml .
+COPY README.md .
+
+# Install into a virtual environment for clean copy
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install core + server + all LLM providers
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir -e ".[all]"
+
+# Copy source code
+COPY xml_pipeline/ xml_pipeline/
+COPY third_party/ third_party/
+COPY handlers/ handlers/
+COPY examples/ examples/
+COPY config/ config/
+COPY deploy/ deploy/
+
+# Re-install with source (editable mode needs the source)
+RUN pip install --no-cache-dir -e ".[all]"
+
+# =============================================================================
+# Stage 2: Runtime — minimal image with only what's needed
+# =============================================================================
+FROM python:3.12-slim AS runtime
+
+# Runtime dependencies for lxml
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ libxml2 \
+ libxslt1.1 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user
+RUN groupadd -r organism && useradd -r -g organism -d /home/organism -s /bin/false organism
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy application source
+WORKDIR /app
+COPY --from=builder /build/xml_pipeline/ xml_pipeline/
+COPY --from=builder /build/third_party/ third_party/
+COPY --from=builder /build/handlers/ handlers/
+COPY --from=builder /build/examples/ examples/
+COPY --from=builder /build/config/ config/
+COPY --from=builder /build/deploy/ deploy/
+COPY --from=builder /build/pyproject.toml .
+COPY --from=builder /build/README.md .
+
+# Create writable data directory and config mount point
+RUN mkdir -p /data /config && chown -R organism:organism /data /config
+
+# Dashboard static files
+COPY dashboard/ /app/dashboard/
+
+# Volume for persistent data
+VOLUME ["/data"]
+
+# Expose agent bus (8080) and management plane (9090)
+EXPOSE 8080 9090
+
+# Switch to non-root user
+USER organism
+
+# Health check against agent port
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1
+
+# Default: boot the organism via entrypoint
+ENTRYPOINT ["python", "-m", "deploy.entrypoint"]
+CMD ["/config/organism.yaml"]
diff --git a/dashboard/dashboard.js b/dashboard/dashboard.js
new file mode 100644
index 0000000..3877bb4
--- /dev/null
+++ b/dashboard/dashboard.js
@@ -0,0 +1,342 @@
+/**
+ * AgentOS Dashboard — WebSocket-driven real-time updates.
+ *
+ * No framework, no build step. Pure vanilla JS.
+ * Connects to the management port WebSocket at /ws.
+ */
+
+// State
+let ws = null;
+let reconnectTimeout = null;
+let messageCount = 0;
+const API_BASE = ''; // Same origin as management server
+
+// =========================================================================
+// WebSocket Connection
+// =========================================================================
+
+function connect() {
+ const protocol = location.protocol === 'https:' ? 'wss:' : 'ws:';
+ const wsUrl = `${protocol}//${location.host}/ws`;
+
+ ws = new WebSocket(wsUrl);
+
+ ws.onopen = () => {
+ setConnectionStatus(true);
+ // Request full state on connect
+ fetchInitialState();
+ };
+
+ ws.onmessage = (event) => {
+ try {
+ const data = JSON.parse(event.data);
+ handleEvent(data);
+ } catch (e) {
+ console.error('Failed to parse WebSocket message:', e);
+ }
+ };
+
+ ws.onclose = () => {
+ setConnectionStatus(false);
+ // Reconnect after delay
+ reconnectTimeout = setTimeout(connect, 3000);
+ };
+
+ ws.onerror = () => {
+ ws.close();
+ };
+}
+
+function setConnectionStatus(connected) {
+ const el = document.getElementById('connection-status');
+ if (connected) {
+ el.textContent = 'Connected';
+ el.className = 'status-indicator connected';
+ } else {
+ el.textContent = 'Disconnected';
+ el.className = 'status-indicator disconnected';
+ }
+}
+
+// =========================================================================
+// Event Handling
+// =========================================================================
+
+function handleEvent(data) {
+ const event = data.event;
+
+ switch (event) {
+ case 'connected':
+ if (data.state) {
+ updateFullState(data.state);
+ }
+ break;
+ case 'agent_state':
+ updateAgentState(data);
+ break;
+ case 'message':
+ addMessage(data);
+ messageCount++;
+ updateMessageCount();
+ break;
+ case 'thread_created':
+ case 'thread_completed':
+ fetchThreads();
+ break;
+ default:
+ break;
+ }
+}
+
+// =========================================================================
+// API Fetching
+// =========================================================================
+
+async function fetchJSON(path) {
+ try {
+ const resp = await fetch(`${API_BASE}${path}`);
+ if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
+ return await resp.json();
+ } catch (e) {
+ console.error(`Fetch ${path} failed:`, e);
+ return null;
+ }
+}
+
+async function fetchInitialState() {
+ const [organism, agents, threads, usage] = await Promise.all([
+ fetchJSON('/api/v1/organism'),
+ fetchJSON('/api/v1/agents'),
+ fetchJSON('/api/v1/threads'),
+ fetchJSON('/api/v1/usage'),
+ ]);
+
+ if (organism) {
+ document.getElementById('organism-name').textContent = organism.name;
+ updateUptime(organism.uptimeSeconds || 0);
+ }
+
+ if (agents) {
+ renderAgents(agents.agents || []);
+ const count = (agents.agents || []).length;
+ const active = (agents.agents || []).filter(a => a.state === 'processing').length;
+ document.getElementById('agent-count').textContent = count;
+ document.getElementById('agent-detail').textContent = `${active} active`;
+ }
+
+ if (threads) {
+ renderThreads(threads.threads || []);
+ document.getElementById('thread-count').textContent = (threads.threads || []).length;
+ }
+
+ if (usage) {
+ updateUsageCards(usage);
+ }
+
+ // Fetch audit
+ refreshAudit();
+}
+
+async function fetchThreads() {
+ const threads = await fetchJSON('/api/v1/threads');
+ if (threads) {
+ renderThreads(threads.threads || []);
+ document.getElementById('thread-count').textContent = (threads.threads || []).length;
+ }
+}
+
+// =========================================================================
+// Rendering
+// =========================================================================
+
+function renderAgents(agents) {
+ const tbody = document.getElementById('agents-table');
+ if (!agents.length) {
+ tbody.innerHTML = '
| No agents registered |
';
+ return;
+ }
+
+ tbody.innerHTML = agents.map(a => `
+
+ ${escapeHtml(a.name)} |
+ ${a.isAgent ? 'Agent' : 'Tool'} |
+ ${a.state || 'idle'} |
+ ${(a.peers || []).map(p => `${escapeHtml(p)}`).join(', ') || '--'} |
+ ${a.messageCount || 0} |
+
+ `).join('');
+}
+
+function renderThreads(threads) {
+ const tbody = document.getElementById('threads-table');
+ if (!threads.length) {
+ tbody.innerHTML = '| No active threads |
';
+ return;
+ }
+
+ tbody.innerHTML = threads.map(t => `
+
+ ${escapeHtml((t.threadId || t.id || '').substring(0, 8))}... |
+ ${t.status || 'active'} |
+ ${(t.participants || []).map(p => `${escapeHtml(p)}`).join(', ') || '--'} |
+ ${t.messageCount || 0} |
+ ${t.createdAt ? formatTime(t.createdAt) : '--'} |
+
+ `).join('');
+}
+
+function addMessage(data) {
+ const log = document.getElementById('message-log');
+ const empty = log.querySelector('.empty');
+ if (empty) empty.remove();
+
+ const entry = document.createElement('div');
+ entry.className = 'log-entry';
+ entry.innerHTML = `
+ ${formatTime(Date.now() / 1000)}
+ ${escapeHtml(data.from || data.fromId || '?')}
+ ${escapeHtml(data.payloadType || data.payload_type || JSON.stringify(data).substring(0, 200))}
+ `;
+
+ log.insertBefore(entry, log.firstChild);
+
+ // Limit log entries
+ while (log.children.length > 500) {
+ log.removeChild(log.lastChild);
+ }
+}
+
+function updateAgentState(data) {
+ // Re-fetch agents on state change
+ fetchJSON('/api/v1/agents').then(agents => {
+ if (agents) {
+ renderAgents(agents.agents || []);
+ const count = (agents.agents || []).length;
+ const active = (agents.agents || []).filter(a => a.state === 'processing').length;
+ document.getElementById('agent-count').textContent = count;
+ document.getElementById('agent-detail').textContent = `${active} active`;
+ }
+ });
+}
+
+function updateUsageCards(usage) {
+ if (usage.totals) {
+ const tokens = usage.totals.totalTokens || 0;
+ const cost = usage.totals.totalCost || usage.totals.estimatedCost || 0;
+ document.getElementById('token-count').textContent = formatNumber(tokens);
+ document.getElementById('token-cost').textContent = `$${cost.toFixed(4)}`;
+ }
+}
+
+function updateMessageCount() {
+ document.getElementById('message-count').textContent = formatNumber(messageCount);
+}
+
+function updateUptime(seconds) {
+ const h = Math.floor(seconds / 3600);
+ const m = Math.floor((seconds % 3600) / 60);
+ const s = Math.floor(seconds % 60);
+ document.getElementById('uptime').textContent =
+ `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
+}
+
+// =========================================================================
+// Audit Log
+// =========================================================================
+
+async function refreshAudit() {
+ const severity = document.getElementById('audit-severity').value;
+ const type = document.getElementById('audit-type').value;
+
+ let url = '/api/v1/audit/events?limit=200';
+ if (severity) url += `&severity=${severity}`;
+ if (type) url += `&event_type=${type}`;
+
+ const data = await fetchJSON(url);
+ if (!data) return;
+
+ const log = document.getElementById('audit-log');
+ if (!data.events || !data.events.length) {
+ log.innerHTML = 'No audit events matching filters
';
+ return;
+ }
+
+ log.innerHTML = data.events.map(e => `
+
+ ${formatTime(e.timestamp)}
+ [${e.severity.toUpperCase()}]
+ ${escapeHtml(e.listener_name)}
+ ${escapeHtml(e.event_type)}: ${escapeHtml(JSON.stringify(e.details).substring(0, 300))}
+
+ `).join('');
+}
+
+// Make refreshAudit available globally for the onclick handler
+window.refreshAudit = refreshAudit;
+
+// =========================================================================
+// Tab Navigation
+// =========================================================================
+
+document.querySelectorAll('.tab').forEach(tab => {
+ tab.addEventListener('click', () => {
+ document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+ document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+
+ tab.classList.add('active');
+ const target = document.getElementById(`tab-${tab.dataset.tab}`);
+ if (target) target.classList.add('active');
+
+ // Refresh data for active tab
+ if (tab.dataset.tab === 'audit') refreshAudit();
+ if (tab.dataset.tab === 'threads') fetchThreads();
+ });
+});
+
+// =========================================================================
+// Utilities
+// =========================================================================
+
+function escapeHtml(str) {
+ if (!str) return '';
+ const div = document.createElement('div');
+ div.textContent = String(str);
+ return div.innerHTML;
+}
+
+function formatTime(timestamp) {
+ const d = new Date(timestamp * 1000);
+ return d.toLocaleTimeString();
+}
+
+function formatNumber(n) {
+ if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M';
+ if (n >= 1_000) return (n / 1_000).toFixed(1) + 'K';
+ return String(n);
+}
+
+// =========================================================================
+// Periodic Refresh
+// =========================================================================
+
+setInterval(() => {
+ // Refresh uptime
+ fetchJSON('/health').then(data => {
+ if (data && data.uptime_seconds !== undefined) {
+ updateUptime(data.uptime_seconds);
+ }
+ });
+}, 10000);
+
+setInterval(() => {
+ // Refresh usage
+ fetchJSON('/api/v1/usage').then(data => {
+ if (data) updateUsageCards(data);
+ });
+}, 30000);
+
+// =========================================================================
+// Boot
+// =========================================================================
+
+connect();
diff --git a/dashboard/index.html b/dashboard/index.html
new file mode 100644
index 0000000..b24fdfa
--- /dev/null
+++ b/dashboard/index.html
@@ -0,0 +1,124 @@
+
+
+
+
+
+ AgentOS Dashboard
+
+
+
+
+
+
+
+
+
+
+
Active Threads
+
--
+
--
+
+
+
+
Token Usage
+
--
+
--
+
+
+
+
+
+
+
+
+
+
+
+ | Name |
+ Type |
+ State |
+ Peers |
+ Messages |
+
+
+
+ | Loading... |
+
+
+
+
+
+
+
+
+
+ | Thread ID |
+ Status |
+ Participants |
+ Messages |
+ Created |
+
+
+
+ | Loading... |
+
+
+
+
+
+
+
+
Waiting for messages...
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dashboard/style.css b/dashboard/style.css
new file mode 100644
index 0000000..f4b208b
--- /dev/null
+++ b/dashboard/style.css
@@ -0,0 +1,328 @@
+/* AgentOS Dashboard — Minimal, no-framework styling */
+
+:root {
+ --bg: #0d1117;
+ --surface: #161b22;
+ --border: #30363d;
+ --text: #c9d1d9;
+ --text-muted: #8b949e;
+ --accent: #58a6ff;
+ --green: #3fb950;
+ --yellow: #d29922;
+ --red: #f85149;
+ --orange: #db6d28;
+}
+
+* {
+ margin: 0;
+ padding: 0;
+ box-sizing: border-box;
+}
+
+body {
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
+ background: var(--bg);
+ color: var(--text);
+ line-height: 1.5;
+}
+
+/* Header */
+header {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ padding: 16px 24px;
+ border-bottom: 1px solid var(--border);
+ background: var(--surface);
+}
+
+.header-left {
+ display: flex;
+ align-items: center;
+ gap: 16px;
+}
+
+header h1 {
+ font-size: 20px;
+ font-weight: 600;
+ color: var(--accent);
+}
+
+.organism-name {
+ font-size: 14px;
+ color: var(--text-muted);
+ font-family: monospace;
+}
+
+.header-right {
+ display: flex;
+ align-items: center;
+ gap: 16px;
+ font-size: 13px;
+}
+
+.status-indicator {
+ padding: 4px 12px;
+ border-radius: 12px;
+ font-size: 12px;
+ font-weight: 500;
+}
+
+.status-indicator.connected {
+ background: rgba(63, 185, 80, 0.15);
+ color: var(--green);
+}
+
+.status-indicator.disconnected {
+ background: rgba(248, 81, 73, 0.15);
+ color: var(--red);
+}
+
+.uptime {
+ color: var(--text-muted);
+ font-family: monospace;
+}
+
+/* Main */
+main {
+ max-width: 1200px;
+ margin: 0 auto;
+ padding: 24px;
+}
+
+/* Status Cards */
+.cards {
+ display: grid;
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+ gap: 16px;
+ margin-bottom: 24px;
+}
+
+.card {
+ background: var(--surface);
+ border: 1px solid var(--border);
+ border-radius: 8px;
+ padding: 20px;
+}
+
+.card-label {
+ font-size: 12px;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ color: var(--text-muted);
+ margin-bottom: 4px;
+}
+
+.card-value {
+ font-size: 28px;
+ font-weight: 600;
+ font-family: monospace;
+}
+
+.card-detail {
+ font-size: 12px;
+ color: var(--text-muted);
+ margin-top: 4px;
+}
+
+/* Tabs */
+.tabs {
+ display: flex;
+ gap: 0;
+ border-bottom: 1px solid var(--border);
+ margin-bottom: 16px;
+}
+
+.tab {
+ background: none;
+ border: none;
+ color: var(--text-muted);
+ padding: 10px 20px;
+ cursor: pointer;
+ font-size: 14px;
+ border-bottom: 2px solid transparent;
+ transition: all 0.2s;
+}
+
+.tab:hover {
+ color: var(--text);
+}
+
+.tab.active {
+ color: var(--accent);
+ border-bottom-color: var(--accent);
+}
+
+.tab-content {
+ display: none;
+}
+
+.tab-content.active {
+ display: block;
+}
+
+/* Tables */
+table {
+ width: 100%;
+ border-collapse: collapse;
+ background: var(--surface);
+ border: 1px solid var(--border);
+ border-radius: 8px;
+ overflow: hidden;
+}
+
+thead {
+ background: rgba(255, 255, 255, 0.03);
+}
+
+th {
+ text-align: left;
+ padding: 10px 16px;
+ font-size: 12px;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ color: var(--text-muted);
+ border-bottom: 1px solid var(--border);
+}
+
+td {
+ padding: 10px 16px;
+ font-size: 14px;
+ border-bottom: 1px solid var(--border);
+}
+
+tr:last-child td {
+ border-bottom: none;
+}
+
+tr:hover {
+ background: rgba(255, 255, 255, 0.02);
+}
+
+td.empty {
+ text-align: center;
+ color: var(--text-muted);
+ padding: 40px;
+}
+
+/* State badges */
+.state-badge {
+ display: inline-block;
+ padding: 2px 8px;
+ border-radius: 10px;
+ font-size: 12px;
+ font-weight: 500;
+}
+
+.state-idle {
+ background: rgba(139, 148, 158, 0.15);
+ color: var(--text-muted);
+}
+
+.state-processing {
+ background: rgba(88, 166, 255, 0.15);
+ color: var(--accent);
+}
+
+.state-error {
+ background: rgba(248, 81, 73, 0.15);
+ color: var(--red);
+}
+
+.state-active {
+ background: rgba(63, 185, 80, 0.15);
+ color: var(--green);
+}
+
+/* Log */
+.log {
+ background: var(--surface);
+ border: 1px solid var(--border);
+ border-radius: 8px;
+ max-height: 600px;
+ overflow-y: auto;
+ font-family: monospace;
+ font-size: 13px;
+}
+
+.log .empty {
+ text-align: center;
+ color: var(--text-muted);
+ padding: 40px;
+}
+
+.log-entry {
+ padding: 8px 16px;
+ border-bottom: 1px solid var(--border);
+ display: flex;
+ gap: 12px;
+ align-items: flex-start;
+}
+
+.log-entry:last-child {
+ border-bottom: none;
+}
+
+.log-time {
+ color: var(--text-muted);
+ white-space: nowrap;
+ flex-shrink: 0;
+}
+
+.log-from {
+ color: var(--accent);
+ font-weight: 500;
+ flex-shrink: 0;
+ min-width: 120px;
+}
+
+.log-content {
+ flex: 1;
+ word-break: break-word;
+}
+
+/* Severity colors for audit */
+.severity-info { color: var(--text-muted); }
+.severity-warning { color: var(--yellow); }
+.severity-error { color: var(--red); }
+.severity-critical { color: var(--red); font-weight: 700; }
+
+/* Audit filters */
+.audit-filters {
+ display: flex;
+ gap: 8px;
+ margin-bottom: 12px;
+}
+
+.audit-filters select,
+.audit-filters button {
+ background: var(--surface);
+ border: 1px solid var(--border);
+ color: var(--text);
+ padding: 6px 12px;
+ border-radius: 6px;
+ font-size: 13px;
+ cursor: pointer;
+}
+
+.audit-filters button:hover {
+ background: rgba(255, 255, 255, 0.05);
+}
+
+/* Scrollbar */
+::-webkit-scrollbar {
+ width: 8px;
+}
+
+::-webkit-scrollbar-track {
+ background: transparent;
+}
+
+::-webkit-scrollbar-thumb {
+ background: var(--border);
+ border-radius: 4px;
+}
+
+::-webkit-scrollbar-thumb:hover {
+ background: var(--text-muted);
+}
diff --git a/deploy/__init__.py b/deploy/__init__.py
new file mode 100644
index 0000000..4a33635
--- /dev/null
+++ b/deploy/__init__.py
@@ -0,0 +1 @@
+"""Deploy module — container entrypoint and configuration."""
diff --git a/deploy/docker-compose.dev.yml b/deploy/docker-compose.dev.yml
new file mode 100644
index 0000000..1671409
--- /dev/null
+++ b/deploy/docker-compose.dev.yml
@@ -0,0 +1,36 @@
+# AgentOS development overlay
+#
+# Usage:
+# docker compose -f deploy/docker-compose.yml -f deploy/docker-compose.dev.yml up
+#
+# Mounts source code for hot-reload, relaxes security for development.
+
+services:
+ organism:
+ build:
+ context: ..
+ dockerfile: Dockerfile
+
+ environment:
+ - ORGANISM_MODE=development
+ - AGENT_PORT=8080
+ - MANAGEMENT_PORT=9090
+
+ volumes:
+ # Mount source for development iteration
+ - ../xml_pipeline:/app/xml_pipeline:ro
+ - ../handlers:/app/handlers:ro
+ - ../third_party:/app/third_party:ro
+ - ../examples:/app/examples:ro
+ - ../config:/config:ro
+ - ../dashboard:/app/dashboard:ro
+ - organism-data:/data
+
+ # Relax security for development
+ read_only: false
+ security_opt: []
+ cap_drop: []
+
+ # No resource limits in dev
+ deploy:
+ resources: {}
diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml
new file mode 100644
index 0000000..9754f57
--- /dev/null
+++ b/deploy/docker-compose.yml
@@ -0,0 +1,90 @@
+# AgentOS production deployment
+#
+# Usage:
+# docker compose -f deploy/docker-compose.yml up
+#
+# Requires:
+# - organism.yaml mounted at /config/organism.yaml
+# - API keys passed as environment variables
+
+services:
+ organism:
+ build:
+ context: ..
+ dockerfile: Dockerfile
+ container_name: agentos
+ restart: unless-stopped
+
+ ports:
+ - "8080:8080" # Agent bus (public-facing)
+ - "9090:9090" # Management plane (bind to localhost in production)
+
+ volumes:
+ - ./organism.yaml:/config/organism.yaml:ro
+ - organism-data:/data
+
+ environment:
+ - ORGANISM_MODE=container
+ - AGENT_PORT=8080
+ - MANAGEMENT_PORT=9090
+
+ env_file:
+ - .env
+
+ # Security hardening
+ security_opt:
+ - no-new-privileges:true
+ cap_drop:
+ - ALL
+ read_only: true
+ tmpfs:
+ - /tmp:size=64M
+
+ # Resource limits
+ deploy:
+ resources:
+ limits:
+ memory: 2G
+ cpus: "2.0"
+ reservations:
+ memory: 512M
+ cpus: "0.5"
+
+ healthcheck:
+ test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"]
+ interval: 30s
+ timeout: 5s
+ retries: 3
+ start_period: 15s
+
+ # Optional: Redis for distributed key-value store
+ redis:
+ image: redis:7-alpine
+ container_name: agentos-redis
+ restart: unless-stopped
+ profiles: ["redis"]
+
+ ports:
+ - "6379:6379"
+
+ volumes:
+ - redis-data:/data
+
+ security_opt:
+ - no-new-privileges:true
+ cap_drop:
+ - ALL
+ cap_add:
+ - SETUID
+ - SETGID
+ read_only: true
+
+ healthcheck:
+ test: ["CMD", "redis-cli", "ping"]
+ interval: 10s
+ timeout: 3s
+ retries: 3
+
+volumes:
+ organism-data:
+ redis-data:
diff --git a/deploy/entrypoint.py b/deploy/entrypoint.py
new file mode 100644
index 0000000..7b05f62
--- /dev/null
+++ b/deploy/entrypoint.py
@@ -0,0 +1,216 @@
+"""
+AgentOS container entrypoint.
+
+Validates config, generates keys if needed, applies security lockdowns,
+and boots the organism with dual-port servers (agent + management).
+
+Usage:
+ python -m deploy.entrypoint /config/organism.yaml
+ python -m deploy.entrypoint --dry-run /config/organism.yaml
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+import sys
+from pathlib import Path
+
+logger = logging.getLogger("agentos.entrypoint")
+
+
+def detect_mode() -> str:
+ """Detect organism mode from config or environment."""
+ return os.environ.get("ORGANISM_MODE", "container")
+
+
+def validate_config(config_path: Path) -> bool:
+ """Validate organism config file exists and is parseable."""
+ if not config_path.exists():
+ logger.error(f"Config not found: {config_path}")
+ return False
+
+ try:
+ import yaml
+
+ with open(config_path) as f:
+ raw = yaml.safe_load(f)
+
+ if not isinstance(raw, dict):
+ logger.error("Config must be a YAML mapping")
+ return False
+
+ org = raw.get("organism", {})
+ if not org.get("name"):
+ logger.error("organism.name is required")
+ return False
+
+ logger.info(f"Config valid: {org['name']}")
+
+ # Count listeners
+ listeners = raw.get("listeners", [])
+ if isinstance(listeners, list):
+ logger.info(f" Listeners: {len(listeners)}")
+
+ return True
+ except Exception as e:
+ logger.error(f"Config parse error: {e}")
+ return False
+
+
+def ensure_identity_key(config_path: Path) -> None:
+ """Generate Ed25519 identity key if not present."""
+ import yaml
+
+ with open(config_path) as f:
+ raw = yaml.safe_load(f)
+
+ identity_path = raw.get("organism", {}).get("identity")
+ if not identity_path:
+ return
+
+ identity_file = Path(identity_path)
+ if identity_file.exists():
+ logger.info(f"Identity key found: {identity_file}")
+ return
+
+ try:
+ from xml_pipeline.crypto import generate_identity
+
+ identity_file.parent.mkdir(parents=True, exist_ok=True)
+ identity = generate_identity()
+ public_path = identity_file.with_suffix(".pub")
+ identity.save(identity_file, public_path)
+ logger.info(f"Generated identity key: {identity_file}")
+ except Exception as e:
+ logger.warning(f"Could not generate identity key: {e}")
+
+
+def apply_container_lockdowns(mode: str) -> None:
+ """Apply security lockdowns based on organism mode."""
+ if mode != "container":
+ logger.info(f"Mode '{mode}' — skipping container lockdowns")
+ return
+
+ logger.info("Applying container security lockdowns")
+
+ from xml_pipeline.security.defaults import apply_container_defaults
+
+ apply_container_defaults()
+
+
+async def boot_organism(config_path: Path, mode: str) -> None:
+ """Bootstrap and run the organism with dual-port servers."""
+ from xml_pipeline.message_bus import bootstrap
+
+ # Bootstrap the pump
+ pump = await bootstrap(str(config_path))
+
+ # Determine ports from environment
+ agent_port = int(os.environ.get("AGENT_PORT", "8080"))
+ management_port = int(os.environ.get("MANAGEMENT_PORT", "9090"))
+ host = os.environ.get("BIND_HOST", "0.0.0.0")
+
+ try:
+ import uvicorn
+ except ImportError:
+ logger.error("uvicorn not installed. Install with: pip install xml-pipeline[server]")
+ sys.exit(1)
+
+ # Create agent-facing app (minimal: /health, /inject, /ws)
+ from xml_pipeline.server.agent_app import create_agent_app
+
+ agent_app = create_agent_app(pump)
+
+ # Create management app (full API, dashboard, audit)
+ from xml_pipeline.server.management import create_management_app
+
+ management_app = create_management_app(pump)
+
+ # Configure uvicorn servers
+ agent_config = uvicorn.Config(
+ agent_app,
+ host=host,
+ port=agent_port,
+ log_level="info",
+ access_log=False,
+ )
+ management_config = uvicorn.Config(
+ management_app,
+ host="127.0.0.1" if mode == "container" else host,
+ port=management_port,
+ log_level="info",
+ )
+
+ agent_server = uvicorn.Server(agent_config)
+ management_server = uvicorn.Server(management_config)
+
+ # Run pump + both servers concurrently
+ pump_task = asyncio.create_task(pump.run())
+
+ logger.info(f"Agent bus: http://{host}:{agent_port}")
+ logger.info(f"Management: http://127.0.0.1:{management_port}")
+ logger.info(f"Dashboard: http://127.0.0.1:{management_port}/dashboard/")
+
+ try:
+ await asyncio.gather(
+ agent_server.serve(),
+ management_server.serve(),
+ )
+ finally:
+ await pump.shutdown()
+ pump_task.cancel()
+ try:
+ await pump_task
+ except asyncio.CancelledError:
+ pass
+
+
+def main() -> int:
+ """Entrypoint for container boot."""
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+ )
+
+ parser = argparse.ArgumentParser(description="AgentOS entrypoint")
+ parser.add_argument("config", nargs="?", default="/config/organism.yaml", help="Config path")
+ parser.add_argument("--dry-run", action="store_true", help="Validate config and exit")
+ parser.add_argument("--mode", help="Override organism mode (container/development)")
+ args = parser.parse_args()
+
+ config_path = Path(args.config)
+ mode = args.mode or detect_mode()
+
+ logger.info(f"AgentOS starting (mode={mode})")
+
+ # Validate config
+ if not validate_config(config_path):
+ return 1
+
+ if args.dry_run:
+ logger.info("Dry run complete — config is valid")
+ return 0
+
+ # Generate identity key if needed
+ ensure_identity_key(config_path)
+
+ # Apply security lockdowns
+ apply_container_lockdowns(mode)
+
+ # Boot the organism
+ try:
+ asyncio.run(boot_organism(config_path, mode))
+ return 0
+ except KeyboardInterrupt:
+ logger.info("Shutdown requested")
+ return 0
+ except Exception as e:
+ logger.error(f"Boot failed: {e}")
+ return 1
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/xml_pipeline/cli.py b/xml_pipeline/cli.py
index 9ceb1e4..d5391da 100644
--- a/xml_pipeline/cli.py
+++ b/xml_pipeline/cli.py
@@ -54,27 +54,16 @@ def cmd_serve(args: argparse.Namespace) -> int:
print(f"Error: Config file not found: {config_path}", file=sys.stderr)
return 1
+ split_mode = getattr(args, "split", False)
+
async def run_with_server():
"""Bootstrap pump and run with server."""
import signal
- from xml_pipeline.server import create_app
from xml_pipeline.server.restart import RestartOrchestrator
# Bootstrap the pump
pump = await bootstrap(str(config_path))
- # Create FastAPI app
- app = create_app(pump)
-
- # Run uvicorn
- config = uvicorn.Config(
- app,
- host=args.host,
- port=args.port,
- log_level="info",
- )
- server = uvicorn.Server(config)
-
# Set up SIGHUP handler for graceful restart (Unix only)
restart_requested = asyncio.Event()
@@ -86,9 +75,53 @@ def cmd_serve(args: argparse.Namespace) -> int:
)
print("SIGHUP handler registered for graceful restart")
- # Run pump and server concurrently
+ # Run pump
pump_task = asyncio.create_task(pump.run())
+ servers = []
+
+ if split_mode:
+ # Dual-port mode: agent app + management app
+ from xml_pipeline.server.agent_app import create_agent_app
+ from xml_pipeline.server.management import create_management_app
+
+ agent_app = create_agent_app(pump)
+ mgmt_app = create_management_app(pump)
+
+ mgmt_port = getattr(args, "management_port", 9090)
+ mgmt_host = "127.0.0.1" # Management always localhost
+
+ agent_config = uvicorn.Config(
+ agent_app,
+ host=args.host,
+ port=args.port,
+ log_level="info",
+ access_log=False,
+ )
+ mgmt_config = uvicorn.Config(
+ mgmt_app,
+ host=mgmt_host,
+ port=mgmt_port,
+ log_level="info",
+ )
+
+ agent_server = uvicorn.Server(agent_config)
+ management_server = uvicorn.Server(mgmt_config)
+ servers = [agent_server, management_server]
+ else:
+ # Single-port mode (backwards compatible)
+ from xml_pipeline.server import create_app
+
+ app = create_app(pump)
+ config = uvicorn.Config(
+ app,
+ host=args.host,
+ port=args.port,
+ log_level="info",
+ )
+ server = uvicorn.Server(config)
+ servers = [server]
+
async def restart_watcher():
"""Watch for restart signal and initiate graceful restart."""
await restart_requested.wait()
@@ -101,12 +134,13 @@ def cmd_serve(args: argparse.Namespace) -> int:
print(f"Drain complete (drained={result.drained})")
if result.journal_stats:
print(f"Journal stats: {result.journal_stats}")
- server.should_exit = True
+ for s in servers:
+ s.should_exit = True
restart_task = asyncio.create_task(restart_watcher())
try:
- await server.serve()
+ await asyncio.gather(*(s.serve() for s in servers))
finally:
await pump.shutdown()
pump_task.cancel()
@@ -121,9 +155,16 @@ def cmd_serve(args: argparse.Namespace) -> int:
RestartOrchestrator.exec_restart()
try:
- print(f"Starting AgentServer on http://{args.host}:{args.port}")
- print(f" API docs: http://{args.host}:{args.port}/docs")
- print(f" WebSocket: ws://{args.host}:{args.port}/ws")
+ if split_mode:
+ mgmt_port = getattr(args, "management_port", 9090)
+ print(f"Starting AgentOS in split mode:")
+ print(f" Agent bus: http://{args.host}:{args.port}")
+ print(f" Management: http://127.0.0.1:{mgmt_port}")
+ print(f" Dashboard: http://127.0.0.1:{mgmt_port}/dashboard/")
+ else:
+ print(f"Starting AgentServer on http://{args.host}:{args.port}")
+ print(f" API docs: http://{args.host}:{args.port}/docs")
+ print(f" WebSocket: ws://{args.host}:{args.port}/ws")
asyncio.run(run_with_server())
return 0
except KeyboardInterrupt:
@@ -251,6 +292,14 @@ def main() -> int:
serve_parser.add_argument("config", nargs="?", default="organism.yaml", help="Config file")
serve_parser.add_argument("--host", default="0.0.0.0", help="Host to bind (default: 0.0.0.0)")
serve_parser.add_argument("--port", "-p", type=int, default=8080, help="Port to listen on (default: 8080)")
+ serve_parser.add_argument(
+ "--split", action="store_true",
+ help="Split mode: agent bus on --port, management on --management-port",
+ )
+ serve_parser.add_argument(
+ "--management-port", type=int, default=9090,
+ help="Management port (default: 9090, only used with --split)",
+ )
serve_parser.set_defaults(func=cmd_serve)
# init
diff --git a/xml_pipeline/config/loader.py b/xml_pipeline/config/loader.py
index 312c7a0..746dd19 100644
--- a/xml_pipeline/config/loader.py
+++ b/xml_pipeline/config/loader.py
@@ -122,17 +122,38 @@ class ProcessPoolConfig:
max_tasks_per_child: int = 100 # Restart workers after N tasks
+@dataclass
+class SecurityConfig:
+ """Security configuration for container mode."""
+
+ tool_default: str = "allow" # "allow" (dev) or "deny" (container)
+ shell: str = "restricted" # "disabled" | "restricted" | "full"
+ writable_paths: list[str] = field(default_factory=lambda: ["/data"])
+ read_paths: list[str] = field(default_factory=lambda: ["/config"])
+
+
+@dataclass
+class NetworkConfig:
+ """Network egress policy configuration."""
+
+ egress: str = "allow" # "allow" or "deny"
+ allowed_domains: list[str] = field(default_factory=list)
+
+
@dataclass
class OrganismConfig:
"""Complete organism configuration."""
organism: OrganismMeta
+ mode: str = "development" # "container" or "development"
listeners: list[ListenerConfig] = field(default_factory=list)
llm_backends: list[LLMBackendConfig] = field(default_factory=list)
server: ServerConfig | None = None
auth: AuthConfig | None = None
backend: BackendStorageConfig | None = None
process_pool: ProcessPoolConfig | None = None
+ security: SecurityConfig | None = None
+ network: NetworkConfig | None = None
def load_config(path: Path) -> OrganismConfig:
@@ -239,14 +260,40 @@ def load_config(path: Path) -> OrganismConfig:
max_tasks_per_child=pool_raw.get("max_tasks_per_child", 100),
)
+ # Parse organism mode
+ mode = org_raw.get("mode", "development")
+
+ # Parse optional security config
+ security = None
+ if "security" in raw:
+ sec_raw = raw["security"]
+ security = SecurityConfig(
+ tool_default=sec_raw.get("tool_default", "allow"),
+ shell=sec_raw.get("shell", "restricted"),
+ writable_paths=sec_raw.get("filesystem", {}).get("writable_paths", ["/data"]),
+ read_paths=sec_raw.get("filesystem", {}).get("read_paths", ["/config"]),
+ )
+
+ # Parse optional network config
+ network = None
+ if "network" in raw:
+ net_raw = raw["network"]
+ network = NetworkConfig(
+ egress=net_raw.get("egress", "allow"),
+ allowed_domains=net_raw.get("allowed_domains", []),
+ )
+
return OrganismConfig(
organism=organism,
+ mode=mode,
listeners=listeners,
llm_backends=llm_backends,
server=server,
auth=auth,
backend=backend,
process_pool=process_pool,
+ security=security,
+ network=network,
)
diff --git a/xml_pipeline/security/__init__.py b/xml_pipeline/security/__init__.py
new file mode 100644
index 0000000..3c2e426
--- /dev/null
+++ b/xml_pipeline/security/__init__.py
@@ -0,0 +1,21 @@
+"""
+security — Default-deny posture and container mode enforcement.
+
+Provides:
+- Container mode detection and lockdown application
+- Permission gate for per-listener tool access
+- Network egress policy enforcement
+- Security event logging
+"""
+
+from xml_pipeline.security.defaults import (
+ apply_container_defaults,
+ get_organism_mode,
+ is_container_mode,
+)
+
+__all__ = [
+ "apply_container_defaults",
+ "get_organism_mode",
+ "is_container_mode",
+]
diff --git a/xml_pipeline/security/defaults.py b/xml_pipeline/security/defaults.py
new file mode 100644
index 0000000..fc0b309
--- /dev/null
+++ b/xml_pipeline/security/defaults.py
@@ -0,0 +1,85 @@
+"""
+Default-deny configuration for container mode.
+
+In container mode:
+- Shell tool is disabled entirely
+- File tool is restricted to /data/ and /config/ (read-only)
+- Fetch tool only allows declared LLM backend domains
+- All tools require explicit per-listener allowlist
+
+In development mode:
+- All tools available (current behavior preserved)
+- No filesystem restrictions
+- No network restrictions
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Module-level mode state
+_organism_mode: Optional[str] = None
+
+
+def get_organism_mode() -> str:
+ """Get the current organism mode."""
+ global _organism_mode
+ if _organism_mode is None:
+ _organism_mode = os.environ.get("ORGANISM_MODE", "development")
+ return _organism_mode
+
+
+def set_organism_mode(mode: str) -> None:
+ """Set the organism mode explicitly."""
+ global _organism_mode
+ if mode not in ("container", "development"):
+ raise ValueError(f"Invalid mode: {mode}. Must be 'container' or 'development'.")
+ _organism_mode = mode
+ logger.info(f"Organism mode set to: {mode}")
+
+
+def is_container_mode() -> bool:
+ """Check if running in container (default-deny) mode."""
+ return get_organism_mode() == "container"
+
+
+def apply_container_defaults() -> None:
+ """
+ Apply default-deny security posture for container mode.
+
+ This configures all tool modules with restrictive defaults:
+ - Shell: disabled
+ - Files: restricted to /data (rw) and /config (ro)
+ - Fetch: blocks all non-LLM-backend domains by default
+ """
+ set_organism_mode("container")
+
+ # Lock down shell tool
+ from xml_pipeline.tools.shell import set_container_mode as shell_lock
+
+ shell_lock(True)
+ logger.info("Shell tool: DISABLED (container mode)")
+
+ # Lock down file tool to /data and /config
+ from xml_pipeline.tools.files import configure_allowed_paths
+
+ configure_allowed_paths(["/data", "/config"])
+ logger.info("File tool: restricted to /data (rw), /config (ro)")
+
+ # Lock down fetch tool
+ from xml_pipeline.tools.fetch import set_container_mode as fetch_lock
+
+ fetch_lock(True)
+ logger.info("Fetch tool: egress restricted to allowlisted domains")
+
+ # Enable permission gate
+ from xml_pipeline.tools.permission_gate import enable_permission_gate
+
+ enable_permission_gate()
+ logger.info("Permission gate: ENABLED (tools require explicit allowlist)")
+
+ logger.info("Container security lockdowns applied")
diff --git a/xml_pipeline/server/__init__.py b/xml_pipeline/server/__init__.py
index 62dfc58..c17b0b8 100644
--- a/xml_pipeline/server/__init__.py
+++ b/xml_pipeline/server/__init__.py
@@ -5,14 +5,21 @@ Provides:
- REST API for querying organism state (agents, threads, messages)
- WebSocket for real-time events
- Message injection endpoint
+- Split architecture: agent app (port 8080) + management app (port 9090)
Usage:
from xml_pipeline.server import create_app, run_server
- # With existing pump
+ # Combined app (backwards compatible)
app = create_app(pump)
uvicorn.run(app, host="0.0.0.0", port=8080)
+ # Split apps (AgentOS mode)
+ from xml_pipeline.server.agent_app import create_agent_app
+ from xml_pipeline.server.management import create_management_app
+ agent_app = create_agent_app(pump) # port 8080
+ mgmt_app = create_management_app(pump) # port 9090
+
# Or use CLI
xml-pipeline serve config/organism.yaml --port 8080
"""
diff --git a/xml_pipeline/server/agent_app.py b/xml_pipeline/server/agent_app.py
new file mode 100644
index 0000000..f69c863
--- /dev/null
+++ b/xml_pipeline/server/agent_app.py
@@ -0,0 +1,128 @@
+"""
+agent_app.py — Minimal agent-facing FastAPI application (port 8080).
+
+Exposes only:
+- GET /health — Health check
+- POST /inject — Message injection
+- WS /ws — Message bus WebSocket
+- WS /ws/messages — Message stream WebSocket
+
+Agents cannot query usage, read config, see other agents, or access audit logs.
+"""
+
+from __future__ import annotations
+
+import uuid
+from contextlib import asynccontextmanager
+from typing import TYPE_CHECKING, Any, AsyncGenerator
+
+from fastapi import APIRouter, FastAPI, HTTPException, WebSocket
+
+from xml_pipeline.server.models import InjectRequest, InjectResponse
+from xml_pipeline.server.state import ServerState
+from xml_pipeline.server.websocket import create_websocket_router
+
+if TYPE_CHECKING:
+ from xml_pipeline.message_bus.stream_pump import StreamPump
+
+
+def create_agent_app(
+ pump: "StreamPump",
+ *,
+ title: str = "AgentOS Agent Bus",
+ version: str = "1.0.0",
+) -> FastAPI:
+ """
+ Create the agent-facing FastAPI app.
+
+ This app is intentionally minimal — only health, inject, and WebSocket.
+ All monitoring, config, and management endpoints are on the management port.
+ """
+ state = ServerState(pump)
+
+ @asynccontextmanager
+ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
+ state.set_running()
+
+ # Journal recovery
+ for hook in pump.dispatch_hooks:
+ from xml_pipeline.message_bus.journal import MessageJournal
+
+ if isinstance(hook, MessageJournal):
+ entries = await hook.get_unacknowledged(older_than_seconds=0)
+ if entries:
+ import logging
+
+ logger = logging.getLogger(__name__)
+ logger.info(
+ f"Journal recovery: replaying {len(entries)} unacknowledged entries"
+ )
+ for entry in entries:
+ await pump.inject(
+ entry["payload_bytes"],
+ thread_id=entry["thread_id"],
+ from_id=entry["from_id"],
+ )
+ logger.info("Journal recovery complete")
+ break
+
+ yield
+ state.set_stopping()
+
+ app = FastAPI(
+ title=title,
+ version=version,
+ description="Agent-facing message bus. No management or monitoring endpoints.",
+ lifespan=lifespan,
+ # No OpenAPI docs on agent port (agents shouldn't see API structure)
+ docs_url=None,
+ redoc_url=None,
+ openapi_url=None,
+ )
+
+ # Health check
+ @app.get("/health")
+ async def health_check() -> dict[str, Any]:
+ info = state.get_organism_info()
+ return {
+ "status": "healthy",
+ "organism": info.name,
+ "uptime_seconds": info.uptime_seconds,
+ }
+
+ # Inject endpoint
+ router = APIRouter()
+
+ @router.post("/inject", response_model=InjectResponse)
+ async def inject_message(request: InjectRequest) -> InjectResponse:
+ """Inject a message to an agent."""
+ agent = state.get_agent(request.to)
+ if agent is None:
+ raise HTTPException(
+ status_code=400,
+ detail=f"Unknown target agent: {request.to}",
+ )
+
+ thread_id = request.thread_id or str(uuid.uuid4())
+ payload_type = next(iter(request.payload.keys()), "Payload")
+
+ msg_id = await state.record_message(
+ thread_id=thread_id,
+ from_id="api",
+ to_id=request.to,
+ payload_type=payload_type,
+ payload=request.payload,
+ )
+
+ return InjectResponse(thread_id=thread_id, message_id=msg_id)
+
+ app.include_router(router)
+
+ # WebSocket endpoints (message bus)
+ app.include_router(create_websocket_router(state))
+
+ # Store state for access
+ app.state.server_state = state
+ app.state.pump = pump
+
+ return app
diff --git a/xml_pipeline/server/audit.py b/xml_pipeline/server/audit.py
new file mode 100644
index 0000000..e801cbb
--- /dev/null
+++ b/xml_pipeline/server/audit.py
@@ -0,0 +1,205 @@
+"""
+audit.py — SQLite-backed audit log for security events.
+
+Records:
+- Tool invocations (who called what tool with what params)
+- Peer constraint violations (blocked routing attempts)
+- Security events (unauthorized access, egress blocks, etc.)
+- Config changes (hot-reload events)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+logger = logging.getLogger(__name__)
+
+# In-memory audit log (SQLite backing added when aiosqlite is available)
+_audit_entries: list[dict[str, Any]] = []
+_max_memory_entries: int = 10000
+
+
+@dataclass
+class AuditEntry:
+ """A single audit log entry."""
+
+ timestamp: float
+ event_type: str # "tool_invocation", "peer_violation", "security_event", "config_change"
+ listener_name: str
+ thread_id: Optional[str]
+ details: dict[str, Any]
+ severity: str = "info" # "info", "warning", "error", "critical"
+
+
+def record_event(
+ event_type: str,
+ listener_name: str,
+ details: dict[str, Any],
+ *,
+ thread_id: Optional[str] = None,
+ severity: str = "info",
+) -> None:
+ """
+ Record an audit event.
+
+ Args:
+ event_type: Category of event
+ listener_name: Which listener triggered this
+ details: Event-specific data
+ thread_id: Associated thread UUID (if any)
+ severity: Event severity level
+ """
+ entry = {
+ "timestamp": time.time(),
+ "event_type": event_type,
+ "listener_name": listener_name,
+ "thread_id": thread_id,
+ "details": details,
+ "severity": severity,
+ }
+ _audit_entries.append(entry)
+
+ # Trim old entries if over limit
+ if len(_audit_entries) > _max_memory_entries:
+ _audit_entries[:] = _audit_entries[-_max_memory_entries:]
+
+ # Log security events at appropriate level
+ if severity == "critical":
+ logger.critical(f"AUDIT [{event_type}] {listener_name}: {details}")
+ elif severity == "error":
+ logger.error(f"AUDIT [{event_type}] {listener_name}: {details}")
+ elif severity == "warning":
+ logger.warning(f"AUDIT [{event_type}] {listener_name}: {details}")
+ else:
+ logger.debug(f"AUDIT [{event_type}] {listener_name}: {details}")
+
+
+def record_tool_invocation(
+ listener_name: str,
+ tool_name: str,
+ params: dict[str, Any],
+ success: bool,
+ *,
+ thread_id: Optional[str] = None,
+ error: Optional[str] = None,
+) -> None:
+ """Record a tool invocation."""
+ record_event(
+ "tool_invocation",
+ listener_name,
+ {
+ "tool": tool_name,
+ "params": _sanitize_params(params),
+ "success": success,
+ "error": error,
+ },
+ thread_id=thread_id,
+ )
+
+
+def record_peer_violation(
+ listener_name: str,
+ target: str,
+ *,
+ thread_id: Optional[str] = None,
+) -> None:
+ """Record a peer constraint violation."""
+ record_event(
+ "peer_violation",
+ listener_name,
+ {"attempted_target": target},
+ thread_id=thread_id,
+ severity="warning",
+ )
+
+
+def record_security_event(
+ listener_name: str,
+ description: str,
+ details: Optional[dict[str, Any]] = None,
+ *,
+ thread_id: Optional[str] = None,
+ severity: str = "warning",
+) -> None:
+ """Record a security event."""
+ record_event(
+ "security_event",
+ listener_name,
+ {"description": description, **(details or {})},
+ thread_id=thread_id,
+ severity=severity,
+ )
+
+
+def get_entries(
+ *,
+ event_type: Optional[str] = None,
+ listener_name: Optional[str] = None,
+ severity: Optional[str] = None,
+ since: Optional[float] = None,
+ limit: int = 100,
+ offset: int = 0,
+) -> list[dict[str, Any]]:
+ """
+ Query audit log entries with optional filtering.
+
+ Returns entries in reverse chronological order (newest first).
+ """
+ filtered = _audit_entries
+
+ if event_type:
+ filtered = [e for e in filtered if e["event_type"] == event_type]
+ if listener_name:
+ filtered = [e for e in filtered if e["listener_name"] == listener_name]
+ if severity:
+ filtered = [e for e in filtered if e["severity"] == severity]
+ if since:
+ filtered = [e for e in filtered if e["timestamp"] >= since]
+
+ # Reverse chronological
+ filtered = list(reversed(filtered))
+
+ return filtered[offset : offset + limit]
+
+
+def get_stats() -> dict[str, Any]:
+ """Get audit log statistics."""
+ total = len(_audit_entries)
+ by_type: dict[str, int] = {}
+ by_severity: dict[str, int] = {}
+
+ for entry in _audit_entries:
+ by_type[entry["event_type"]] = by_type.get(entry["event_type"], 0) + 1
+ by_severity[entry["severity"]] = by_severity.get(entry["severity"], 0) + 1
+
+ return {
+ "total_entries": total,
+ "by_type": by_type,
+ "by_severity": by_severity,
+ "oldest": _audit_entries[0]["timestamp"] if _audit_entries else None,
+ "newest": _audit_entries[-1]["timestamp"] if _audit_entries else None,
+ }
+
+
+def clear() -> None:
+ """Clear the audit log (for testing)."""
+ _audit_entries.clear()
+
+
+def _sanitize_params(params: dict[str, Any]) -> dict[str, Any]:
+ """Remove sensitive values from tool parameters before logging."""
+ sanitized = {}
+ sensitive_keys = {"api_key", "password", "secret", "token", "credential"}
+ for key, value in params.items():
+ if any(s in key.lower() for s in sensitive_keys):
+ sanitized[key] = "***"
+ elif isinstance(value, str) and len(value) > 500:
+ sanitized[key] = value[:500] + "...(truncated)"
+ else:
+ sanitized[key] = value
+ return sanitized
diff --git a/xml_pipeline/server/management.py b/xml_pipeline/server/management.py
new file mode 100644
index 0000000..5a8d366
--- /dev/null
+++ b/xml_pipeline/server/management.py
@@ -0,0 +1,171 @@
+"""
+management.py — Management plane FastAPI application (port 9090).
+
+Full operator visibility and control:
+- All REST API endpoints (organism, agents, threads, usage, journal)
+- Audit log viewer
+- Configuration management
+- Static dashboard
+- WebSocket for real-time monitoring
+
+This app should only be accessible to operators, never to agents.
+"""
+
+from __future__ import annotations
+
+import time
+from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
+
+from fastapi import APIRouter, FastAPI, Query
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+
+from xml_pipeline.server.api import create_router
+from xml_pipeline.server.state import ServerState
+from xml_pipeline.server.websocket import create_websocket_router
+
+if TYPE_CHECKING:
+ from xml_pipeline.message_bus.stream_pump import StreamPump
+
+
+def create_management_app(
+ pump: "StreamPump",
+ *,
+ title: str = "AgentOS Management",
+ version: str = "1.0.0",
+ cors_origins: Optional[list[str]] = None,
+) -> FastAPI:
+ """
+ Create the management FastAPI app (full operator access).
+
+ Includes all existing API endpoints plus:
+ - Audit log endpoints
+ - Dashboard static file serving
+ """
+ state = ServerState(pump)
+
+ @asynccontextmanager
+ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
+ state.set_running()
+ yield
+ state.set_stopping()
+
+ app = FastAPI(
+ title=title,
+ version=version,
+ description=(
+ "Management plane for AgentOS operators. "
+ "Full monitoring, control, audit, and configuration access."
+ ),
+ lifespan=lifespan,
+ )
+
+ # CORS for dashboard
+ if cors_origins is None:
+ cors_origins = ["*"]
+
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=cors_origins,
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+ # Health check
+ @app.get("/health")
+ async def health_check() -> dict[str, Any]:
+ info = state.get_organism_info()
+ return {
+ "status": "healthy",
+ "organism": info.name,
+ "uptime_seconds": info.uptime_seconds,
+ "management": True,
+ }
+
+ # Include full API router (all endpoints)
+ app.include_router(create_router(state))
+
+ # Include WebSocket endpoints (for dashboard real-time updates)
+ app.include_router(create_websocket_router(state))
+
+ # Audit log endpoints
+ audit_router = _create_audit_router()
+ app.include_router(audit_router)
+
+ # Store state
+ app.state.server_state = state
+ app.state.pump = pump
+
+ # Mount dashboard static files (if directory exists)
+ dashboard_paths = [
+ Path(__file__).parent.parent.parent / "dashboard", # repo root
+ Path("/app/dashboard"), # container path
+ ]
+ for dashboard_dir in dashboard_paths:
+ if dashboard_dir.is_dir():
+ app.mount(
+ "/dashboard",
+ StaticFiles(directory=str(dashboard_dir), html=True),
+ name="dashboard",
+ )
+ break
+
+ return app
+
+
+def _create_audit_router() -> APIRouter:
+ """Create the audit log API router."""
+ router = APIRouter(prefix="/api/v1/audit", tags=["audit"])
+
+ @router.get("/events")
+ async def get_audit_events(
+ event_type: Optional[str] = Query(None, description="Filter by event type"),
+ listener: Optional[str] = Query(None, description="Filter by listener name"),
+ severity: Optional[str] = Query(None, description="Filter by severity"),
+ since: Optional[float] = Query(None, description="Events since timestamp"),
+ limit: int = Query(100, ge=1, le=1000),
+ offset: int = Query(0, ge=0),
+ ) -> dict[str, Any]:
+ """Query audit log events."""
+ from xml_pipeline.server.audit import get_entries
+
+ entries = get_entries(
+ event_type=event_type,
+ listener_name=listener,
+ severity=severity,
+ since=since,
+ limit=limit,
+ offset=offset,
+ )
+ return {"events": entries, "count": len(entries)}
+
+ @router.get("/stats")
+ async def get_audit_stats() -> dict[str, Any]:
+ """Get audit log statistics."""
+ from xml_pipeline.server.audit import get_stats
+
+ return get_stats()
+
+ @router.get("/security")
+ async def get_security_events(
+ limit: int = Query(50, ge=1, le=500),
+ ) -> dict[str, Any]:
+ """Get recent security-related events (warnings and above)."""
+ from xml_pipeline.server.audit import get_entries
+
+ warnings = get_entries(severity="warning", limit=limit)
+ errors = get_entries(severity="error", limit=limit)
+ critical = get_entries(severity="critical", limit=limit)
+
+ all_events = sorted(
+ warnings + errors + critical,
+ key=lambda e: e["timestamp"],
+ reverse=True,
+ )[:limit]
+
+ return {"events": all_events, "count": len(all_events)}
+
+ return router
diff --git a/xml_pipeline/tools/base.py b/xml_pipeline/tools/base.py
index a048069..dc9b8b1 100644
--- a/xml_pipeline/tools/base.py
+++ b/xml_pipeline/tools/base.py
@@ -26,8 +26,22 @@ class Tool:
func: Callable
parameters: Dict[str, Any] = field(default_factory=dict)
- async def invoke(self, **kwargs) -> ToolResult:
- """Invoke the tool with given parameters."""
+ async def invoke(self, *, _listener_name: Optional[str] = None, **kwargs) -> ToolResult:
+ """
+ Invoke the tool with given parameters.
+
+ Args:
+ _listener_name: The invoking listener's name (for permission checks).
+ Prefixed with _ to avoid collision with tool parameters.
+ **kwargs: Tool-specific parameters.
+ """
+ # Permission gate check (container mode)
+ if _listener_name is not None:
+ from xml_pipeline.tools.permission_gate import check_permission
+ denied = check_permission(_listener_name, self.name)
+ if denied is not None:
+ return denied
+
try:
result = await self.func(**kwargs)
if isinstance(result, ToolResult):
diff --git a/xml_pipeline/tools/fetch.py b/xml_pipeline/tools/fetch.py
index afed8cb..44365b2 100644
--- a/xml_pipeline/tools/fetch.py
+++ b/xml_pipeline/tools/fetch.py
@@ -2,6 +2,7 @@
Fetch tool - HTTP requests with security controls.
Uses aiohttp for async HTTP operations.
+In container mode, egress is restricted to allowlisted domains only.
"""
from __future__ import annotations
@@ -34,6 +35,20 @@ BLOCKED_HOSTS = {
"169.254.169.254", # AWS/Azure/GCP metadata
}
+# Container mode flag — when True, egress is restricted via network policy
+_container_mode: bool = False
+
+
+def set_container_mode(enabled: bool) -> None:
+ """Enable or disable container mode (restricts egress to allowlisted domains)."""
+ global _container_mode
+ _container_mode = enabled
+
+
+def is_container_mode() -> bool:
+ """Check if fetch is in container mode."""
+ return _container_mode
+
def _is_private_ip(hostname: str) -> bool:
"""Check if hostname resolves to a private/internal IP."""
@@ -117,6 +132,13 @@ async def fetch_url(
if error := _validate_url(url, allow_internal):
return ToolResult(success=False, error=error)
+ # Container mode: enforce network egress policy
+ if _container_mode:
+ from xml_pipeline.tools.network_policy import check_egress
+
+ if egress_error := check_egress(url):
+ return ToolResult(success=False, error=egress_error)
+
# Validate method
method = method.upper()
allowed_methods = {"GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"}
diff --git a/xml_pipeline/tools/network_policy.py b/xml_pipeline/tools/network_policy.py
new file mode 100644
index 0000000..02cc6bd
--- /dev/null
+++ b/xml_pipeline/tools/network_policy.py
@@ -0,0 +1,109 @@
+"""
+Network policy — Egress control for container mode.
+
+Default-deny: only declared LLM backend domains and explicitly
+allowlisted domains can be reached.
+
+In development mode, no restrictions apply.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+
+# Module state
+_egress_enabled: bool = False
+_allowed_domains: set[str] = set()
+
+# LLM provider domains that are auto-allowlisted based on configured backends
+LLM_PROVIDER_DOMAINS: dict[str, list[str]] = {
+ "xai": ["api.x.ai"],
+ "anthropic": ["api.anthropic.com"],
+ "openai": ["api.openai.com"],
+ "ollama": ["localhost", "127.0.0.1"],
+}
+
+
+def enable_egress_control() -> None:
+ """Enable egress control (deny by default)."""
+ global _egress_enabled
+ _egress_enabled = True
+
+
+def disable_egress_control() -> None:
+ """Disable egress control (allow all)."""
+ global _egress_enabled
+ _egress_enabled = False
+
+
+def is_egress_controlled() -> bool:
+ """Check if egress control is active."""
+ return _egress_enabled
+
+
+def allow_domain(domain: str) -> None:
+ """Add a domain to the egress allowlist."""
+ _allowed_domains.add(domain.lower())
+
+
+def allow_domains(domains: list[str]) -> None:
+ """Add multiple domains to the egress allowlist."""
+ for domain in domains:
+ _allowed_domains.add(domain.lower())
+
+
+def allow_llm_provider(provider: str) -> None:
+ """Auto-allowlist domains for an LLM provider."""
+ domains = LLM_PROVIDER_DOMAINS.get(provider, [])
+ for domain in domains:
+ _allowed_domains.add(domain.lower())
+ logger.debug(f"Auto-allowlisted domain for {provider}: {domain}")
+
+
+def check_egress(url: str) -> Optional[str]:
+ """
+ Check if a URL is allowed by the egress policy.
+
+ Returns None if allowed, or an error message if blocked.
+ """
+ if not _egress_enabled:
+ return None
+
+ try:
+ parsed = urlparse(url)
+ hostname = (parsed.hostname or "").lower()
+ except Exception:
+ return "Invalid URL"
+
+ if not hostname:
+ return "URL must have a host"
+
+ # Check exact domain match
+ if hostname in _allowed_domains:
+ return None
+
+ # Check wildcard subdomain match (e.g., *.example.com)
+ parts = hostname.split(".")
+ for i in range(1, len(parts)):
+ parent = ".".join(parts[i:])
+ if f"*.{parent}" in _allowed_domains or parent in _allowed_domains:
+ return None
+
+ logger.warning(f"Egress blocked: {hostname} not in allowlist")
+ return f"Egress blocked: domain '{hostname}' is not in the allowed domains list"
+
+
+def get_allowed_domains() -> set[str]:
+ """Get the current set of allowed domains."""
+ return set(_allowed_domains)
+
+
+def reset() -> None:
+ """Reset network policy state (for testing)."""
+ global _egress_enabled
+ _egress_enabled = False
+ _allowed_domains.clear()
diff --git a/xml_pipeline/tools/permission_gate.py b/xml_pipeline/tools/permission_gate.py
new file mode 100644
index 0000000..75c0028
--- /dev/null
+++ b/xml_pipeline/tools/permission_gate.py
@@ -0,0 +1,92 @@
+"""
+Permission gate — Per-listener tool allowlist enforcement.
+
+In container mode, handlers get NO tools unless explicitly declared
+in their listener config via `allowed_tools`.
+
+In development mode, all tools are available to all handlers (current behavior).
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+from xml_pipeline.tools.base import ToolResult
+
+logger = logging.getLogger(__name__)
+
+# Module state
+_gate_enabled: bool = False
+_listener_allowlists: dict[str, set[str]] = {}
+
+
+def enable_permission_gate() -> None:
+ """Enable permission gate (tools require explicit allowlist)."""
+ global _gate_enabled
+ _gate_enabled = True
+
+
+def disable_permission_gate() -> None:
+ """Disable permission gate (all tools available)."""
+ global _gate_enabled
+ _gate_enabled = False
+
+
+def is_gate_enabled() -> bool:
+ """Check if permission gate is active."""
+ return _gate_enabled
+
+
+def register_listener_tools(listener_name: str, allowed_tools: list[str]) -> None:
+ """
+ Register the tool allowlist for a listener.
+
+ Args:
+ listener_name: The listener's registered name
+ allowed_tools: List of tool names this listener may invoke
+ """
+ _listener_allowlists[listener_name] = set(allowed_tools)
+ if allowed_tools:
+ logger.debug(f"Listener '{listener_name}' tools: {allowed_tools}")
+
+
+def check_permission(listener_name: str, tool_name: str) -> Optional[ToolResult]:
+ """
+ Check if a listener is allowed to invoke a tool.
+
+ Returns None if allowed, or a ToolResult error if denied.
+ """
+ if not _gate_enabled:
+ return None
+
+ allowed = _listener_allowlists.get(listener_name)
+ if allowed is None:
+ # No allowlist registered — deny by default in container mode
+ logger.warning(
+ f"Permission denied: listener '{listener_name}' has no tool allowlist, "
+ f"attempted to use '{tool_name}'"
+ )
+ return ToolResult(
+ success=False,
+ error="Tool access denied. No tools are configured for this listener.",
+ )
+
+ if tool_name not in allowed:
+ logger.warning(
+ f"Permission denied: listener '{listener_name}' "
+ f"not allowed to use tool '{tool_name}'"
+ )
+ return ToolResult(
+ success=False,
+ error=f"Tool '{tool_name}' is not in the allowed tools for this listener.",
+ )
+
+ return None
+
+
+def reset() -> None:
+ """Reset permission gate state (for testing)."""
+ global _gate_enabled
+ _gate_enabled = False
+ _listener_allowlists.clear()
diff --git a/xml_pipeline/tools/shell.py b/xml_pipeline/tools/shell.py
index 39bc21c..b41d4cc 100644
--- a/xml_pipeline/tools/shell.py
+++ b/xml_pipeline/tools/shell.py
@@ -2,6 +2,7 @@
Shell tool - sandboxed command execution.
Provides controlled command execution with security restrictions.
+In container mode, shell is disabled entirely.
"""
from __future__ import annotations
@@ -13,6 +14,21 @@ from typing import Optional, List
from .base import tool, ToolResult
+# Container mode flag — when True, all shell commands are rejected
+_container_mode: bool = False
+
+
+def set_container_mode(enabled: bool) -> None:
+ """Enable or disable container mode (disables shell entirely)."""
+ global _container_mode
+ _container_mode = enabled
+
+
+def is_container_mode() -> bool:
+ """Check if shell is in container mode (disabled)."""
+ return _container_mode
+
+
# Security configuration
ALLOWED_COMMANDS: List[str] = [] # Empty = check blocklist only
BLOCKED_COMMANDS: List[str] = [
@@ -106,6 +122,13 @@ async def run_command(
- Timeout enforced
- Output size limited to 1 MB
"""
+ # Container mode: shell disabled entirely
+ if _container_mode:
+ return ToolResult(
+ success=False,
+ error="Shell access is disabled in container mode.",
+ )
+
# Validate command
if error := _validate_command(command):
return ToolResult(success=False, error=error)