Add Premium Librarian MVP for codebase intelligence

Implements an RLM-powered codebase intelligence system that: - Ingests git repositories and chunks code intelligently - Stores chunks in eXist-db for RAG retrieval - Answers natural language queries using LLM synthesis New package xml_pipeline/librarian/ with: - chunker.py: AST-based code chunking (Python, JS/TS, C++) - ingest.py: Git clone + file walking + chunk storage - index.py: Structural index building (files, functions, classes) - query.py: RAG search + LLM synthesis with source citations - primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.) - handler.py: Message handlers for organism integration Also adds GitPython and aiohttp as optional [librarian] dependencies. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 23:07:19 -08:00 · 2026-01-27 23:07:19 -08:00 · ce8a9ae0e7
commit ce8a9ae0e7
parent d0d78a9f70
10 changed files with 3021 additions and 1 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -77,6 +77,9 @@ openai = ["openai>=1.0"]
 redis = ["redis>=5.0"]              # Distributed key-value store
 search = ["duckduckgo-search>=6.0"] # Web search tool
 # Premium Librarian (codebase intelligence)
 librarian = ["gitpython>=3.1", "aiohttp>=3.9"]
 # Console example (optional, for interactive use)
 console = ["prompt_toolkit>=3.0"]
@ -91,7 +94,7 @@ server = [
 llm = ["xml-pipeline[anthropic,openai]"]
 # All tools
-tools = ["xml-pipeline[redis,search]"]
+tools = ["xml-pipeline[redis,search,librarian]"]
 # Everything (for local development)
 all = ["xml-pipeline[llm,tools,console,server]"]
--- a/tests/test_librarian_chunker.py
+++ b/tests/test_librarian_chunker.py
@ -0,0 +1,375 @@
 """
 Tests for the Premium Librarian code chunker.
 """
 import pytest
 from xml_pipeline.librarian.chunker import (
    Chunk,
    chunk_file,
    chunk_python,
    chunk_javascript,
    chunk_cpp,
    chunk_prose,
    chunk_generic,
    detect_language,
 )
 class TestLanguageDetection:
    """Tests for language detection from file paths."""
    def test_python_detection(self) -> None:
        assert detect_language("foo.py") == "python"
        assert detect_language("path/to/module.py") == "python"
        assert detect_language("types.pyi") == "python"
    def test_javascript_detection(self) -> None:
        assert detect_language("app.js") == "javascript"
        assert detect_language("component.jsx") == "javascript"
        assert detect_language("index.mjs") == "javascript"
    def test_typescript_detection(self) -> None:
        assert detect_language("app.ts") == "typescript"
        assert detect_language("component.tsx") == "typescript"
    def test_cpp_detection(self) -> None:
        assert detect_language("main.cpp") == "cpp"
        assert detect_language("header.hpp") == "cpp"
        assert detect_language("source.cc") == "cpp"
    def test_c_detection(self) -> None:
        assert detect_language("main.c") == "c"
        assert detect_language("header.h") == "c"
    def test_unknown_language(self) -> None:
        assert detect_language("data.xyz") == "unknown"
        assert detect_language("noextension") == "unknown"
    def test_case_insensitive(self) -> None:
        assert detect_language("Module.PY") == "python"
        assert detect_language("APP.JS") == "javascript"
 class TestPythonChunker:
    """Tests for Python AST-based chunking."""
    def test_simple_function(self) -> None:
        code = '''
 def hello(name: str) -> str:
    """Say hello."""
    return f"Hello, {name}!"
 '''
        chunks = chunk_python(code, "test.py")
        assert len(chunks) == 1
        assert chunks[0].name == "hello"
        assert chunks[0].chunk_type == "function"
        assert chunks[0].docstring == "Say hello."
        assert "str" in chunks[0].signature
    def test_async_function(self) -> None:
        code = '''
 async def fetch_data(url: str) -> dict:
    """Fetch data from URL."""
    pass
 '''
        chunks = chunk_python(code, "test.py")
        assert len(chunks) == 1
        assert chunks[0].name == "fetch_data"
        assert chunks[0].chunk_type == "function"
        assert "async" in chunks[0].signature
    def test_class_with_methods(self) -> None:
        code = '''
 class Calculator:
    """A simple calculator."""
    def add(self, a: int, b: int) -> int:
        """Add two numbers."""
        return a + b
    def subtract(self, a: int, b: int) -> int:
        """Subtract two numbers."""
        return a - b
 '''
        chunks = chunk_python(code, "test.py")
        # Should create a class chunk (small enough to keep together)
        assert len(chunks) >= 1
        class_chunk = [c for c in chunks if c.chunk_type == "class"]
        assert len(class_chunk) == 1
        assert class_chunk[0].name == "Calculator"
        assert class_chunk[0].docstring == "A simple calculator."
    def test_imports_extracted(self) -> None:
        code = '''
 import os
 from typing import Optional, List
 def process():
    pass
 '''
        chunks = chunk_python(code, "test.py")
        assert len(chunks) == 1
        assert "import os" in chunks[0].imports
        assert any("from typing import" in imp for imp in chunks[0].imports)
    def test_empty_file(self) -> None:
        chunks = chunk_python("", "test.py")
        assert len(chunks) == 0
    def test_module_with_only_imports(self) -> None:
        code = '''
 import os
 import sys
 '''
        chunks = chunk_python(code, "test.py")
        # Should create a module chunk for files with no functions/classes
        assert len(chunks) == 0 or chunks[0].chunk_type == "module"
    def test_syntax_error_fallback(self) -> None:
        code = '''
 def broken(
    # Missing closing paren
 '''
        chunks = chunk_python(code, "test.py")
        # Should fall back to generic chunking
        assert len(chunks) >= 0  # May or may not produce chunks
 class TestJavaScriptChunker:
    """Tests for JavaScript regex-based chunking."""
    def test_function_declaration(self) -> None:
        code = '''
 function greet(name) {
    return `Hello, ${name}!`;
 }
 '''
        chunks = chunk_javascript(code, "test.js")
        assert len(chunks) == 1
        assert chunks[0].name == "greet"
        assert chunks[0].chunk_type == "function"
    def test_async_function(self) -> None:
        code = '''
 async function fetchData(url) {
    const response = await fetch(url);
    return response.json();
 }
 '''
        chunks = chunk_javascript(code, "test.js")
        assert len(chunks) == 1
        assert chunks[0].name == "fetchData"
    def test_arrow_function(self) -> None:
        code = '''
 const multiply = (a, b) => {
    return a * b;
 };
 '''
        chunks = chunk_javascript(code, "test.js")
        assert len(chunks) == 1
        assert chunks[0].name == "multiply"
        assert chunks[0].chunk_type == "function"
    def test_class_definition(self) -> None:
        code = '''
 class Calculator {
    constructor() {
        this.result = 0;
    }
    add(value) {
        this.result += value;
        return this;
    }
 }
 '''
        chunks = chunk_javascript(code, "test.js")
        assert len(chunks) >= 1
        class_chunks = [c for c in chunks if c.chunk_type == "class"]
        assert len(class_chunks) == 1
        assert class_chunks[0].name == "Calculator"
    def test_export_function(self) -> None:
        code = '''
 export function exportedFunc() {
    return 42;
 }
 '''
        chunks = chunk_javascript(code, "test.js")
        assert len(chunks) == 1
        assert chunks[0].name == "exportedFunc"
    def test_imports_extracted(self) -> None:
        code = '''
 import React from 'react';
 import { useState } from 'react';
 const lodash = require('lodash');
 function Component() {
    return null;
 }
 '''
        chunks = chunk_javascript(code, "test.jsx")
        assert len(chunks) >= 1
        assert any("import React" in imp for imp in chunks[0].imports)
 class TestCppChunker:
    """Tests for C++ regex-based chunking."""
    def test_function_definition(self) -> None:
        code = '''
 int add(int a, int b) {
    return a + b;
 }
 '''
        chunks = chunk_cpp(code, "test.cpp")
        assert len(chunks) >= 1
        func_chunks = [c for c in chunks if c.chunk_type == "function"]
        assert len(func_chunks) == 1
        assert func_chunks[0].name == "add"
    def test_class_definition(self) -> None:
        code = '''
 class Calculator {
 public:
    int add(int a, int b);
    int subtract(int a, int b);
 };
 '''
        chunks = chunk_cpp(code, "test.cpp")
        assert len(chunks) >= 1
        class_chunks = [c for c in chunks if c.chunk_type == "class"]
        assert len(class_chunks) == 1
        assert class_chunks[0].name == "Calculator"
    def test_includes_extracted(self) -> None:
        code = '''
 #include <iostream>
 #include "myheader.h"
 int main() {
    return 0;
 }
 '''
        chunks = chunk_cpp(code, "test.cpp")
        assert len(chunks) >= 1
        assert any("#include <iostream>" in imp for imp in chunks[0].imports)
 class TestProseChunker:
    """Tests for prose document chunking."""
    def test_markdown_headings(self) -> None:
        content = '''# Introduction
 This is the introduction section.
 ## Getting Started
 Follow these steps to get started.
 ## Advanced Topics
 More advanced content here.
 '''
        chunks = chunk_prose(content, "readme.md", "markdown")
        assert len(chunks) >= 2
        # First chunk should be introduction
        assert chunks[0].name == "Introduction"
    def test_empty_document(self) -> None:
        chunks = chunk_prose("", "empty.md", "markdown")
        assert len(chunks) == 0
 class TestGenericChunker:
    """Tests for generic line-based chunking."""
    def test_small_file(self) -> None:
        content = "line1\nline2\nline3"
        chunks = chunk_generic(content, "test.txt", "text")
        assert len(chunks) == 1
        assert chunks[0].content == content
    def test_empty_file(self) -> None:
        chunks = chunk_generic("", "empty.txt", "text")
        assert len(chunks) == 0
 class TestChunkFile:
    """Tests for the main chunk_file dispatcher."""
    def test_dispatches_to_python(self) -> None:
        code = "def foo(): pass"
        chunks = chunk_file(code, "test.py")
        assert all(c.language == "python" for c in chunks)
    def test_dispatches_to_javascript(self) -> None:
        code = "function foo() {}"
        chunks = chunk_file(code, "test.js")
        assert all(c.language == "javascript" for c in chunks)
    def test_dispatches_to_cpp(self) -> None:
        code = "int main() { return 0; }"
        chunks = chunk_file(code, "test.cpp")
        assert all(c.language == "cpp" for c in chunks)
    def test_unknown_language_uses_generic(self) -> None:
        content = "some content"
        chunks = chunk_file(content, "test.xyz")
        assert all(c.language == "unknown" for c in chunks)
 class TestChunkProperties:
    """Tests for Chunk dataclass properties."""
    def test_chunk_id_generation(self) -> None:
        chunk = Chunk(
            content="def foo(): pass",
            file_path="test.py",
            start_line=1,
            end_line=1,
            chunk_type="function",
            name="foo",
            language="python",
        )
        assert chunk.chunk_id
        assert "test.py" in chunk.chunk_id
        assert "foo" in chunk.chunk_id
    def test_chunk_id_uniqueness(self) -> None:
        chunk1 = Chunk(
            content="def foo(): pass",
            file_path="test.py",
            start_line=1,
            end_line=1,
            chunk_type="function",
            name="foo",
            language="python",
        )
        chunk2 = Chunk(
            content="def foo(): return 1",
            file_path="test.py",
            start_line=1,
            end_line=1,
            chunk_type="function",
            name="foo",
            language="python",
        )
        # Different content should produce different IDs
        assert chunk1.chunk_id != chunk2.chunk_id
    def test_line_count(self) -> None:
        chunk = Chunk(
            content="line1\nline2\nline3",
            file_path="test.py",
            start_line=10,
            end_line=12,
            chunk_type="block",
            name="test",
            language="python",
        )
        assert chunk.line_count == 3
--- a/tests/test_librarian_query.py
+++ b/tests/test_librarian_query.py
@ -0,0 +1,292 @@
 """
 Integration tests for Premium Librarian query system.
 These tests require:
 - eXist-db running (for storage)
 - LLM router configured (for synthesis)
 Mark with @pytest.mark.integration to skip in CI without dependencies.
 """
 import pytest
 from unittest.mock import AsyncMock, patch, MagicMock
 from xml_pipeline.librarian.query import (
    QueryResult,
    Source,
    RetrievedChunk,
    _build_rag_prompt,
    format_sources_xml,
 )
 from xml_pipeline.librarian.index import LibraryIndex
 class TestBuildRagPrompt:
    """Tests for RAG prompt construction."""
    def test_builds_prompt_with_context(self) -> None:
        chunks = [
            RetrievedChunk(
                chunk_id="test:foo:abc123",
                file_path="src/utils.py",
                name="calculate",
                chunk_type="function",
                language="python",
                start_line=10,
                end_line=20,
                content="def calculate(x): return x * 2",
                docstring="Calculate double.",
                signature="def calculate(x) -> int",
                score=0.9,
            ),
            RetrievedChunk(
                chunk_id="test:bar:def456",
                file_path="src/main.py",
                name="main",
                chunk_type="function",
                language="python",
                start_line=1,
                end_line=5,
                content="def main(): print('hello')",
                docstring="",
                signature="def main()",
                score=0.7,
            ),
        ]
        prompt = _build_rag_prompt(
            question="How does the calculate function work?",
            chunks=chunks,
            library_name="test-lib",
        )
        # Verify prompt structure
        assert "test-lib" in prompt
        assert "calculate function" in prompt
        assert "src/utils.py" in prompt
        assert "src/main.py" in prompt
        assert "[1]" in prompt
        assert "[2]" in prompt
        assert "```python" in prompt
    def test_truncates_long_content(self) -> None:
        long_content = "x" * 3000  # Longer than 2000 char limit
        chunks = [
            RetrievedChunk(
                chunk_id="test:long:123",
                file_path="long.py",
                name="long_func",
                chunk_type="function",
                language="python",
                start_line=1,
                end_line=100,
                content=long_content,
                docstring="",
                signature="",
                score=0.5,
            ),
        ]
        prompt = _build_rag_prompt("What?", chunks, "lib")
        # Content should be truncated
        assert "(truncated)" in prompt
        # Should not contain full content
        assert long_content not in prompt
    def test_empty_chunks_list(self) -> None:
        prompt = _build_rag_prompt("What?", [], "lib")
        assert "lib" in prompt
        assert "Question" in prompt
 class TestFormatSourcesXml:
    """Tests for XML source formatting."""
    def test_formats_sources_as_xml(self) -> None:
        sources = [
            Source(
                file_path="src/app.py",
                name="process",
                chunk_type="function",
                start_line=10,
                end_line=25,
                relevance_score=0.95,
                snippet="def process(data): ...",
            ),
        ]
        xml = format_sources_xml(sources)
        assert "<sources>" in xml
        assert "</sources>" in xml
        assert "<source index=\"1\">" in xml
        assert "<file-path>src/app.py</file-path>" in xml
        assert "<name>process</name>" in xml
        assert "<type>function</type>" in xml
        assert "<lines>10-25</lines>" in xml
        assert "<score>0.95</score>" in xml
    def test_escapes_special_characters(self) -> None:
        sources = [
            Source(
                file_path="src/<special>.py",
                name="func&name",
                chunk_type="function",
                start_line=1,
                end_line=1,
                relevance_score=0.5,
                snippet="code with <tags> & entities",
            ),
        ]
        xml = format_sources_xml(sources)
        # XML entities should be escaped
        assert "&lt;special&gt;" in xml
        assert "func&amp;name" in xml
    def test_empty_sources_list(self) -> None:
        xml = format_sources_xml([])
        assert "<sources>" in xml
        assert "</sources>" in xml
 class TestQueryResultDataclass:
    """Tests for QueryResult dataclass."""
    def test_default_values(self) -> None:
        result = QueryResult(answer="Test answer")
        assert result.answer == "Test answer"
        assert result.sources == []
        assert result.tokens_used == 0
        assert result.chunks_examined == 0
        assert result.error == ""
    def test_with_sources(self) -> None:
        sources = [
            Source(
                file_path="test.py",
                name="test",
                chunk_type="function",
                start_line=1,
                end_line=10,
                relevance_score=0.9,
            ),
        ]
        result = QueryResult(
            answer="Test answer",
            sources=sources,
            tokens_used=100,
            chunks_examined=5,
        )
        assert len(result.sources) == 1
        assert result.tokens_used == 100
        assert result.chunks_examined == 5
 class TestRetrievedChunk:
    """Tests for RetrievedChunk dataclass."""
    def test_all_fields(self) -> None:
        chunk = RetrievedChunk(
            chunk_id="lib:file:hash",
            file_path="src/module.py",
            name="my_function",
            chunk_type="function",
            language="python",
            start_line=10,
            end_line=20,
            content="def my_function(): pass",
            docstring="Does something.",
            signature="def my_function() -> None",
            score=0.85,
        )
        assert chunk.chunk_id == "lib:file:hash"
        assert chunk.file_path == "src/module.py"
        assert chunk.name == "my_function"
        assert chunk.language == "python"
        assert chunk.score == 0.85
@pytest.mark.integration
 class TestQueryLibraryIntegration:
    """Integration tests requiring eXist-db and LLM."""
    async def test_query_nonexistent_library(self) -> None:
        """Query should return error for non-existent library."""
        from xml_pipeline.librarian.query import query_library
        # Mock get_index to return None - patch at index module level
        with patch("xml_pipeline.librarian.index.get_index", new_callable=AsyncMock) as mock_get_index:
            mock_get_index.return_value = None
            result = await query_library(
                library_id="nonexistent-lib-xyz",
                question="What does this do?",
            )
            assert result.error
            assert "not found" in result.error.lower()
    async def test_query_with_no_relevant_chunks(self) -> None:
        """Query should handle case where search returns no results."""
        from xml_pipeline.librarian.query import query_library
        mock_index = LibraryIndex(
            library_id="test-lib",
            name="Test Library",
            source_url="https://example.com/repo",
            created_at="2024-01-01T00:00:00Z",
        )
        # Patch get_index at the index module level (where it's defined)
        # and _search_chunks at query module level
        with patch("xml_pipeline.librarian.index.get_index", new_callable=AsyncMock) as mock_get_index:
            mock_get_index.return_value = mock_index
            with patch("xml_pipeline.librarian.query._search_chunks", new_callable=AsyncMock) as mock_search:
                mock_search.return_value = []
                result = await query_library(
                    library_id="test-lib",
                    question="What does foo do?",
                )
                assert "No relevant code found" in result.answer
                assert result.chunks_examined == 0
 class TestLibraryIndex:
    """Tests for LibraryIndex dataclass."""
    def test_properties(self) -> None:
        index = LibraryIndex(
            library_id="test-id",
            name="Test Lib",
            source_url="https://github.com/test/repo",
            created_at="2024-01-01",
            files=["a.py", "b.py", "c.py"],
            functions={"func1": "a.py", "func2": "b.py"},
            classes={"MyClass": "c.py"},
            stats={"chunks": 10, "files": 3},
        )
        assert index.total_chunks == 10
        assert index.total_files == 3
    def test_empty_stats(self) -> None:
        index = LibraryIndex(
            library_id="test",
            name="Test",
            source_url="",
            created_at="",
        )
        assert index.total_chunks == 0
        assert index.total_files == 0
--- a/xml_pipeline/librarian/init.py
+++ b/xml_pipeline/librarian/init.py
@ -0,0 +1,103 @@
 """
 Premium Librarian — RLM-powered codebase intelligence.
 Ingests codebases, chunks them intelligently, stores in eXist-db,
 and answers natural language queries using Online LLM + RAG.
 Usage:
    from xml_pipeline.librarian import ingest_git_repo, query_library
    # Ingest a codebase
    result = await ingest_git_repo(
        url="https://github.com/example/repo.git",
        library_name="my-lib",
    )
    # Query it
    answer = await query_library(
        library_id=result.library_id,
        question="What does this codebase do?",
    )
 """
 from xml_pipeline.librarian.chunker import (
    Chunk,
    chunk_file,
    chunk_python,
    chunk_javascript,
    chunk_cpp,
    detect_language,
 )
 from xml_pipeline.librarian.ingest import (
    IngestResult,
    ingest_git_repo,
 )
 from xml_pipeline.librarian.index import (
    LibraryIndex,
    build_index,
    get_index,
 )
 from xml_pipeline.librarian.query import (
    Source,
    QueryResult,
    query_library,
 )
 from xml_pipeline.librarian.primitives import (
    LibrarianIngest,
    LibrarianIngested,
    LibrarianQuery,
    LibrarianAnswer,
    LibrarianList,
    LibrarianLibraries,
    LibrarianDelete,
    LibrarianDeleted,
    LibrarianGetChunk,
    LibrarianChunk,
    LibraryInfo,
 )
 from xml_pipeline.librarian.handler import (
    handle_librarian_ingest,
    handle_librarian_query,
    handle_librarian_list,
    handle_librarian_delete,
    handle_librarian_get_chunk,
 )
 __all__ = [
    # Chunker
    "Chunk",
    "chunk_file",
    "chunk_python",
    "chunk_javascript",
    "chunk_cpp",
    "detect_language",
    # Ingest
    "IngestResult",
    "ingest_git_repo",
    # Index
    "LibraryIndex",
    "build_index",
    "get_index",
    # Query
    "Source",
    "QueryResult",
    "query_library",
    # Primitives
    "LibrarianIngest",
    "LibrarianIngested",
    "LibrarianQuery",
    "LibrarianAnswer",
    "LibrarianList",
    "LibrarianLibraries",
    "LibrarianDelete",
    "LibrarianDeleted",
    "LibrarianGetChunk",
    "LibrarianChunk",
    "LibraryInfo",
    # Handlers
    "handle_librarian_ingest",
    "handle_librarian_query",
    "handle_librarian_list",
    "handle_librarian_delete",
    "handle_librarian_get_chunk",
 ]
--- a/xml_pipeline/librarian/chunker.py
+++ b/xml_pipeline/librarian/chunker.py
@ -0,0 +1,677 @@
 """
 chunker.py — AST-based code chunking for intelligent RAG retrieval.
 Chunks source files into semantically meaningful units (functions, classes, modules)
 preserving context like docstrings, signatures, and imports.
 Supported languages:
 - Python (ast.parse)
 - JavaScript/TypeScript (regex-based)
 - C++ (regex-based)
 """
 from __future__ import annotations
 import ast
 import re
 import hashlib
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
@dataclass
 class Chunk:
    """A semantically meaningful code chunk."""
    content: str
    file_path: str
    start_line: int
    end_line: int
    chunk_type: str  # "function", "class", "method", "module", "block"
    name: str  # Function/class name or file name for modules
    language: str
    imports: list[str] = field(default_factory=list)
    docstring: str = ""
    signature: str = ""  # Function signature for context
    parent_class: str = ""  # Class name if this is a method
    @property
    def chunk_id(self) -> str:
        """Generate unique ID for this chunk."""
        content_hash = hashlib.sha256(self.content.encode()).hexdigest()[:12]
        return f"{self.file_path}:{self.name}:{content_hash}"
    @property
    def line_count(self) -> int:
        """Number of lines in this chunk."""
        return self.end_line - self.start_line + 1
 # Language detection by file extension
 LANGUAGE_MAP = {
    ".py": "python",
    ".pyi": "python",
    ".js": "javascript",
    ".jsx": "javascript",
    ".ts": "typescript",
    ".tsx": "typescript",
    ".mjs": "javascript",
    ".cjs": "javascript",
    ".c": "c",
    ".h": "c",
    ".cpp": "cpp",
    ".cxx": "cpp",
    ".cc": "cpp",
    ".hpp": "cpp",
    ".hxx": "cpp",
    ".rs": "rust",
    ".go": "go",
    ".java": "java",
    ".kt": "kotlin",
    ".rb": "ruby",
    ".php": "php",
    ".cs": "csharp",
    ".swift": "swift",
    ".scala": "scala",
    ".md": "markdown",
    ".rst": "restructuredtext",
    ".txt": "text",
 }
 # Max lines per chunk before splitting
 MAX_CHUNK_LINES = 500
 def detect_language(file_path: str) -> str:
    """Detect language from file extension."""
    suffix = Path(file_path).suffix.lower()
    return LANGUAGE_MAP.get(suffix, "unknown")
 def chunk_file(content: str, file_path: str) -> list[Chunk]:
    """
    Chunk a file based on detected language.
    Dispatches to language-specific chunker or falls back to
    line-based chunking for unknown languages.
    """
    language = detect_language(file_path)
    if language == "python":
        return chunk_python(content, file_path)
    elif language in ("javascript", "typescript"):
        return chunk_javascript(content, file_path)
    elif language in ("c", "cpp"):
        return chunk_cpp(content, file_path)
    elif language in ("markdown", "restructuredtext", "text"):
        return chunk_prose(content, file_path, language)
    else:
        # Generic line-based chunking
        return chunk_generic(content, file_path, language)
 def chunk_python(content: str, file_path: str) -> list[Chunk]:
    """
    AST-based Python chunking.
    Extracts:
    - Module-level imports (as context)
    - Functions (with docstrings)
    - Classes (with methods)
    - Top-level code blocks
    """
    chunks: list[Chunk] = []
    lines = content.splitlines()
    try:
        tree = ast.parse(content)
    except SyntaxError:
        # Fall back to generic chunking on parse error
        return chunk_generic(content, file_path, "python")
    # Extract imports for context
    imports: list[str] = []
    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                imports.append(f"import {alias.name}")
        elif isinstance(node, ast.ImportFrom):
            module = node.module or ""
            names = ", ".join(a.name for a in node.names)
            imports.append(f"from {module} import {names}")
    # Process top-level definitions
    for node in ast.iter_child_nodes(tree):
        if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef):
            chunk = _extract_python_function(node, lines, file_path, imports)
            chunks.append(chunk)
        elif isinstance(node, ast.ClassDef):
            # Create chunk for class definition + methods
            class_chunks = _extract_python_class(node, lines, file_path, imports)
            chunks.extend(class_chunks)
    # If no chunks extracted, create a module chunk
    if not chunks and content.strip():
        chunks.append(
            Chunk(
                content=content,
                file_path=file_path,
                start_line=1,
                end_line=len(lines),
                chunk_type="module",
                name=Path(file_path).stem,
                language="python",
                imports=imports,
            )
        )
    return chunks
 def _extract_python_function(
    node: ast.FunctionDef | ast.AsyncFunctionDef,
    lines: list[str],
    file_path: str,
    imports: list[str],
    parent_class: str = "",
 ) -> Chunk:
    """Extract a Python function as a chunk."""
    start_line = node.lineno
    end_line = node.end_lineno or start_line
    # Get source lines (1-indexed)
    func_lines = lines[start_line - 1 : end_line]
    content = "\n".join(func_lines)
    # Extract docstring
    docstring = ast.get_docstring(node) or ""
    # Build signature
    args = []
    for arg in node.args.args:
        arg_str = arg.arg
        if arg.annotation:
            try:
                arg_str += f": {ast.unparse(arg.annotation)}"
            except Exception:
                pass
        args.append(arg_str)
    returns = ""
    if node.returns:
        try:
            returns = f" -> {ast.unparse(node.returns)}"
        except Exception:
            pass
    async_prefix = "async " if isinstance(node, ast.AsyncFunctionDef) else ""
    signature = f"{async_prefix}def {node.name}({', '.join(args)}){returns}"
    chunk_type = "method" if parent_class else "function"
    return Chunk(
        content=content,
        file_path=file_path,
        start_line=start_line,
        end_line=end_line,
        chunk_type=chunk_type,
        name=node.name,
        language="python",
        imports=imports,
        docstring=docstring,
        signature=signature,
        parent_class=parent_class,
    )
 def _extract_python_class(
    node: ast.ClassDef,
    lines: list[str],
    file_path: str,
    imports: list[str],
 ) -> list[Chunk]:
    """Extract a Python class and its methods as chunks."""
    chunks: list[Chunk] = []
    start_line = node.lineno
    end_line = node.end_lineno or start_line
    # Get full class source
    class_lines = lines[start_line - 1 : end_line]
    class_content = "\n".join(class_lines)
    # Class docstring
    docstring = ast.get_docstring(node) or ""
    # Build class signature with bases
    bases = []
    for base in node.bases:
        try:
            bases.append(ast.unparse(base))
        except Exception:
            pass
    base_str = f"({', '.join(bases)})" if bases else ""
    signature = f"class {node.name}{base_str}"
    # If class is small enough, keep as single chunk
    if len(class_lines) <= MAX_CHUNK_LINES:
        chunks.append(
            Chunk(
                content=class_content,
                file_path=file_path,
                start_line=start_line,
                end_line=end_line,
                chunk_type="class",
                name=node.name,
                language="python",
                imports=imports,
                docstring=docstring,
                signature=signature,
            )
        )
    else:
        # Large class: chunk into class header + individual methods
        # First, create a class header chunk (up to first method or ~50 lines)
        header_end = start_line + min(50, len(class_lines) - 1)
        for child in node.body:
            if isinstance(child, ast.FunctionDef | ast.AsyncFunctionDef):
                header_end = child.lineno - 1
                break
        header_lines = lines[start_line - 1 : header_end]
        chunks.append(
            Chunk(
                content="\n".join(header_lines),
                file_path=file_path,
                start_line=start_line,
                end_line=header_end,
                chunk_type="class",
                name=node.name,
                language="python",
                imports=imports,
                docstring=docstring,
                signature=signature,
            )
        )
        # Then extract each method
        for child in node.body:
            if isinstance(child, ast.FunctionDef | ast.AsyncFunctionDef):
                method_chunk = _extract_python_function(
                    child, lines, file_path, imports, parent_class=node.name
                )
                chunks.append(method_chunk)
    return chunks
 def chunk_javascript(content: str, file_path: str) -> list[Chunk]:
    """
    Regex-based JavaScript/TypeScript chunking.
    Extracts:
    - Function declarations
    - Arrow functions assigned to const/let
    - Class definitions
    - Export statements
    """
    chunks: list[Chunk] = []
    lines = content.splitlines()
    language = detect_language(file_path)
    # Extract imports
    imports: list[str] = []
    import_pattern = re.compile(
        r'^(?:import\s+.*?from\s+[\'"].*?[\'"]|import\s+[\'"].*?[\'"]|'
        r'const\s+\w+\s*=\s*require\([\'"].*?[\'"]\))',
        re.MULTILINE,
    )
    for match in import_pattern.finditer(content):
        imports.append(match.group(0))
    # Function pattern: function name(...) or async function name(...)
    func_pattern = re.compile(
        r"^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\([^)]*\)",
        re.MULTILINE,
    )
    # Arrow function pattern: const name = (...) => or const name = async (...) =>
    arrow_pattern = re.compile(
        r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>",
        re.MULTILINE,
    )
    # Class pattern
    class_pattern = re.compile(
        r"^(?:export\s+)?(?:default\s+)?class\s+(\w+)",
        re.MULTILINE,
    )
    # Find all definitions and their positions
    definitions: list[tuple[int, str, str, str]] = []  # (line, type, name, signature)
    for match in func_pattern.finditer(content):
        line_num = content[: match.start()].count("\n") + 1
        definitions.append((line_num, "function", match.group(1), match.group(0)))
    for match in arrow_pattern.finditer(content):
        line_num = content[: match.start()].count("\n") + 1
        definitions.append((line_num, "function", match.group(1), match.group(0)))
    for match in class_pattern.finditer(content):
        line_num = content[: match.start()].count("\n") + 1
        definitions.append((line_num, "class", match.group(1), match.group(0)))
    # Sort by line number
    definitions.sort(key=lambda x: x[0])
    # Create chunks
    for i, (start_line, chunk_type, name, signature) in enumerate(definitions):
        # End line is start of next definition - 1, or end of file
        if i + 1 < len(definitions):
            end_line = definitions[i + 1][0] - 1
        else:
            end_line = len(lines)
        # Trim trailing empty lines
        while end_line > start_line and not lines[end_line - 1].strip():
            end_line -= 1
        chunk_lines = lines[start_line - 1 : end_line]
        chunk_content = "\n".join(chunk_lines)
        # Extract JSDoc comment if present
        docstring = ""
        if start_line > 1:
            prev_line = lines[start_line - 2].strip()
            if prev_line.endswith("*/"):
                # Look back for JSDoc start
                doc_lines = []
                for j in range(start_line - 2, max(0, start_line - 20), -1):
                    doc_lines.insert(0, lines[j])
                    if "/**" in lines[j]:
                        break
                docstring = "\n".join(doc_lines)
        chunks.append(
            Chunk(
                content=chunk_content,
                file_path=file_path,
                start_line=start_line,
                end_line=end_line,
                chunk_type=chunk_type,
                name=name,
                language=language,
                imports=imports,
                docstring=docstring,
                signature=signature,
            )
        )
    # If no chunks, create module chunk
    if not chunks and content.strip():
        chunks.append(
            Chunk(
                content=content,
                file_path=file_path,
                start_line=1,
                end_line=len(lines),
                chunk_type="module",
                name=Path(file_path).stem,
                language=language,
                imports=imports,
            )
        )
    return chunks
 def chunk_cpp(content: str, file_path: str) -> list[Chunk]:
    """
    Regex-based C/C++ chunking.
    Extracts:
    - Function definitions
    - Class definitions
    - Struct definitions
    """
    chunks: list[Chunk] = []
    lines = content.splitlines()
    language = detect_language(file_path)
    # Extract includes
    imports: list[str] = []
    include_pattern = re.compile(r'^#include\s+[<"].*?[>"]', re.MULTILINE)
    for match in include_pattern.finditer(content):
        imports.append(match.group(0))
    # Function pattern (simplified): return_type name(params) {
    # This is a simplified pattern that won't catch all cases
    func_pattern = re.compile(
        r"^(?:(?:static|inline|virtual|explicit|constexpr|template\s*<[^>]*>\s*)*"
        r"(?:\w+(?:::\w+)*\s+)+)"  # Return type
        r"(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:override\s*)?(?:noexcept\s*)?[{;]",
        re.MULTILINE,
    )
    # Class/struct pattern
    class_pattern = re.compile(
        r"^(?:template\s*<[^>]*>\s*)?(?:class|struct)\s+(\w+)",
        re.MULTILINE,
    )
    definitions: list[tuple[int, str, str, str]] = []
    for match in func_pattern.finditer(content):
        line_num = content[: match.start()].count("\n") + 1
        name = match.group(1)
        # Skip common false positives
        if name not in ("if", "while", "for", "switch", "return"):
            definitions.append((line_num, "function", name, match.group(0).strip()))
    for match in class_pattern.finditer(content):
        line_num = content[: match.start()].count("\n") + 1
        definitions.append((line_num, "class", match.group(1), match.group(0)))
    definitions.sort(key=lambda x: x[0])
    # Create chunks (similar to JS)
    for i, (start_line, chunk_type, name, signature) in enumerate(definitions):
        if i + 1 < len(definitions):
            end_line = definitions[i + 1][0] - 1
        else:
            end_line = len(lines)
        while end_line > start_line and not lines[end_line - 1].strip():
            end_line -= 1
        # For functions, try to find matching brace
        if chunk_type == "function":
            brace_count = 0
            found_open = False
            for j in range(start_line - 1, min(end_line, len(lines))):
                for char in lines[j]:
                    if char == "{":
                        brace_count += 1
                        found_open = True
                    elif char == "}":
                        brace_count -= 1
                        if found_open and brace_count == 0:
                            end_line = j + 1
                            break
                if found_open and brace_count == 0:
                    break
        chunk_lines = lines[start_line - 1 : end_line]
        chunk_content = "\n".join(chunk_lines)
        # Extract Doxygen comment if present
        docstring = ""
        if start_line > 1:
            prev_line = lines[start_line - 2].strip()
            if prev_line.endswith("*/"):
                doc_lines = []
                for j in range(start_line - 2, max(0, start_line - 30), -1):
                    doc_lines.insert(0, lines[j])
                    if "/**" in lines[j] or "/*!" in lines[j]:
                        break
                docstring = "\n".join(doc_lines)
        chunks.append(
            Chunk(
                content=chunk_content,
                file_path=file_path,
                start_line=start_line,
                end_line=end_line,
                chunk_type=chunk_type,
                name=name,
                language=language,
                imports=imports,
                docstring=docstring,
                signature=signature,
            )
        )
    if not chunks and content.strip():
        chunks.append(
            Chunk(
                content=content,
                file_path=file_path,
                start_line=1,
                end_line=len(lines),
                chunk_type="module",
                name=Path(file_path).stem,
                language=language,
                imports=imports,
            )
        )
    return chunks
 def chunk_prose(content: str, file_path: str, language: str) -> list[Chunk]:
    """
    Chunk prose documents (Markdown, RST, plain text).
    Splits on headings/sections, keeping chunks under MAX_CHUNK_LINES.
    """
    chunks: list[Chunk] = []
    lines = content.splitlines()
    # Markdown heading pattern
    if language == "markdown":
        heading_pattern = re.compile(r"^#{1,6}\s+(.+)$")
    else:
        heading_pattern = re.compile(r"^[=\-~]+$")  # RST underline headings
    current_chunk_lines: list[str] = []
    current_start = 1
    current_name = Path(file_path).stem
    for i, line in enumerate(lines, 1):
        match = heading_pattern.match(line)
        # New section or chunk too large
        if match or len(current_chunk_lines) >= MAX_CHUNK_LINES:
            # Save current chunk if non-empty
            if current_chunk_lines:
                chunks.append(
                    Chunk(
                        content="\n".join(current_chunk_lines),
                        file_path=file_path,
                        start_line=current_start,
                        end_line=i - 1,
                        chunk_type="section",
                        name=current_name,
                        language=language,
                    )
                )
            # Start new chunk
            current_chunk_lines = [line]
            current_start = i
            if match:
                current_name = match.group(1) if language == "markdown" else lines[i - 2] if i > 1 else current_name
        else:
            current_chunk_lines.append(line)
    # Save final chunk
    if current_chunk_lines:
        chunks.append(
            Chunk(
                content="\n".join(current_chunk_lines),
                file_path=file_path,
                start_line=current_start,
                end_line=len(lines),
                chunk_type="section",
                name=current_name,
                language=language,
            )
        )
    return chunks
 def chunk_generic(content: str, file_path: str, language: str) -> list[Chunk]:
    """
    Generic line-based chunking for unknown languages.
    Splits content into MAX_CHUNK_LINES chunks, trying to break at empty lines.
    """
    chunks: list[Chunk] = []
    lines = content.splitlines()
    if not lines:
        return chunks
    current_chunk_lines: list[str] = []
    current_start = 1
    for i, line in enumerate(lines, 1):
        current_chunk_lines.append(line)
        # Check if we should split
        if len(current_chunk_lines) >= MAX_CHUNK_LINES:
            # Try to find a good break point (empty line in last 50 lines)
            break_at = len(current_chunk_lines)
            for j in range(len(current_chunk_lines) - 1, max(0, len(current_chunk_lines) - 50), -1):
                if not current_chunk_lines[j].strip():
                    break_at = j
                    break
            # Create chunk up to break point
            chunks.append(
                Chunk(
                    content="\n".join(current_chunk_lines[:break_at]),
                    file_path=file_path,
                    start_line=current_start,
                    end_line=current_start + break_at - 1,
                    chunk_type="block",
                    name=f"{Path(file_path).stem}:{current_start}",
                    language=language,
                )
            )
            # Keep remaining lines for next chunk
            current_chunk_lines = current_chunk_lines[break_at:]
            current_start = current_start + break_at
    # Save final chunk
    if current_chunk_lines:
        chunks.append(
            Chunk(
                content="\n".join(current_chunk_lines),
                file_path=file_path,
                start_line=current_start,
                end_line=len(lines),
                chunk_type="block",
                name=f"{Path(file_path).stem}:{current_start}",
                language=language,
            )
        )
    return chunks
--- a/xml_pipeline/librarian/handler.py
+++ b/xml_pipeline/librarian/handler.py
@ -0,0 +1,246 @@
 """
 handler.py — Message handlers for Premium Librarian.
 These handlers process librarian requests through the organism's message bus.
 """
 from __future__ import annotations
 import logging
 from xml.sax.saxutils import escape as xml_escape
 from xml_pipeline.message_bus.message_state import HandlerMetadata, HandlerResponse
 from xml_pipeline.librarian.primitives import (
    LibrarianIngest,
    LibrarianIngested,
    LibrarianQuery,
    LibrarianAnswer,
    LibrarianList,
    LibrarianLibraries,
    LibrarianDelete,
    LibrarianDeleted,
    LibrarianGetChunk,
    LibrarianChunk,
 )
 logger = logging.getLogger(__name__)
 async def handle_librarian_ingest(
    payload: LibrarianIngest,
    metadata: HandlerMetadata,
 ) -> HandlerResponse:
    """
    Handle a codebase ingestion request.
    Clones the git repository, chunks all files, and stores in eXist-db.
    """
    from xml_pipeline.librarian.ingest import ingest_git_repo
    logger.info(f"Ingesting codebase from {payload.git_url}")
    try:
        result = await ingest_git_repo(
            url=payload.git_url,
            branch=payload.branch,
            library_name=payload.library_name,
        )
        return HandlerResponse.respond(
            payload=LibrarianIngested(
                library_id=result.library_id,
                library_name=result.library_name,
                files_processed=result.files_processed,
                chunks_created=result.chunks_created,
                index_built=result.index_built,
                errors="\n".join(result.errors) if result.errors else "",
            )
        )
    except Exception as e:
        logger.error(f"Ingest failed: {e}")
        return HandlerResponse.respond(
            payload=LibrarianIngested(
                library_id="",
                library_name=payload.library_name or "",
                files_processed=0,
                chunks_created=0,
                index_built=False,
                errors=str(e),
            )
        )
 async def handle_librarian_query(
    payload: LibrarianQuery,
    metadata: HandlerMetadata,
 ) -> HandlerResponse:
    """
    Handle a library query request.
    Searches for relevant code chunks and synthesizes an answer using LLM.
    """
    from xml_pipeline.librarian.query import query_library, format_sources_xml
    logger.info(f"Querying library {payload.library_id}: {payload.question[:100]}...")
    try:
        result = await query_library(
            library_id=payload.library_id,
            question=payload.question,
            max_chunks=payload.max_chunks,
            model=payload.model,
        )
        sources_xml = format_sources_xml(result.sources) if result.sources else ""
        return HandlerResponse.respond(
            payload=LibrarianAnswer(
                answer=result.answer,
                sources=sources_xml,
                tokens_used=result.tokens_used,
                chunks_examined=result.chunks_examined,
                error=result.error,
            )
        )
    except Exception as e:
        logger.error(f"Query failed: {e}")
        return HandlerResponse.respond(
            payload=LibrarianAnswer(
                answer="",
                sources="",
                tokens_used=0,
                chunks_examined=0,
                error=str(e),
            )
        )
 async def handle_librarian_list(
    payload: LibrarianList,
    metadata: HandlerMetadata,
 ) -> HandlerResponse:
    """
    Handle a request to list all ingested libraries.
    """
    from xml_pipeline.librarian.index import list_libraries
    logger.info("Listing all libraries")
    try:
        libraries = await list_libraries()
        # Format libraries as XML
        lib_items = []
        for lib in libraries:
            lib_items.append(
                f"""  <library>
    <library-id>{xml_escape(lib.library_id)}</library-id>
    <name>{xml_escape(lib.name)}</name>
    <source-url>{xml_escape(lib.source_url)}</source-url>
    <created-at>{xml_escape(lib.created_at)}</created-at>
    <total-files>{lib.total_files}</total-files>
    <total-chunks>{lib.total_chunks}</total-chunks>
  </library>"""
            )
        libraries_xml = "<libraries>\n" + "\n".join(lib_items) + "\n</libraries>"
        return HandlerResponse.respond(
            payload=LibrarianLibraries(
                count=len(libraries),
                libraries=libraries_xml,
            )
        )
    except Exception as e:
        logger.error(f"List failed: {e}")
        return HandlerResponse.respond(
            payload=LibrarianLibraries(
                count=0,
                libraries="<libraries></libraries>",
            )
        )
 async def handle_librarian_delete(
    payload: LibrarianDelete,
    metadata: HandlerMetadata,
 ) -> HandlerResponse:
    """
    Handle a request to delete a library.
    """
    from xml_pipeline.librarian.index import delete_library
    logger.info(f"Deleting library {payload.library_id}")
    try:
        success = await delete_library(payload.library_id)
        return HandlerResponse.respond(
            payload=LibrarianDeleted(
                library_id=payload.library_id,
                success=success,
                error="" if success else "Delete operation failed",
            )
        )
    except Exception as e:
        logger.error(f"Delete failed: {e}")
        return HandlerResponse.respond(
            payload=LibrarianDeleted(
                library_id=payload.library_id,
                success=False,
                error=str(e),
            )
        )
 async def handle_librarian_get_chunk(
    payload: LibrarianGetChunk,
    metadata: HandlerMetadata,
 ) -> HandlerResponse:
    """
    Handle a request to retrieve a specific code chunk.
    """
    from xml_pipeline.librarian.query import get_chunk_by_id
    logger.info(f"Getting chunk {payload.chunk_id} from library {payload.library_id}")
    try:
        chunk = await get_chunk_by_id(payload.library_id, payload.chunk_id)
        if chunk is None:
            return HandlerResponse.respond(
                payload=LibrarianChunk(
                    chunk_id=payload.chunk_id,
                    error=f"Chunk not found: {payload.chunk_id}",
                )
            )
        return HandlerResponse.respond(
            payload=LibrarianChunk(
                chunk_id=chunk.chunk_id,
                file_path=chunk.file_path,
                name=chunk.name,
                chunk_type=chunk.chunk_type,
                language=chunk.language,
                start_line=chunk.start_line,
                end_line=chunk.end_line,
                content=chunk.content,
                docstring=chunk.docstring,
                signature=chunk.signature,
                error="",
            )
        )
    except Exception as e:
        logger.error(f"Get chunk failed: {e}")
        return HandlerResponse.respond(
            payload=LibrarianChunk(
                chunk_id=payload.chunk_id,
                error=str(e),
            )
        )
--- a/xml_pipeline/librarian/index.py
+++ b/xml_pipeline/librarian/index.py
@ -0,0 +1,328 @@
 """
 index.py — Library index management for Premium Librarian.
 Builds and queries structural indices for ingested codebases.
 The index provides fast lookup of files, functions, and classes
 without needing full-text search.
 """
 from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import Optional
 from xml.sax.saxutils import escape as xml_escape
 logger = logging.getLogger(__name__)
@dataclass
 class LibraryIndex:
    """Structural index for an ingested library."""
    library_id: str
    name: str
    source_url: str
    created_at: str
    files: list[str] = field(default_factory=list)
    functions: dict[str, str] = field(default_factory=dict)  # name → file path
    classes: dict[str, str] = field(default_factory=dict)  # name → file path
    modules: list[str] = field(default_factory=list)
    stats: dict[str, int] = field(default_factory=dict)
    @property
    def total_chunks(self) -> int:
        """Total number of chunks in this library."""
        return self.stats.get("chunks", 0)
    @property
    def total_files(self) -> int:
        """Total number of files in this library."""
        return len(self.files)
 def _index_to_xml(index: LibraryIndex) -> str:
    """Convert index to XML document for storage."""
    files_xml = "\n".join(f"    <file>{xml_escape(f)}</file>" for f in index.files)
    functions_xml = "\n".join(
        f'    <function name="{xml_escape(name)}" file="{xml_escape(path)}"/>'
        for name, path in index.functions.items()
    )
    classes_xml = "\n".join(
        f'    <class name="{xml_escape(name)}" file="{xml_escape(path)}"/>'
        for name, path in index.classes.items()
    )
    modules_xml = "\n".join(f"    <module>{xml_escape(m)}</module>" for m in index.modules)
    stats_xml = "\n".join(
        f'    <stat name="{xml_escape(k)}">{v}</stat>'
        for k, v in index.stats.items()
    )
    return f"""<?xml version="1.0" encoding="UTF-8"?>
 <library-index xmlns="https://xml-pipeline.org/ns/librarian/v1">
  <library-id>{xml_escape(index.library_id)}</library-id>
  <name>{xml_escape(index.name)}</name>
  <source-url>{xml_escape(index.source_url)}</source-url>
  <created-at>{xml_escape(index.created_at)}</created-at>
  <files>
 {files_xml}
  </files>
  <functions>
 {functions_xml}
  </functions>
  <classes>
 {classes_xml}
  </classes>
  <modules>
 {modules_xml}
  </modules>
  <stats>
 {stats_xml}
  </stats>
 </library-index>"""
 def _parse_index_xml(xml_content: str) -> Optional[LibraryIndex]:
    """Parse index XML back to LibraryIndex object."""
    try:
        from lxml import etree
        root = etree.fromstring(xml_content.encode())
        ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
        library_id = root.findtext("l:library-id", "", namespaces=ns)
        name = root.findtext("l:name", "", namespaces=ns)
        source_url = root.findtext("l:source-url", "", namespaces=ns)
        created_at = root.findtext("l:created-at", "", namespaces=ns)
        files = [f.text or "" for f in root.findall("l:files/l:file", namespaces=ns)]
        functions = {
            f.get("name", ""): f.get("file", "")
            for f in root.findall("l:functions/l:function", namespaces=ns)
        }
        classes = {
            c.get("name", ""): c.get("file", "")
            for c in root.findall("l:classes/l:class", namespaces=ns)
        }
        modules = [m.text or "" for m in root.findall("l:modules/l:module", namespaces=ns)]
        stats = {
            s.get("name", ""): int(s.text or 0)
            for s in root.findall("l:stats/l:stat", namespaces=ns)
        }
        return LibraryIndex(
            library_id=library_id,
            name=name,
            source_url=source_url,
            created_at=created_at,
            files=files,
            functions=functions,
            classes=classes,
            modules=modules,
            stats=stats,
        )
    except Exception as e:
        logger.error(f"Failed to parse index XML: {e}")
        return None
 async def build_index(
    library_id: str,
    library_name: str,
    source_url: str,
 ) -> LibraryIndex:
    """
    Build structural index from stored chunks.
    Queries eXist-db for all chunks belonging to this library
    and extracts structural information.
    """
    from xml_pipeline.tools.librarian import librarian_query, librarian_store
    # Query for all chunks in this library
    xquery = f"""
    declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
    for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
    return <item>
      <file>{{$chunk/l:file-path/text()}}</file>
      <type>{{$chunk/l:chunk-type/text()}}</type>
      <name>{{$chunk/l:name/text()}}</name>
      <language>{{$chunk/l:language/text()}}</language>
    </item>
    """
    result = await librarian_query(query=xquery, collection=f"/db/librarian/{library_id}")
    if not result.success:
        logger.warning(f"Failed to query chunks for index: {result.error}")
        # Create minimal index
        index = LibraryIndex(
            library_id=library_id,
            name=library_name,
            source_url=source_url,
            created_at=datetime.now(timezone.utc).isoformat(),
        )
    else:
        # Parse results
        files: set[str] = set()
        functions: dict[str, str] = {}
        classes: dict[str, str] = {}
        modules: list[str] = []
        lang_stats: dict[str, int] = {}
        chunk_count = 0
        try:
            from lxml import etree
            # Wrap results in root element for parsing
            xml_str = f"<results>{result.data.get('results', '')}</results>"
            root = etree.fromstring(xml_str.encode())
            for item in root.findall("item"):
                chunk_count += 1
                file_path = item.findtext("file", "")
                chunk_type = item.findtext("type", "")
                name = item.findtext("name", "")
                language = item.findtext("language", "")
                if file_path:
                    files.add(file_path)
                if chunk_type == "function" or chunk_type == "method":
                    functions[name] = file_path
                elif chunk_type == "class":
                    classes[name] = file_path
                elif chunk_type == "module":
                    modules.append(file_path)
                if language:
                    lang_stats[language] = lang_stats.get(language, 0) + 1
        except Exception as e:
            logger.warning(f"Failed to parse chunk query results: {e}")
        index = LibraryIndex(
            library_id=library_id,
            name=library_name,
            source_url=source_url,
            created_at=datetime.now(timezone.utc).isoformat(),
            files=sorted(files),
            functions=functions,
            classes=classes,
            modules=modules,
            stats={
                "chunks": chunk_count,
                "files": len(files),
                "functions": len(functions),
                "classes": len(classes),
                **{f"lang_{k}": v for k, v in lang_stats.items()},
            },
        )
    # Store index document
    index_xml = _index_to_xml(index)
    store_result = await librarian_store(
        collection=f"/db/librarian/{library_id}",
        document_name="index.xml",
        content=index_xml,
    )
    if not store_result.success:
        logger.warning(f"Failed to store index: {store_result.error}")
    return index
 async def get_index(library_id: str) -> Optional[LibraryIndex]:
    """
    Retrieve library index from eXist-db.
    Returns None if index doesn't exist.
    """
    from xml_pipeline.tools.librarian import librarian_get
    result = await librarian_get(f"/db/librarian/{library_id}/index.xml")
    if not result.success:
        return None
    content = result.data.get("content", "")
    return _parse_index_xml(content)
 async def list_libraries() -> list[LibraryIndex]:
    """
    List all ingested libraries.
    Returns list of LibraryIndex objects for all libraries in eXist-db.
    """
    from xml_pipeline.tools.librarian import librarian_query
    xquery = """
    declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
    for $index in collection("/db/librarian")//l:library-index
    return $index
    """
    result = await librarian_query(query=xquery, collection="/db/librarian")
    if not result.success:
        logger.warning(f"Failed to list libraries: {result.error}")
        return []
    libraries: list[LibraryIndex] = []
    try:
        from lxml import etree
        # Parse each index document
        xml_str = result.data.get("results", "")
        if xml_str.strip():
            # Wrap in root element
            wrapped = f"<results>{xml_str}</results>"
            root = etree.fromstring(wrapped.encode())
            for index_elem in root.findall(
                "{https://xml-pipeline.org/ns/librarian/v1}library-index"
            ):
                index_xml = etree.tostring(index_elem, encoding="unicode")
                index = _parse_index_xml(index_xml)
                if index:
                    libraries.append(index)
    except Exception as e:
        logger.warning(f"Failed to parse library list: {e}")
    return libraries
 async def delete_library(library_id: str) -> bool:
    """
    Delete a library and all its chunks from eXist-db.
    Returns True if successful.
    """
    from xml_pipeline.tools.librarian import librarian_query
    # Delete the entire collection
    xquery = f"""
    xmldb:remove("/db/librarian/{library_id}")
    """
    result = await librarian_query(query=xquery)
    if not result.success:
        logger.warning(f"Failed to delete library {library_id}: {result.error}")
        return False
    return True
--- a/xml_pipeline/librarian/ingest.py
+++ b/xml_pipeline/librarian/ingest.py
@ -0,0 +1,393 @@
 """
 ingest.py — Codebase ingestion for Premium Librarian.
 Clones git repositories, walks files, chunks them, and stores in eXist-db.
 """
 from __future__ import annotations
 import asyncio
 import hashlib
 import logging
 import shutil
 import tempfile
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import AsyncIterator, Optional
 from xml.sax.saxutils import escape as xml_escape
 from xml_pipeline.librarian.chunker import Chunk, chunk_file, detect_language
 logger = logging.getLogger(__name__)
 # File patterns to skip during ingestion
 SKIP_PATTERNS = {
    # Version control
    ".git",
    ".svn",
    ".hg",
    # Dependencies
    "node_modules",
    "vendor",
    ".venv",
    "venv",
    "__pycache__",
    ".pytest_cache",
    ".mypy_cache",
    ".ruff_cache",
    # Build artifacts
    "dist",
    "build",
    "target",
    "out",
    ".next",
    # IDE
    ".idea",
    ".vscode",
    # OS
    ".DS_Store",
    "Thumbs.db",
 }
 # File extensions to process
 CODE_EXTENSIONS = {
    ".py", ".pyi",  # Python
    ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",  # JavaScript/TypeScript
    ".c", ".h", ".cpp", ".cxx", ".cc", ".hpp", ".hxx",  # C/C++
    ".rs",  # Rust
    ".go",  # Go
    ".java",  # Java
    ".kt", ".kts",  # Kotlin
    ".rb",  # Ruby
    ".php",  # PHP
    ".cs",  # C#
    ".swift",  # Swift
    ".scala",  # Scala
    ".md", ".rst", ".txt",  # Documentation
    ".yaml", ".yml", ".toml", ".json",  # Config
    ".xml", ".xsd",  # XML
    ".sql",  # SQL
    ".sh", ".bash", ".zsh",  # Shell
    ".dockerfile", ".containerfile",  # Docker
 }
 # Max file size to process (1MB)
 MAX_FILE_SIZE = 1024 * 1024
@dataclass
 class IngestResult:
    """Result of a codebase ingestion."""
    library_id: str
    library_name: str
    files_processed: int
    chunks_created: int
    index_built: bool
    errors: list[str] = field(default_factory=list)
    stats: dict[str, int] = field(default_factory=dict)
@dataclass
 class IngestConfig:
    """Configuration for ingestion."""
    branch: str = "main"
    max_file_size: int = MAX_FILE_SIZE
    skip_patterns: set[str] = field(default_factory=lambda: SKIP_PATTERNS.copy())
    extensions: set[str] = field(default_factory=lambda: CODE_EXTENSIONS.copy())
 def _should_skip_path(path: Path, config: IngestConfig) -> bool:
    """Check if a path should be skipped."""
    for part in path.parts:
        if part in config.skip_patterns:
            return True
        if part.startswith(".") and part not in {".github", ".gitlab"}:
            return True
    return False
 def _should_process_file(path: Path, config: IngestConfig) -> bool:
    """Check if a file should be processed."""
    # Check extension
    suffix = path.suffix.lower()
    if suffix not in config.extensions:
        # Also check for files without extension (Dockerfile, Makefile, etc.)
        name_lower = path.name.lower()
        if name_lower not in {"dockerfile", "makefile", "rakefile", "gemfile"}:
            return False
    # Check size
    try:
        if path.stat().st_size > config.max_file_size:
            return False
    except OSError:
        return False
    return True
 async def _clone_repo(url: str, branch: str, target_dir: Path) -> None:
    """Clone a git repository."""
    try:
        # Try using GitPython
        from git import Repo
        logger.info(f"Cloning {url} (branch: {branch}) to {target_dir}")
        Repo.clone_from(url, target_dir, branch=branch, depth=1)
    except ImportError:
        # Fall back to git CLI
        logger.info(f"GitPython not available, using git CLI")
        proc = await asyncio.create_subprocess_exec(
            "git", "clone", "--depth", "1", "--branch", branch, url, str(target_dir),
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        stdout, stderr = await proc.communicate()
        if proc.returncode != 0:
            raise RuntimeError(f"git clone failed: {stderr.decode()}")
 async def _walk_files(root: Path, config: IngestConfig) -> AsyncIterator[Path]:
    """Walk directory tree, yielding files to process."""
    for path in root.rglob("*"):
        if path.is_file():
            rel_path = path.relative_to(root)
            if not _should_skip_path(rel_path, config):
                if _should_process_file(path, config):
                    yield path
 def _chunk_to_xml(chunk: Chunk, library_id: str) -> str:
    """Convert a chunk to XML document for storage."""
    # Escape content for XML
    content_escaped = xml_escape(chunk.content)
    docstring_escaped = xml_escape(chunk.docstring) if chunk.docstring else ""
    signature_escaped = xml_escape(chunk.signature) if chunk.signature else ""
    imports_xml = "\n".join(f"    <import>{xml_escape(imp)}</import>" for imp in chunk.imports)
    return f"""<?xml version="1.0" encoding="UTF-8"?>
 <chunk xmlns="https://xml-pipeline.org/ns/librarian/v1">
  <id>{xml_escape(chunk.chunk_id)}</id>
  <library-id>{xml_escape(library_id)}</library-id>
  <file-path>{xml_escape(chunk.file_path)}</file-path>
  <start-line>{chunk.start_line}</start-line>
  <end-line>{chunk.end_line}</end-line>
  <chunk-type>{xml_escape(chunk.chunk_type)}</chunk-type>
  <name>{xml_escape(chunk.name)}</name>
  <language>{xml_escape(chunk.language)}</language>
  <parent-class>{xml_escape(chunk.parent_class)}</parent-class>
  <signature>{signature_escaped}</signature>
  <docstring>{docstring_escaped}</docstring>
  <imports>
 {imports_xml}
  </imports>
  <content><![CDATA[{chunk.content}]]></content>
 </chunk>"""
 async def _store_chunk(
    chunk: Chunk,
    library_id: str,
    collection: str,
 ) -> bool:
    """Store a chunk in eXist-db."""
    from xml_pipeline.tools.librarian import librarian_store
    xml_content = _chunk_to_xml(chunk, library_id)
    # Generate document name from chunk ID
    doc_name = f"{chunk.chunk_id.replace(':', '_').replace('/', '_')}.xml"
    result = await librarian_store(
        collection=collection,
        document_name=doc_name,
        content=xml_content,
    )
    return result.success
 async def ingest_git_repo(
    url: str,
    branch: str = "main",
    library_name: str = "",
    config: Optional[IngestConfig] = None,
 ) -> IngestResult:
    """
    Clone and ingest a git repository.
    Args:
        url: Git repository URL
        branch: Branch to clone (default: main)
        library_name: Human-readable name (derived from URL if empty)
        config: Ingestion configuration
    Returns:
        IngestResult with statistics and library_id
    """
    if config is None:
        config = IngestConfig(branch=branch)
    # Derive library name from URL if not provided
    if not library_name:
        # Extract repo name from URL
        # https://github.com/user/repo.git -> repo
        # git@github.com:user/repo.git -> repo
        name = url.rstrip("/").rstrip(".git").split("/")[-1].split(":")[-1]
        library_name = name
    # Generate unique library ID
    library_id = f"{library_name}-{uuid.uuid4().hex[:8]}"
    result = IngestResult(
        library_id=library_id,
        library_name=library_name,
        files_processed=0,
        chunks_created=0,
        index_built=False,
    )
    # Create temp directory for clone
    temp_dir = Path(tempfile.mkdtemp(prefix="librarian_"))
    try:
        # Clone repository
        await _clone_repo(url, config.branch, temp_dir)
        # Collection path in eXist-db
        collection = f"/db/librarian/{library_id}/chunks"
        # Track language statistics
        lang_stats: dict[str, int] = {}
        # Process files
        async for file_path in _walk_files(temp_dir, config):
            try:
                # Read file content
                content = file_path.read_text(encoding="utf-8", errors="replace")
                # Get relative path for storage
                rel_path = str(file_path.relative_to(temp_dir))
                # Detect language and update stats
                language = detect_language(rel_path)
                lang_stats[language] = lang_stats.get(language, 0) + 1
                # Chunk the file
                chunks = chunk_file(content, rel_path)
                # Store each chunk
                for chunk in chunks:
                    success = await _store_chunk(chunk, library_id, collection)
                    if success:
                        result.chunks_created += 1
                    else:
                        result.errors.append(f"Failed to store chunk: {chunk.chunk_id}")
                result.files_processed += 1
            except Exception as e:
                result.errors.append(f"Error processing {file_path}: {e}")
                logger.warning(f"Error processing {file_path}: {e}")
        result.stats = lang_stats
        # Build index
        from xml_pipeline.librarian.index import build_index
        try:
            await build_index(library_id, library_name, url)
            result.index_built = True
        except Exception as e:
            result.errors.append(f"Index build failed: {e}")
            logger.warning(f"Index build failed: {e}")
        logger.info(
            f"Ingested {library_name}: {result.files_processed} files, "
            f"{result.chunks_created} chunks"
        )
    finally:
        # Cleanup temp directory
        shutil.rmtree(temp_dir, ignore_errors=True)
    return result
 async def ingest_directory(
    path: str | Path,
    library_name: str,
    config: Optional[IngestConfig] = None,
 ) -> IngestResult:
    """
    Ingest a local directory (for testing or local codebases).
    Args:
        path: Path to directory
        library_name: Human-readable name
        config: Ingestion configuration
    Returns:
        IngestResult with statistics and library_id
    """
    if config is None:
        config = IngestConfig()
    root = Path(path)
    if not root.is_dir():
        raise ValueError(f"Not a directory: {path}")
    # Generate unique library ID
    library_id = f"{library_name}-{uuid.uuid4().hex[:8]}"
    result = IngestResult(
        library_id=library_id,
        library_name=library_name,
        files_processed=0,
        chunks_created=0,
        index_built=False,
    )
    collection = f"/db/librarian/{library_id}/chunks"
    lang_stats: dict[str, int] = {}
    async for file_path in _walk_files(root, config):
        try:
            content = file_path.read_text(encoding="utf-8", errors="replace")
            rel_path = str(file_path.relative_to(root))
            language = detect_language(rel_path)
            lang_stats[language] = lang_stats.get(language, 0) + 1
            chunks = chunk_file(content, rel_path)
            for chunk in chunks:
                success = await _store_chunk(chunk, library_id, collection)
                if success:
                    result.chunks_created += 1
                else:
                    result.errors.append(f"Failed to store chunk: {chunk.chunk_id}")
            result.files_processed += 1
        except Exception as e:
            result.errors.append(f"Error processing {file_path}: {e}")
            logger.warning(f"Error processing {file_path}: {e}")
    result.stats = lang_stats
    # Build index
    from xml_pipeline.librarian.index import build_index
    try:
        await build_index(library_id, library_name, str(root))
        result.index_built = True
    except Exception as e:
        result.errors.append(f"Index build failed: {e}")
    return result
--- a/xml_pipeline/librarian/primitives.py
+++ b/xml_pipeline/librarian/primitives.py
@ -0,0 +1,167 @@
 """
 primitives.py — XML payload dataclasses for Premium Librarian.
 These are the message types that flow through the organism's message bus.
 Note: Do NOT use `from __future__ import annotations` here
 as it breaks the xmlify decorator which needs concrete types.
 """
 from dataclasses import dataclass
 from third_party.xmlable import xmlify
@xmlify
@dataclass
 class LibrarianIngest:
    """
    Request to ingest a codebase into the Premium Librarian.
    Supports git URLs. The library will be cloned, chunked, and stored
    in eXist-db for subsequent querying.
    """
    git_url: str = ""
    branch: str = "main"
    library_name: str = ""  # Optional; derived from URL if empty
@xmlify
@dataclass
 class LibrarianIngested:
    """
    Response after successful codebase ingestion.
    Contains the library_id needed for subsequent queries.
    """
    library_id: str = ""
    library_name: str = ""
    files_processed: int = 0
    chunks_created: int = 0
    index_built: bool = False
    errors: str = ""  # Newline-separated error messages
@xmlify
@dataclass
 class LibrarianQuery:
    """
    Query an ingested library with a natural language question.
    The system will search for relevant code chunks and synthesize
    an answer using the configured LLM.
    """
    library_id: str = ""
    question: str = ""
    max_chunks: int = 20  # Max chunks to include in context
    model: str = ""  # Optional; uses default if empty
@xmlify
@dataclass
 class LibrarianAnswer:
    """
    Response to a library query.
    Contains the synthesized answer and source references.
    """
    answer: str = ""
    sources: str = ""  # XML-formatted source list
    tokens_used: int = 0
    chunks_examined: int = 0
    error: str = ""
@xmlify
@dataclass
 class LibrarianList:
    """
    Request to list all ingested libraries.
    """
    pass  # No parameters needed
@xmlify
@dataclass
 class LibraryInfo:
    """
    Information about a single ingested library.
    """
    library_id: str = ""
    name: str = ""
    source_url: str = ""
    created_at: str = ""
    total_files: int = 0
    total_chunks: int = 0
@xmlify
@dataclass
 class LibrarianLibraries:
    """
    Response listing all ingested libraries.
    """
    count: int = 0
    libraries: str = ""  # XML-formatted library list
@xmlify
@dataclass
 class LibrarianDelete:
    """
    Request to delete an ingested library.
    """
    library_id: str = ""
@xmlify
@dataclass
 class LibrarianDeleted:
    """
    Response after library deletion.
    """
    library_id: str = ""
    success: bool = False
    error: str = ""
@xmlify
@dataclass
 class LibrarianGetChunk:
    """
    Request to retrieve a specific code chunk.
    Useful for examining source code referenced in a query response.
    """
    library_id: str = ""
    chunk_id: str = ""
@xmlify
@dataclass
 class LibrarianChunk:
    """
    Response with a specific code chunk.
    """
    chunk_id: str = ""
    file_path: str = ""
    name: str = ""
    chunk_type: str = ""
    language: str = ""
    start_line: int = 0
    end_line: int = 0
    content: str = ""
    docstring: str = ""
    signature: str = ""
    error: str = ""
--- a/xml_pipeline/librarian/query.py
+++ b/xml_pipeline/librarian/query.py
@ -0,0 +1,436 @@
 """
 query.py — RAG-based query system for Premium Librarian.
 Searches indexed codebases and synthesizes answers using Online LLM.
 The flow: Search → Retrieve → Synthesize → Return with sources.
 """
 from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 from typing import Optional
 from xml.sax.saxutils import escape as xml_escape
 logger = logging.getLogger(__name__)
@dataclass
 class Source:
    """A source chunk used in answering a query."""
    file_path: str
    name: str
    chunk_type: str
    start_line: int
    end_line: int
    relevance_score: float
    snippet: str = ""  # First ~200 chars of content
@dataclass
 class QueryResult:
    """Result of a library query."""
    answer: str
    sources: list[Source] = field(default_factory=list)
    tokens_used: int = 0
    chunks_examined: int = 0
    error: str = ""
@dataclass
 class RetrievedChunk:
    """A chunk retrieved from eXist-db for RAG."""
    chunk_id: str
    file_path: str
    name: str
    chunk_type: str
    language: str
    start_line: int
    end_line: int
    content: str
    docstring: str
    signature: str
    score: float
 async def _search_chunks(
    library_id: str,
    query: str,
    max_results: int = 20,
 ) -> list[RetrievedChunk]:
    """
    Search for relevant chunks using Lucene full-text search.
    Returns chunks sorted by relevance score.
    """
    from xml_pipeline.tools.librarian import librarian_query
    # Escape query for XQuery
    query_escaped = query.replace('"', '\\"').replace("'", "\\'")
    # Full-text search using Lucene
    xquery = f"""
    declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
    import module namespace ft = "http://exist-db.org/xquery/lucene";
    for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
    let $content := $chunk/l:content/text()
    let $name := $chunk/l:name/text()
    let $docstring := $chunk/l:docstring/text()
    let $score := (
        if (ft:query($content, "{query_escaped}")) then ft:score($content) * 2
        else if (ft:query($name, "{query_escaped}")) then ft:score($name) * 3
        else if (ft:query($docstring, "{query_escaped}")) then ft:score($docstring)
        else 0
    )
    where $score > 0
    order by $score descending
    return <result score="{{$score}}">
      <id>{{$chunk/l:id/text()}}</id>
      <file-path>{{$chunk/l:file-path/text()}}</file-path>
      <name>{{$chunk/l:name/text()}}</name>
      <chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
      <language>{{$chunk/l:language/text()}}</language>
      <start-line>{{$chunk/l:start-line/text()}}</start-line>
      <end-line>{{$chunk/l:end-line/text()}}</end-line>
      <signature>{{$chunk/l:signature/text()}}</signature>
      <docstring>{{$chunk/l:docstring/text()}}</docstring>
      <content>{{$chunk/l:content/text()}}</content>
    </result>
    """
    result = await librarian_query(
        query=xquery,
        collection=f"/db/librarian/{library_id}",
    )
    chunks: list[RetrievedChunk] = []
    if not result.success:
        logger.warning(f"Search failed: {result.error}")
        # Fall back to simple query without Lucene
        return await _search_chunks_fallback(library_id, query, max_results)
    try:
        from lxml import etree
        xml_str = f"<results>{result.data.get('results', '')}</results>"
        root = etree.fromstring(xml_str.encode())
        for item in root.findall("result")[:max_results]:
            score = float(item.get("score", 0))
            chunks.append(
                RetrievedChunk(
                    chunk_id=item.findtext("id", ""),
                    file_path=item.findtext("file-path", ""),
                    name=item.findtext("name", ""),
                    chunk_type=item.findtext("chunk-type", ""),
                    language=item.findtext("language", ""),
                    start_line=int(item.findtext("start-line", "0")),
                    end_line=int(item.findtext("end-line", "0")),
                    content=item.findtext("content", ""),
                    docstring=item.findtext("docstring", ""),
                    signature=item.findtext("signature", ""),
                    score=score,
                )
            )
    except Exception as e:
        logger.warning(f"Failed to parse search results: {e}")
    return chunks
 async def _search_chunks_fallback(
    library_id: str,
    query: str,
    max_results: int = 20,
 ) -> list[RetrievedChunk]:
    """
    Fallback search using contains() when Lucene is not available.
    Less accurate but works without Lucene indexing.
    """
    from xml_pipeline.tools.librarian import librarian_query
    # Simple contains search
    query_lower = query.lower().replace('"', '\\"').replace("'", "\\'")
    terms = query_lower.split()
    # Build contains conditions
    conditions = []
    for term in terms[:5]:  # Limit to 5 terms
        conditions.append(
            f'(contains(lower-case($chunk/l:content), "{term}") or '
            f'contains(lower-case($chunk/l:name), "{term}") or '
            f'contains(lower-case($chunk/l:docstring), "{term}"))'
        )
    where_clause = " or ".join(conditions) if conditions else "true()"
    xquery = f"""
    declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
    for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
    where {where_clause}
    return <result>
      <id>{{$chunk/l:id/text()}}</id>
      <file-path>{{$chunk/l:file-path/text()}}</file-path>
      <name>{{$chunk/l:name/text()}}</name>
      <chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
      <language>{{$chunk/l:language/text()}}</language>
      <start-line>{{$chunk/l:start-line/text()}}</start-line>
      <end-line>{{$chunk/l:end-line/text()}}</end-line>
      <signature>{{$chunk/l:signature/text()}}</signature>
      <docstring>{{$chunk/l:docstring/text()}}</docstring>
      <content>{{$chunk/l:content/text()}}</content>
    </result>
    """
    result = await librarian_query(
        query=xquery,
        collection=f"/db/librarian/{library_id}",
    )
    chunks: list[RetrievedChunk] = []
    if not result.success:
        logger.warning(f"Fallback search failed: {result.error}")
        return chunks
    try:
        from lxml import etree
        xml_str = f"<results>{result.data.get('results', '')}</results>"
        root = etree.fromstring(xml_str.encode())
        for i, item in enumerate(root.findall("result")[:max_results]):
            # Assign decreasing score based on order
            score = 1.0 - (i * 0.05)
            chunks.append(
                RetrievedChunk(
                    chunk_id=item.findtext("id", ""),
                    file_path=item.findtext("file-path", ""),
                    name=item.findtext("name", ""),
                    chunk_type=item.findtext("chunk-type", ""),
                    language=item.findtext("language", ""),
                    start_line=int(item.findtext("start-line", "0")),
                    end_line=int(item.findtext("end-line", "0")),
                    content=item.findtext("content", ""),
                    docstring=item.findtext("docstring", ""),
                    signature=item.findtext("signature", ""),
                    score=score,
                )
            )
    except Exception as e:
        logger.warning(f"Failed to parse fallback search results: {e}")
    return chunks
 def _build_rag_prompt(
    question: str,
    chunks: list[RetrievedChunk],
    library_name: str,
 ) -> str:
    """Build the RAG prompt with retrieved context."""
    context_parts = []
    for i, chunk in enumerate(chunks, 1):
        header = f"[{i}] {chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
        if chunk.signature:
            header += f"\n    {chunk.signature}"
        # Truncate content if too long
        content = chunk.content
        if len(content) > 2000:
            content = content[:2000] + "\n... (truncated)"
        context_parts.append(f"{header}\n```{chunk.language}\n{content}\n```")
    context = "\n\n".join(context_parts)
    return f"""You are a code assistant analyzing the "{library_name}" codebase.
 Answer the following question based ONLY on the provided code context.
 If the answer is not in the context, say so clearly.
 Reference specific files and line numbers when relevant.
 ## Code Context
 {context}
 ## Question
 {question}
 ## Instructions
 1. Answer based on the code context above
 2. Cite sources using [1], [2], etc. format
 3. Include relevant code snippets if helpful
 4. Be concise but complete"""
 async def query_library(
    library_id: str,
    question: str,
    max_chunks: int = 20,
    model: str = "",
 ) -> QueryResult:
    """
    Query an ingested library using RAG.
    Args:
        library_id: ID of the ingested library
        question: Natural language question
        max_chunks: Maximum chunks to retrieve for context
        model: LLM model to use (empty = use default)
    Returns:
        QueryResult with answer and sources
    """
    from xml_pipeline.librarian.index import get_index
    from xml_pipeline.llm import complete
    # Get library info
    index = await get_index(library_id)
    if not index:
        return QueryResult(
            answer="",
            error=f"Library not found: {library_id}",
        )
    # Search for relevant chunks
    chunks = await _search_chunks(library_id, question, max_chunks)
    if not chunks:
        return QueryResult(
            answer=f"No relevant code found for your question in the '{index.name}' codebase.",
            chunks_examined=0,
        )
    # Build RAG prompt
    prompt = _build_rag_prompt(question, chunks, index.name)
    # Call LLM
    try:
        response = await complete(
            model=model or "grok-4.1",  # Default model
            messages=[
                {"role": "user", "content": prompt},
            ],
        )
        answer = response.content
        tokens_used = response.usage.get("total_tokens", 0)
    except Exception as e:
        logger.error(f"LLM call failed: {e}")
        return QueryResult(
            answer="",
            error=f"Failed to generate answer: {e}",
            chunks_examined=len(chunks),
        )
    # Build sources list
    sources = [
        Source(
            file_path=chunk.file_path,
            name=chunk.name,
            chunk_type=chunk.chunk_type,
            start_line=chunk.start_line,
            end_line=chunk.end_line,
            relevance_score=chunk.score,
            snippet=chunk.content[:200] if chunk.content else "",
        )
        for chunk in chunks
    ]
    return QueryResult(
        answer=answer,
        sources=sources,
        tokens_used=tokens_used,
        chunks_examined=len(chunks),
    )
 def format_sources_xml(sources: list[Source]) -> str:
    """Format sources as XML for LibrarianAnswer payload."""
    source_items = []
    for i, source in enumerate(sources, 1):
        snippet_escaped = xml_escape(source.snippet[:100]) if source.snippet else ""
        source_items.append(
            f"""  <source index="{i}">
    <file-path>{xml_escape(source.file_path)}</file-path>
    <name>{xml_escape(source.name)}</name>
    <type>{xml_escape(source.chunk_type)}</type>
    <lines>{source.start_line}-{source.end_line}</lines>
    <score>{source.relevance_score:.2f}</score>
    <snippet>{snippet_escaped}</snippet>
  </source>"""
        )
    return "<sources>\n" + "\n".join(source_items) + "\n</sources>"
 async def get_chunk_by_id(library_id: str, chunk_id: str) -> Optional[RetrievedChunk]:
    """
    Retrieve a specific chunk by ID.
    Useful for follow-up queries about a specific piece of code.
    """
    from xml_pipeline.tools.librarian import librarian_query
    chunk_id_escaped = chunk_id.replace('"', '\\"')
    xquery = f"""
    declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
    for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
    where $chunk/l:id = "{chunk_id_escaped}"
    return $chunk
    """
    result = await librarian_query(
        query=xquery,
        collection=f"/db/librarian/{library_id}",
    )
    if not result.success:
        return None
    try:
        from lxml import etree
        ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
        root = etree.fromstring(result.data.get("results", "").encode())
        chunk_elem = root if root.tag.endswith("chunk") else root.find("l:chunk", namespaces=ns)
        if chunk_elem is None:
            return None
        return RetrievedChunk(
            chunk_id=chunk_elem.findtext("l:id", "", namespaces=ns),
            file_path=chunk_elem.findtext("l:file-path", "", namespaces=ns),
            name=chunk_elem.findtext("l:name", "", namespaces=ns),
            chunk_type=chunk_elem.findtext("l:chunk-type", "", namespaces=ns),
            language=chunk_elem.findtext("l:language", "", namespaces=ns),
            start_line=int(chunk_elem.findtext("l:start-line", "0", namespaces=ns)),
            end_line=int(chunk_elem.findtext("l:end-line", "0", namespaces=ns)),
            content=chunk_elem.findtext("l:content", "", namespaces=ns),
            docstring=chunk_elem.findtext("l:docstring", "", namespaces=ns),
            signature=chunk_elem.findtext("l:signature", "", namespaces=ns),
            score=1.0,
        )
    except Exception as e:
        logger.warning(f"Failed to parse chunk: {e}")
        return None