xml-pipeline/xml_pipeline/librarian/index.py
dullfig ce8a9ae0e7 Add Premium Librarian MVP for codebase intelligence
Implements an RLM-powered codebase intelligence system that:
- Ingests git repositories and chunks code intelligently
- Stores chunks in eXist-db for RAG retrieval
- Answers natural language queries using LLM synthesis

New package xml_pipeline/librarian/ with:
- chunker.py: AST-based code chunking (Python, JS/TS, C++)
- ingest.py: Git clone + file walking + chunk storage
- index.py: Structural index building (files, functions, classes)
- query.py: RAG search + LLM synthesis with source citations
- primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.)
- handler.py: Message handlers for organism integration

Also adds GitPython and aiohttp as optional [librarian] dependencies.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 23:07:19 -08:00

328 lines
9.7 KiB
Python

"""
index.py — Library index management for Premium Librarian.
Builds and queries structural indices for ingested codebases.
The index provides fast lookup of files, functions, and classes
without needing full-text search.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
from xml.sax.saxutils import escape as xml_escape
logger = logging.getLogger(__name__)
@dataclass
class LibraryIndex:
"""Structural index for an ingested library."""
library_id: str
name: str
source_url: str
created_at: str
files: list[str] = field(default_factory=list)
functions: dict[str, str] = field(default_factory=dict) # name → file path
classes: dict[str, str] = field(default_factory=dict) # name → file path
modules: list[str] = field(default_factory=list)
stats: dict[str, int] = field(default_factory=dict)
@property
def total_chunks(self) -> int:
"""Total number of chunks in this library."""
return self.stats.get("chunks", 0)
@property
def total_files(self) -> int:
"""Total number of files in this library."""
return len(self.files)
def _index_to_xml(index: LibraryIndex) -> str:
"""Convert index to XML document for storage."""
files_xml = "\n".join(f" <file>{xml_escape(f)}</file>" for f in index.files)
functions_xml = "\n".join(
f' <function name="{xml_escape(name)}" file="{xml_escape(path)}"/>'
for name, path in index.functions.items()
)
classes_xml = "\n".join(
f' <class name="{xml_escape(name)}" file="{xml_escape(path)}"/>'
for name, path in index.classes.items()
)
modules_xml = "\n".join(f" <module>{xml_escape(m)}</module>" for m in index.modules)
stats_xml = "\n".join(
f' <stat name="{xml_escape(k)}">{v}</stat>'
for k, v in index.stats.items()
)
return f"""<?xml version="1.0" encoding="UTF-8"?>
<library-index xmlns="https://xml-pipeline.org/ns/librarian/v1">
<library-id>{xml_escape(index.library_id)}</library-id>
<name>{xml_escape(index.name)}</name>
<source-url>{xml_escape(index.source_url)}</source-url>
<created-at>{xml_escape(index.created_at)}</created-at>
<files>
{files_xml}
</files>
<functions>
{functions_xml}
</functions>
<classes>
{classes_xml}
</classes>
<modules>
{modules_xml}
</modules>
<stats>
{stats_xml}
</stats>
</library-index>"""
def _parse_index_xml(xml_content: str) -> Optional[LibraryIndex]:
"""Parse index XML back to LibraryIndex object."""
try:
from lxml import etree
root = etree.fromstring(xml_content.encode())
ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
library_id = root.findtext("l:library-id", "", namespaces=ns)
name = root.findtext("l:name", "", namespaces=ns)
source_url = root.findtext("l:source-url", "", namespaces=ns)
created_at = root.findtext("l:created-at", "", namespaces=ns)
files = [f.text or "" for f in root.findall("l:files/l:file", namespaces=ns)]
functions = {
f.get("name", ""): f.get("file", "")
for f in root.findall("l:functions/l:function", namespaces=ns)
}
classes = {
c.get("name", ""): c.get("file", "")
for c in root.findall("l:classes/l:class", namespaces=ns)
}
modules = [m.text or "" for m in root.findall("l:modules/l:module", namespaces=ns)]
stats = {
s.get("name", ""): int(s.text or 0)
for s in root.findall("l:stats/l:stat", namespaces=ns)
}
return LibraryIndex(
library_id=library_id,
name=name,
source_url=source_url,
created_at=created_at,
files=files,
functions=functions,
classes=classes,
modules=modules,
stats=stats,
)
except Exception as e:
logger.error(f"Failed to parse index XML: {e}")
return None
async def build_index(
library_id: str,
library_name: str,
source_url: str,
) -> LibraryIndex:
"""
Build structural index from stored chunks.
Queries eXist-db for all chunks belonging to this library
and extracts structural information.
"""
from xml_pipeline.tools.librarian import librarian_query, librarian_store
# Query for all chunks in this library
xquery = f"""
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
return <item>
<file>{{$chunk/l:file-path/text()}}</file>
<type>{{$chunk/l:chunk-type/text()}}</type>
<name>{{$chunk/l:name/text()}}</name>
<language>{{$chunk/l:language/text()}}</language>
</item>
"""
result = await librarian_query(query=xquery, collection=f"/db/librarian/{library_id}")
if not result.success:
logger.warning(f"Failed to query chunks for index: {result.error}")
# Create minimal index
index = LibraryIndex(
library_id=library_id,
name=library_name,
source_url=source_url,
created_at=datetime.now(timezone.utc).isoformat(),
)
else:
# Parse results
files: set[str] = set()
functions: dict[str, str] = {}
classes: dict[str, str] = {}
modules: list[str] = []
lang_stats: dict[str, int] = {}
chunk_count = 0
try:
from lxml import etree
# Wrap results in root element for parsing
xml_str = f"<results>{result.data.get('results', '')}</results>"
root = etree.fromstring(xml_str.encode())
for item in root.findall("item"):
chunk_count += 1
file_path = item.findtext("file", "")
chunk_type = item.findtext("type", "")
name = item.findtext("name", "")
language = item.findtext("language", "")
if file_path:
files.add(file_path)
if chunk_type == "function" or chunk_type == "method":
functions[name] = file_path
elif chunk_type == "class":
classes[name] = file_path
elif chunk_type == "module":
modules.append(file_path)
if language:
lang_stats[language] = lang_stats.get(language, 0) + 1
except Exception as e:
logger.warning(f"Failed to parse chunk query results: {e}")
index = LibraryIndex(
library_id=library_id,
name=library_name,
source_url=source_url,
created_at=datetime.now(timezone.utc).isoformat(),
files=sorted(files),
functions=functions,
classes=classes,
modules=modules,
stats={
"chunks": chunk_count,
"files": len(files),
"functions": len(functions),
"classes": len(classes),
**{f"lang_{k}": v for k, v in lang_stats.items()},
},
)
# Store index document
index_xml = _index_to_xml(index)
store_result = await librarian_store(
collection=f"/db/librarian/{library_id}",
document_name="index.xml",
content=index_xml,
)
if not store_result.success:
logger.warning(f"Failed to store index: {store_result.error}")
return index
async def get_index(library_id: str) -> Optional[LibraryIndex]:
"""
Retrieve library index from eXist-db.
Returns None if index doesn't exist.
"""
from xml_pipeline.tools.librarian import librarian_get
result = await librarian_get(f"/db/librarian/{library_id}/index.xml")
if not result.success:
return None
content = result.data.get("content", "")
return _parse_index_xml(content)
async def list_libraries() -> list[LibraryIndex]:
"""
List all ingested libraries.
Returns list of LibraryIndex objects for all libraries in eXist-db.
"""
from xml_pipeline.tools.librarian import librarian_query
xquery = """
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
for $index in collection("/db/librarian")//l:library-index
return $index
"""
result = await librarian_query(query=xquery, collection="/db/librarian")
if not result.success:
logger.warning(f"Failed to list libraries: {result.error}")
return []
libraries: list[LibraryIndex] = []
try:
from lxml import etree
# Parse each index document
xml_str = result.data.get("results", "")
if xml_str.strip():
# Wrap in root element
wrapped = f"<results>{xml_str}</results>"
root = etree.fromstring(wrapped.encode())
for index_elem in root.findall(
"{https://xml-pipeline.org/ns/librarian/v1}library-index"
):
index_xml = etree.tostring(index_elem, encoding="unicode")
index = _parse_index_xml(index_xml)
if index:
libraries.append(index)
except Exception as e:
logger.warning(f"Failed to parse library list: {e}")
return libraries
async def delete_library(library_id: str) -> bool:
"""
Delete a library and all its chunks from eXist-db.
Returns True if successful.
"""
from xml_pipeline.tools.librarian import librarian_query
# Delete the entire collection
xquery = f"""
xmldb:remove("/db/librarian/{library_id}")
"""
result = await librarian_query(query=xquery)
if not result.success:
logger.warning(f"Failed to delete library {library_id}: {result.error}")
return False
return True