"""
index.py — Library index management for Premium Librarian.
Builds and queries structural indices for ingested codebases.
The index provides fast lookup of files, functions, and classes
without needing full-text search.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
from xml.sax.saxutils import escape as xml_escape
logger = logging.getLogger(__name__)
@dataclass
class LibraryIndex:
"""Structural index for an ingested library."""
library_id: str
name: str
source_url: str
created_at: str
files: list[str] = field(default_factory=list)
functions: dict[str, str] = field(default_factory=dict) # name → file path
classes: dict[str, str] = field(default_factory=dict) # name → file path
modules: list[str] = field(default_factory=list)
stats: dict[str, int] = field(default_factory=dict)
@property
def total_chunks(self) -> int:
"""Total number of chunks in this library."""
return self.stats.get("chunks", 0)
@property
def total_files(self) -> int:
"""Total number of files in this library."""
return len(self.files)
def _index_to_xml(index: LibraryIndex) -> str:
"""Convert index to XML document for storage."""
files_xml = "\n".join(f" {xml_escape(f)}" for f in index.files)
functions_xml = "\n".join(
f' '
for name, path in index.functions.items()
)
classes_xml = "\n".join(
f' '
for name, path in index.classes.items()
)
modules_xml = "\n".join(f" {xml_escape(m)}" for m in index.modules)
stats_xml = "\n".join(
f' {v}'
for k, v in index.stats.items()
)
return f"""
{xml_escape(index.library_id)}
{xml_escape(index.name)}
{xml_escape(index.source_url)}
{xml_escape(index.created_at)}
{files_xml}
{functions_xml}
{classes_xml}
{modules_xml}
{stats_xml}
"""
def _parse_index_xml(xml_content: str) -> Optional[LibraryIndex]:
"""Parse index XML back to LibraryIndex object."""
try:
from lxml import etree
root = etree.fromstring(xml_content.encode())
ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
library_id = root.findtext("l:library-id", "", namespaces=ns)
name = root.findtext("l:name", "", namespaces=ns)
source_url = root.findtext("l:source-url", "", namespaces=ns)
created_at = root.findtext("l:created-at", "", namespaces=ns)
files = [f.text or "" for f in root.findall("l:files/l:file", namespaces=ns)]
functions = {
f.get("name", ""): f.get("file", "")
for f in root.findall("l:functions/l:function", namespaces=ns)
}
classes = {
c.get("name", ""): c.get("file", "")
for c in root.findall("l:classes/l:class", namespaces=ns)
}
modules = [m.text or "" for m in root.findall("l:modules/l:module", namespaces=ns)]
stats = {
s.get("name", ""): int(s.text or 0)
for s in root.findall("l:stats/l:stat", namespaces=ns)
}
return LibraryIndex(
library_id=library_id,
name=name,
source_url=source_url,
created_at=created_at,
files=files,
functions=functions,
classes=classes,
modules=modules,
stats=stats,
)
except Exception as e:
logger.error(f"Failed to parse index XML: {e}")
return None
async def build_index(
library_id: str,
library_name: str,
source_url: str,
) -> LibraryIndex:
"""
Build structural index from stored chunks.
Queries eXist-db for all chunks belonging to this library
and extracts structural information.
"""
from xml_pipeline.tools.librarian import librarian_query, librarian_store
# Query for all chunks in this library
xquery = f"""
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
return -
{{$chunk/l:file-path/text()}}
{{$chunk/l:chunk-type/text()}}
{{$chunk/l:name/text()}}
{{$chunk/l:language/text()}}
"""
result = await librarian_query(query=xquery, collection=f"/db/librarian/{library_id}")
if not result.success:
logger.warning(f"Failed to query chunks for index: {result.error}")
# Create minimal index
index = LibraryIndex(
library_id=library_id,
name=library_name,
source_url=source_url,
created_at=datetime.now(timezone.utc).isoformat(),
)
else:
# Parse results
files: set[str] = set()
functions: dict[str, str] = {}
classes: dict[str, str] = {}
modules: list[str] = []
lang_stats: dict[str, int] = {}
chunk_count = 0
try:
from lxml import etree
# Wrap results in root element for parsing
xml_str = f"{result.data.get('results', '')}"
root = etree.fromstring(xml_str.encode())
for item in root.findall("item"):
chunk_count += 1
file_path = item.findtext("file", "")
chunk_type = item.findtext("type", "")
name = item.findtext("name", "")
language = item.findtext("language", "")
if file_path:
files.add(file_path)
if chunk_type == "function" or chunk_type == "method":
functions[name] = file_path
elif chunk_type == "class":
classes[name] = file_path
elif chunk_type == "module":
modules.append(file_path)
if language:
lang_stats[language] = lang_stats.get(language, 0) + 1
except Exception as e:
logger.warning(f"Failed to parse chunk query results: {e}")
index = LibraryIndex(
library_id=library_id,
name=library_name,
source_url=source_url,
created_at=datetime.now(timezone.utc).isoformat(),
files=sorted(files),
functions=functions,
classes=classes,
modules=modules,
stats={
"chunks": chunk_count,
"files": len(files),
"functions": len(functions),
"classes": len(classes),
**{f"lang_{k}": v for k, v in lang_stats.items()},
},
)
# Store index document
index_xml = _index_to_xml(index)
store_result = await librarian_store(
collection=f"/db/librarian/{library_id}",
document_name="index.xml",
content=index_xml,
)
if not store_result.success:
logger.warning(f"Failed to store index: {store_result.error}")
return index
async def get_index(library_id: str) -> Optional[LibraryIndex]:
"""
Retrieve library index from eXist-db.
Returns None if index doesn't exist.
"""
from xml_pipeline.tools.librarian import librarian_get
result = await librarian_get(f"/db/librarian/{library_id}/index.xml")
if not result.success:
return None
content = result.data.get("content", "")
return _parse_index_xml(content)
async def list_libraries() -> list[LibraryIndex]:
"""
List all ingested libraries.
Returns list of LibraryIndex objects for all libraries in eXist-db.
"""
from xml_pipeline.tools.librarian import librarian_query
xquery = """
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
for $index in collection("/db/librarian")//l:library-index
return $index
"""
result = await librarian_query(query=xquery, collection="/db/librarian")
if not result.success:
logger.warning(f"Failed to list libraries: {result.error}")
return []
libraries: list[LibraryIndex] = []
try:
from lxml import etree
# Parse each index document
xml_str = result.data.get("results", "")
if xml_str.strip():
# Wrap in root element
wrapped = f"{xml_str}"
root = etree.fromstring(wrapped.encode())
for index_elem in root.findall(
"{https://xml-pipeline.org/ns/librarian/v1}library-index"
):
index_xml = etree.tostring(index_elem, encoding="unicode")
index = _parse_index_xml(index_xml)
if index:
libraries.append(index)
except Exception as e:
logger.warning(f"Failed to parse library list: {e}")
return libraries
async def delete_library(library_id: str) -> bool:
"""
Delete a library and all its chunks from eXist-db.
Returns True if successful.
"""
from xml_pipeline.tools.librarian import librarian_query
# Delete the entire collection
xquery = f"""
xmldb:remove("/db/librarian/{library_id}")
"""
result = await librarian_query(query=xquery)
if not result.success:
logger.warning(f"Failed to delete library {library_id}: {result.error}")
return False
return True