xml-pipeline/xml_pipeline/librarian/query.py
dullfig ce8a9ae0e7 Add Premium Librarian MVP for codebase intelligence
Implements an RLM-powered codebase intelligence system that:
- Ingests git repositories and chunks code intelligently
- Stores chunks in eXist-db for RAG retrieval
- Answers natural language queries using LLM synthesis

New package xml_pipeline/librarian/ with:
- chunker.py: AST-based code chunking (Python, JS/TS, C++)
- ingest.py: Git clone + file walking + chunk storage
- index.py: Structural index building (files, functions, classes)
- query.py: RAG search + LLM synthesis with source citations
- primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.)
- handler.py: Message handlers for organism integration

Also adds GitPython and aiohttp as optional [librarian] dependencies.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 23:07:19 -08:00

436 lines
13 KiB
Python

"""
query.py — RAG-based query system for Premium Librarian.
Searches indexed codebases and synthesizes answers using Online LLM.
The flow: Search → Retrieve → Synthesize → Return with sources.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import Optional
from xml.sax.saxutils import escape as xml_escape
logger = logging.getLogger(__name__)
@dataclass
class Source:
"""A source chunk used in answering a query."""
file_path: str
name: str
chunk_type: str
start_line: int
end_line: int
relevance_score: float
snippet: str = "" # First ~200 chars of content
@dataclass
class QueryResult:
"""Result of a library query."""
answer: str
sources: list[Source] = field(default_factory=list)
tokens_used: int = 0
chunks_examined: int = 0
error: str = ""
@dataclass
class RetrievedChunk:
"""A chunk retrieved from eXist-db for RAG."""
chunk_id: str
file_path: str
name: str
chunk_type: str
language: str
start_line: int
end_line: int
content: str
docstring: str
signature: str
score: float
async def _search_chunks(
library_id: str,
query: str,
max_results: int = 20,
) -> list[RetrievedChunk]:
"""
Search for relevant chunks using Lucene full-text search.
Returns chunks sorted by relevance score.
"""
from xml_pipeline.tools.librarian import librarian_query
# Escape query for XQuery
query_escaped = query.replace('"', '\\"').replace("'", "\\'")
# Full-text search using Lucene
xquery = f"""
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
import module namespace ft = "http://exist-db.org/xquery/lucene";
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
let $content := $chunk/l:content/text()
let $name := $chunk/l:name/text()
let $docstring := $chunk/l:docstring/text()
let $score := (
if (ft:query($content, "{query_escaped}")) then ft:score($content) * 2
else if (ft:query($name, "{query_escaped}")) then ft:score($name) * 3
else if (ft:query($docstring, "{query_escaped}")) then ft:score($docstring)
else 0
)
where $score > 0
order by $score descending
return <result score="{{$score}}">
<id>{{$chunk/l:id/text()}}</id>
<file-path>{{$chunk/l:file-path/text()}}</file-path>
<name>{{$chunk/l:name/text()}}</name>
<chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
<language>{{$chunk/l:language/text()}}</language>
<start-line>{{$chunk/l:start-line/text()}}</start-line>
<end-line>{{$chunk/l:end-line/text()}}</end-line>
<signature>{{$chunk/l:signature/text()}}</signature>
<docstring>{{$chunk/l:docstring/text()}}</docstring>
<content>{{$chunk/l:content/text()}}</content>
</result>
"""
result = await librarian_query(
query=xquery,
collection=f"/db/librarian/{library_id}",
)
chunks: list[RetrievedChunk] = []
if not result.success:
logger.warning(f"Search failed: {result.error}")
# Fall back to simple query without Lucene
return await _search_chunks_fallback(library_id, query, max_results)
try:
from lxml import etree
xml_str = f"<results>{result.data.get('results', '')}</results>"
root = etree.fromstring(xml_str.encode())
for item in root.findall("result")[:max_results]:
score = float(item.get("score", 0))
chunks.append(
RetrievedChunk(
chunk_id=item.findtext("id", ""),
file_path=item.findtext("file-path", ""),
name=item.findtext("name", ""),
chunk_type=item.findtext("chunk-type", ""),
language=item.findtext("language", ""),
start_line=int(item.findtext("start-line", "0")),
end_line=int(item.findtext("end-line", "0")),
content=item.findtext("content", ""),
docstring=item.findtext("docstring", ""),
signature=item.findtext("signature", ""),
score=score,
)
)
except Exception as e:
logger.warning(f"Failed to parse search results: {e}")
return chunks
async def _search_chunks_fallback(
library_id: str,
query: str,
max_results: int = 20,
) -> list[RetrievedChunk]:
"""
Fallback search using contains() when Lucene is not available.
Less accurate but works without Lucene indexing.
"""
from xml_pipeline.tools.librarian import librarian_query
# Simple contains search
query_lower = query.lower().replace('"', '\\"').replace("'", "\\'")
terms = query_lower.split()
# Build contains conditions
conditions = []
for term in terms[:5]: # Limit to 5 terms
conditions.append(
f'(contains(lower-case($chunk/l:content), "{term}") or '
f'contains(lower-case($chunk/l:name), "{term}") or '
f'contains(lower-case($chunk/l:docstring), "{term}"))'
)
where_clause = " or ".join(conditions) if conditions else "true()"
xquery = f"""
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
where {where_clause}
return <result>
<id>{{$chunk/l:id/text()}}</id>
<file-path>{{$chunk/l:file-path/text()}}</file-path>
<name>{{$chunk/l:name/text()}}</name>
<chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
<language>{{$chunk/l:language/text()}}</language>
<start-line>{{$chunk/l:start-line/text()}}</start-line>
<end-line>{{$chunk/l:end-line/text()}}</end-line>
<signature>{{$chunk/l:signature/text()}}</signature>
<docstring>{{$chunk/l:docstring/text()}}</docstring>
<content>{{$chunk/l:content/text()}}</content>
</result>
"""
result = await librarian_query(
query=xquery,
collection=f"/db/librarian/{library_id}",
)
chunks: list[RetrievedChunk] = []
if not result.success:
logger.warning(f"Fallback search failed: {result.error}")
return chunks
try:
from lxml import etree
xml_str = f"<results>{result.data.get('results', '')}</results>"
root = etree.fromstring(xml_str.encode())
for i, item in enumerate(root.findall("result")[:max_results]):
# Assign decreasing score based on order
score = 1.0 - (i * 0.05)
chunks.append(
RetrievedChunk(
chunk_id=item.findtext("id", ""),
file_path=item.findtext("file-path", ""),
name=item.findtext("name", ""),
chunk_type=item.findtext("chunk-type", ""),
language=item.findtext("language", ""),
start_line=int(item.findtext("start-line", "0")),
end_line=int(item.findtext("end-line", "0")),
content=item.findtext("content", ""),
docstring=item.findtext("docstring", ""),
signature=item.findtext("signature", ""),
score=score,
)
)
except Exception as e:
logger.warning(f"Failed to parse fallback search results: {e}")
return chunks
def _build_rag_prompt(
question: str,
chunks: list[RetrievedChunk],
library_name: str,
) -> str:
"""Build the RAG prompt with retrieved context."""
context_parts = []
for i, chunk in enumerate(chunks, 1):
header = f"[{i}] {chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
if chunk.signature:
header += f"\n {chunk.signature}"
# Truncate content if too long
content = chunk.content
if len(content) > 2000:
content = content[:2000] + "\n... (truncated)"
context_parts.append(f"{header}\n```{chunk.language}\n{content}\n```")
context = "\n\n".join(context_parts)
return f"""You are a code assistant analyzing the "{library_name}" codebase.
Answer the following question based ONLY on the provided code context.
If the answer is not in the context, say so clearly.
Reference specific files and line numbers when relevant.
## Code Context
{context}
## Question
{question}
## Instructions
1. Answer based on the code context above
2. Cite sources using [1], [2], etc. format
3. Include relevant code snippets if helpful
4. Be concise but complete"""
async def query_library(
library_id: str,
question: str,
max_chunks: int = 20,
model: str = "",
) -> QueryResult:
"""
Query an ingested library using RAG.
Args:
library_id: ID of the ingested library
question: Natural language question
max_chunks: Maximum chunks to retrieve for context
model: LLM model to use (empty = use default)
Returns:
QueryResult with answer and sources
"""
from xml_pipeline.librarian.index import get_index
from xml_pipeline.llm import complete
# Get library info
index = await get_index(library_id)
if not index:
return QueryResult(
answer="",
error=f"Library not found: {library_id}",
)
# Search for relevant chunks
chunks = await _search_chunks(library_id, question, max_chunks)
if not chunks:
return QueryResult(
answer=f"No relevant code found for your question in the '{index.name}' codebase.",
chunks_examined=0,
)
# Build RAG prompt
prompt = _build_rag_prompt(question, chunks, index.name)
# Call LLM
try:
response = await complete(
model=model or "grok-4.1", # Default model
messages=[
{"role": "user", "content": prompt},
],
)
answer = response.content
tokens_used = response.usage.get("total_tokens", 0)
except Exception as e:
logger.error(f"LLM call failed: {e}")
return QueryResult(
answer="",
error=f"Failed to generate answer: {e}",
chunks_examined=len(chunks),
)
# Build sources list
sources = [
Source(
file_path=chunk.file_path,
name=chunk.name,
chunk_type=chunk.chunk_type,
start_line=chunk.start_line,
end_line=chunk.end_line,
relevance_score=chunk.score,
snippet=chunk.content[:200] if chunk.content else "",
)
for chunk in chunks
]
return QueryResult(
answer=answer,
sources=sources,
tokens_used=tokens_used,
chunks_examined=len(chunks),
)
def format_sources_xml(sources: list[Source]) -> str:
"""Format sources as XML for LibrarianAnswer payload."""
source_items = []
for i, source in enumerate(sources, 1):
snippet_escaped = xml_escape(source.snippet[:100]) if source.snippet else ""
source_items.append(
f""" <source index="{i}">
<file-path>{xml_escape(source.file_path)}</file-path>
<name>{xml_escape(source.name)}</name>
<type>{xml_escape(source.chunk_type)}</type>
<lines>{source.start_line}-{source.end_line}</lines>
<score>{source.relevance_score:.2f}</score>
<snippet>{snippet_escaped}</snippet>
</source>"""
)
return "<sources>\n" + "\n".join(source_items) + "\n</sources>"
async def get_chunk_by_id(library_id: str, chunk_id: str) -> Optional[RetrievedChunk]:
"""
Retrieve a specific chunk by ID.
Useful for follow-up queries about a specific piece of code.
"""
from xml_pipeline.tools.librarian import librarian_query
chunk_id_escaped = chunk_id.replace('"', '\\"')
xquery = f"""
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
where $chunk/l:id = "{chunk_id_escaped}"
return $chunk
"""
result = await librarian_query(
query=xquery,
collection=f"/db/librarian/{library_id}",
)
if not result.success:
return None
try:
from lxml import etree
ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
root = etree.fromstring(result.data.get("results", "").encode())
chunk_elem = root if root.tag.endswith("chunk") else root.find("l:chunk", namespaces=ns)
if chunk_elem is None:
return None
return RetrievedChunk(
chunk_id=chunk_elem.findtext("l:id", "", namespaces=ns),
file_path=chunk_elem.findtext("l:file-path", "", namespaces=ns),
name=chunk_elem.findtext("l:name", "", namespaces=ns),
chunk_type=chunk_elem.findtext("l:chunk-type", "", namespaces=ns),
language=chunk_elem.findtext("l:language", "", namespaces=ns),
start_line=int(chunk_elem.findtext("l:start-line", "0", namespaces=ns)),
end_line=int(chunk_elem.findtext("l:end-line", "0", namespaces=ns)),
content=chunk_elem.findtext("l:content", "", namespaces=ns),
docstring=chunk_elem.findtext("l:docstring", "", namespaces=ns),
signature=chunk_elem.findtext("l:signature", "", namespaces=ns),
score=1.0,
)
except Exception as e:
logger.warning(f"Failed to parse chunk: {e}")
return None