Implements an RLM-powered codebase intelligence system that: - Ingests git repositories and chunks code intelligently - Stores chunks in eXist-db for RAG retrieval - Answers natural language queries using LLM synthesis New package xml_pipeline/librarian/ with: - chunker.py: AST-based code chunking (Python, JS/TS, C++) - ingest.py: Git clone + file walking + chunk storage - index.py: Structural index building (files, functions, classes) - query.py: RAG search + LLM synthesis with source citations - primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.) - handler.py: Message handlers for organism integration Also adds GitPython and aiohttp as optional [librarian] dependencies. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
436 lines
13 KiB
Python
436 lines
13 KiB
Python
"""
|
|
query.py — RAG-based query system for Premium Librarian.
|
|
|
|
Searches indexed codebases and synthesizes answers using Online LLM.
|
|
The flow: Search → Retrieve → Synthesize → Return with sources.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
from xml.sax.saxutils import escape as xml_escape
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class Source:
|
|
"""A source chunk used in answering a query."""
|
|
|
|
file_path: str
|
|
name: str
|
|
chunk_type: str
|
|
start_line: int
|
|
end_line: int
|
|
relevance_score: float
|
|
snippet: str = "" # First ~200 chars of content
|
|
|
|
|
|
@dataclass
|
|
class QueryResult:
|
|
"""Result of a library query."""
|
|
|
|
answer: str
|
|
sources: list[Source] = field(default_factory=list)
|
|
tokens_used: int = 0
|
|
chunks_examined: int = 0
|
|
error: str = ""
|
|
|
|
|
|
@dataclass
|
|
class RetrievedChunk:
|
|
"""A chunk retrieved from eXist-db for RAG."""
|
|
|
|
chunk_id: str
|
|
file_path: str
|
|
name: str
|
|
chunk_type: str
|
|
language: str
|
|
start_line: int
|
|
end_line: int
|
|
content: str
|
|
docstring: str
|
|
signature: str
|
|
score: float
|
|
|
|
|
|
async def _search_chunks(
|
|
library_id: str,
|
|
query: str,
|
|
max_results: int = 20,
|
|
) -> list[RetrievedChunk]:
|
|
"""
|
|
Search for relevant chunks using Lucene full-text search.
|
|
|
|
Returns chunks sorted by relevance score.
|
|
"""
|
|
from xml_pipeline.tools.librarian import librarian_query
|
|
|
|
# Escape query for XQuery
|
|
query_escaped = query.replace('"', '\\"').replace("'", "\\'")
|
|
|
|
# Full-text search using Lucene
|
|
xquery = f"""
|
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
|
import module namespace ft = "http://exist-db.org/xquery/lucene";
|
|
|
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
|
let $content := $chunk/l:content/text()
|
|
let $name := $chunk/l:name/text()
|
|
let $docstring := $chunk/l:docstring/text()
|
|
let $score := (
|
|
if (ft:query($content, "{query_escaped}")) then ft:score($content) * 2
|
|
else if (ft:query($name, "{query_escaped}")) then ft:score($name) * 3
|
|
else if (ft:query($docstring, "{query_escaped}")) then ft:score($docstring)
|
|
else 0
|
|
)
|
|
where $score > 0
|
|
order by $score descending
|
|
return <result score="{{$score}}">
|
|
<id>{{$chunk/l:id/text()}}</id>
|
|
<file-path>{{$chunk/l:file-path/text()}}</file-path>
|
|
<name>{{$chunk/l:name/text()}}</name>
|
|
<chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
|
|
<language>{{$chunk/l:language/text()}}</language>
|
|
<start-line>{{$chunk/l:start-line/text()}}</start-line>
|
|
<end-line>{{$chunk/l:end-line/text()}}</end-line>
|
|
<signature>{{$chunk/l:signature/text()}}</signature>
|
|
<docstring>{{$chunk/l:docstring/text()}}</docstring>
|
|
<content>{{$chunk/l:content/text()}}</content>
|
|
</result>
|
|
"""
|
|
|
|
result = await librarian_query(
|
|
query=xquery,
|
|
collection=f"/db/librarian/{library_id}",
|
|
)
|
|
|
|
chunks: list[RetrievedChunk] = []
|
|
|
|
if not result.success:
|
|
logger.warning(f"Search failed: {result.error}")
|
|
# Fall back to simple query without Lucene
|
|
return await _search_chunks_fallback(library_id, query, max_results)
|
|
|
|
try:
|
|
from lxml import etree
|
|
|
|
xml_str = f"<results>{result.data.get('results', '')}</results>"
|
|
root = etree.fromstring(xml_str.encode())
|
|
|
|
for item in root.findall("result")[:max_results]:
|
|
score = float(item.get("score", 0))
|
|
|
|
chunks.append(
|
|
RetrievedChunk(
|
|
chunk_id=item.findtext("id", ""),
|
|
file_path=item.findtext("file-path", ""),
|
|
name=item.findtext("name", ""),
|
|
chunk_type=item.findtext("chunk-type", ""),
|
|
language=item.findtext("language", ""),
|
|
start_line=int(item.findtext("start-line", "0")),
|
|
end_line=int(item.findtext("end-line", "0")),
|
|
content=item.findtext("content", ""),
|
|
docstring=item.findtext("docstring", ""),
|
|
signature=item.findtext("signature", ""),
|
|
score=score,
|
|
)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse search results: {e}")
|
|
|
|
return chunks
|
|
|
|
|
|
async def _search_chunks_fallback(
|
|
library_id: str,
|
|
query: str,
|
|
max_results: int = 20,
|
|
) -> list[RetrievedChunk]:
|
|
"""
|
|
Fallback search using contains() when Lucene is not available.
|
|
|
|
Less accurate but works without Lucene indexing.
|
|
"""
|
|
from xml_pipeline.tools.librarian import librarian_query
|
|
|
|
# Simple contains search
|
|
query_lower = query.lower().replace('"', '\\"').replace("'", "\\'")
|
|
terms = query_lower.split()
|
|
|
|
# Build contains conditions
|
|
conditions = []
|
|
for term in terms[:5]: # Limit to 5 terms
|
|
conditions.append(
|
|
f'(contains(lower-case($chunk/l:content), "{term}") or '
|
|
f'contains(lower-case($chunk/l:name), "{term}") or '
|
|
f'contains(lower-case($chunk/l:docstring), "{term}"))'
|
|
)
|
|
|
|
where_clause = " or ".join(conditions) if conditions else "true()"
|
|
|
|
xquery = f"""
|
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
|
|
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
|
where {where_clause}
|
|
return <result>
|
|
<id>{{$chunk/l:id/text()}}</id>
|
|
<file-path>{{$chunk/l:file-path/text()}}</file-path>
|
|
<name>{{$chunk/l:name/text()}}</name>
|
|
<chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
|
|
<language>{{$chunk/l:language/text()}}</language>
|
|
<start-line>{{$chunk/l:start-line/text()}}</start-line>
|
|
<end-line>{{$chunk/l:end-line/text()}}</end-line>
|
|
<signature>{{$chunk/l:signature/text()}}</signature>
|
|
<docstring>{{$chunk/l:docstring/text()}}</docstring>
|
|
<content>{{$chunk/l:content/text()}}</content>
|
|
</result>
|
|
"""
|
|
|
|
result = await librarian_query(
|
|
query=xquery,
|
|
collection=f"/db/librarian/{library_id}",
|
|
)
|
|
|
|
chunks: list[RetrievedChunk] = []
|
|
|
|
if not result.success:
|
|
logger.warning(f"Fallback search failed: {result.error}")
|
|
return chunks
|
|
|
|
try:
|
|
from lxml import etree
|
|
|
|
xml_str = f"<results>{result.data.get('results', '')}</results>"
|
|
root = etree.fromstring(xml_str.encode())
|
|
|
|
for i, item in enumerate(root.findall("result")[:max_results]):
|
|
# Assign decreasing score based on order
|
|
score = 1.0 - (i * 0.05)
|
|
|
|
chunks.append(
|
|
RetrievedChunk(
|
|
chunk_id=item.findtext("id", ""),
|
|
file_path=item.findtext("file-path", ""),
|
|
name=item.findtext("name", ""),
|
|
chunk_type=item.findtext("chunk-type", ""),
|
|
language=item.findtext("language", ""),
|
|
start_line=int(item.findtext("start-line", "0")),
|
|
end_line=int(item.findtext("end-line", "0")),
|
|
content=item.findtext("content", ""),
|
|
docstring=item.findtext("docstring", ""),
|
|
signature=item.findtext("signature", ""),
|
|
score=score,
|
|
)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse fallback search results: {e}")
|
|
|
|
return chunks
|
|
|
|
|
|
def _build_rag_prompt(
|
|
question: str,
|
|
chunks: list[RetrievedChunk],
|
|
library_name: str,
|
|
) -> str:
|
|
"""Build the RAG prompt with retrieved context."""
|
|
context_parts = []
|
|
|
|
for i, chunk in enumerate(chunks, 1):
|
|
header = f"[{i}] {chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
|
if chunk.signature:
|
|
header += f"\n {chunk.signature}"
|
|
|
|
# Truncate content if too long
|
|
content = chunk.content
|
|
if len(content) > 2000:
|
|
content = content[:2000] + "\n... (truncated)"
|
|
|
|
context_parts.append(f"{header}\n```{chunk.language}\n{content}\n```")
|
|
|
|
context = "\n\n".join(context_parts)
|
|
|
|
return f"""You are a code assistant analyzing the "{library_name}" codebase.
|
|
|
|
Answer the following question based ONLY on the provided code context.
|
|
If the answer is not in the context, say so clearly.
|
|
Reference specific files and line numbers when relevant.
|
|
|
|
## Code Context
|
|
|
|
{context}
|
|
|
|
## Question
|
|
|
|
{question}
|
|
|
|
## Instructions
|
|
|
|
1. Answer based on the code context above
|
|
2. Cite sources using [1], [2], etc. format
|
|
3. Include relevant code snippets if helpful
|
|
4. Be concise but complete"""
|
|
|
|
|
|
async def query_library(
|
|
library_id: str,
|
|
question: str,
|
|
max_chunks: int = 20,
|
|
model: str = "",
|
|
) -> QueryResult:
|
|
"""
|
|
Query an ingested library using RAG.
|
|
|
|
Args:
|
|
library_id: ID of the ingested library
|
|
question: Natural language question
|
|
max_chunks: Maximum chunks to retrieve for context
|
|
model: LLM model to use (empty = use default)
|
|
|
|
Returns:
|
|
QueryResult with answer and sources
|
|
"""
|
|
from xml_pipeline.librarian.index import get_index
|
|
from xml_pipeline.llm import complete
|
|
|
|
# Get library info
|
|
index = await get_index(library_id)
|
|
if not index:
|
|
return QueryResult(
|
|
answer="",
|
|
error=f"Library not found: {library_id}",
|
|
)
|
|
|
|
# Search for relevant chunks
|
|
chunks = await _search_chunks(library_id, question, max_chunks)
|
|
|
|
if not chunks:
|
|
return QueryResult(
|
|
answer=f"No relevant code found for your question in the '{index.name}' codebase.",
|
|
chunks_examined=0,
|
|
)
|
|
|
|
# Build RAG prompt
|
|
prompt = _build_rag_prompt(question, chunks, index.name)
|
|
|
|
# Call LLM
|
|
try:
|
|
response = await complete(
|
|
model=model or "grok-4.1", # Default model
|
|
messages=[
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
)
|
|
|
|
answer = response.content
|
|
tokens_used = response.usage.get("total_tokens", 0)
|
|
|
|
except Exception as e:
|
|
logger.error(f"LLM call failed: {e}")
|
|
return QueryResult(
|
|
answer="",
|
|
error=f"Failed to generate answer: {e}",
|
|
chunks_examined=len(chunks),
|
|
)
|
|
|
|
# Build sources list
|
|
sources = [
|
|
Source(
|
|
file_path=chunk.file_path,
|
|
name=chunk.name,
|
|
chunk_type=chunk.chunk_type,
|
|
start_line=chunk.start_line,
|
|
end_line=chunk.end_line,
|
|
relevance_score=chunk.score,
|
|
snippet=chunk.content[:200] if chunk.content else "",
|
|
)
|
|
for chunk in chunks
|
|
]
|
|
|
|
return QueryResult(
|
|
answer=answer,
|
|
sources=sources,
|
|
tokens_used=tokens_used,
|
|
chunks_examined=len(chunks),
|
|
)
|
|
|
|
|
|
def format_sources_xml(sources: list[Source]) -> str:
|
|
"""Format sources as XML for LibrarianAnswer payload."""
|
|
source_items = []
|
|
|
|
for i, source in enumerate(sources, 1):
|
|
snippet_escaped = xml_escape(source.snippet[:100]) if source.snippet else ""
|
|
source_items.append(
|
|
f""" <source index="{i}">
|
|
<file-path>{xml_escape(source.file_path)}</file-path>
|
|
<name>{xml_escape(source.name)}</name>
|
|
<type>{xml_escape(source.chunk_type)}</type>
|
|
<lines>{source.start_line}-{source.end_line}</lines>
|
|
<score>{source.relevance_score:.2f}</score>
|
|
<snippet>{snippet_escaped}</snippet>
|
|
</source>"""
|
|
)
|
|
|
|
return "<sources>\n" + "\n".join(source_items) + "\n</sources>"
|
|
|
|
|
|
async def get_chunk_by_id(library_id: str, chunk_id: str) -> Optional[RetrievedChunk]:
|
|
"""
|
|
Retrieve a specific chunk by ID.
|
|
|
|
Useful for follow-up queries about a specific piece of code.
|
|
"""
|
|
from xml_pipeline.tools.librarian import librarian_query
|
|
|
|
chunk_id_escaped = chunk_id.replace('"', '\\"')
|
|
|
|
xquery = f"""
|
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
|
|
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
|
where $chunk/l:id = "{chunk_id_escaped}"
|
|
return $chunk
|
|
"""
|
|
|
|
result = await librarian_query(
|
|
query=xquery,
|
|
collection=f"/db/librarian/{library_id}",
|
|
)
|
|
|
|
if not result.success:
|
|
return None
|
|
|
|
try:
|
|
from lxml import etree
|
|
|
|
ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
|
|
root = etree.fromstring(result.data.get("results", "").encode())
|
|
|
|
chunk_elem = root if root.tag.endswith("chunk") else root.find("l:chunk", namespaces=ns)
|
|
if chunk_elem is None:
|
|
return None
|
|
|
|
return RetrievedChunk(
|
|
chunk_id=chunk_elem.findtext("l:id", "", namespaces=ns),
|
|
file_path=chunk_elem.findtext("l:file-path", "", namespaces=ns),
|
|
name=chunk_elem.findtext("l:name", "", namespaces=ns),
|
|
chunk_type=chunk_elem.findtext("l:chunk-type", "", namespaces=ns),
|
|
language=chunk_elem.findtext("l:language", "", namespaces=ns),
|
|
start_line=int(chunk_elem.findtext("l:start-line", "0", namespaces=ns)),
|
|
end_line=int(chunk_elem.findtext("l:end-line", "0", namespaces=ns)),
|
|
content=chunk_elem.findtext("l:content", "", namespaces=ns),
|
|
docstring=chunk_elem.findtext("l:docstring", "", namespaces=ns),
|
|
signature=chunk_elem.findtext("l:signature", "", namespaces=ns),
|
|
score=1.0,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse chunk: {e}")
|
|
return None
|