Implements an RLM-powered codebase intelligence system that: - Ingests git repositories and chunks code intelligently - Stores chunks in eXist-db for RAG retrieval - Answers natural language queries using LLM synthesis New package xml_pipeline/librarian/ with: - chunker.py: AST-based code chunking (Python, JS/TS, C++) - ingest.py: Git clone + file walking + chunk storage - index.py: Structural index building (files, functions, classes) - query.py: RAG search + LLM synthesis with source citations - primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.) - handler.py: Message handlers for organism integration Also adds GitPython and aiohttp as optional [librarian] dependencies. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
167 lines
3.1 KiB
Python
167 lines
3.1 KiB
Python
"""
|
|
primitives.py — XML payload dataclasses for Premium Librarian.
|
|
|
|
These are the message types that flow through the organism's message bus.
|
|
|
|
Note: Do NOT use `from __future__ import annotations` here
|
|
as it breaks the xmlify decorator which needs concrete types.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from third_party.xmlable import xmlify
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianIngest:
|
|
"""
|
|
Request to ingest a codebase into the Premium Librarian.
|
|
|
|
Supports git URLs. The library will be cloned, chunked, and stored
|
|
in eXist-db for subsequent querying.
|
|
"""
|
|
|
|
git_url: str = ""
|
|
branch: str = "main"
|
|
library_name: str = "" # Optional; derived from URL if empty
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianIngested:
|
|
"""
|
|
Response after successful codebase ingestion.
|
|
|
|
Contains the library_id needed for subsequent queries.
|
|
"""
|
|
|
|
library_id: str = ""
|
|
library_name: str = ""
|
|
files_processed: int = 0
|
|
chunks_created: int = 0
|
|
index_built: bool = False
|
|
errors: str = "" # Newline-separated error messages
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianQuery:
|
|
"""
|
|
Query an ingested library with a natural language question.
|
|
|
|
The system will search for relevant code chunks and synthesize
|
|
an answer using the configured LLM.
|
|
"""
|
|
|
|
library_id: str = ""
|
|
question: str = ""
|
|
max_chunks: int = 20 # Max chunks to include in context
|
|
model: str = "" # Optional; uses default if empty
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianAnswer:
|
|
"""
|
|
Response to a library query.
|
|
|
|
Contains the synthesized answer and source references.
|
|
"""
|
|
|
|
answer: str = ""
|
|
sources: str = "" # XML-formatted source list
|
|
tokens_used: int = 0
|
|
chunks_examined: int = 0
|
|
error: str = ""
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianList:
|
|
"""
|
|
Request to list all ingested libraries.
|
|
"""
|
|
|
|
pass # No parameters needed
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibraryInfo:
|
|
"""
|
|
Information about a single ingested library.
|
|
"""
|
|
|
|
library_id: str = ""
|
|
name: str = ""
|
|
source_url: str = ""
|
|
created_at: str = ""
|
|
total_files: int = 0
|
|
total_chunks: int = 0
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianLibraries:
|
|
"""
|
|
Response listing all ingested libraries.
|
|
"""
|
|
|
|
count: int = 0
|
|
libraries: str = "" # XML-formatted library list
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianDelete:
|
|
"""
|
|
Request to delete an ingested library.
|
|
"""
|
|
|
|
library_id: str = ""
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianDeleted:
|
|
"""
|
|
Response after library deletion.
|
|
"""
|
|
|
|
library_id: str = ""
|
|
success: bool = False
|
|
error: str = ""
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianGetChunk:
|
|
"""
|
|
Request to retrieve a specific code chunk.
|
|
|
|
Useful for examining source code referenced in a query response.
|
|
"""
|
|
|
|
library_id: str = ""
|
|
chunk_id: str = ""
|
|
|
|
|
|
@xmlify
|
|
@dataclass
|
|
class LibrarianChunk:
|
|
"""
|
|
Response with a specific code chunk.
|
|
"""
|
|
|
|
chunk_id: str = ""
|
|
file_path: str = ""
|
|
name: str = ""
|
|
chunk_type: str = ""
|
|
language: str = ""
|
|
start_line: int = 0
|
|
end_line: int = 0
|
|
content: str = ""
|
|
docstring: str = ""
|
|
signature: str = ""
|
|
error: str = ""
|