Add Premium Librarian MVP for codebase intelligence
Implements an RLM-powered codebase intelligence system that: - Ingests git repositories and chunks code intelligently - Stores chunks in eXist-db for RAG retrieval - Answers natural language queries using LLM synthesis New package xml_pipeline/librarian/ with: - chunker.py: AST-based code chunking (Python, JS/TS, C++) - ingest.py: Git clone + file walking + chunk storage - index.py: Structural index building (files, functions, classes) - query.py: RAG search + LLM synthesis with source citations - primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.) - handler.py: Message handlers for organism integration Also adds GitPython and aiohttp as optional [librarian] dependencies. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
d0d78a9f70
commit
ce8a9ae0e7
10 changed files with 3021 additions and 1 deletions
|
|
@ -77,6 +77,9 @@ openai = ["openai>=1.0"]
|
||||||
redis = ["redis>=5.0"] # Distributed key-value store
|
redis = ["redis>=5.0"] # Distributed key-value store
|
||||||
search = ["duckduckgo-search>=6.0"] # Web search tool
|
search = ["duckduckgo-search>=6.0"] # Web search tool
|
||||||
|
|
||||||
|
# Premium Librarian (codebase intelligence)
|
||||||
|
librarian = ["gitpython>=3.1", "aiohttp>=3.9"]
|
||||||
|
|
||||||
# Console example (optional, for interactive use)
|
# Console example (optional, for interactive use)
|
||||||
console = ["prompt_toolkit>=3.0"]
|
console = ["prompt_toolkit>=3.0"]
|
||||||
|
|
||||||
|
|
@ -91,7 +94,7 @@ server = [
|
||||||
llm = ["xml-pipeline[anthropic,openai]"]
|
llm = ["xml-pipeline[anthropic,openai]"]
|
||||||
|
|
||||||
# All tools
|
# All tools
|
||||||
tools = ["xml-pipeline[redis,search]"]
|
tools = ["xml-pipeline[redis,search,librarian]"]
|
||||||
|
|
||||||
# Everything (for local development)
|
# Everything (for local development)
|
||||||
all = ["xml-pipeline[llm,tools,console,server]"]
|
all = ["xml-pipeline[llm,tools,console,server]"]
|
||||||
|
|
|
||||||
375
tests/test_librarian_chunker.py
Normal file
375
tests/test_librarian_chunker.py
Normal file
|
|
@ -0,0 +1,375 @@
|
||||||
|
"""
|
||||||
|
Tests for the Premium Librarian code chunker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from xml_pipeline.librarian.chunker import (
|
||||||
|
Chunk,
|
||||||
|
chunk_file,
|
||||||
|
chunk_python,
|
||||||
|
chunk_javascript,
|
||||||
|
chunk_cpp,
|
||||||
|
chunk_prose,
|
||||||
|
chunk_generic,
|
||||||
|
detect_language,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestLanguageDetection:
|
||||||
|
"""Tests for language detection from file paths."""
|
||||||
|
|
||||||
|
def test_python_detection(self) -> None:
|
||||||
|
assert detect_language("foo.py") == "python"
|
||||||
|
assert detect_language("path/to/module.py") == "python"
|
||||||
|
assert detect_language("types.pyi") == "python"
|
||||||
|
|
||||||
|
def test_javascript_detection(self) -> None:
|
||||||
|
assert detect_language("app.js") == "javascript"
|
||||||
|
assert detect_language("component.jsx") == "javascript"
|
||||||
|
assert detect_language("index.mjs") == "javascript"
|
||||||
|
|
||||||
|
def test_typescript_detection(self) -> None:
|
||||||
|
assert detect_language("app.ts") == "typescript"
|
||||||
|
assert detect_language("component.tsx") == "typescript"
|
||||||
|
|
||||||
|
def test_cpp_detection(self) -> None:
|
||||||
|
assert detect_language("main.cpp") == "cpp"
|
||||||
|
assert detect_language("header.hpp") == "cpp"
|
||||||
|
assert detect_language("source.cc") == "cpp"
|
||||||
|
|
||||||
|
def test_c_detection(self) -> None:
|
||||||
|
assert detect_language("main.c") == "c"
|
||||||
|
assert detect_language("header.h") == "c"
|
||||||
|
|
||||||
|
def test_unknown_language(self) -> None:
|
||||||
|
assert detect_language("data.xyz") == "unknown"
|
||||||
|
assert detect_language("noextension") == "unknown"
|
||||||
|
|
||||||
|
def test_case_insensitive(self) -> None:
|
||||||
|
assert detect_language("Module.PY") == "python"
|
||||||
|
assert detect_language("APP.JS") == "javascript"
|
||||||
|
|
||||||
|
|
||||||
|
class TestPythonChunker:
|
||||||
|
"""Tests for Python AST-based chunking."""
|
||||||
|
|
||||||
|
def test_simple_function(self) -> None:
|
||||||
|
code = '''
|
||||||
|
def hello(name: str) -> str:
|
||||||
|
"""Say hello."""
|
||||||
|
return f"Hello, {name}!"
|
||||||
|
'''
|
||||||
|
chunks = chunk_python(code, "test.py")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].name == "hello"
|
||||||
|
assert chunks[0].chunk_type == "function"
|
||||||
|
assert chunks[0].docstring == "Say hello."
|
||||||
|
assert "str" in chunks[0].signature
|
||||||
|
|
||||||
|
def test_async_function(self) -> None:
|
||||||
|
code = '''
|
||||||
|
async def fetch_data(url: str) -> dict:
|
||||||
|
"""Fetch data from URL."""
|
||||||
|
pass
|
||||||
|
'''
|
||||||
|
chunks = chunk_python(code, "test.py")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].name == "fetch_data"
|
||||||
|
assert chunks[0].chunk_type == "function"
|
||||||
|
assert "async" in chunks[0].signature
|
||||||
|
|
||||||
|
def test_class_with_methods(self) -> None:
|
||||||
|
code = '''
|
||||||
|
class Calculator:
|
||||||
|
"""A simple calculator."""
|
||||||
|
|
||||||
|
def add(self, a: int, b: int) -> int:
|
||||||
|
"""Add two numbers."""
|
||||||
|
return a + b
|
||||||
|
|
||||||
|
def subtract(self, a: int, b: int) -> int:
|
||||||
|
"""Subtract two numbers."""
|
||||||
|
return a - b
|
||||||
|
'''
|
||||||
|
chunks = chunk_python(code, "test.py")
|
||||||
|
# Should create a class chunk (small enough to keep together)
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
class_chunk = [c for c in chunks if c.chunk_type == "class"]
|
||||||
|
assert len(class_chunk) == 1
|
||||||
|
assert class_chunk[0].name == "Calculator"
|
||||||
|
assert class_chunk[0].docstring == "A simple calculator."
|
||||||
|
|
||||||
|
def test_imports_extracted(self) -> None:
|
||||||
|
code = '''
|
||||||
|
import os
|
||||||
|
from typing import Optional, List
|
||||||
|
|
||||||
|
def process():
|
||||||
|
pass
|
||||||
|
'''
|
||||||
|
chunks = chunk_python(code, "test.py")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert "import os" in chunks[0].imports
|
||||||
|
assert any("from typing import" in imp for imp in chunks[0].imports)
|
||||||
|
|
||||||
|
def test_empty_file(self) -> None:
|
||||||
|
chunks = chunk_python("", "test.py")
|
||||||
|
assert len(chunks) == 0
|
||||||
|
|
||||||
|
def test_module_with_only_imports(self) -> None:
|
||||||
|
code = '''
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
'''
|
||||||
|
chunks = chunk_python(code, "test.py")
|
||||||
|
# Should create a module chunk for files with no functions/classes
|
||||||
|
assert len(chunks) == 0 or chunks[0].chunk_type == "module"
|
||||||
|
|
||||||
|
def test_syntax_error_fallback(self) -> None:
|
||||||
|
code = '''
|
||||||
|
def broken(
|
||||||
|
# Missing closing paren
|
||||||
|
'''
|
||||||
|
chunks = chunk_python(code, "test.py")
|
||||||
|
# Should fall back to generic chunking
|
||||||
|
assert len(chunks) >= 0 # May or may not produce chunks
|
||||||
|
|
||||||
|
|
||||||
|
class TestJavaScriptChunker:
|
||||||
|
"""Tests for JavaScript regex-based chunking."""
|
||||||
|
|
||||||
|
def test_function_declaration(self) -> None:
|
||||||
|
code = '''
|
||||||
|
function greet(name) {
|
||||||
|
return `Hello, ${name}!`;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_javascript(code, "test.js")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].name == "greet"
|
||||||
|
assert chunks[0].chunk_type == "function"
|
||||||
|
|
||||||
|
def test_async_function(self) -> None:
|
||||||
|
code = '''
|
||||||
|
async function fetchData(url) {
|
||||||
|
const response = await fetch(url);
|
||||||
|
return response.json();
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_javascript(code, "test.js")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].name == "fetchData"
|
||||||
|
|
||||||
|
def test_arrow_function(self) -> None:
|
||||||
|
code = '''
|
||||||
|
const multiply = (a, b) => {
|
||||||
|
return a * b;
|
||||||
|
};
|
||||||
|
'''
|
||||||
|
chunks = chunk_javascript(code, "test.js")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].name == "multiply"
|
||||||
|
assert chunks[0].chunk_type == "function"
|
||||||
|
|
||||||
|
def test_class_definition(self) -> None:
|
||||||
|
code = '''
|
||||||
|
class Calculator {
|
||||||
|
constructor() {
|
||||||
|
this.result = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
add(value) {
|
||||||
|
this.result += value;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_javascript(code, "test.js")
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
class_chunks = [c for c in chunks if c.chunk_type == "class"]
|
||||||
|
assert len(class_chunks) == 1
|
||||||
|
assert class_chunks[0].name == "Calculator"
|
||||||
|
|
||||||
|
def test_export_function(self) -> None:
|
||||||
|
code = '''
|
||||||
|
export function exportedFunc() {
|
||||||
|
return 42;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_javascript(code, "test.js")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].name == "exportedFunc"
|
||||||
|
|
||||||
|
def test_imports_extracted(self) -> None:
|
||||||
|
code = '''
|
||||||
|
import React from 'react';
|
||||||
|
import { useState } from 'react';
|
||||||
|
const lodash = require('lodash');
|
||||||
|
|
||||||
|
function Component() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_javascript(code, "test.jsx")
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
assert any("import React" in imp for imp in chunks[0].imports)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCppChunker:
|
||||||
|
"""Tests for C++ regex-based chunking."""
|
||||||
|
|
||||||
|
def test_function_definition(self) -> None:
|
||||||
|
code = '''
|
||||||
|
int add(int a, int b) {
|
||||||
|
return a + b;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_cpp(code, "test.cpp")
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
func_chunks = [c for c in chunks if c.chunk_type == "function"]
|
||||||
|
assert len(func_chunks) == 1
|
||||||
|
assert func_chunks[0].name == "add"
|
||||||
|
|
||||||
|
def test_class_definition(self) -> None:
|
||||||
|
code = '''
|
||||||
|
class Calculator {
|
||||||
|
public:
|
||||||
|
int add(int a, int b);
|
||||||
|
int subtract(int a, int b);
|
||||||
|
};
|
||||||
|
'''
|
||||||
|
chunks = chunk_cpp(code, "test.cpp")
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
class_chunks = [c for c in chunks if c.chunk_type == "class"]
|
||||||
|
assert len(class_chunks) == 1
|
||||||
|
assert class_chunks[0].name == "Calculator"
|
||||||
|
|
||||||
|
def test_includes_extracted(self) -> None:
|
||||||
|
code = '''
|
||||||
|
#include <iostream>
|
||||||
|
#include "myheader.h"
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
chunks = chunk_cpp(code, "test.cpp")
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
assert any("#include <iostream>" in imp for imp in chunks[0].imports)
|
||||||
|
|
||||||
|
|
||||||
|
class TestProseChunker:
|
||||||
|
"""Tests for prose document chunking."""
|
||||||
|
|
||||||
|
def test_markdown_headings(self) -> None:
|
||||||
|
content = '''# Introduction
|
||||||
|
|
||||||
|
This is the introduction section.
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
Follow these steps to get started.
|
||||||
|
|
||||||
|
## Advanced Topics
|
||||||
|
|
||||||
|
More advanced content here.
|
||||||
|
'''
|
||||||
|
chunks = chunk_prose(content, "readme.md", "markdown")
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
# First chunk should be introduction
|
||||||
|
assert chunks[0].name == "Introduction"
|
||||||
|
|
||||||
|
def test_empty_document(self) -> None:
|
||||||
|
chunks = chunk_prose("", "empty.md", "markdown")
|
||||||
|
assert len(chunks) == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestGenericChunker:
|
||||||
|
"""Tests for generic line-based chunking."""
|
||||||
|
|
||||||
|
def test_small_file(self) -> None:
|
||||||
|
content = "line1\nline2\nline3"
|
||||||
|
chunks = chunk_generic(content, "test.txt", "text")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].content == content
|
||||||
|
|
||||||
|
def test_empty_file(self) -> None:
|
||||||
|
chunks = chunk_generic("", "empty.txt", "text")
|
||||||
|
assert len(chunks) == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestChunkFile:
|
||||||
|
"""Tests for the main chunk_file dispatcher."""
|
||||||
|
|
||||||
|
def test_dispatches_to_python(self) -> None:
|
||||||
|
code = "def foo(): pass"
|
||||||
|
chunks = chunk_file(code, "test.py")
|
||||||
|
assert all(c.language == "python" for c in chunks)
|
||||||
|
|
||||||
|
def test_dispatches_to_javascript(self) -> None:
|
||||||
|
code = "function foo() {}"
|
||||||
|
chunks = chunk_file(code, "test.js")
|
||||||
|
assert all(c.language == "javascript" for c in chunks)
|
||||||
|
|
||||||
|
def test_dispatches_to_cpp(self) -> None:
|
||||||
|
code = "int main() { return 0; }"
|
||||||
|
chunks = chunk_file(code, "test.cpp")
|
||||||
|
assert all(c.language == "cpp" for c in chunks)
|
||||||
|
|
||||||
|
def test_unknown_language_uses_generic(self) -> None:
|
||||||
|
content = "some content"
|
||||||
|
chunks = chunk_file(content, "test.xyz")
|
||||||
|
assert all(c.language == "unknown" for c in chunks)
|
||||||
|
|
||||||
|
|
||||||
|
class TestChunkProperties:
|
||||||
|
"""Tests for Chunk dataclass properties."""
|
||||||
|
|
||||||
|
def test_chunk_id_generation(self) -> None:
|
||||||
|
chunk = Chunk(
|
||||||
|
content="def foo(): pass",
|
||||||
|
file_path="test.py",
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
chunk_type="function",
|
||||||
|
name="foo",
|
||||||
|
language="python",
|
||||||
|
)
|
||||||
|
assert chunk.chunk_id
|
||||||
|
assert "test.py" in chunk.chunk_id
|
||||||
|
assert "foo" in chunk.chunk_id
|
||||||
|
|
||||||
|
def test_chunk_id_uniqueness(self) -> None:
|
||||||
|
chunk1 = Chunk(
|
||||||
|
content="def foo(): pass",
|
||||||
|
file_path="test.py",
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
chunk_type="function",
|
||||||
|
name="foo",
|
||||||
|
language="python",
|
||||||
|
)
|
||||||
|
chunk2 = Chunk(
|
||||||
|
content="def foo(): return 1",
|
||||||
|
file_path="test.py",
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
chunk_type="function",
|
||||||
|
name="foo",
|
||||||
|
language="python",
|
||||||
|
)
|
||||||
|
# Different content should produce different IDs
|
||||||
|
assert chunk1.chunk_id != chunk2.chunk_id
|
||||||
|
|
||||||
|
def test_line_count(self) -> None:
|
||||||
|
chunk = Chunk(
|
||||||
|
content="line1\nline2\nline3",
|
||||||
|
file_path="test.py",
|
||||||
|
start_line=10,
|
||||||
|
end_line=12,
|
||||||
|
chunk_type="block",
|
||||||
|
name="test",
|
||||||
|
language="python",
|
||||||
|
)
|
||||||
|
assert chunk.line_count == 3
|
||||||
292
tests/test_librarian_query.py
Normal file
292
tests/test_librarian_query.py
Normal file
|
|
@ -0,0 +1,292 @@
|
||||||
|
"""
|
||||||
|
Integration tests for Premium Librarian query system.
|
||||||
|
|
||||||
|
These tests require:
|
||||||
|
- eXist-db running (for storage)
|
||||||
|
- LLM router configured (for synthesis)
|
||||||
|
|
||||||
|
Mark with @pytest.mark.integration to skip in CI without dependencies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
|
||||||
|
from xml_pipeline.librarian.query import (
|
||||||
|
QueryResult,
|
||||||
|
Source,
|
||||||
|
RetrievedChunk,
|
||||||
|
_build_rag_prompt,
|
||||||
|
format_sources_xml,
|
||||||
|
)
|
||||||
|
from xml_pipeline.librarian.index import LibraryIndex
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildRagPrompt:
|
||||||
|
"""Tests for RAG prompt construction."""
|
||||||
|
|
||||||
|
def test_builds_prompt_with_context(self) -> None:
|
||||||
|
chunks = [
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id="test:foo:abc123",
|
||||||
|
file_path="src/utils.py",
|
||||||
|
name="calculate",
|
||||||
|
chunk_type="function",
|
||||||
|
language="python",
|
||||||
|
start_line=10,
|
||||||
|
end_line=20,
|
||||||
|
content="def calculate(x): return x * 2",
|
||||||
|
docstring="Calculate double.",
|
||||||
|
signature="def calculate(x) -> int",
|
||||||
|
score=0.9,
|
||||||
|
),
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id="test:bar:def456",
|
||||||
|
file_path="src/main.py",
|
||||||
|
name="main",
|
||||||
|
chunk_type="function",
|
||||||
|
language="python",
|
||||||
|
start_line=1,
|
||||||
|
end_line=5,
|
||||||
|
content="def main(): print('hello')",
|
||||||
|
docstring="",
|
||||||
|
signature="def main()",
|
||||||
|
score=0.7,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
prompt = _build_rag_prompt(
|
||||||
|
question="How does the calculate function work?",
|
||||||
|
chunks=chunks,
|
||||||
|
library_name="test-lib",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify prompt structure
|
||||||
|
assert "test-lib" in prompt
|
||||||
|
assert "calculate function" in prompt
|
||||||
|
assert "src/utils.py" in prompt
|
||||||
|
assert "src/main.py" in prompt
|
||||||
|
assert "[1]" in prompt
|
||||||
|
assert "[2]" in prompt
|
||||||
|
assert "```python" in prompt
|
||||||
|
|
||||||
|
def test_truncates_long_content(self) -> None:
|
||||||
|
long_content = "x" * 3000 # Longer than 2000 char limit
|
||||||
|
chunks = [
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id="test:long:123",
|
||||||
|
file_path="long.py",
|
||||||
|
name="long_func",
|
||||||
|
chunk_type="function",
|
||||||
|
language="python",
|
||||||
|
start_line=1,
|
||||||
|
end_line=100,
|
||||||
|
content=long_content,
|
||||||
|
docstring="",
|
||||||
|
signature="",
|
||||||
|
score=0.5,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
prompt = _build_rag_prompt("What?", chunks, "lib")
|
||||||
|
|
||||||
|
# Content should be truncated
|
||||||
|
assert "(truncated)" in prompt
|
||||||
|
# Should not contain full content
|
||||||
|
assert long_content not in prompt
|
||||||
|
|
||||||
|
def test_empty_chunks_list(self) -> None:
|
||||||
|
prompt = _build_rag_prompt("What?", [], "lib")
|
||||||
|
assert "lib" in prompt
|
||||||
|
assert "Question" in prompt
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatSourcesXml:
|
||||||
|
"""Tests for XML source formatting."""
|
||||||
|
|
||||||
|
def test_formats_sources_as_xml(self) -> None:
|
||||||
|
sources = [
|
||||||
|
Source(
|
||||||
|
file_path="src/app.py",
|
||||||
|
name="process",
|
||||||
|
chunk_type="function",
|
||||||
|
start_line=10,
|
||||||
|
end_line=25,
|
||||||
|
relevance_score=0.95,
|
||||||
|
snippet="def process(data): ...",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
xml = format_sources_xml(sources)
|
||||||
|
|
||||||
|
assert "<sources>" in xml
|
||||||
|
assert "</sources>" in xml
|
||||||
|
assert "<source index=\"1\">" in xml
|
||||||
|
assert "<file-path>src/app.py</file-path>" in xml
|
||||||
|
assert "<name>process</name>" in xml
|
||||||
|
assert "<type>function</type>" in xml
|
||||||
|
assert "<lines>10-25</lines>" in xml
|
||||||
|
assert "<score>0.95</score>" in xml
|
||||||
|
|
||||||
|
def test_escapes_special_characters(self) -> None:
|
||||||
|
sources = [
|
||||||
|
Source(
|
||||||
|
file_path="src/<special>.py",
|
||||||
|
name="func&name",
|
||||||
|
chunk_type="function",
|
||||||
|
start_line=1,
|
||||||
|
end_line=1,
|
||||||
|
relevance_score=0.5,
|
||||||
|
snippet="code with <tags> & entities",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
xml = format_sources_xml(sources)
|
||||||
|
|
||||||
|
# XML entities should be escaped
|
||||||
|
assert "<special>" in xml
|
||||||
|
assert "func&name" in xml
|
||||||
|
|
||||||
|
def test_empty_sources_list(self) -> None:
|
||||||
|
xml = format_sources_xml([])
|
||||||
|
|
||||||
|
assert "<sources>" in xml
|
||||||
|
assert "</sources>" in xml
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryResultDataclass:
|
||||||
|
"""Tests for QueryResult dataclass."""
|
||||||
|
|
||||||
|
def test_default_values(self) -> None:
|
||||||
|
result = QueryResult(answer="Test answer")
|
||||||
|
|
||||||
|
assert result.answer == "Test answer"
|
||||||
|
assert result.sources == []
|
||||||
|
assert result.tokens_used == 0
|
||||||
|
assert result.chunks_examined == 0
|
||||||
|
assert result.error == ""
|
||||||
|
|
||||||
|
def test_with_sources(self) -> None:
|
||||||
|
sources = [
|
||||||
|
Source(
|
||||||
|
file_path="test.py",
|
||||||
|
name="test",
|
||||||
|
chunk_type="function",
|
||||||
|
start_line=1,
|
||||||
|
end_line=10,
|
||||||
|
relevance_score=0.9,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
result = QueryResult(
|
||||||
|
answer="Test answer",
|
||||||
|
sources=sources,
|
||||||
|
tokens_used=100,
|
||||||
|
chunks_examined=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(result.sources) == 1
|
||||||
|
assert result.tokens_used == 100
|
||||||
|
assert result.chunks_examined == 5
|
||||||
|
|
||||||
|
|
||||||
|
class TestRetrievedChunk:
|
||||||
|
"""Tests for RetrievedChunk dataclass."""
|
||||||
|
|
||||||
|
def test_all_fields(self) -> None:
|
||||||
|
chunk = RetrievedChunk(
|
||||||
|
chunk_id="lib:file:hash",
|
||||||
|
file_path="src/module.py",
|
||||||
|
name="my_function",
|
||||||
|
chunk_type="function",
|
||||||
|
language="python",
|
||||||
|
start_line=10,
|
||||||
|
end_line=20,
|
||||||
|
content="def my_function(): pass",
|
||||||
|
docstring="Does something.",
|
||||||
|
signature="def my_function() -> None",
|
||||||
|
score=0.85,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert chunk.chunk_id == "lib:file:hash"
|
||||||
|
assert chunk.file_path == "src/module.py"
|
||||||
|
assert chunk.name == "my_function"
|
||||||
|
assert chunk.language == "python"
|
||||||
|
assert chunk.score == 0.85
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
class TestQueryLibraryIntegration:
|
||||||
|
"""Integration tests requiring eXist-db and LLM."""
|
||||||
|
|
||||||
|
async def test_query_nonexistent_library(self) -> None:
|
||||||
|
"""Query should return error for non-existent library."""
|
||||||
|
from xml_pipeline.librarian.query import query_library
|
||||||
|
|
||||||
|
# Mock get_index to return None - patch at index module level
|
||||||
|
with patch("xml_pipeline.librarian.index.get_index", new_callable=AsyncMock) as mock_get_index:
|
||||||
|
mock_get_index.return_value = None
|
||||||
|
|
||||||
|
result = await query_library(
|
||||||
|
library_id="nonexistent-lib-xyz",
|
||||||
|
question="What does this do?",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.error
|
||||||
|
assert "not found" in result.error.lower()
|
||||||
|
|
||||||
|
async def test_query_with_no_relevant_chunks(self) -> None:
|
||||||
|
"""Query should handle case where search returns no results."""
|
||||||
|
from xml_pipeline.librarian.query import query_library
|
||||||
|
|
||||||
|
mock_index = LibraryIndex(
|
||||||
|
library_id="test-lib",
|
||||||
|
name="Test Library",
|
||||||
|
source_url="https://example.com/repo",
|
||||||
|
created_at="2024-01-01T00:00:00Z",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Patch get_index at the index module level (where it's defined)
|
||||||
|
# and _search_chunks at query module level
|
||||||
|
with patch("xml_pipeline.librarian.index.get_index", new_callable=AsyncMock) as mock_get_index:
|
||||||
|
mock_get_index.return_value = mock_index
|
||||||
|
|
||||||
|
with patch("xml_pipeline.librarian.query._search_chunks", new_callable=AsyncMock) as mock_search:
|
||||||
|
mock_search.return_value = []
|
||||||
|
|
||||||
|
result = await query_library(
|
||||||
|
library_id="test-lib",
|
||||||
|
question="What does foo do?",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "No relevant code found" in result.answer
|
||||||
|
assert result.chunks_examined == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestLibraryIndex:
|
||||||
|
"""Tests for LibraryIndex dataclass."""
|
||||||
|
|
||||||
|
def test_properties(self) -> None:
|
||||||
|
index = LibraryIndex(
|
||||||
|
library_id="test-id",
|
||||||
|
name="Test Lib",
|
||||||
|
source_url="https://github.com/test/repo",
|
||||||
|
created_at="2024-01-01",
|
||||||
|
files=["a.py", "b.py", "c.py"],
|
||||||
|
functions={"func1": "a.py", "func2": "b.py"},
|
||||||
|
classes={"MyClass": "c.py"},
|
||||||
|
stats={"chunks": 10, "files": 3},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert index.total_chunks == 10
|
||||||
|
assert index.total_files == 3
|
||||||
|
|
||||||
|
def test_empty_stats(self) -> None:
|
||||||
|
index = LibraryIndex(
|
||||||
|
library_id="test",
|
||||||
|
name="Test",
|
||||||
|
source_url="",
|
||||||
|
created_at="",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert index.total_chunks == 0
|
||||||
|
assert index.total_files == 0
|
||||||
103
xml_pipeline/librarian/__init__.py
Normal file
103
xml_pipeline/librarian/__init__.py
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
"""
|
||||||
|
Premium Librarian — RLM-powered codebase intelligence.
|
||||||
|
|
||||||
|
Ingests codebases, chunks them intelligently, stores in eXist-db,
|
||||||
|
and answers natural language queries using Online LLM + RAG.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from xml_pipeline.librarian import ingest_git_repo, query_library
|
||||||
|
|
||||||
|
# Ingest a codebase
|
||||||
|
result = await ingest_git_repo(
|
||||||
|
url="https://github.com/example/repo.git",
|
||||||
|
library_name="my-lib",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Query it
|
||||||
|
answer = await query_library(
|
||||||
|
library_id=result.library_id,
|
||||||
|
question="What does this codebase do?",
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from xml_pipeline.librarian.chunker import (
|
||||||
|
Chunk,
|
||||||
|
chunk_file,
|
||||||
|
chunk_python,
|
||||||
|
chunk_javascript,
|
||||||
|
chunk_cpp,
|
||||||
|
detect_language,
|
||||||
|
)
|
||||||
|
from xml_pipeline.librarian.ingest import (
|
||||||
|
IngestResult,
|
||||||
|
ingest_git_repo,
|
||||||
|
)
|
||||||
|
from xml_pipeline.librarian.index import (
|
||||||
|
LibraryIndex,
|
||||||
|
build_index,
|
||||||
|
get_index,
|
||||||
|
)
|
||||||
|
from xml_pipeline.librarian.query import (
|
||||||
|
Source,
|
||||||
|
QueryResult,
|
||||||
|
query_library,
|
||||||
|
)
|
||||||
|
from xml_pipeline.librarian.primitives import (
|
||||||
|
LibrarianIngest,
|
||||||
|
LibrarianIngested,
|
||||||
|
LibrarianQuery,
|
||||||
|
LibrarianAnswer,
|
||||||
|
LibrarianList,
|
||||||
|
LibrarianLibraries,
|
||||||
|
LibrarianDelete,
|
||||||
|
LibrarianDeleted,
|
||||||
|
LibrarianGetChunk,
|
||||||
|
LibrarianChunk,
|
||||||
|
LibraryInfo,
|
||||||
|
)
|
||||||
|
from xml_pipeline.librarian.handler import (
|
||||||
|
handle_librarian_ingest,
|
||||||
|
handle_librarian_query,
|
||||||
|
handle_librarian_list,
|
||||||
|
handle_librarian_delete,
|
||||||
|
handle_librarian_get_chunk,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Chunker
|
||||||
|
"Chunk",
|
||||||
|
"chunk_file",
|
||||||
|
"chunk_python",
|
||||||
|
"chunk_javascript",
|
||||||
|
"chunk_cpp",
|
||||||
|
"detect_language",
|
||||||
|
# Ingest
|
||||||
|
"IngestResult",
|
||||||
|
"ingest_git_repo",
|
||||||
|
# Index
|
||||||
|
"LibraryIndex",
|
||||||
|
"build_index",
|
||||||
|
"get_index",
|
||||||
|
# Query
|
||||||
|
"Source",
|
||||||
|
"QueryResult",
|
||||||
|
"query_library",
|
||||||
|
# Primitives
|
||||||
|
"LibrarianIngest",
|
||||||
|
"LibrarianIngested",
|
||||||
|
"LibrarianQuery",
|
||||||
|
"LibrarianAnswer",
|
||||||
|
"LibrarianList",
|
||||||
|
"LibrarianLibraries",
|
||||||
|
"LibrarianDelete",
|
||||||
|
"LibrarianDeleted",
|
||||||
|
"LibrarianGetChunk",
|
||||||
|
"LibrarianChunk",
|
||||||
|
"LibraryInfo",
|
||||||
|
# Handlers
|
||||||
|
"handle_librarian_ingest",
|
||||||
|
"handle_librarian_query",
|
||||||
|
"handle_librarian_list",
|
||||||
|
"handle_librarian_delete",
|
||||||
|
"handle_librarian_get_chunk",
|
||||||
|
]
|
||||||
677
xml_pipeline/librarian/chunker.py
Normal file
677
xml_pipeline/librarian/chunker.py
Normal file
|
|
@ -0,0 +1,677 @@
|
||||||
|
"""
|
||||||
|
chunker.py — AST-based code chunking for intelligent RAG retrieval.
|
||||||
|
|
||||||
|
Chunks source files into semantically meaningful units (functions, classes, modules)
|
||||||
|
preserving context like docstrings, signatures, and imports.
|
||||||
|
|
||||||
|
Supported languages:
|
||||||
|
- Python (ast.parse)
|
||||||
|
- JavaScript/TypeScript (regex-based)
|
||||||
|
- C++ (regex-based)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import re
|
||||||
|
import hashlib
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
"""A semantically meaningful code chunk."""
|
||||||
|
|
||||||
|
content: str
|
||||||
|
file_path: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
chunk_type: str # "function", "class", "method", "module", "block"
|
||||||
|
name: str # Function/class name or file name for modules
|
||||||
|
language: str
|
||||||
|
imports: list[str] = field(default_factory=list)
|
||||||
|
docstring: str = ""
|
||||||
|
signature: str = "" # Function signature for context
|
||||||
|
parent_class: str = "" # Class name if this is a method
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chunk_id(self) -> str:
|
||||||
|
"""Generate unique ID for this chunk."""
|
||||||
|
content_hash = hashlib.sha256(self.content.encode()).hexdigest()[:12]
|
||||||
|
return f"{self.file_path}:{self.name}:{content_hash}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def line_count(self) -> int:
|
||||||
|
"""Number of lines in this chunk."""
|
||||||
|
return self.end_line - self.start_line + 1
|
||||||
|
|
||||||
|
|
||||||
|
# Language detection by file extension
|
||||||
|
LANGUAGE_MAP = {
|
||||||
|
".py": "python",
|
||||||
|
".pyi": "python",
|
||||||
|
".js": "javascript",
|
||||||
|
".jsx": "javascript",
|
||||||
|
".ts": "typescript",
|
||||||
|
".tsx": "typescript",
|
||||||
|
".mjs": "javascript",
|
||||||
|
".cjs": "javascript",
|
||||||
|
".c": "c",
|
||||||
|
".h": "c",
|
||||||
|
".cpp": "cpp",
|
||||||
|
".cxx": "cpp",
|
||||||
|
".cc": "cpp",
|
||||||
|
".hpp": "cpp",
|
||||||
|
".hxx": "cpp",
|
||||||
|
".rs": "rust",
|
||||||
|
".go": "go",
|
||||||
|
".java": "java",
|
||||||
|
".kt": "kotlin",
|
||||||
|
".rb": "ruby",
|
||||||
|
".php": "php",
|
||||||
|
".cs": "csharp",
|
||||||
|
".swift": "swift",
|
||||||
|
".scala": "scala",
|
||||||
|
".md": "markdown",
|
||||||
|
".rst": "restructuredtext",
|
||||||
|
".txt": "text",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Max lines per chunk before splitting
|
||||||
|
MAX_CHUNK_LINES = 500
|
||||||
|
|
||||||
|
|
||||||
|
def detect_language(file_path: str) -> str:
|
||||||
|
"""Detect language from file extension."""
|
||||||
|
suffix = Path(file_path).suffix.lower()
|
||||||
|
return LANGUAGE_MAP.get(suffix, "unknown")
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_file(content: str, file_path: str) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
Chunk a file based on detected language.
|
||||||
|
|
||||||
|
Dispatches to language-specific chunker or falls back to
|
||||||
|
line-based chunking for unknown languages.
|
||||||
|
"""
|
||||||
|
language = detect_language(file_path)
|
||||||
|
|
||||||
|
if language == "python":
|
||||||
|
return chunk_python(content, file_path)
|
||||||
|
elif language in ("javascript", "typescript"):
|
||||||
|
return chunk_javascript(content, file_path)
|
||||||
|
elif language in ("c", "cpp"):
|
||||||
|
return chunk_cpp(content, file_path)
|
||||||
|
elif language in ("markdown", "restructuredtext", "text"):
|
||||||
|
return chunk_prose(content, file_path, language)
|
||||||
|
else:
|
||||||
|
# Generic line-based chunking
|
||||||
|
return chunk_generic(content, file_path, language)
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_python(content: str, file_path: str) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
AST-based Python chunking.
|
||||||
|
|
||||||
|
Extracts:
|
||||||
|
- Module-level imports (as context)
|
||||||
|
- Functions (with docstrings)
|
||||||
|
- Classes (with methods)
|
||||||
|
- Top-level code blocks
|
||||||
|
"""
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
lines = content.splitlines()
|
||||||
|
|
||||||
|
try:
|
||||||
|
tree = ast.parse(content)
|
||||||
|
except SyntaxError:
|
||||||
|
# Fall back to generic chunking on parse error
|
||||||
|
return chunk_generic(content, file_path, "python")
|
||||||
|
|
||||||
|
# Extract imports for context
|
||||||
|
imports: list[str] = []
|
||||||
|
for node in ast.walk(tree):
|
||||||
|
if isinstance(node, ast.Import):
|
||||||
|
for alias in node.names:
|
||||||
|
imports.append(f"import {alias.name}")
|
||||||
|
elif isinstance(node, ast.ImportFrom):
|
||||||
|
module = node.module or ""
|
||||||
|
names = ", ".join(a.name for a in node.names)
|
||||||
|
imports.append(f"from {module} import {names}")
|
||||||
|
|
||||||
|
# Process top-level definitions
|
||||||
|
for node in ast.iter_child_nodes(tree):
|
||||||
|
if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef):
|
||||||
|
chunk = _extract_python_function(node, lines, file_path, imports)
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
elif isinstance(node, ast.ClassDef):
|
||||||
|
# Create chunk for class definition + methods
|
||||||
|
class_chunks = _extract_python_class(node, lines, file_path, imports)
|
||||||
|
chunks.extend(class_chunks)
|
||||||
|
|
||||||
|
# If no chunks extracted, create a module chunk
|
||||||
|
if not chunks and content.strip():
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=1,
|
||||||
|
end_line=len(lines),
|
||||||
|
chunk_type="module",
|
||||||
|
name=Path(file_path).stem,
|
||||||
|
language="python",
|
||||||
|
imports=imports,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_python_function(
|
||||||
|
node: ast.FunctionDef | ast.AsyncFunctionDef,
|
||||||
|
lines: list[str],
|
||||||
|
file_path: str,
|
||||||
|
imports: list[str],
|
||||||
|
parent_class: str = "",
|
||||||
|
) -> Chunk:
|
||||||
|
"""Extract a Python function as a chunk."""
|
||||||
|
start_line = node.lineno
|
||||||
|
end_line = node.end_lineno or start_line
|
||||||
|
|
||||||
|
# Get source lines (1-indexed)
|
||||||
|
func_lines = lines[start_line - 1 : end_line]
|
||||||
|
content = "\n".join(func_lines)
|
||||||
|
|
||||||
|
# Extract docstring
|
||||||
|
docstring = ast.get_docstring(node) or ""
|
||||||
|
|
||||||
|
# Build signature
|
||||||
|
args = []
|
||||||
|
for arg in node.args.args:
|
||||||
|
arg_str = arg.arg
|
||||||
|
if arg.annotation:
|
||||||
|
try:
|
||||||
|
arg_str += f": {ast.unparse(arg.annotation)}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
args.append(arg_str)
|
||||||
|
|
||||||
|
returns = ""
|
||||||
|
if node.returns:
|
||||||
|
try:
|
||||||
|
returns = f" -> {ast.unparse(node.returns)}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async_prefix = "async " if isinstance(node, ast.AsyncFunctionDef) else ""
|
||||||
|
signature = f"{async_prefix}def {node.name}({', '.join(args)}){returns}"
|
||||||
|
|
||||||
|
chunk_type = "method" if parent_class else "function"
|
||||||
|
|
||||||
|
return Chunk(
|
||||||
|
content=content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
chunk_type=chunk_type,
|
||||||
|
name=node.name,
|
||||||
|
language="python",
|
||||||
|
imports=imports,
|
||||||
|
docstring=docstring,
|
||||||
|
signature=signature,
|
||||||
|
parent_class=parent_class,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_python_class(
|
||||||
|
node: ast.ClassDef,
|
||||||
|
lines: list[str],
|
||||||
|
file_path: str,
|
||||||
|
imports: list[str],
|
||||||
|
) -> list[Chunk]:
|
||||||
|
"""Extract a Python class and its methods as chunks."""
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
|
||||||
|
start_line = node.lineno
|
||||||
|
end_line = node.end_lineno or start_line
|
||||||
|
|
||||||
|
# Get full class source
|
||||||
|
class_lines = lines[start_line - 1 : end_line]
|
||||||
|
class_content = "\n".join(class_lines)
|
||||||
|
|
||||||
|
# Class docstring
|
||||||
|
docstring = ast.get_docstring(node) or ""
|
||||||
|
|
||||||
|
# Build class signature with bases
|
||||||
|
bases = []
|
||||||
|
for base in node.bases:
|
||||||
|
try:
|
||||||
|
bases.append(ast.unparse(base))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
base_str = f"({', '.join(bases)})" if bases else ""
|
||||||
|
signature = f"class {node.name}{base_str}"
|
||||||
|
|
||||||
|
# If class is small enough, keep as single chunk
|
||||||
|
if len(class_lines) <= MAX_CHUNK_LINES:
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=class_content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
chunk_type="class",
|
||||||
|
name=node.name,
|
||||||
|
language="python",
|
||||||
|
imports=imports,
|
||||||
|
docstring=docstring,
|
||||||
|
signature=signature,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Large class: chunk into class header + individual methods
|
||||||
|
# First, create a class header chunk (up to first method or ~50 lines)
|
||||||
|
header_end = start_line + min(50, len(class_lines) - 1)
|
||||||
|
|
||||||
|
for child in node.body:
|
||||||
|
if isinstance(child, ast.FunctionDef | ast.AsyncFunctionDef):
|
||||||
|
header_end = child.lineno - 1
|
||||||
|
break
|
||||||
|
|
||||||
|
header_lines = lines[start_line - 1 : header_end]
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content="\n".join(header_lines),
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=header_end,
|
||||||
|
chunk_type="class",
|
||||||
|
name=node.name,
|
||||||
|
language="python",
|
||||||
|
imports=imports,
|
||||||
|
docstring=docstring,
|
||||||
|
signature=signature,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Then extract each method
|
||||||
|
for child in node.body:
|
||||||
|
if isinstance(child, ast.FunctionDef | ast.AsyncFunctionDef):
|
||||||
|
method_chunk = _extract_python_function(
|
||||||
|
child, lines, file_path, imports, parent_class=node.name
|
||||||
|
)
|
||||||
|
chunks.append(method_chunk)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_javascript(content: str, file_path: str) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
Regex-based JavaScript/TypeScript chunking.
|
||||||
|
|
||||||
|
Extracts:
|
||||||
|
- Function declarations
|
||||||
|
- Arrow functions assigned to const/let
|
||||||
|
- Class definitions
|
||||||
|
- Export statements
|
||||||
|
"""
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
lines = content.splitlines()
|
||||||
|
language = detect_language(file_path)
|
||||||
|
|
||||||
|
# Extract imports
|
||||||
|
imports: list[str] = []
|
||||||
|
import_pattern = re.compile(
|
||||||
|
r'^(?:import\s+.*?from\s+[\'"].*?[\'"]|import\s+[\'"].*?[\'"]|'
|
||||||
|
r'const\s+\w+\s*=\s*require\([\'"].*?[\'"]\))',
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
for match in import_pattern.finditer(content):
|
||||||
|
imports.append(match.group(0))
|
||||||
|
|
||||||
|
# Function pattern: function name(...) or async function name(...)
|
||||||
|
func_pattern = re.compile(
|
||||||
|
r"^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\([^)]*\)",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Arrow function pattern: const name = (...) => or const name = async (...) =>
|
||||||
|
arrow_pattern = re.compile(
|
||||||
|
r"^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Class pattern
|
||||||
|
class_pattern = re.compile(
|
||||||
|
r"^(?:export\s+)?(?:default\s+)?class\s+(\w+)",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Find all definitions and their positions
|
||||||
|
definitions: list[tuple[int, str, str, str]] = [] # (line, type, name, signature)
|
||||||
|
|
||||||
|
for match in func_pattern.finditer(content):
|
||||||
|
line_num = content[: match.start()].count("\n") + 1
|
||||||
|
definitions.append((line_num, "function", match.group(1), match.group(0)))
|
||||||
|
|
||||||
|
for match in arrow_pattern.finditer(content):
|
||||||
|
line_num = content[: match.start()].count("\n") + 1
|
||||||
|
definitions.append((line_num, "function", match.group(1), match.group(0)))
|
||||||
|
|
||||||
|
for match in class_pattern.finditer(content):
|
||||||
|
line_num = content[: match.start()].count("\n") + 1
|
||||||
|
definitions.append((line_num, "class", match.group(1), match.group(0)))
|
||||||
|
|
||||||
|
# Sort by line number
|
||||||
|
definitions.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Create chunks
|
||||||
|
for i, (start_line, chunk_type, name, signature) in enumerate(definitions):
|
||||||
|
# End line is start of next definition - 1, or end of file
|
||||||
|
if i + 1 < len(definitions):
|
||||||
|
end_line = definitions[i + 1][0] - 1
|
||||||
|
else:
|
||||||
|
end_line = len(lines)
|
||||||
|
|
||||||
|
# Trim trailing empty lines
|
||||||
|
while end_line > start_line and not lines[end_line - 1].strip():
|
||||||
|
end_line -= 1
|
||||||
|
|
||||||
|
chunk_lines = lines[start_line - 1 : end_line]
|
||||||
|
chunk_content = "\n".join(chunk_lines)
|
||||||
|
|
||||||
|
# Extract JSDoc comment if present
|
||||||
|
docstring = ""
|
||||||
|
if start_line > 1:
|
||||||
|
prev_line = lines[start_line - 2].strip()
|
||||||
|
if prev_line.endswith("*/"):
|
||||||
|
# Look back for JSDoc start
|
||||||
|
doc_lines = []
|
||||||
|
for j in range(start_line - 2, max(0, start_line - 20), -1):
|
||||||
|
doc_lines.insert(0, lines[j])
|
||||||
|
if "/**" in lines[j]:
|
||||||
|
break
|
||||||
|
docstring = "\n".join(doc_lines)
|
||||||
|
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=chunk_content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
chunk_type=chunk_type,
|
||||||
|
name=name,
|
||||||
|
language=language,
|
||||||
|
imports=imports,
|
||||||
|
docstring=docstring,
|
||||||
|
signature=signature,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# If no chunks, create module chunk
|
||||||
|
if not chunks and content.strip():
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=1,
|
||||||
|
end_line=len(lines),
|
||||||
|
chunk_type="module",
|
||||||
|
name=Path(file_path).stem,
|
||||||
|
language=language,
|
||||||
|
imports=imports,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_cpp(content: str, file_path: str) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
Regex-based C/C++ chunking.
|
||||||
|
|
||||||
|
Extracts:
|
||||||
|
- Function definitions
|
||||||
|
- Class definitions
|
||||||
|
- Struct definitions
|
||||||
|
"""
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
lines = content.splitlines()
|
||||||
|
language = detect_language(file_path)
|
||||||
|
|
||||||
|
# Extract includes
|
||||||
|
imports: list[str] = []
|
||||||
|
include_pattern = re.compile(r'^#include\s+[<"].*?[>"]', re.MULTILINE)
|
||||||
|
for match in include_pattern.finditer(content):
|
||||||
|
imports.append(match.group(0))
|
||||||
|
|
||||||
|
# Function pattern (simplified): return_type name(params) {
|
||||||
|
# This is a simplified pattern that won't catch all cases
|
||||||
|
func_pattern = re.compile(
|
||||||
|
r"^(?:(?:static|inline|virtual|explicit|constexpr|template\s*<[^>]*>\s*)*"
|
||||||
|
r"(?:\w+(?:::\w+)*\s+)+)" # Return type
|
||||||
|
r"(\w+)\s*\([^)]*\)\s*(?:const\s*)?(?:override\s*)?(?:noexcept\s*)?[{;]",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Class/struct pattern
|
||||||
|
class_pattern = re.compile(
|
||||||
|
r"^(?:template\s*<[^>]*>\s*)?(?:class|struct)\s+(\w+)",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
definitions: list[tuple[int, str, str, str]] = []
|
||||||
|
|
||||||
|
for match in func_pattern.finditer(content):
|
||||||
|
line_num = content[: match.start()].count("\n") + 1
|
||||||
|
name = match.group(1)
|
||||||
|
# Skip common false positives
|
||||||
|
if name not in ("if", "while", "for", "switch", "return"):
|
||||||
|
definitions.append((line_num, "function", name, match.group(0).strip()))
|
||||||
|
|
||||||
|
for match in class_pattern.finditer(content):
|
||||||
|
line_num = content[: match.start()].count("\n") + 1
|
||||||
|
definitions.append((line_num, "class", match.group(1), match.group(0)))
|
||||||
|
|
||||||
|
definitions.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Create chunks (similar to JS)
|
||||||
|
for i, (start_line, chunk_type, name, signature) in enumerate(definitions):
|
||||||
|
if i + 1 < len(definitions):
|
||||||
|
end_line = definitions[i + 1][0] - 1
|
||||||
|
else:
|
||||||
|
end_line = len(lines)
|
||||||
|
|
||||||
|
while end_line > start_line and not lines[end_line - 1].strip():
|
||||||
|
end_line -= 1
|
||||||
|
|
||||||
|
# For functions, try to find matching brace
|
||||||
|
if chunk_type == "function":
|
||||||
|
brace_count = 0
|
||||||
|
found_open = False
|
||||||
|
for j in range(start_line - 1, min(end_line, len(lines))):
|
||||||
|
for char in lines[j]:
|
||||||
|
if char == "{":
|
||||||
|
brace_count += 1
|
||||||
|
found_open = True
|
||||||
|
elif char == "}":
|
||||||
|
brace_count -= 1
|
||||||
|
if found_open and brace_count == 0:
|
||||||
|
end_line = j + 1
|
||||||
|
break
|
||||||
|
if found_open and brace_count == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
chunk_lines = lines[start_line - 1 : end_line]
|
||||||
|
chunk_content = "\n".join(chunk_lines)
|
||||||
|
|
||||||
|
# Extract Doxygen comment if present
|
||||||
|
docstring = ""
|
||||||
|
if start_line > 1:
|
||||||
|
prev_line = lines[start_line - 2].strip()
|
||||||
|
if prev_line.endswith("*/"):
|
||||||
|
doc_lines = []
|
||||||
|
for j in range(start_line - 2, max(0, start_line - 30), -1):
|
||||||
|
doc_lines.insert(0, lines[j])
|
||||||
|
if "/**" in lines[j] or "/*!" in lines[j]:
|
||||||
|
break
|
||||||
|
docstring = "\n".join(doc_lines)
|
||||||
|
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=chunk_content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
chunk_type=chunk_type,
|
||||||
|
name=name,
|
||||||
|
language=language,
|
||||||
|
imports=imports,
|
||||||
|
docstring=docstring,
|
||||||
|
signature=signature,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chunks and content.strip():
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=content,
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=1,
|
||||||
|
end_line=len(lines),
|
||||||
|
chunk_type="module",
|
||||||
|
name=Path(file_path).stem,
|
||||||
|
language=language,
|
||||||
|
imports=imports,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_prose(content: str, file_path: str, language: str) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
Chunk prose documents (Markdown, RST, plain text).
|
||||||
|
|
||||||
|
Splits on headings/sections, keeping chunks under MAX_CHUNK_LINES.
|
||||||
|
"""
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
lines = content.splitlines()
|
||||||
|
|
||||||
|
# Markdown heading pattern
|
||||||
|
if language == "markdown":
|
||||||
|
heading_pattern = re.compile(r"^#{1,6}\s+(.+)$")
|
||||||
|
else:
|
||||||
|
heading_pattern = re.compile(r"^[=\-~]+$") # RST underline headings
|
||||||
|
|
||||||
|
current_chunk_lines: list[str] = []
|
||||||
|
current_start = 1
|
||||||
|
current_name = Path(file_path).stem
|
||||||
|
|
||||||
|
for i, line in enumerate(lines, 1):
|
||||||
|
match = heading_pattern.match(line)
|
||||||
|
|
||||||
|
# New section or chunk too large
|
||||||
|
if match or len(current_chunk_lines) >= MAX_CHUNK_LINES:
|
||||||
|
# Save current chunk if non-empty
|
||||||
|
if current_chunk_lines:
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content="\n".join(current_chunk_lines),
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=current_start,
|
||||||
|
end_line=i - 1,
|
||||||
|
chunk_type="section",
|
||||||
|
name=current_name,
|
||||||
|
language=language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start new chunk
|
||||||
|
current_chunk_lines = [line]
|
||||||
|
current_start = i
|
||||||
|
if match:
|
||||||
|
current_name = match.group(1) if language == "markdown" else lines[i - 2] if i > 1 else current_name
|
||||||
|
else:
|
||||||
|
current_chunk_lines.append(line)
|
||||||
|
|
||||||
|
# Save final chunk
|
||||||
|
if current_chunk_lines:
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content="\n".join(current_chunk_lines),
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=current_start,
|
||||||
|
end_line=len(lines),
|
||||||
|
chunk_type="section",
|
||||||
|
name=current_name,
|
||||||
|
language=language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_generic(content: str, file_path: str, language: str) -> list[Chunk]:
|
||||||
|
"""
|
||||||
|
Generic line-based chunking for unknown languages.
|
||||||
|
|
||||||
|
Splits content into MAX_CHUNK_LINES chunks, trying to break at empty lines.
|
||||||
|
"""
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
lines = content.splitlines()
|
||||||
|
|
||||||
|
if not lines:
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
current_chunk_lines: list[str] = []
|
||||||
|
current_start = 1
|
||||||
|
|
||||||
|
for i, line in enumerate(lines, 1):
|
||||||
|
current_chunk_lines.append(line)
|
||||||
|
|
||||||
|
# Check if we should split
|
||||||
|
if len(current_chunk_lines) >= MAX_CHUNK_LINES:
|
||||||
|
# Try to find a good break point (empty line in last 50 lines)
|
||||||
|
break_at = len(current_chunk_lines)
|
||||||
|
for j in range(len(current_chunk_lines) - 1, max(0, len(current_chunk_lines) - 50), -1):
|
||||||
|
if not current_chunk_lines[j].strip():
|
||||||
|
break_at = j
|
||||||
|
break
|
||||||
|
|
||||||
|
# Create chunk up to break point
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content="\n".join(current_chunk_lines[:break_at]),
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=current_start,
|
||||||
|
end_line=current_start + break_at - 1,
|
||||||
|
chunk_type="block",
|
||||||
|
name=f"{Path(file_path).stem}:{current_start}",
|
||||||
|
language=language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Keep remaining lines for next chunk
|
||||||
|
current_chunk_lines = current_chunk_lines[break_at:]
|
||||||
|
current_start = current_start + break_at
|
||||||
|
|
||||||
|
# Save final chunk
|
||||||
|
if current_chunk_lines:
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content="\n".join(current_chunk_lines),
|
||||||
|
file_path=file_path,
|
||||||
|
start_line=current_start,
|
||||||
|
end_line=len(lines),
|
||||||
|
chunk_type="block",
|
||||||
|
name=f"{Path(file_path).stem}:{current_start}",
|
||||||
|
language=language,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
246
xml_pipeline/librarian/handler.py
Normal file
246
xml_pipeline/librarian/handler.py
Normal file
|
|
@ -0,0 +1,246 @@
|
||||||
|
"""
|
||||||
|
handler.py — Message handlers for Premium Librarian.
|
||||||
|
|
||||||
|
These handlers process librarian requests through the organism's message bus.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from xml.sax.saxutils import escape as xml_escape
|
||||||
|
|
||||||
|
from xml_pipeline.message_bus.message_state import HandlerMetadata, HandlerResponse
|
||||||
|
|
||||||
|
from xml_pipeline.librarian.primitives import (
|
||||||
|
LibrarianIngest,
|
||||||
|
LibrarianIngested,
|
||||||
|
LibrarianQuery,
|
||||||
|
LibrarianAnswer,
|
||||||
|
LibrarianList,
|
||||||
|
LibrarianLibraries,
|
||||||
|
LibrarianDelete,
|
||||||
|
LibrarianDeleted,
|
||||||
|
LibrarianGetChunk,
|
||||||
|
LibrarianChunk,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_librarian_ingest(
|
||||||
|
payload: LibrarianIngest,
|
||||||
|
metadata: HandlerMetadata,
|
||||||
|
) -> HandlerResponse:
|
||||||
|
"""
|
||||||
|
Handle a codebase ingestion request.
|
||||||
|
|
||||||
|
Clones the git repository, chunks all files, and stores in eXist-db.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.librarian.ingest import ingest_git_repo
|
||||||
|
|
||||||
|
logger.info(f"Ingesting codebase from {payload.git_url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await ingest_git_repo(
|
||||||
|
url=payload.git_url,
|
||||||
|
branch=payload.branch,
|
||||||
|
library_name=payload.library_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianIngested(
|
||||||
|
library_id=result.library_id,
|
||||||
|
library_name=result.library_name,
|
||||||
|
files_processed=result.files_processed,
|
||||||
|
chunks_created=result.chunks_created,
|
||||||
|
index_built=result.index_built,
|
||||||
|
errors="\n".join(result.errors) if result.errors else "",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Ingest failed: {e}")
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianIngested(
|
||||||
|
library_id="",
|
||||||
|
library_name=payload.library_name or "",
|
||||||
|
files_processed=0,
|
||||||
|
chunks_created=0,
|
||||||
|
index_built=False,
|
||||||
|
errors=str(e),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_librarian_query(
|
||||||
|
payload: LibrarianQuery,
|
||||||
|
metadata: HandlerMetadata,
|
||||||
|
) -> HandlerResponse:
|
||||||
|
"""
|
||||||
|
Handle a library query request.
|
||||||
|
|
||||||
|
Searches for relevant code chunks and synthesizes an answer using LLM.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.librarian.query import query_library, format_sources_xml
|
||||||
|
|
||||||
|
logger.info(f"Querying library {payload.library_id}: {payload.question[:100]}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await query_library(
|
||||||
|
library_id=payload.library_id,
|
||||||
|
question=payload.question,
|
||||||
|
max_chunks=payload.max_chunks,
|
||||||
|
model=payload.model,
|
||||||
|
)
|
||||||
|
|
||||||
|
sources_xml = format_sources_xml(result.sources) if result.sources else ""
|
||||||
|
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianAnswer(
|
||||||
|
answer=result.answer,
|
||||||
|
sources=sources_xml,
|
||||||
|
tokens_used=result.tokens_used,
|
||||||
|
chunks_examined=result.chunks_examined,
|
||||||
|
error=result.error,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Query failed: {e}")
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianAnswer(
|
||||||
|
answer="",
|
||||||
|
sources="",
|
||||||
|
tokens_used=0,
|
||||||
|
chunks_examined=0,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_librarian_list(
|
||||||
|
payload: LibrarianList,
|
||||||
|
metadata: HandlerMetadata,
|
||||||
|
) -> HandlerResponse:
|
||||||
|
"""
|
||||||
|
Handle a request to list all ingested libraries.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.librarian.index import list_libraries
|
||||||
|
|
||||||
|
logger.info("Listing all libraries")
|
||||||
|
|
||||||
|
try:
|
||||||
|
libraries = await list_libraries()
|
||||||
|
|
||||||
|
# Format libraries as XML
|
||||||
|
lib_items = []
|
||||||
|
for lib in libraries:
|
||||||
|
lib_items.append(
|
||||||
|
f""" <library>
|
||||||
|
<library-id>{xml_escape(lib.library_id)}</library-id>
|
||||||
|
<name>{xml_escape(lib.name)}</name>
|
||||||
|
<source-url>{xml_escape(lib.source_url)}</source-url>
|
||||||
|
<created-at>{xml_escape(lib.created_at)}</created-at>
|
||||||
|
<total-files>{lib.total_files}</total-files>
|
||||||
|
<total-chunks>{lib.total_chunks}</total-chunks>
|
||||||
|
</library>"""
|
||||||
|
)
|
||||||
|
|
||||||
|
libraries_xml = "<libraries>\n" + "\n".join(lib_items) + "\n</libraries>"
|
||||||
|
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianLibraries(
|
||||||
|
count=len(libraries),
|
||||||
|
libraries=libraries_xml,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"List failed: {e}")
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianLibraries(
|
||||||
|
count=0,
|
||||||
|
libraries="<libraries></libraries>",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_librarian_delete(
|
||||||
|
payload: LibrarianDelete,
|
||||||
|
metadata: HandlerMetadata,
|
||||||
|
) -> HandlerResponse:
|
||||||
|
"""
|
||||||
|
Handle a request to delete a library.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.librarian.index import delete_library
|
||||||
|
|
||||||
|
logger.info(f"Deleting library {payload.library_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
success = await delete_library(payload.library_id)
|
||||||
|
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianDeleted(
|
||||||
|
library_id=payload.library_id,
|
||||||
|
success=success,
|
||||||
|
error="" if success else "Delete operation failed",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Delete failed: {e}")
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianDeleted(
|
||||||
|
library_id=payload.library_id,
|
||||||
|
success=False,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_librarian_get_chunk(
|
||||||
|
payload: LibrarianGetChunk,
|
||||||
|
metadata: HandlerMetadata,
|
||||||
|
) -> HandlerResponse:
|
||||||
|
"""
|
||||||
|
Handle a request to retrieve a specific code chunk.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.librarian.query import get_chunk_by_id
|
||||||
|
|
||||||
|
logger.info(f"Getting chunk {payload.chunk_id} from library {payload.library_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
chunk = await get_chunk_by_id(payload.library_id, payload.chunk_id)
|
||||||
|
|
||||||
|
if chunk is None:
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianChunk(
|
||||||
|
chunk_id=payload.chunk_id,
|
||||||
|
error=f"Chunk not found: {payload.chunk_id}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianChunk(
|
||||||
|
chunk_id=chunk.chunk_id,
|
||||||
|
file_path=chunk.file_path,
|
||||||
|
name=chunk.name,
|
||||||
|
chunk_type=chunk.chunk_type,
|
||||||
|
language=chunk.language,
|
||||||
|
start_line=chunk.start_line,
|
||||||
|
end_line=chunk.end_line,
|
||||||
|
content=chunk.content,
|
||||||
|
docstring=chunk.docstring,
|
||||||
|
signature=chunk.signature,
|
||||||
|
error="",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Get chunk failed: {e}")
|
||||||
|
return HandlerResponse.respond(
|
||||||
|
payload=LibrarianChunk(
|
||||||
|
chunk_id=payload.chunk_id,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
)
|
||||||
328
xml_pipeline/librarian/index.py
Normal file
328
xml_pipeline/librarian/index.py
Normal file
|
|
@ -0,0 +1,328 @@
|
||||||
|
"""
|
||||||
|
index.py — Library index management for Premium Librarian.
|
||||||
|
|
||||||
|
Builds and queries structural indices for ingested codebases.
|
||||||
|
The index provides fast lookup of files, functions, and classes
|
||||||
|
without needing full-text search.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
from xml.sax.saxutils import escape as xml_escape
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LibraryIndex:
|
||||||
|
"""Structural index for an ingested library."""
|
||||||
|
|
||||||
|
library_id: str
|
||||||
|
name: str
|
||||||
|
source_url: str
|
||||||
|
created_at: str
|
||||||
|
files: list[str] = field(default_factory=list)
|
||||||
|
functions: dict[str, str] = field(default_factory=dict) # name → file path
|
||||||
|
classes: dict[str, str] = field(default_factory=dict) # name → file path
|
||||||
|
modules: list[str] = field(default_factory=list)
|
||||||
|
stats: dict[str, int] = field(default_factory=dict)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def total_chunks(self) -> int:
|
||||||
|
"""Total number of chunks in this library."""
|
||||||
|
return self.stats.get("chunks", 0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def total_files(self) -> int:
|
||||||
|
"""Total number of files in this library."""
|
||||||
|
return len(self.files)
|
||||||
|
|
||||||
|
|
||||||
|
def _index_to_xml(index: LibraryIndex) -> str:
|
||||||
|
"""Convert index to XML document for storage."""
|
||||||
|
files_xml = "\n".join(f" <file>{xml_escape(f)}</file>" for f in index.files)
|
||||||
|
|
||||||
|
functions_xml = "\n".join(
|
||||||
|
f' <function name="{xml_escape(name)}" file="{xml_escape(path)}"/>'
|
||||||
|
for name, path in index.functions.items()
|
||||||
|
)
|
||||||
|
|
||||||
|
classes_xml = "\n".join(
|
||||||
|
f' <class name="{xml_escape(name)}" file="{xml_escape(path)}"/>'
|
||||||
|
for name, path in index.classes.items()
|
||||||
|
)
|
||||||
|
|
||||||
|
modules_xml = "\n".join(f" <module>{xml_escape(m)}</module>" for m in index.modules)
|
||||||
|
|
||||||
|
stats_xml = "\n".join(
|
||||||
|
f' <stat name="{xml_escape(k)}">{v}</stat>'
|
||||||
|
for k, v in index.stats.items()
|
||||||
|
)
|
||||||
|
|
||||||
|
return f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<library-index xmlns="https://xml-pipeline.org/ns/librarian/v1">
|
||||||
|
<library-id>{xml_escape(index.library_id)}</library-id>
|
||||||
|
<name>{xml_escape(index.name)}</name>
|
||||||
|
<source-url>{xml_escape(index.source_url)}</source-url>
|
||||||
|
<created-at>{xml_escape(index.created_at)}</created-at>
|
||||||
|
<files>
|
||||||
|
{files_xml}
|
||||||
|
</files>
|
||||||
|
<functions>
|
||||||
|
{functions_xml}
|
||||||
|
</functions>
|
||||||
|
<classes>
|
||||||
|
{classes_xml}
|
||||||
|
</classes>
|
||||||
|
<modules>
|
||||||
|
{modules_xml}
|
||||||
|
</modules>
|
||||||
|
<stats>
|
||||||
|
{stats_xml}
|
||||||
|
</stats>
|
||||||
|
</library-index>"""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_index_xml(xml_content: str) -> Optional[LibraryIndex]:
|
||||||
|
"""Parse index XML back to LibraryIndex object."""
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
root = etree.fromstring(xml_content.encode())
|
||||||
|
ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
|
||||||
|
|
||||||
|
library_id = root.findtext("l:library-id", "", namespaces=ns)
|
||||||
|
name = root.findtext("l:name", "", namespaces=ns)
|
||||||
|
source_url = root.findtext("l:source-url", "", namespaces=ns)
|
||||||
|
created_at = root.findtext("l:created-at", "", namespaces=ns)
|
||||||
|
|
||||||
|
files = [f.text or "" for f in root.findall("l:files/l:file", namespaces=ns)]
|
||||||
|
|
||||||
|
functions = {
|
||||||
|
f.get("name", ""): f.get("file", "")
|
||||||
|
for f in root.findall("l:functions/l:function", namespaces=ns)
|
||||||
|
}
|
||||||
|
|
||||||
|
classes = {
|
||||||
|
c.get("name", ""): c.get("file", "")
|
||||||
|
for c in root.findall("l:classes/l:class", namespaces=ns)
|
||||||
|
}
|
||||||
|
|
||||||
|
modules = [m.text or "" for m in root.findall("l:modules/l:module", namespaces=ns)]
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
s.get("name", ""): int(s.text or 0)
|
||||||
|
for s in root.findall("l:stats/l:stat", namespaces=ns)
|
||||||
|
}
|
||||||
|
|
||||||
|
return LibraryIndex(
|
||||||
|
library_id=library_id,
|
||||||
|
name=name,
|
||||||
|
source_url=source_url,
|
||||||
|
created_at=created_at,
|
||||||
|
files=files,
|
||||||
|
functions=functions,
|
||||||
|
classes=classes,
|
||||||
|
modules=modules,
|
||||||
|
stats=stats,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to parse index XML: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def build_index(
|
||||||
|
library_id: str,
|
||||||
|
library_name: str,
|
||||||
|
source_url: str,
|
||||||
|
) -> LibraryIndex:
|
||||||
|
"""
|
||||||
|
Build structural index from stored chunks.
|
||||||
|
|
||||||
|
Queries eXist-db for all chunks belonging to this library
|
||||||
|
and extracts structural information.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_query, librarian_store
|
||||||
|
|
||||||
|
# Query for all chunks in this library
|
||||||
|
xquery = f"""
|
||||||
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
||||||
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
||||||
|
return <item>
|
||||||
|
<file>{{$chunk/l:file-path/text()}}</file>
|
||||||
|
<type>{{$chunk/l:chunk-type/text()}}</type>
|
||||||
|
<name>{{$chunk/l:name/text()}}</name>
|
||||||
|
<language>{{$chunk/l:language/text()}}</language>
|
||||||
|
</item>
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await librarian_query(query=xquery, collection=f"/db/librarian/{library_id}")
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
logger.warning(f"Failed to query chunks for index: {result.error}")
|
||||||
|
# Create minimal index
|
||||||
|
index = LibraryIndex(
|
||||||
|
library_id=library_id,
|
||||||
|
name=library_name,
|
||||||
|
source_url=source_url,
|
||||||
|
created_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Parse results
|
||||||
|
files: set[str] = set()
|
||||||
|
functions: dict[str, str] = {}
|
||||||
|
classes: dict[str, str] = {}
|
||||||
|
modules: list[str] = []
|
||||||
|
lang_stats: dict[str, int] = {}
|
||||||
|
chunk_count = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# Wrap results in root element for parsing
|
||||||
|
xml_str = f"<results>{result.data.get('results', '')}</results>"
|
||||||
|
root = etree.fromstring(xml_str.encode())
|
||||||
|
|
||||||
|
for item in root.findall("item"):
|
||||||
|
chunk_count += 1
|
||||||
|
file_path = item.findtext("file", "")
|
||||||
|
chunk_type = item.findtext("type", "")
|
||||||
|
name = item.findtext("name", "")
|
||||||
|
language = item.findtext("language", "")
|
||||||
|
|
||||||
|
if file_path:
|
||||||
|
files.add(file_path)
|
||||||
|
|
||||||
|
if chunk_type == "function" or chunk_type == "method":
|
||||||
|
functions[name] = file_path
|
||||||
|
elif chunk_type == "class":
|
||||||
|
classes[name] = file_path
|
||||||
|
elif chunk_type == "module":
|
||||||
|
modules.append(file_path)
|
||||||
|
|
||||||
|
if language:
|
||||||
|
lang_stats[language] = lang_stats.get(language, 0) + 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to parse chunk query results: {e}")
|
||||||
|
|
||||||
|
index = LibraryIndex(
|
||||||
|
library_id=library_id,
|
||||||
|
name=library_name,
|
||||||
|
source_url=source_url,
|
||||||
|
created_at=datetime.now(timezone.utc).isoformat(),
|
||||||
|
files=sorted(files),
|
||||||
|
functions=functions,
|
||||||
|
classes=classes,
|
||||||
|
modules=modules,
|
||||||
|
stats={
|
||||||
|
"chunks": chunk_count,
|
||||||
|
"files": len(files),
|
||||||
|
"functions": len(functions),
|
||||||
|
"classes": len(classes),
|
||||||
|
**{f"lang_{k}": v for k, v in lang_stats.items()},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store index document
|
||||||
|
index_xml = _index_to_xml(index)
|
||||||
|
store_result = await librarian_store(
|
||||||
|
collection=f"/db/librarian/{library_id}",
|
||||||
|
document_name="index.xml",
|
||||||
|
content=index_xml,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not store_result.success:
|
||||||
|
logger.warning(f"Failed to store index: {store_result.error}")
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
async def get_index(library_id: str) -> Optional[LibraryIndex]:
|
||||||
|
"""
|
||||||
|
Retrieve library index from eXist-db.
|
||||||
|
|
||||||
|
Returns None if index doesn't exist.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_get
|
||||||
|
|
||||||
|
result = await librarian_get(f"/db/librarian/{library_id}/index.xml")
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
return None
|
||||||
|
|
||||||
|
content = result.data.get("content", "")
|
||||||
|
return _parse_index_xml(content)
|
||||||
|
|
||||||
|
|
||||||
|
async def list_libraries() -> list[LibraryIndex]:
|
||||||
|
"""
|
||||||
|
List all ingested libraries.
|
||||||
|
|
||||||
|
Returns list of LibraryIndex objects for all libraries in eXist-db.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_query
|
||||||
|
|
||||||
|
xquery = """
|
||||||
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
||||||
|
for $index in collection("/db/librarian")//l:library-index
|
||||||
|
return $index
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await librarian_query(query=xquery, collection="/db/librarian")
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
logger.warning(f"Failed to list libraries: {result.error}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
libraries: list[LibraryIndex] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# Parse each index document
|
||||||
|
xml_str = result.data.get("results", "")
|
||||||
|
if xml_str.strip():
|
||||||
|
# Wrap in root element
|
||||||
|
wrapped = f"<results>{xml_str}</results>"
|
||||||
|
root = etree.fromstring(wrapped.encode())
|
||||||
|
|
||||||
|
for index_elem in root.findall(
|
||||||
|
"{https://xml-pipeline.org/ns/librarian/v1}library-index"
|
||||||
|
):
|
||||||
|
index_xml = etree.tostring(index_elem, encoding="unicode")
|
||||||
|
index = _parse_index_xml(index_xml)
|
||||||
|
if index:
|
||||||
|
libraries.append(index)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to parse library list: {e}")
|
||||||
|
|
||||||
|
return libraries
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_library(library_id: str) -> bool:
|
||||||
|
"""
|
||||||
|
Delete a library and all its chunks from eXist-db.
|
||||||
|
|
||||||
|
Returns True if successful.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_query
|
||||||
|
|
||||||
|
# Delete the entire collection
|
||||||
|
xquery = f"""
|
||||||
|
xmldb:remove("/db/librarian/{library_id}")
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await librarian_query(query=xquery)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
logger.warning(f"Failed to delete library {library_id}: {result.error}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
393
xml_pipeline/librarian/ingest.py
Normal file
393
xml_pipeline/librarian/ingest.py
Normal file
|
|
@ -0,0 +1,393 @@
|
||||||
|
"""
|
||||||
|
ingest.py — Codebase ingestion for Premium Librarian.
|
||||||
|
|
||||||
|
Clones git repositories, walks files, chunks them, and stores in eXist-db.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import AsyncIterator, Optional
|
||||||
|
from xml.sax.saxutils import escape as xml_escape
|
||||||
|
|
||||||
|
from xml_pipeline.librarian.chunker import Chunk, chunk_file, detect_language
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# File patterns to skip during ingestion
|
||||||
|
SKIP_PATTERNS = {
|
||||||
|
# Version control
|
||||||
|
".git",
|
||||||
|
".svn",
|
||||||
|
".hg",
|
||||||
|
# Dependencies
|
||||||
|
"node_modules",
|
||||||
|
"vendor",
|
||||||
|
".venv",
|
||||||
|
"venv",
|
||||||
|
"__pycache__",
|
||||||
|
".pytest_cache",
|
||||||
|
".mypy_cache",
|
||||||
|
".ruff_cache",
|
||||||
|
# Build artifacts
|
||||||
|
"dist",
|
||||||
|
"build",
|
||||||
|
"target",
|
||||||
|
"out",
|
||||||
|
".next",
|
||||||
|
# IDE
|
||||||
|
".idea",
|
||||||
|
".vscode",
|
||||||
|
# OS
|
||||||
|
".DS_Store",
|
||||||
|
"Thumbs.db",
|
||||||
|
}
|
||||||
|
|
||||||
|
# File extensions to process
|
||||||
|
CODE_EXTENSIONS = {
|
||||||
|
".py", ".pyi", # Python
|
||||||
|
".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", # JavaScript/TypeScript
|
||||||
|
".c", ".h", ".cpp", ".cxx", ".cc", ".hpp", ".hxx", # C/C++
|
||||||
|
".rs", # Rust
|
||||||
|
".go", # Go
|
||||||
|
".java", # Java
|
||||||
|
".kt", ".kts", # Kotlin
|
||||||
|
".rb", # Ruby
|
||||||
|
".php", # PHP
|
||||||
|
".cs", # C#
|
||||||
|
".swift", # Swift
|
||||||
|
".scala", # Scala
|
||||||
|
".md", ".rst", ".txt", # Documentation
|
||||||
|
".yaml", ".yml", ".toml", ".json", # Config
|
||||||
|
".xml", ".xsd", # XML
|
||||||
|
".sql", # SQL
|
||||||
|
".sh", ".bash", ".zsh", # Shell
|
||||||
|
".dockerfile", ".containerfile", # Docker
|
||||||
|
}
|
||||||
|
|
||||||
|
# Max file size to process (1MB)
|
||||||
|
MAX_FILE_SIZE = 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IngestResult:
|
||||||
|
"""Result of a codebase ingestion."""
|
||||||
|
|
||||||
|
library_id: str
|
||||||
|
library_name: str
|
||||||
|
files_processed: int
|
||||||
|
chunks_created: int
|
||||||
|
index_built: bool
|
||||||
|
errors: list[str] = field(default_factory=list)
|
||||||
|
stats: dict[str, int] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IngestConfig:
|
||||||
|
"""Configuration for ingestion."""
|
||||||
|
|
||||||
|
branch: str = "main"
|
||||||
|
max_file_size: int = MAX_FILE_SIZE
|
||||||
|
skip_patterns: set[str] = field(default_factory=lambda: SKIP_PATTERNS.copy())
|
||||||
|
extensions: set[str] = field(default_factory=lambda: CODE_EXTENSIONS.copy())
|
||||||
|
|
||||||
|
|
||||||
|
def _should_skip_path(path: Path, config: IngestConfig) -> bool:
|
||||||
|
"""Check if a path should be skipped."""
|
||||||
|
for part in path.parts:
|
||||||
|
if part in config.skip_patterns:
|
||||||
|
return True
|
||||||
|
if part.startswith(".") and part not in {".github", ".gitlab"}:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _should_process_file(path: Path, config: IngestConfig) -> bool:
|
||||||
|
"""Check if a file should be processed."""
|
||||||
|
# Check extension
|
||||||
|
suffix = path.suffix.lower()
|
||||||
|
if suffix not in config.extensions:
|
||||||
|
# Also check for files without extension (Dockerfile, Makefile, etc.)
|
||||||
|
name_lower = path.name.lower()
|
||||||
|
if name_lower not in {"dockerfile", "makefile", "rakefile", "gemfile"}:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check size
|
||||||
|
try:
|
||||||
|
if path.stat().st_size > config.max_file_size:
|
||||||
|
return False
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def _clone_repo(url: str, branch: str, target_dir: Path) -> None:
|
||||||
|
"""Clone a git repository."""
|
||||||
|
try:
|
||||||
|
# Try using GitPython
|
||||||
|
from git import Repo
|
||||||
|
logger.info(f"Cloning {url} (branch: {branch}) to {target_dir}")
|
||||||
|
Repo.clone_from(url, target_dir, branch=branch, depth=1)
|
||||||
|
except ImportError:
|
||||||
|
# Fall back to git CLI
|
||||||
|
logger.info(f"GitPython not available, using git CLI")
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
"git", "clone", "--depth", "1", "--branch", branch, url, str(target_dir),
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = await proc.communicate()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
raise RuntimeError(f"git clone failed: {stderr.decode()}")
|
||||||
|
|
||||||
|
|
||||||
|
async def _walk_files(root: Path, config: IngestConfig) -> AsyncIterator[Path]:
|
||||||
|
"""Walk directory tree, yielding files to process."""
|
||||||
|
for path in root.rglob("*"):
|
||||||
|
if path.is_file():
|
||||||
|
rel_path = path.relative_to(root)
|
||||||
|
if not _should_skip_path(rel_path, config):
|
||||||
|
if _should_process_file(path, config):
|
||||||
|
yield path
|
||||||
|
|
||||||
|
|
||||||
|
def _chunk_to_xml(chunk: Chunk, library_id: str) -> str:
|
||||||
|
"""Convert a chunk to XML document for storage."""
|
||||||
|
# Escape content for XML
|
||||||
|
content_escaped = xml_escape(chunk.content)
|
||||||
|
docstring_escaped = xml_escape(chunk.docstring) if chunk.docstring else ""
|
||||||
|
signature_escaped = xml_escape(chunk.signature) if chunk.signature else ""
|
||||||
|
|
||||||
|
imports_xml = "\n".join(f" <import>{xml_escape(imp)}</import>" for imp in chunk.imports)
|
||||||
|
|
||||||
|
return f"""<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<chunk xmlns="https://xml-pipeline.org/ns/librarian/v1">
|
||||||
|
<id>{xml_escape(chunk.chunk_id)}</id>
|
||||||
|
<library-id>{xml_escape(library_id)}</library-id>
|
||||||
|
<file-path>{xml_escape(chunk.file_path)}</file-path>
|
||||||
|
<start-line>{chunk.start_line}</start-line>
|
||||||
|
<end-line>{chunk.end_line}</end-line>
|
||||||
|
<chunk-type>{xml_escape(chunk.chunk_type)}</chunk-type>
|
||||||
|
<name>{xml_escape(chunk.name)}</name>
|
||||||
|
<language>{xml_escape(chunk.language)}</language>
|
||||||
|
<parent-class>{xml_escape(chunk.parent_class)}</parent-class>
|
||||||
|
<signature>{signature_escaped}</signature>
|
||||||
|
<docstring>{docstring_escaped}</docstring>
|
||||||
|
<imports>
|
||||||
|
{imports_xml}
|
||||||
|
</imports>
|
||||||
|
<content><![CDATA[{chunk.content}]]></content>
|
||||||
|
</chunk>"""
|
||||||
|
|
||||||
|
|
||||||
|
async def _store_chunk(
|
||||||
|
chunk: Chunk,
|
||||||
|
library_id: str,
|
||||||
|
collection: str,
|
||||||
|
) -> bool:
|
||||||
|
"""Store a chunk in eXist-db."""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_store
|
||||||
|
|
||||||
|
xml_content = _chunk_to_xml(chunk, library_id)
|
||||||
|
|
||||||
|
# Generate document name from chunk ID
|
||||||
|
doc_name = f"{chunk.chunk_id.replace(':', '_').replace('/', '_')}.xml"
|
||||||
|
|
||||||
|
result = await librarian_store(
|
||||||
|
collection=collection,
|
||||||
|
document_name=doc_name,
|
||||||
|
content=xml_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result.success
|
||||||
|
|
||||||
|
|
||||||
|
async def ingest_git_repo(
|
||||||
|
url: str,
|
||||||
|
branch: str = "main",
|
||||||
|
library_name: str = "",
|
||||||
|
config: Optional[IngestConfig] = None,
|
||||||
|
) -> IngestResult:
|
||||||
|
"""
|
||||||
|
Clone and ingest a git repository.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Git repository URL
|
||||||
|
branch: Branch to clone (default: main)
|
||||||
|
library_name: Human-readable name (derived from URL if empty)
|
||||||
|
config: Ingestion configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
IngestResult with statistics and library_id
|
||||||
|
"""
|
||||||
|
if config is None:
|
||||||
|
config = IngestConfig(branch=branch)
|
||||||
|
|
||||||
|
# Derive library name from URL if not provided
|
||||||
|
if not library_name:
|
||||||
|
# Extract repo name from URL
|
||||||
|
# https://github.com/user/repo.git -> repo
|
||||||
|
# git@github.com:user/repo.git -> repo
|
||||||
|
name = url.rstrip("/").rstrip(".git").split("/")[-1].split(":")[-1]
|
||||||
|
library_name = name
|
||||||
|
|
||||||
|
# Generate unique library ID
|
||||||
|
library_id = f"{library_name}-{uuid.uuid4().hex[:8]}"
|
||||||
|
|
||||||
|
result = IngestResult(
|
||||||
|
library_id=library_id,
|
||||||
|
library_name=library_name,
|
||||||
|
files_processed=0,
|
||||||
|
chunks_created=0,
|
||||||
|
index_built=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create temp directory for clone
|
||||||
|
temp_dir = Path(tempfile.mkdtemp(prefix="librarian_"))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Clone repository
|
||||||
|
await _clone_repo(url, config.branch, temp_dir)
|
||||||
|
|
||||||
|
# Collection path in eXist-db
|
||||||
|
collection = f"/db/librarian/{library_id}/chunks"
|
||||||
|
|
||||||
|
# Track language statistics
|
||||||
|
lang_stats: dict[str, int] = {}
|
||||||
|
|
||||||
|
# Process files
|
||||||
|
async for file_path in _walk_files(temp_dir, config):
|
||||||
|
try:
|
||||||
|
# Read file content
|
||||||
|
content = file_path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
|
||||||
|
# Get relative path for storage
|
||||||
|
rel_path = str(file_path.relative_to(temp_dir))
|
||||||
|
|
||||||
|
# Detect language and update stats
|
||||||
|
language = detect_language(rel_path)
|
||||||
|
lang_stats[language] = lang_stats.get(language, 0) + 1
|
||||||
|
|
||||||
|
# Chunk the file
|
||||||
|
chunks = chunk_file(content, rel_path)
|
||||||
|
|
||||||
|
# Store each chunk
|
||||||
|
for chunk in chunks:
|
||||||
|
success = await _store_chunk(chunk, library_id, collection)
|
||||||
|
if success:
|
||||||
|
result.chunks_created += 1
|
||||||
|
else:
|
||||||
|
result.errors.append(f"Failed to store chunk: {chunk.chunk_id}")
|
||||||
|
|
||||||
|
result.files_processed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
result.errors.append(f"Error processing {file_path}: {e}")
|
||||||
|
logger.warning(f"Error processing {file_path}: {e}")
|
||||||
|
|
||||||
|
result.stats = lang_stats
|
||||||
|
|
||||||
|
# Build index
|
||||||
|
from xml_pipeline.librarian.index import build_index
|
||||||
|
|
||||||
|
try:
|
||||||
|
await build_index(library_id, library_name, url)
|
||||||
|
result.index_built = True
|
||||||
|
except Exception as e:
|
||||||
|
result.errors.append(f"Index build failed: {e}")
|
||||||
|
logger.warning(f"Index build failed: {e}")
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Ingested {library_name}: {result.files_processed} files, "
|
||||||
|
f"{result.chunks_created} chunks"
|
||||||
|
)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Cleanup temp directory
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def ingest_directory(
|
||||||
|
path: str | Path,
|
||||||
|
library_name: str,
|
||||||
|
config: Optional[IngestConfig] = None,
|
||||||
|
) -> IngestResult:
|
||||||
|
"""
|
||||||
|
Ingest a local directory (for testing or local codebases).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to directory
|
||||||
|
library_name: Human-readable name
|
||||||
|
config: Ingestion configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
IngestResult with statistics and library_id
|
||||||
|
"""
|
||||||
|
if config is None:
|
||||||
|
config = IngestConfig()
|
||||||
|
|
||||||
|
root = Path(path)
|
||||||
|
if not root.is_dir():
|
||||||
|
raise ValueError(f"Not a directory: {path}")
|
||||||
|
|
||||||
|
# Generate unique library ID
|
||||||
|
library_id = f"{library_name}-{uuid.uuid4().hex[:8]}"
|
||||||
|
|
||||||
|
result = IngestResult(
|
||||||
|
library_id=library_id,
|
||||||
|
library_name=library_name,
|
||||||
|
files_processed=0,
|
||||||
|
chunks_created=0,
|
||||||
|
index_built=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
collection = f"/db/librarian/{library_id}/chunks"
|
||||||
|
lang_stats: dict[str, int] = {}
|
||||||
|
|
||||||
|
async for file_path in _walk_files(root, config):
|
||||||
|
try:
|
||||||
|
content = file_path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
rel_path = str(file_path.relative_to(root))
|
||||||
|
|
||||||
|
language = detect_language(rel_path)
|
||||||
|
lang_stats[language] = lang_stats.get(language, 0) + 1
|
||||||
|
|
||||||
|
chunks = chunk_file(content, rel_path)
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
success = await _store_chunk(chunk, library_id, collection)
|
||||||
|
if success:
|
||||||
|
result.chunks_created += 1
|
||||||
|
else:
|
||||||
|
result.errors.append(f"Failed to store chunk: {chunk.chunk_id}")
|
||||||
|
|
||||||
|
result.files_processed += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
result.errors.append(f"Error processing {file_path}: {e}")
|
||||||
|
logger.warning(f"Error processing {file_path}: {e}")
|
||||||
|
|
||||||
|
result.stats = lang_stats
|
||||||
|
|
||||||
|
# Build index
|
||||||
|
from xml_pipeline.librarian.index import build_index
|
||||||
|
|
||||||
|
try:
|
||||||
|
await build_index(library_id, library_name, str(root))
|
||||||
|
result.index_built = True
|
||||||
|
except Exception as e:
|
||||||
|
result.errors.append(f"Index build failed: {e}")
|
||||||
|
|
||||||
|
return result
|
||||||
167
xml_pipeline/librarian/primitives.py
Normal file
167
xml_pipeline/librarian/primitives.py
Normal file
|
|
@ -0,0 +1,167 @@
|
||||||
|
"""
|
||||||
|
primitives.py — XML payload dataclasses for Premium Librarian.
|
||||||
|
|
||||||
|
These are the message types that flow through the organism's message bus.
|
||||||
|
|
||||||
|
Note: Do NOT use `from __future__ import annotations` here
|
||||||
|
as it breaks the xmlify decorator which needs concrete types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from third_party.xmlable import xmlify
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianIngest:
|
||||||
|
"""
|
||||||
|
Request to ingest a codebase into the Premium Librarian.
|
||||||
|
|
||||||
|
Supports git URLs. The library will be cloned, chunked, and stored
|
||||||
|
in eXist-db for subsequent querying.
|
||||||
|
"""
|
||||||
|
|
||||||
|
git_url: str = ""
|
||||||
|
branch: str = "main"
|
||||||
|
library_name: str = "" # Optional; derived from URL if empty
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianIngested:
|
||||||
|
"""
|
||||||
|
Response after successful codebase ingestion.
|
||||||
|
|
||||||
|
Contains the library_id needed for subsequent queries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
library_id: str = ""
|
||||||
|
library_name: str = ""
|
||||||
|
files_processed: int = 0
|
||||||
|
chunks_created: int = 0
|
||||||
|
index_built: bool = False
|
||||||
|
errors: str = "" # Newline-separated error messages
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianQuery:
|
||||||
|
"""
|
||||||
|
Query an ingested library with a natural language question.
|
||||||
|
|
||||||
|
The system will search for relevant code chunks and synthesize
|
||||||
|
an answer using the configured LLM.
|
||||||
|
"""
|
||||||
|
|
||||||
|
library_id: str = ""
|
||||||
|
question: str = ""
|
||||||
|
max_chunks: int = 20 # Max chunks to include in context
|
||||||
|
model: str = "" # Optional; uses default if empty
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianAnswer:
|
||||||
|
"""
|
||||||
|
Response to a library query.
|
||||||
|
|
||||||
|
Contains the synthesized answer and source references.
|
||||||
|
"""
|
||||||
|
|
||||||
|
answer: str = ""
|
||||||
|
sources: str = "" # XML-formatted source list
|
||||||
|
tokens_used: int = 0
|
||||||
|
chunks_examined: int = 0
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianList:
|
||||||
|
"""
|
||||||
|
Request to list all ingested libraries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass # No parameters needed
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibraryInfo:
|
||||||
|
"""
|
||||||
|
Information about a single ingested library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
library_id: str = ""
|
||||||
|
name: str = ""
|
||||||
|
source_url: str = ""
|
||||||
|
created_at: str = ""
|
||||||
|
total_files: int = 0
|
||||||
|
total_chunks: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianLibraries:
|
||||||
|
"""
|
||||||
|
Response listing all ingested libraries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
count: int = 0
|
||||||
|
libraries: str = "" # XML-formatted library list
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianDelete:
|
||||||
|
"""
|
||||||
|
Request to delete an ingested library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
library_id: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianDeleted:
|
||||||
|
"""
|
||||||
|
Response after library deletion.
|
||||||
|
"""
|
||||||
|
|
||||||
|
library_id: str = ""
|
||||||
|
success: bool = False
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianGetChunk:
|
||||||
|
"""
|
||||||
|
Request to retrieve a specific code chunk.
|
||||||
|
|
||||||
|
Useful for examining source code referenced in a query response.
|
||||||
|
"""
|
||||||
|
|
||||||
|
library_id: str = ""
|
||||||
|
chunk_id: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@xmlify
|
||||||
|
@dataclass
|
||||||
|
class LibrarianChunk:
|
||||||
|
"""
|
||||||
|
Response with a specific code chunk.
|
||||||
|
"""
|
||||||
|
|
||||||
|
chunk_id: str = ""
|
||||||
|
file_path: str = ""
|
||||||
|
name: str = ""
|
||||||
|
chunk_type: str = ""
|
||||||
|
language: str = ""
|
||||||
|
start_line: int = 0
|
||||||
|
end_line: int = 0
|
||||||
|
content: str = ""
|
||||||
|
docstring: str = ""
|
||||||
|
signature: str = ""
|
||||||
|
error: str = ""
|
||||||
436
xml_pipeline/librarian/query.py
Normal file
436
xml_pipeline/librarian/query.py
Normal file
|
|
@ -0,0 +1,436 @@
|
||||||
|
"""
|
||||||
|
query.py — RAG-based query system for Premium Librarian.
|
||||||
|
|
||||||
|
Searches indexed codebases and synthesizes answers using Online LLM.
|
||||||
|
The flow: Search → Retrieve → Synthesize → Return with sources.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
from xml.sax.saxutils import escape as xml_escape
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Source:
|
||||||
|
"""A source chunk used in answering a query."""
|
||||||
|
|
||||||
|
file_path: str
|
||||||
|
name: str
|
||||||
|
chunk_type: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
relevance_score: float
|
||||||
|
snippet: str = "" # First ~200 chars of content
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QueryResult:
|
||||||
|
"""Result of a library query."""
|
||||||
|
|
||||||
|
answer: str
|
||||||
|
sources: list[Source] = field(default_factory=list)
|
||||||
|
tokens_used: int = 0
|
||||||
|
chunks_examined: int = 0
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RetrievedChunk:
|
||||||
|
"""A chunk retrieved from eXist-db for RAG."""
|
||||||
|
|
||||||
|
chunk_id: str
|
||||||
|
file_path: str
|
||||||
|
name: str
|
||||||
|
chunk_type: str
|
||||||
|
language: str
|
||||||
|
start_line: int
|
||||||
|
end_line: int
|
||||||
|
content: str
|
||||||
|
docstring: str
|
||||||
|
signature: str
|
||||||
|
score: float
|
||||||
|
|
||||||
|
|
||||||
|
async def _search_chunks(
|
||||||
|
library_id: str,
|
||||||
|
query: str,
|
||||||
|
max_results: int = 20,
|
||||||
|
) -> list[RetrievedChunk]:
|
||||||
|
"""
|
||||||
|
Search for relevant chunks using Lucene full-text search.
|
||||||
|
|
||||||
|
Returns chunks sorted by relevance score.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_query
|
||||||
|
|
||||||
|
# Escape query for XQuery
|
||||||
|
query_escaped = query.replace('"', '\\"').replace("'", "\\'")
|
||||||
|
|
||||||
|
# Full-text search using Lucene
|
||||||
|
xquery = f"""
|
||||||
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
||||||
|
import module namespace ft = "http://exist-db.org/xquery/lucene";
|
||||||
|
|
||||||
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
||||||
|
let $content := $chunk/l:content/text()
|
||||||
|
let $name := $chunk/l:name/text()
|
||||||
|
let $docstring := $chunk/l:docstring/text()
|
||||||
|
let $score := (
|
||||||
|
if (ft:query($content, "{query_escaped}")) then ft:score($content) * 2
|
||||||
|
else if (ft:query($name, "{query_escaped}")) then ft:score($name) * 3
|
||||||
|
else if (ft:query($docstring, "{query_escaped}")) then ft:score($docstring)
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
where $score > 0
|
||||||
|
order by $score descending
|
||||||
|
return <result score="{{$score}}">
|
||||||
|
<id>{{$chunk/l:id/text()}}</id>
|
||||||
|
<file-path>{{$chunk/l:file-path/text()}}</file-path>
|
||||||
|
<name>{{$chunk/l:name/text()}}</name>
|
||||||
|
<chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
|
||||||
|
<language>{{$chunk/l:language/text()}}</language>
|
||||||
|
<start-line>{{$chunk/l:start-line/text()}}</start-line>
|
||||||
|
<end-line>{{$chunk/l:end-line/text()}}</end-line>
|
||||||
|
<signature>{{$chunk/l:signature/text()}}</signature>
|
||||||
|
<docstring>{{$chunk/l:docstring/text()}}</docstring>
|
||||||
|
<content>{{$chunk/l:content/text()}}</content>
|
||||||
|
</result>
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await librarian_query(
|
||||||
|
query=xquery,
|
||||||
|
collection=f"/db/librarian/{library_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks: list[RetrievedChunk] = []
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
logger.warning(f"Search failed: {result.error}")
|
||||||
|
# Fall back to simple query without Lucene
|
||||||
|
return await _search_chunks_fallback(library_id, query, max_results)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
xml_str = f"<results>{result.data.get('results', '')}</results>"
|
||||||
|
root = etree.fromstring(xml_str.encode())
|
||||||
|
|
||||||
|
for item in root.findall("result")[:max_results]:
|
||||||
|
score = float(item.get("score", 0))
|
||||||
|
|
||||||
|
chunks.append(
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id=item.findtext("id", ""),
|
||||||
|
file_path=item.findtext("file-path", ""),
|
||||||
|
name=item.findtext("name", ""),
|
||||||
|
chunk_type=item.findtext("chunk-type", ""),
|
||||||
|
language=item.findtext("language", ""),
|
||||||
|
start_line=int(item.findtext("start-line", "0")),
|
||||||
|
end_line=int(item.findtext("end-line", "0")),
|
||||||
|
content=item.findtext("content", ""),
|
||||||
|
docstring=item.findtext("docstring", ""),
|
||||||
|
signature=item.findtext("signature", ""),
|
||||||
|
score=score,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to parse search results: {e}")
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
async def _search_chunks_fallback(
|
||||||
|
library_id: str,
|
||||||
|
query: str,
|
||||||
|
max_results: int = 20,
|
||||||
|
) -> list[RetrievedChunk]:
|
||||||
|
"""
|
||||||
|
Fallback search using contains() when Lucene is not available.
|
||||||
|
|
||||||
|
Less accurate but works without Lucene indexing.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_query
|
||||||
|
|
||||||
|
# Simple contains search
|
||||||
|
query_lower = query.lower().replace('"', '\\"').replace("'", "\\'")
|
||||||
|
terms = query_lower.split()
|
||||||
|
|
||||||
|
# Build contains conditions
|
||||||
|
conditions = []
|
||||||
|
for term in terms[:5]: # Limit to 5 terms
|
||||||
|
conditions.append(
|
||||||
|
f'(contains(lower-case($chunk/l:content), "{term}") or '
|
||||||
|
f'contains(lower-case($chunk/l:name), "{term}") or '
|
||||||
|
f'contains(lower-case($chunk/l:docstring), "{term}"))'
|
||||||
|
)
|
||||||
|
|
||||||
|
where_clause = " or ".join(conditions) if conditions else "true()"
|
||||||
|
|
||||||
|
xquery = f"""
|
||||||
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
||||||
|
|
||||||
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
||||||
|
where {where_clause}
|
||||||
|
return <result>
|
||||||
|
<id>{{$chunk/l:id/text()}}</id>
|
||||||
|
<file-path>{{$chunk/l:file-path/text()}}</file-path>
|
||||||
|
<name>{{$chunk/l:name/text()}}</name>
|
||||||
|
<chunk-type>{{$chunk/l:chunk-type/text()}}</chunk-type>
|
||||||
|
<language>{{$chunk/l:language/text()}}</language>
|
||||||
|
<start-line>{{$chunk/l:start-line/text()}}</start-line>
|
||||||
|
<end-line>{{$chunk/l:end-line/text()}}</end-line>
|
||||||
|
<signature>{{$chunk/l:signature/text()}}</signature>
|
||||||
|
<docstring>{{$chunk/l:docstring/text()}}</docstring>
|
||||||
|
<content>{{$chunk/l:content/text()}}</content>
|
||||||
|
</result>
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await librarian_query(
|
||||||
|
query=xquery,
|
||||||
|
collection=f"/db/librarian/{library_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks: list[RetrievedChunk] = []
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
logger.warning(f"Fallback search failed: {result.error}")
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
xml_str = f"<results>{result.data.get('results', '')}</results>"
|
||||||
|
root = etree.fromstring(xml_str.encode())
|
||||||
|
|
||||||
|
for i, item in enumerate(root.findall("result")[:max_results]):
|
||||||
|
# Assign decreasing score based on order
|
||||||
|
score = 1.0 - (i * 0.05)
|
||||||
|
|
||||||
|
chunks.append(
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id=item.findtext("id", ""),
|
||||||
|
file_path=item.findtext("file-path", ""),
|
||||||
|
name=item.findtext("name", ""),
|
||||||
|
chunk_type=item.findtext("chunk-type", ""),
|
||||||
|
language=item.findtext("language", ""),
|
||||||
|
start_line=int(item.findtext("start-line", "0")),
|
||||||
|
end_line=int(item.findtext("end-line", "0")),
|
||||||
|
content=item.findtext("content", ""),
|
||||||
|
docstring=item.findtext("docstring", ""),
|
||||||
|
signature=item.findtext("signature", ""),
|
||||||
|
score=score,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to parse fallback search results: {e}")
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _build_rag_prompt(
|
||||||
|
question: str,
|
||||||
|
chunks: list[RetrievedChunk],
|
||||||
|
library_name: str,
|
||||||
|
) -> str:
|
||||||
|
"""Build the RAG prompt with retrieved context."""
|
||||||
|
context_parts = []
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks, 1):
|
||||||
|
header = f"[{i}] {chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
||||||
|
if chunk.signature:
|
||||||
|
header += f"\n {chunk.signature}"
|
||||||
|
|
||||||
|
# Truncate content if too long
|
||||||
|
content = chunk.content
|
||||||
|
if len(content) > 2000:
|
||||||
|
content = content[:2000] + "\n... (truncated)"
|
||||||
|
|
||||||
|
context_parts.append(f"{header}\n```{chunk.language}\n{content}\n```")
|
||||||
|
|
||||||
|
context = "\n\n".join(context_parts)
|
||||||
|
|
||||||
|
return f"""You are a code assistant analyzing the "{library_name}" codebase.
|
||||||
|
|
||||||
|
Answer the following question based ONLY on the provided code context.
|
||||||
|
If the answer is not in the context, say so clearly.
|
||||||
|
Reference specific files and line numbers when relevant.
|
||||||
|
|
||||||
|
## Code Context
|
||||||
|
|
||||||
|
{context}
|
||||||
|
|
||||||
|
## Question
|
||||||
|
|
||||||
|
{question}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
|
||||||
|
1. Answer based on the code context above
|
||||||
|
2. Cite sources using [1], [2], etc. format
|
||||||
|
3. Include relevant code snippets if helpful
|
||||||
|
4. Be concise but complete"""
|
||||||
|
|
||||||
|
|
||||||
|
async def query_library(
|
||||||
|
library_id: str,
|
||||||
|
question: str,
|
||||||
|
max_chunks: int = 20,
|
||||||
|
model: str = "",
|
||||||
|
) -> QueryResult:
|
||||||
|
"""
|
||||||
|
Query an ingested library using RAG.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
library_id: ID of the ingested library
|
||||||
|
question: Natural language question
|
||||||
|
max_chunks: Maximum chunks to retrieve for context
|
||||||
|
model: LLM model to use (empty = use default)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
QueryResult with answer and sources
|
||||||
|
"""
|
||||||
|
from xml_pipeline.librarian.index import get_index
|
||||||
|
from xml_pipeline.llm import complete
|
||||||
|
|
||||||
|
# Get library info
|
||||||
|
index = await get_index(library_id)
|
||||||
|
if not index:
|
||||||
|
return QueryResult(
|
||||||
|
answer="",
|
||||||
|
error=f"Library not found: {library_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Search for relevant chunks
|
||||||
|
chunks = await _search_chunks(library_id, question, max_chunks)
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
return QueryResult(
|
||||||
|
answer=f"No relevant code found for your question in the '{index.name}' codebase.",
|
||||||
|
chunks_examined=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build RAG prompt
|
||||||
|
prompt = _build_rag_prompt(question, chunks, index.name)
|
||||||
|
|
||||||
|
# Call LLM
|
||||||
|
try:
|
||||||
|
response = await complete(
|
||||||
|
model=model or "grok-4.1", # Default model
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
answer = response.content
|
||||||
|
tokens_used = response.usage.get("total_tokens", 0)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"LLM call failed: {e}")
|
||||||
|
return QueryResult(
|
||||||
|
answer="",
|
||||||
|
error=f"Failed to generate answer: {e}",
|
||||||
|
chunks_examined=len(chunks),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build sources list
|
||||||
|
sources = [
|
||||||
|
Source(
|
||||||
|
file_path=chunk.file_path,
|
||||||
|
name=chunk.name,
|
||||||
|
chunk_type=chunk.chunk_type,
|
||||||
|
start_line=chunk.start_line,
|
||||||
|
end_line=chunk.end_line,
|
||||||
|
relevance_score=chunk.score,
|
||||||
|
snippet=chunk.content[:200] if chunk.content else "",
|
||||||
|
)
|
||||||
|
for chunk in chunks
|
||||||
|
]
|
||||||
|
|
||||||
|
return QueryResult(
|
||||||
|
answer=answer,
|
||||||
|
sources=sources,
|
||||||
|
tokens_used=tokens_used,
|
||||||
|
chunks_examined=len(chunks),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def format_sources_xml(sources: list[Source]) -> str:
|
||||||
|
"""Format sources as XML for LibrarianAnswer payload."""
|
||||||
|
source_items = []
|
||||||
|
|
||||||
|
for i, source in enumerate(sources, 1):
|
||||||
|
snippet_escaped = xml_escape(source.snippet[:100]) if source.snippet else ""
|
||||||
|
source_items.append(
|
||||||
|
f""" <source index="{i}">
|
||||||
|
<file-path>{xml_escape(source.file_path)}</file-path>
|
||||||
|
<name>{xml_escape(source.name)}</name>
|
||||||
|
<type>{xml_escape(source.chunk_type)}</type>
|
||||||
|
<lines>{source.start_line}-{source.end_line}</lines>
|
||||||
|
<score>{source.relevance_score:.2f}</score>
|
||||||
|
<snippet>{snippet_escaped}</snippet>
|
||||||
|
</source>"""
|
||||||
|
)
|
||||||
|
|
||||||
|
return "<sources>\n" + "\n".join(source_items) + "\n</sources>"
|
||||||
|
|
||||||
|
|
||||||
|
async def get_chunk_by_id(library_id: str, chunk_id: str) -> Optional[RetrievedChunk]:
|
||||||
|
"""
|
||||||
|
Retrieve a specific chunk by ID.
|
||||||
|
|
||||||
|
Useful for follow-up queries about a specific piece of code.
|
||||||
|
"""
|
||||||
|
from xml_pipeline.tools.librarian import librarian_query
|
||||||
|
|
||||||
|
chunk_id_escaped = chunk_id.replace('"', '\\"')
|
||||||
|
|
||||||
|
xquery = f"""
|
||||||
|
declare namespace l = "https://xml-pipeline.org/ns/librarian/v1";
|
||||||
|
|
||||||
|
for $chunk in collection("/db/librarian/{library_id}/chunks")//l:chunk
|
||||||
|
where $chunk/l:id = "{chunk_id_escaped}"
|
||||||
|
return $chunk
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = await librarian_query(
|
||||||
|
query=xquery,
|
||||||
|
collection=f"/db/librarian/{library_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
ns = {"l": "https://xml-pipeline.org/ns/librarian/v1"}
|
||||||
|
root = etree.fromstring(result.data.get("results", "").encode())
|
||||||
|
|
||||||
|
chunk_elem = root if root.tag.endswith("chunk") else root.find("l:chunk", namespaces=ns)
|
||||||
|
if chunk_elem is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return RetrievedChunk(
|
||||||
|
chunk_id=chunk_elem.findtext("l:id", "", namespaces=ns),
|
||||||
|
file_path=chunk_elem.findtext("l:file-path", "", namespaces=ns),
|
||||||
|
name=chunk_elem.findtext("l:name", "", namespaces=ns),
|
||||||
|
chunk_type=chunk_elem.findtext("l:chunk-type", "", namespaces=ns),
|
||||||
|
language=chunk_elem.findtext("l:language", "", namespaces=ns),
|
||||||
|
start_line=int(chunk_elem.findtext("l:start-line", "0", namespaces=ns)),
|
||||||
|
end_line=int(chunk_elem.findtext("l:end-line", "0", namespaces=ns)),
|
||||||
|
content=chunk_elem.findtext("l:content", "", namespaces=ns),
|
||||||
|
docstring=chunk_elem.findtext("l:docstring", "", namespaces=ns),
|
||||||
|
signature=chunk_elem.findtext("l:signature", "", namespaces=ns),
|
||||||
|
score=1.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to parse chunk: {e}")
|
||||||
|
return None
|
||||||
Loading…
Reference in a new issue