Implements an RLM-powered codebase intelligence system that: - Ingests git repositories and chunks code intelligently - Stores chunks in eXist-db for RAG retrieval - Answers natural language queries using LLM synthesis New package xml_pipeline/librarian/ with: - chunker.py: AST-based code chunking (Python, JS/TS, C++) - ingest.py: Git clone + file walking + chunk storage - index.py: Structural index building (files, functions, classes) - query.py: RAG search + LLM synthesis with source citations - primitives.py: XML payloads (LibrarianIngest, LibrarianQuery, etc.) - handler.py: Message handlers for organism integration Also adds GitPython and aiohttp as optional [librarian] dependencies. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
375 lines
10 KiB
Python
375 lines
10 KiB
Python
"""
|
|
Tests for the Premium Librarian code chunker.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from xml_pipeline.librarian.chunker import (
|
|
Chunk,
|
|
chunk_file,
|
|
chunk_python,
|
|
chunk_javascript,
|
|
chunk_cpp,
|
|
chunk_prose,
|
|
chunk_generic,
|
|
detect_language,
|
|
)
|
|
|
|
|
|
class TestLanguageDetection:
|
|
"""Tests for language detection from file paths."""
|
|
|
|
def test_python_detection(self) -> None:
|
|
assert detect_language("foo.py") == "python"
|
|
assert detect_language("path/to/module.py") == "python"
|
|
assert detect_language("types.pyi") == "python"
|
|
|
|
def test_javascript_detection(self) -> None:
|
|
assert detect_language("app.js") == "javascript"
|
|
assert detect_language("component.jsx") == "javascript"
|
|
assert detect_language("index.mjs") == "javascript"
|
|
|
|
def test_typescript_detection(self) -> None:
|
|
assert detect_language("app.ts") == "typescript"
|
|
assert detect_language("component.tsx") == "typescript"
|
|
|
|
def test_cpp_detection(self) -> None:
|
|
assert detect_language("main.cpp") == "cpp"
|
|
assert detect_language("header.hpp") == "cpp"
|
|
assert detect_language("source.cc") == "cpp"
|
|
|
|
def test_c_detection(self) -> None:
|
|
assert detect_language("main.c") == "c"
|
|
assert detect_language("header.h") == "c"
|
|
|
|
def test_unknown_language(self) -> None:
|
|
assert detect_language("data.xyz") == "unknown"
|
|
assert detect_language("noextension") == "unknown"
|
|
|
|
def test_case_insensitive(self) -> None:
|
|
assert detect_language("Module.PY") == "python"
|
|
assert detect_language("APP.JS") == "javascript"
|
|
|
|
|
|
class TestPythonChunker:
|
|
"""Tests for Python AST-based chunking."""
|
|
|
|
def test_simple_function(self) -> None:
|
|
code = '''
|
|
def hello(name: str) -> str:
|
|
"""Say hello."""
|
|
return f"Hello, {name}!"
|
|
'''
|
|
chunks = chunk_python(code, "test.py")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].name == "hello"
|
|
assert chunks[0].chunk_type == "function"
|
|
assert chunks[0].docstring == "Say hello."
|
|
assert "str" in chunks[0].signature
|
|
|
|
def test_async_function(self) -> None:
|
|
code = '''
|
|
async def fetch_data(url: str) -> dict:
|
|
"""Fetch data from URL."""
|
|
pass
|
|
'''
|
|
chunks = chunk_python(code, "test.py")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].name == "fetch_data"
|
|
assert chunks[0].chunk_type == "function"
|
|
assert "async" in chunks[0].signature
|
|
|
|
def test_class_with_methods(self) -> None:
|
|
code = '''
|
|
class Calculator:
|
|
"""A simple calculator."""
|
|
|
|
def add(self, a: int, b: int) -> int:
|
|
"""Add two numbers."""
|
|
return a + b
|
|
|
|
def subtract(self, a: int, b: int) -> int:
|
|
"""Subtract two numbers."""
|
|
return a - b
|
|
'''
|
|
chunks = chunk_python(code, "test.py")
|
|
# Should create a class chunk (small enough to keep together)
|
|
assert len(chunks) >= 1
|
|
class_chunk = [c for c in chunks if c.chunk_type == "class"]
|
|
assert len(class_chunk) == 1
|
|
assert class_chunk[0].name == "Calculator"
|
|
assert class_chunk[0].docstring == "A simple calculator."
|
|
|
|
def test_imports_extracted(self) -> None:
|
|
code = '''
|
|
import os
|
|
from typing import Optional, List
|
|
|
|
def process():
|
|
pass
|
|
'''
|
|
chunks = chunk_python(code, "test.py")
|
|
assert len(chunks) == 1
|
|
assert "import os" in chunks[0].imports
|
|
assert any("from typing import" in imp for imp in chunks[0].imports)
|
|
|
|
def test_empty_file(self) -> None:
|
|
chunks = chunk_python("", "test.py")
|
|
assert len(chunks) == 0
|
|
|
|
def test_module_with_only_imports(self) -> None:
|
|
code = '''
|
|
import os
|
|
import sys
|
|
'''
|
|
chunks = chunk_python(code, "test.py")
|
|
# Should create a module chunk for files with no functions/classes
|
|
assert len(chunks) == 0 or chunks[0].chunk_type == "module"
|
|
|
|
def test_syntax_error_fallback(self) -> None:
|
|
code = '''
|
|
def broken(
|
|
# Missing closing paren
|
|
'''
|
|
chunks = chunk_python(code, "test.py")
|
|
# Should fall back to generic chunking
|
|
assert len(chunks) >= 0 # May or may not produce chunks
|
|
|
|
|
|
class TestJavaScriptChunker:
|
|
"""Tests for JavaScript regex-based chunking."""
|
|
|
|
def test_function_declaration(self) -> None:
|
|
code = '''
|
|
function greet(name) {
|
|
return `Hello, ${name}!`;
|
|
}
|
|
'''
|
|
chunks = chunk_javascript(code, "test.js")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].name == "greet"
|
|
assert chunks[0].chunk_type == "function"
|
|
|
|
def test_async_function(self) -> None:
|
|
code = '''
|
|
async function fetchData(url) {
|
|
const response = await fetch(url);
|
|
return response.json();
|
|
}
|
|
'''
|
|
chunks = chunk_javascript(code, "test.js")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].name == "fetchData"
|
|
|
|
def test_arrow_function(self) -> None:
|
|
code = '''
|
|
const multiply = (a, b) => {
|
|
return a * b;
|
|
};
|
|
'''
|
|
chunks = chunk_javascript(code, "test.js")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].name == "multiply"
|
|
assert chunks[0].chunk_type == "function"
|
|
|
|
def test_class_definition(self) -> None:
|
|
code = '''
|
|
class Calculator {
|
|
constructor() {
|
|
this.result = 0;
|
|
}
|
|
|
|
add(value) {
|
|
this.result += value;
|
|
return this;
|
|
}
|
|
}
|
|
'''
|
|
chunks = chunk_javascript(code, "test.js")
|
|
assert len(chunks) >= 1
|
|
class_chunks = [c for c in chunks if c.chunk_type == "class"]
|
|
assert len(class_chunks) == 1
|
|
assert class_chunks[0].name == "Calculator"
|
|
|
|
def test_export_function(self) -> None:
|
|
code = '''
|
|
export function exportedFunc() {
|
|
return 42;
|
|
}
|
|
'''
|
|
chunks = chunk_javascript(code, "test.js")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].name == "exportedFunc"
|
|
|
|
def test_imports_extracted(self) -> None:
|
|
code = '''
|
|
import React from 'react';
|
|
import { useState } from 'react';
|
|
const lodash = require('lodash');
|
|
|
|
function Component() {
|
|
return null;
|
|
}
|
|
'''
|
|
chunks = chunk_javascript(code, "test.jsx")
|
|
assert len(chunks) >= 1
|
|
assert any("import React" in imp for imp in chunks[0].imports)
|
|
|
|
|
|
class TestCppChunker:
|
|
"""Tests for C++ regex-based chunking."""
|
|
|
|
def test_function_definition(self) -> None:
|
|
code = '''
|
|
int add(int a, int b) {
|
|
return a + b;
|
|
}
|
|
'''
|
|
chunks = chunk_cpp(code, "test.cpp")
|
|
assert len(chunks) >= 1
|
|
func_chunks = [c for c in chunks if c.chunk_type == "function"]
|
|
assert len(func_chunks) == 1
|
|
assert func_chunks[0].name == "add"
|
|
|
|
def test_class_definition(self) -> None:
|
|
code = '''
|
|
class Calculator {
|
|
public:
|
|
int add(int a, int b);
|
|
int subtract(int a, int b);
|
|
};
|
|
'''
|
|
chunks = chunk_cpp(code, "test.cpp")
|
|
assert len(chunks) >= 1
|
|
class_chunks = [c for c in chunks if c.chunk_type == "class"]
|
|
assert len(class_chunks) == 1
|
|
assert class_chunks[0].name == "Calculator"
|
|
|
|
def test_includes_extracted(self) -> None:
|
|
code = '''
|
|
#include <iostream>
|
|
#include "myheader.h"
|
|
|
|
int main() {
|
|
return 0;
|
|
}
|
|
'''
|
|
chunks = chunk_cpp(code, "test.cpp")
|
|
assert len(chunks) >= 1
|
|
assert any("#include <iostream>" in imp for imp in chunks[0].imports)
|
|
|
|
|
|
class TestProseChunker:
|
|
"""Tests for prose document chunking."""
|
|
|
|
def test_markdown_headings(self) -> None:
|
|
content = '''# Introduction
|
|
|
|
This is the introduction section.
|
|
|
|
## Getting Started
|
|
|
|
Follow these steps to get started.
|
|
|
|
## Advanced Topics
|
|
|
|
More advanced content here.
|
|
'''
|
|
chunks = chunk_prose(content, "readme.md", "markdown")
|
|
assert len(chunks) >= 2
|
|
# First chunk should be introduction
|
|
assert chunks[0].name == "Introduction"
|
|
|
|
def test_empty_document(self) -> None:
|
|
chunks = chunk_prose("", "empty.md", "markdown")
|
|
assert len(chunks) == 0
|
|
|
|
|
|
class TestGenericChunker:
|
|
"""Tests for generic line-based chunking."""
|
|
|
|
def test_small_file(self) -> None:
|
|
content = "line1\nline2\nline3"
|
|
chunks = chunk_generic(content, "test.txt", "text")
|
|
assert len(chunks) == 1
|
|
assert chunks[0].content == content
|
|
|
|
def test_empty_file(self) -> None:
|
|
chunks = chunk_generic("", "empty.txt", "text")
|
|
assert len(chunks) == 0
|
|
|
|
|
|
class TestChunkFile:
|
|
"""Tests for the main chunk_file dispatcher."""
|
|
|
|
def test_dispatches_to_python(self) -> None:
|
|
code = "def foo(): pass"
|
|
chunks = chunk_file(code, "test.py")
|
|
assert all(c.language == "python" for c in chunks)
|
|
|
|
def test_dispatches_to_javascript(self) -> None:
|
|
code = "function foo() {}"
|
|
chunks = chunk_file(code, "test.js")
|
|
assert all(c.language == "javascript" for c in chunks)
|
|
|
|
def test_dispatches_to_cpp(self) -> None:
|
|
code = "int main() { return 0; }"
|
|
chunks = chunk_file(code, "test.cpp")
|
|
assert all(c.language == "cpp" for c in chunks)
|
|
|
|
def test_unknown_language_uses_generic(self) -> None:
|
|
content = "some content"
|
|
chunks = chunk_file(content, "test.xyz")
|
|
assert all(c.language == "unknown" for c in chunks)
|
|
|
|
|
|
class TestChunkProperties:
|
|
"""Tests for Chunk dataclass properties."""
|
|
|
|
def test_chunk_id_generation(self) -> None:
|
|
chunk = Chunk(
|
|
content="def foo(): pass",
|
|
file_path="test.py",
|
|
start_line=1,
|
|
end_line=1,
|
|
chunk_type="function",
|
|
name="foo",
|
|
language="python",
|
|
)
|
|
assert chunk.chunk_id
|
|
assert "test.py" in chunk.chunk_id
|
|
assert "foo" in chunk.chunk_id
|
|
|
|
def test_chunk_id_uniqueness(self) -> None:
|
|
chunk1 = Chunk(
|
|
content="def foo(): pass",
|
|
file_path="test.py",
|
|
start_line=1,
|
|
end_line=1,
|
|
chunk_type="function",
|
|
name="foo",
|
|
language="python",
|
|
)
|
|
chunk2 = Chunk(
|
|
content="def foo(): return 1",
|
|
file_path="test.py",
|
|
start_line=1,
|
|
end_line=1,
|
|
chunk_type="function",
|
|
name="foo",
|
|
language="python",
|
|
)
|
|
# Different content should produce different IDs
|
|
assert chunk1.chunk_id != chunk2.chunk_id
|
|
|
|
def test_line_count(self) -> None:
|
|
chunk = Chunk(
|
|
content="line1\nline2\nline3",
|
|
file_path="test.py",
|
|
start_line=10,
|
|
end_line=12,
|
|
chunk_type="block",
|
|
name="test",
|
|
language="python",
|
|
)
|
|
assert chunk.line_count == 3
|