gitingest/tests/test_query_ingestion.py
Filip Christiansen 58dbe2cb7e
refactor: code cleanup, utils extraction, and test improvements (#141)
* Refactor code for lifespan, template usage, and improved tests

- Move background tasks and rate-limit handler into utils.py
- Reference TEMPLATES from config instead of inline Jinja2Templates
- Adopt Given/When/Then docstrings for test clarity
- Parametrize some tests and consolidate code across query_parser tests
- Add pytest.warns context handler to test_parse_repo_source_with_failed_git_command
2025-01-23 02:46:17 +01:00

209 lines
7.9 KiB
Python

"""
Tests for the `query_ingestion` module.
These tests validate directory scanning, file content extraction, notebook handling, and the overall ingestion logic,
including filtering patterns and subpaths.
"""
from pathlib import Path
from unittest.mock import patch
import pytest
from gitingest.query_ingestion import _extract_files_content, _read_file_content, _scan_directory, run_ingest_query
from gitingest.query_parser import ParsedQuery
def test_scan_directory(temp_directory: Path, sample_query: ParsedQuery) -> None:
"""
Test `_scan_directory` with default settings.
Given a populated test directory:
When `_scan_directory` is called,
Then it should return a structured node containing the correct directories and file counts.
"""
sample_query.local_path = temp_directory
result = _scan_directory(temp_directory, query=sample_query)
assert result is not None, "Expected a valid directory node structure"
assert result["type"] == "directory"
assert result["file_count"] == 8, "Should count all .txt and .py files"
assert result["dir_count"] == 4, "Should include src, src/subdir, dir1, dir2"
assert len(result["children"]) == 5, "Should contain file1.txt, file2.py, src, dir1, dir2"
def test_extract_files_content(temp_directory: Path, sample_query: ParsedQuery) -> None:
"""
Test `_extract_files_content` to ensure it gathers contents from scanned nodes.
Given a populated test directory:
When `_extract_files_content` is called with a valid scan result,
Then it should return a list of file info containing the correct filenames and paths.
"""
sample_query.local_path = temp_directory
nodes = _scan_directory(temp_directory, query=sample_query)
assert nodes is not None, "Expected a valid scan result"
files = _extract_files_content(query=sample_query, node=nodes)
assert len(files) == 8, "Should extract all .txt and .py files"
paths = [f["path"] for f in files]
# Verify presence of key files
assert any("file1.txt" in p for p in paths)
assert any("subfile1.txt" in p for p in paths)
assert any("file2.py" in p for p in paths)
assert any("subfile2.py" in p for p in paths)
assert any("file_subdir.txt" in p for p in paths)
assert any("file_dir1.txt" in p for p in paths)
assert any("file_dir2.txt" in p for p in paths)
def test_read_file_content_with_notebook(tmp_path: Path) -> None:
"""
Test `_read_file_content` with a notebook file.
Given a minimal .ipynb file:
When `_read_file_content` is called,
Then `process_notebook` should be invoked to handle notebook-specific content.
"""
notebook_path = tmp_path / "dummy_notebook.ipynb"
notebook_path.write_text("{}", encoding="utf-8") # minimal JSON
with patch("gitingest.query_ingestion.process_notebook") as mock_process:
_read_file_content(notebook_path)
mock_process.assert_called_once_with(notebook_path)
def test_read_file_content_with_non_notebook(tmp_path: Path):
"""
Test `_read_file_content` with a non-notebook file.
Given a standard .py file:
When `_read_file_content` is called,
Then `process_notebook` should not be triggered.
"""
py_file_path = tmp_path / "dummy_file.py"
py_file_path.write_text("print('Hello')", encoding="utf-8")
with patch("gitingest.query_ingestion.process_notebook") as mock_process:
_read_file_content(py_file_path)
mock_process.assert_not_called()
def test_include_txt_pattern(temp_directory: Path, sample_query: ParsedQuery) -> None:
"""
Test including only .txt files using a pattern like `*.txt`.
Given a directory with mixed .txt and .py files:
When `include_patterns` is set to `*.txt`,
Then `_scan_directory` should include only .txt files, excluding .py files.
"""
sample_query.local_path = temp_directory
sample_query.include_patterns = {"*.txt"}
result = _scan_directory(temp_directory, query=sample_query)
assert result is not None, "Expected a valid directory node structure"
files = _extract_files_content(query=sample_query, node=result)
file_paths = [f["path"] for f in files]
assert len(files) == 5, "Should find exactly 5 .txt files"
assert all(path.endswith(".txt") for path in file_paths), "Should only include .txt files"
expected_files = ["file1.txt", "subfile1.txt", "file_subdir.txt", "file_dir1.txt", "file_dir2.txt"]
for expected_file in expected_files:
assert any(expected_file in path for path in file_paths), f"Missing expected file: {expected_file}"
assert not any(path.endswith(".py") for path in file_paths), "No .py files should be included"
def test_include_nonexistent_extension(temp_directory: Path, sample_query: ParsedQuery) -> None:
"""
Test including a nonexistent extension (e.g., `*.query`).
Given a directory with no files matching `*.query`:
When `_scan_directory` is called with that pattern,
Then no files should be returned in the result.
"""
sample_query.local_path = temp_directory
sample_query.include_patterns = {"*.query"} # Nonexistent extension
result = _scan_directory(temp_directory, query=sample_query)
assert result is not None, "Expected a valid directory node structure"
files = _extract_files_content(query=sample_query, node=result)
assert len(files) == 0, "Should not find any files matching *.query"
assert result["type"] == "directory"
assert result["file_count"] == 0, "No files counted with this pattern"
assert result["dir_count"] == 0
assert len(result["children"]) == 0
@pytest.mark.parametrize("include_pattern", ["src/*", "src/**", "src*"])
def test_include_src_patterns(temp_directory: Path, sample_query: ParsedQuery, include_pattern: str) -> None:
"""
Test including files under the `src` directory with various patterns.
Given a directory containing `src` with subfiles:
When `include_patterns` is set to `src/*`, `src/**`, or `src*`,
Then `_scan_directory` should include the correct files under `src`.
Note: Windows is not supported; paths are converted to Unix-style for validation.
"""
sample_query.local_path = temp_directory
sample_query.include_patterns = {include_pattern}
result = _scan_directory(temp_directory, query=sample_query)
assert result is not None, "Expected a valid directory node structure"
files = _extract_files_content(query=sample_query, node=result)
# Convert Windows paths to Unix-style
file_paths = {f["path"].replace("\\", "/") for f in files}
expected_paths = {
"src/subfile1.txt",
"src/subfile2.py",
"src/subdir/file_subdir.txt",
"src/subdir/file_subdir.py",
}
assert file_paths == expected_paths, "Missing or unexpected files in result"
def test_run_ingest_query(temp_directory: Path, sample_query: ParsedQuery) -> None:
"""
Test `run_ingest_query` to ensure it processes the directory and returns expected results.
Given a directory with .txt and .py files:
When `run_ingest_query` is invoked,
Then it should produce a summary string listing the files analyzed and a combined content string.
"""
sample_query.local_path = temp_directory
sample_query.subpath = "/"
sample_query.type = None
summary, _, content = run_ingest_query(sample_query)
assert "Repository: test_user/test_repo" in summary
assert "Files analyzed: 8" in summary
# Check presence of key files in the content
assert "src/subfile1.txt" in content
assert "src/subfile2.py" in content
assert "src/subdir/file_subdir.txt" in content
assert "src/subdir/file_subdir.py" in content
assert "file1.txt" in content
assert "file2.py" in content
assert "dir1/file_dir1.txt" in content
assert "dir2/file_dir2.txt" in content
# TODO: Additional tests:
# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"].
# - Edge cases with weird file names or deep subdirectory structures.