refactor: code cleanup, utils extraction, and test improvements (#141)

* Refactor code for lifespan, template usage, and improved tests

- Move background tasks and rate-limit handler into utils.py
- Reference TEMPLATES from config instead of inline Jinja2Templates
- Adopt Given/When/Then docstrings for test clarity
- Parametrize some tests and consolidate code across query_parser tests
- Add pytest.warns context handler to test_parse_repo_source_with_failed_git_command
This commit is contained in:
Filip Christiansen 2025-01-23 02:46:17 +01:00 committed by GitHub
parent 18368097f9
commit 58dbe2cb7e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 705 additions and 401 deletions

View file

@ -1,4 +1,9 @@
""" Tests for the query_parser module. """
"""
Tests for the `query_parser` module.
These tests cover URL parsing, pattern parsing, and handling of branches/subpaths for HTTP(S) repositories and local
paths.
"""
from pathlib import Path
from unittest.mock import AsyncMock, patch
@ -9,10 +14,14 @@ from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
from gitingest.query_parser import _parse_patterns, _parse_repo_source, parse_query
@pytest.mark.asyncio
async def test_parse_url_valid_https() -> None:
"""
Test `_parse_repo_source` with valid HTTPS URLs from supported platforms (GitHub, GitLab, Bitbucket, Gitea).
Verifies that user and repository names are correctly extracted.
Test `_parse_repo_source` with valid HTTPS URLs.
Given various HTTPS URLs on supported platforms:
When `_parse_repo_source` is called,
Then user name, repo name, and the URL should be extracted correctly.
"""
test_cases = [
"https://github.com/user/repo",
@ -24,15 +33,20 @@ async def test_parse_url_valid_https() -> None:
]
for url in test_cases:
parsed_query = await _parse_repo_source(url)
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
assert parsed_query.url == url
@pytest.mark.asyncio
async def test_parse_url_valid_http() -> None:
"""
Test `_parse_repo_source` with valid HTTP URLs from supported platforms.
Verifies that user and repository names, as well as the slug, are correctly extracted.
Test `_parse_repo_source` with valid HTTP URLs.
Given various HTTP URLs on supported platforms:
When `_parse_repo_source` is called,
Then user name, repo name, and the slug should be extracted correctly.
"""
test_cases = [
"http://github.com/user/repo",
@ -44,71 +58,99 @@ async def test_parse_url_valid_http() -> None:
]
for url in test_cases:
parsed_query = await _parse_repo_source(url)
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
assert parsed_query.slug == "user-repo"
@pytest.mark.asyncio
async def test_parse_url_invalid() -> None:
"""
Test `_parse_repo_source` with an invalid URL that does not include a repository structure.
Verifies that a ValueError is raised with an appropriate error message.
Test `_parse_repo_source` with an invalid URL.
Given an HTTPS URL lacking a repository structure (e.g., "https://github.com"),
When `_parse_repo_source` is called,
Then a ValueError should be raised indicating an invalid repository URL.
"""
url = "https://github.com"
with pytest.raises(ValueError, match="Invalid repository URL"):
await _parse_repo_source(url)
async def test_parse_query_basic() -> None:
@pytest.mark.asyncio
@pytest.mark.parametrize("url", ["https://github.com/user/repo", "https://gitlab.com/user/repo"])
async def test_parse_query_basic(url):
"""
Test `parse_query` with basic inputs including valid repository URLs.
Verifies that user and repository names, URL, and ignore patterns are correctly parsed.
Test `parse_query` with a basic valid repository URL.
Given an HTTPS URL and ignore_patterns="*.txt":
When `parse_query` is called,
Then user/repo, URL, and ignore patterns should be parsed correctly.
"""
test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"]
for url in test_cases:
parsed_query = await parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt")
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
assert parsed_query.url == url
assert parsed_query.ignore_patterns
assert "*.txt" in parsed_query.ignore_patterns
parsed_query = await parse_query(source=url, max_file_size=50, from_web=True, ignore_patterns="*.txt")
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
assert parsed_query.url == url
assert parsed_query.ignore_patterns
assert "*.txt" in parsed_query.ignore_patterns
@pytest.mark.asyncio
async def test_parse_query_mixed_case() -> None:
"""
Test `parse_query` with mixed case URLs.
Test `parse_query` with mixed-case URLs.
Given a URL with mixed-case parts (e.g. "Https://GitHub.COM/UsEr/rEpO"):
When `parse_query` is called,
Then the user and repo names should be normalized to lowercase.
"""
url = "Https://GitHub.COM/UsEr/rEpO"
parsed_query = await parse_query(url, max_file_size=50, from_web=True)
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
@pytest.mark.asyncio
async def test_parse_query_include_pattern() -> None:
"""
Test `parse_query` with an include pattern.
Verifies that the include pattern is set correctly and default ignore patterns are applied.
Test `parse_query` with a specified include pattern.
Given a URL and include_patterns="*.py":
When `parse_query` is called,
Then the include pattern should be set, and default ignore patterns remain applied.
"""
url = "https://github.com/user/repo"
parsed_query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py")
assert parsed_query.include_patterns == {"*.py"}
assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS
@pytest.mark.asyncio
async def test_parse_query_invalid_pattern() -> None:
"""
Test `parse_query` with an invalid pattern containing special characters.
Verifies that a ValueError is raised with an appropriate error message.
Test `parse_query` with an invalid pattern.
Given an include pattern containing special characters (e.g., "*.py;rm -rf"):
When `parse_query` is called,
Then a ValueError should be raised indicating invalid characters.
"""
url = "https://github.com/user/repo"
with pytest.raises(ValueError, match="Pattern.*contains invalid characters"):
await parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf")
@pytest.mark.asyncio
async def test_parse_url_with_subpaths() -> None:
"""
Test `_parse_repo_source` with a URL containing a branch and subpath.
Verifies that user name, repository name, branch, and subpath are correctly extracted.
Test `_parse_repo_source` with a URL containing branch and subpath.
Given a URL referencing a branch ("main") and a subdir ("subdir/file"):
When `_parse_repo_source` is called with remote branch fetching,
Then user, repo, branch, and subpath should be identified correctly.
"""
url = "https://github.com/user/repo/tree/main/subdir/file"
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
@ -118,16 +160,21 @@ async def test_parse_url_with_subpaths() -> None:
) as mock_fetch_branches:
mock_fetch_branches.return_value = ["main", "dev", "feature-branch"]
parsed_query = await _parse_repo_source(url)
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
assert parsed_query.branch == "main"
assert parsed_query.subpath == "/subdir/file"
@pytest.mark.asyncio
async def test_parse_url_invalid_repo_structure() -> None:
"""
Test `_parse_repo_source` with an invalid repository structure in the URL.
Verifies that a ValueError is raised with an appropriate error message.
Test `_parse_repo_source` with a URL missing a repository name.
Given a URL like "https://github.com/user":
When `_parse_repo_source` is called,
Then a ValueError should be raised indicating an invalid repository URL.
"""
url = "https://github.com/user"
with pytest.raises(ValueError, match="Invalid repository URL"):
@ -136,50 +183,71 @@ async def test_parse_url_invalid_repo_structure() -> None:
def test_parse_patterns_valid() -> None:
"""
Test `_parse_patterns` with valid patterns separated by commas.
Verifies that the patterns are correctly parsed into a list.
Test `_parse_patterns` with valid comma-separated patterns.
Given patterns like "*.py, *.md, docs/*":
When `_parse_patterns` is called,
Then it should return a set of parsed strings.
"""
patterns = "*.py, *.md, docs/*"
parsed_patterns = _parse_patterns(patterns)
assert parsed_patterns == {"*.py", "*.md", "docs/*"}
def test_parse_patterns_invalid_characters() -> None:
"""
Test `_parse_patterns` with invalid patterns containing special characters.
Verifies that a ValueError is raised with an appropriate error message.
Test `_parse_patterns` with invalid characters.
Given a pattern string containing special characters (e.g. "*.py;rm -rf"):
When `_parse_patterns` is called,
Then a ValueError should be raised indicating invalid pattern syntax.
"""
patterns = "*.py;rm -rf"
with pytest.raises(ValueError, match="Pattern.*contains invalid characters"):
_parse_patterns(patterns)
@pytest.mark.asyncio
async def test_parse_query_with_large_file_size() -> None:
"""
Test `parse_query` with a very large file size limit.
Verifies that the file size limit and default ignore patterns are set correctly.
Given a URL and max_file_size=10**9:
When `parse_query` is called,
Then `max_file_size` should be set correctly and default ignore patterns remain unchanged.
"""
url = "https://github.com/user/repo"
parsed_query = await parse_query(url, max_file_size=10**9, from_web=True)
assert parsed_query.max_file_size == 10**9
assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS
@pytest.mark.asyncio
async def test_parse_query_empty_patterns() -> None:
"""
Test `parse_query` with empty include and ignore patterns.
Verifies that the include patterns are set to None and default ignore patterns are applied.
Test `parse_query` with empty patterns.
Given empty include_patterns and ignore_patterns:
When `parse_query` is called,
Then include_patterns becomes None and default ignore patterns apply.
"""
url = "https://github.com/user/repo"
parsed_query = await parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="")
assert parsed_query.include_patterns is None
assert parsed_query.ignore_patterns == DEFAULT_IGNORE_PATTERNS
@pytest.mark.asyncio
async def test_parse_query_include_and_ignore_overlap() -> None:
"""
Test `parse_query` with overlapping include and ignore patterns.
Verifies that overlapping patterns are removed from the ignore patterns.
Test `parse_query` with overlapping patterns.
Given include="*.py" and ignore={"*.py", "*.txt"}:
When `parse_query` is called,
Then "*.py" should be removed from ignore patterns.
"""
url = "https://github.com/user/repo"
parsed_query = await parse_query(
@ -189,102 +257,155 @@ async def test_parse_query_include_and_ignore_overlap() -> None:
include_patterns="*.py",
ignore_patterns={"*.py", "*.txt"},
)
assert parsed_query.include_patterns == {"*.py"}
assert parsed_query.ignore_patterns is not None
assert "*.py" not in parsed_query.ignore_patterns
assert "*.txt" in parsed_query.ignore_patterns
@pytest.mark.asyncio
async def test_parse_query_local_path() -> None:
"""
Test `parse_query` with a local file path.
Verifies that the local path is set, a unique ID is generated, and the slug is correctly created.
Given "/home/user/project" and from_web=False:
When `parse_query` is called,
Then the local path should be set, id generated, and slug formed accordingly.
"""
path = "/home/user/project"
parsed_query = await parse_query(path, max_file_size=100, from_web=False)
tail = Path("home/user/project")
assert parsed_query.local_path.parts[-len(tail.parts) :] == tail.parts
assert parsed_query.id is not None
assert parsed_query.slug == "user/project"
@pytest.mark.asyncio
async def test_parse_query_relative_path() -> None:
"""
Test `parse_query` with a relative file path.
Verifies that the local path and slug are correctly resolved.
Test `parse_query` with a relative path.
Given "./project" and from_web=False:
When `parse_query` is called,
Then local_path resolves relatively, and slug ends with "project".
"""
path = "./project"
parsed_query = await parse_query(path, max_file_size=100, from_web=False)
tail = Path("project")
assert parsed_query.local_path.parts[-len(tail.parts) :] == tail.parts
assert parsed_query.slug.endswith("project")
@pytest.mark.asyncio
async def test_parse_query_empty_source() -> None:
"""
Test `parse_query` with an empty source input.
Verifies that a ValueError is raised with an appropriate error message.
Test `parse_query` with an empty string.
Given an empty source string:
When `parse_query` is called,
Then a ValueError should be raised indicating an invalid repository URL.
"""
with pytest.raises(ValueError, match="Invalid repository URL"):
await parse_query("", max_file_size=100, from_web=True)
async def test_parse_url_branch_and_commit_distinction() -> None:
@pytest.mark.asyncio
@pytest.mark.parametrize(
"url, expected_branch, expected_commit",
[
("https://github.com/user/repo/tree/main", "main", None),
(
"https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234",
None,
"abcd1234abcd1234abcd1234abcd1234abcd1234",
),
],
)
async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch: str, expected_commit: str) -> None:
"""
Test `_parse_repo_source` with URLs containing either a branch name or a commit hash.
Verifies that the branch and commit are correctly distinguished.
"""
url_branch = "https://github.com/user/repo/tree/main"
url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234"
Test `_parse_repo_source` distinguishing branch vs. commit hash.
Given either a branch URL (e.g., ".../tree/main") or a 40-character commit URL:
When `_parse_repo_source` is called with branch fetching,
Then the function should correctly set `branch` or `commit` based on the URL content.
"""
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
# Mocking the return value to include 'main' and some additional branches
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
with patch(
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
) as mock_fetch_branches:
mock_fetch_branches.return_value = ["main", "dev", "feature-branch"]
parsed_query_with_branch = await _parse_repo_source(url_branch)
parsed_query_with_commit = await _parse_repo_source(url_commit)
parsed_query = await _parse_repo_source(url)
assert parsed_query_with_branch.branch == "main"
assert parsed_query_with_branch.commit is None
assert parsed_query_with_commit.branch is None
assert parsed_query_with_commit.commit == "abcd1234abcd1234abcd1234abcd1234abcd1234"
# Verify that `branch` and `commit` match our expectations
assert parsed_query.branch == expected_branch
assert parsed_query.commit == expected_commit
@pytest.mark.asyncio
async def test_parse_query_uuid_uniqueness() -> None:
"""
Test `parse_query` to ensure that each call generates a unique UUID for the query.
Test `parse_query` for unique UUID generation.
Given the same path twice:
When `parse_query` is called repeatedly,
Then each call should produce a different query id.
"""
path = "/home/user/project"
parsed_query_1 = await parse_query(path, max_file_size=100, from_web=False)
parsed_query_2 = await parse_query(path, max_file_size=100, from_web=False)
assert parsed_query_1.id != parsed_query_2.id
@pytest.mark.asyncio
async def test_parse_url_with_query_and_fragment() -> None:
"""
Test `_parse_repo_source` with a URL containing query parameters and a fragment.
Verifies that the URL is cleaned and other fields are correctly extracted.
Test `_parse_repo_source` with query parameters and a fragment.
Given a URL like "https://github.com/user/repo?arg=value#fragment":
When `_parse_repo_source` is called,
Then those parts should be stripped, leaving a clean user/repo URL.
"""
url = "https://github.com/user/repo?arg=value#fragment"
parsed_query = await _parse_repo_source(url)
assert parsed_query.user_name == "user"
assert parsed_query.repo_name == "repo"
assert parsed_query.url == "https://github.com/user/repo" # URL should be cleaned
@pytest.mark.asyncio
async def test_parse_url_unsupported_host() -> None:
"""
Test `_parse_repo_source` with an unsupported host.
Given "https://only-domain.com":
When `_parse_repo_source` is called,
Then a ValueError should be raised for the unknown domain.
"""
url = "https://only-domain.com"
with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"):
await _parse_repo_source(url)
@pytest.mark.asyncio
async def test_parse_query_with_branch() -> None:
"""
Test `parse_query` when a branch is specified in a blob path.
Given "https://github.com/pandas-dev/pandas/blob/2.2.x/...":
When `parse_query` is called,
Then the branch should be identified, subpath set, and commit remain None.
"""
url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
parsed_query = await parse_query(url, max_file_size=10**9, from_web=True)
assert parsed_query.user_name == "pandas-dev"
assert parsed_query.repo_name == "pandas"
assert parsed_query.url == "https://github.com/pandas-dev/pandas"
@ -307,16 +428,25 @@ async def test_parse_query_with_branch() -> None:
)
async def test_parse_repo_source_with_failed_git_command(url, expected_branch, expected_subpath):
"""
Test `_parse_repo_source` when git command fails.
Verifies that the function returns the first path component as the branch.
Test `_parse_repo_source` when git fetch fails.
Given a URL referencing a branch, but Git fetching fails:
When `_parse_repo_source` is called,
Then it should fall back to path components for branch identification.
"""
with patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches:
mock_fetch_branches.side_effect = Exception("Failed to fetch branch list")
parsed_query = await _parse_repo_source(url)
with pytest.warns(
RuntimeWarning,
match="Warning: Failed to fetch branch list: Git command failed: "
"git ls-remote --heads https://github.com/user/repo",
):
assert parsed_query.branch == expected_branch
assert parsed_query.subpath == expected_subpath
parsed_query = await _parse_repo_source(url)
assert parsed_query.branch == expected_branch
assert parsed_query.subpath == expected_subpath
@pytest.mark.asyncio
@ -332,6 +462,13 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e
],
)
async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, expected_subpath):
"""
Test `_parse_repo_source` with various URL patterns.
Given multiple branch/blob patterns (including nonexistent branches):
When `_parse_repo_source` is called with remote branch fetching,
Then the correct branch/subpath should be set or None if unmatched.
"""
with (
patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command,
patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches,
@ -344,5 +481,6 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch,
mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"]
parsed_query = await _parse_repo_source(url)
assert parsed_query.branch == expected_branch
assert parsed_query.subpath == expected_subpath