feat: partial cloning (#188)

This commit introduces the `partial_clone_repo` function, which performs a sparse clone
of a repository (`git clone --filter=blob:none --sparse`) based on query parameters
from a `ParsedQuery` object.

- Add a new method (extact_clone_config) in ParsedQuery to encapsulate the creation
  of a CloneConfig from query parameters.
- Replace repeated CloneConfig instantiation in repository_ingest.py and
  query_processor.py with calls to the new method.
- Simplify code and improve maintainability by centralizing CloneConfig logic.

* Refactor cloning logic to support subpath-based partial clones

- Add `repo_name` and `subpath` fields to `CloneConfig` for flexible cloning.
- Split out `partial_clone_repo` and `full_clone_repo` to handle subpath vs. full clones.
- Update `CloneConfig` to include `repo_name` and `subpath`.
- Simplify query processing to always call `clone_repo`, which now delegates to partial or full clone.
- Improve docstrings to reflect new parameters and return types.

---------

Co-authored-by: cyclotruc <romain@coderamp.io>
This commit is contained in:
Filip Christiansen 2025-02-19 01:36:08 -08:00 committed by GitHub
parent f90595de7c
commit f4fd4bbe7a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 199 additions and 139 deletions

View file

@ -153,7 +153,7 @@ async def test_parse_url_with_subpaths() -> None:
Then user, repo, branch, and subpath should be identified correctly.
"""
url = "https://github.com/user/repo/tree/main/subdir/file"
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
with patch("gitingest.repository_clone._run_command", new_callable=AsyncMock) as mock_run_git_command:
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
with patch(
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
@ -332,7 +332,7 @@ async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch
When `_parse_repo_source` is called with branch fetching,
Then the function should correctly set `branch` or `commit` based on the URL content.
"""
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
with patch("gitingest.repository_clone._run_command", new_callable=AsyncMock) as mock_run_git_command:
# Mocking the return value to include 'main' and some additional branches
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
with patch(
@ -439,7 +439,7 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e
with pytest.warns(
RuntimeWarning,
match="Warning: Failed to fetch branch list: Git command failed: "
match="Warning: Failed to fetch branch list: Command failed: "
"git ls-remote --heads https://github.com/user/repo",
):
@ -469,7 +469,7 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch,
When `_parse_repo_source` is called with remote branch fetching,
Then the correct branch/subpath should be set or None if unmatched.
"""
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
with patch("gitingest.repository_clone._run_command", new_callable=AsyncMock) as mock_run_git_command:
with patch(
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
) as mock_fetch_branches: