mirror of
https://github.com/cyclotruc/gitingest.git
synced 2026-04-28 08:09:31 +00:00
feat: ignore .gitignore files by default (use --include-gitignored to stay
Some checks are pending
CI / test (macos-latest, 3.10) (push) Waiting to run
CI / test (macos-latest, 3.11) (push) Waiting to run
CI / test (macos-latest, 3.12) (push) Waiting to run
CI / test (macos-latest, 3.13) (push) Waiting to run
CI / test (macos-latest, 3.8) (push) Waiting to run
CI / test (macos-latest, 3.9) (push) Waiting to run
CI / test (ubuntu-latest, 3.10) (push) Waiting to run
CI / test (ubuntu-latest, 3.11) (push) Waiting to run
CI / test (ubuntu-latest, 3.12) (push) Waiting to run
CI / test (ubuntu-latest, 3.13) (push) Waiting to run
CI / test (ubuntu-latest, 3.8) (push) Waiting to run
CI / test (ubuntu-latest, 3.9) (push) Waiting to run
CI / test (windows-latest, 3.10) (push) Waiting to run
CI / test (windows-latest, 3.11) (push) Waiting to run
CI / test (windows-latest, 3.12) (push) Waiting to run
CI / test (windows-latest, 3.13) (push) Waiting to run
CI / test (windows-latest, 3.8) (push) Waiting to run
CI / test (windows-latest, 3.9) (push) Waiting to run
OSSF Scorecard / Scorecard analysis (push) Waiting to run
Some checks are pending
CI / test (macos-latest, 3.10) (push) Waiting to run
CI / test (macos-latest, 3.11) (push) Waiting to run
CI / test (macos-latest, 3.12) (push) Waiting to run
CI / test (macos-latest, 3.13) (push) Waiting to run
CI / test (macos-latest, 3.8) (push) Waiting to run
CI / test (macos-latest, 3.9) (push) Waiting to run
CI / test (ubuntu-latest, 3.10) (push) Waiting to run
CI / test (ubuntu-latest, 3.11) (push) Waiting to run
CI / test (ubuntu-latest, 3.12) (push) Waiting to run
CI / test (ubuntu-latest, 3.13) (push) Waiting to run
CI / test (ubuntu-latest, 3.8) (push) Waiting to run
CI / test (ubuntu-latest, 3.9) (push) Waiting to run
CI / test (windows-latest, 3.10) (push) Waiting to run
CI / test (windows-latest, 3.11) (push) Waiting to run
CI / test (windows-latest, 3.12) (push) Waiting to run
CI / test (windows-latest, 3.13) (push) Waiting to run
CI / test (windows-latest, 3.8) (push) Waiting to run
CI / test (windows-latest, 3.9) (push) Waiting to run
OSSF Scorecard / Scorecard analysis (push) Waiting to run
* use_gitignore flag to exclude gitignore --------- Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
This commit is contained in:
parent
c19f275010
commit
ba701a80c9
10 changed files with 165 additions and 17 deletions
|
|
@ -105,6 +105,7 @@ repos:
|
|||
starlette>=0.40.0,
|
||||
tiktoken,
|
||||
tomli,
|
||||
pathspec,
|
||||
uvicorn>=0.11.7,
|
||||
]
|
||||
- id: pylint
|
||||
|
|
@ -124,6 +125,7 @@ repos:
|
|||
starlette>=0.40.0,
|
||||
tiktoken,
|
||||
tomli,
|
||||
pathspec,
|
||||
uvicorn>=0.11.7,
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -109,6 +109,9 @@ export GITHUB_TOKEN=github_pat_...
|
|||
gitingest https://github.com/username/private-repo
|
||||
```
|
||||
|
||||
By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you
|
||||
need those files in the digest.
|
||||
|
||||
By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways:
|
||||
|
||||
- Use `--output/-o <filename>` to write to a specific file.
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ dependencies = [
|
|||
"starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
|
||||
"tiktoken>=0.7.0", # Support for o200k_base encoding
|
||||
"tomli",
|
||||
"pathspec>=0.12.1",
|
||||
"typing_extensions; python_version < '3.10'",
|
||||
"uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
click>=8.0.0
|
||||
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
|
||||
pathspec>=0.12.1
|
||||
pydantic
|
||||
python-dotenv
|
||||
slowapi
|
||||
|
|
|
|||
|
|
@ -44,6 +44,12 @@ from gitingest.entrypoint import ingest_async
|
|||
),
|
||||
)
|
||||
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
|
||||
@click.option(
|
||||
"--include-gitignored",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Include files matched by .gitignore",
|
||||
)
|
||||
@click.option(
|
||||
"--token",
|
||||
"-t",
|
||||
|
|
@ -61,6 +67,7 @@ def main(
|
|||
exclude_pattern: Tuple[str, ...],
|
||||
include_pattern: Tuple[str, ...],
|
||||
branch: Optional[str],
|
||||
include_gitignored: bool,
|
||||
token: Optional[str],
|
||||
):
|
||||
"""
|
||||
|
|
@ -83,11 +90,12 @@ def main(
|
|||
Glob patterns for including files in the output.
|
||||
branch : str, optional
|
||||
Specific branch to ingest (defaults to the repository's default).
|
||||
include_gitignored : bool
|
||||
If provided, include files normally ignored by .gitignore.
|
||||
token: str, optional
|
||||
GitHub personal-access token (PAT). Needed when *source* refers to a
|
||||
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
|
||||
"""
|
||||
|
||||
asyncio.run(
|
||||
_async_main(
|
||||
source=source,
|
||||
|
|
@ -96,6 +104,7 @@ def main(
|
|||
exclude_pattern=exclude_pattern,
|
||||
include_pattern=include_pattern,
|
||||
branch=branch,
|
||||
include_gitignored=include_gitignored,
|
||||
token=token,
|
||||
)
|
||||
)
|
||||
|
|
@ -108,6 +117,7 @@ async def _async_main(
|
|||
exclude_pattern: Tuple[str, ...],
|
||||
include_pattern: Tuple[str, ...],
|
||||
branch: Optional[str],
|
||||
include_gitignored: bool,
|
||||
token: Optional[str],
|
||||
) -> None:
|
||||
"""
|
||||
|
|
@ -132,6 +142,8 @@ async def _async_main(
|
|||
Glob patterns for including files in the output.
|
||||
branch : str, optional
|
||||
Specific branch to ingest (defaults to the repository's default).
|
||||
include_gitignored : bool
|
||||
If provided, include files normally ignored by .gitignore.
|
||||
token: str, optional
|
||||
GitHub personal-access token (PAT). Needed when *source* refers to a
|
||||
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
|
||||
|
|
@ -160,6 +172,7 @@ async def _async_main(
|
|||
exclude_patterns=exclude_patterns,
|
||||
branch=branch,
|
||||
output=output_target,
|
||||
include_gitignored=include_gitignored,
|
||||
token=token,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from gitingest.cloning import clone_repo
|
|||
from gitingest.config import TMP_BASE_PATH
|
||||
from gitingest.ingestion import ingest_query
|
||||
from gitingest.query_parsing import IngestionQuery, parse_query
|
||||
from gitingest.utils.ignore_patterns import load_gitignore_patterns
|
||||
|
||||
|
||||
async def ingest_async(
|
||||
|
|
@ -19,6 +20,7 @@ async def ingest_async(
|
|||
include_patterns: Optional[Union[str, Set[str]]] = None,
|
||||
exclude_patterns: Optional[Union[str, Set[str]]] = None,
|
||||
branch: Optional[str] = None,
|
||||
include_gitignored: bool = False,
|
||||
token: Optional[str] = None,
|
||||
output: Optional[str] = None,
|
||||
) -> Tuple[str, str, str]:
|
||||
|
|
@ -42,6 +44,8 @@ async def ingest_async(
|
|||
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
|
||||
branch : str, optional
|
||||
The branch to clone and ingest. If `None`, the default branch is used.
|
||||
include_gitignored : bool
|
||||
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
|
||||
token : str, optional
|
||||
GitHub personal-access token (PAT). Needed when *source* refers to a
|
||||
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
|
||||
|
|
@ -76,6 +80,10 @@ async def ingest_async(
|
|||
token=token,
|
||||
)
|
||||
|
||||
if not include_gitignored:
|
||||
gitignore_patterns = load_gitignore_patterns(query.local_path)
|
||||
query.ignore_patterns.update(gitignore_patterns)
|
||||
|
||||
if query.url:
|
||||
selected_branch = branch if branch else query.branch # prioritize branch argument
|
||||
query.branch = selected_branch
|
||||
|
|
@ -117,6 +125,7 @@ def ingest(
|
|||
include_patterns: Optional[Union[str, Set[str]]] = None,
|
||||
exclude_patterns: Optional[Union[str, Set[str]]] = None,
|
||||
branch: Optional[str] = None,
|
||||
include_gitignored: bool = False,
|
||||
token: Optional[str] = None,
|
||||
output: Optional[str] = None,
|
||||
) -> Tuple[str, str, str]:
|
||||
|
|
@ -140,6 +149,8 @@ def ingest(
|
|||
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
|
||||
branch : str, optional
|
||||
The branch to clone and ingest. If `None`, the default branch is used.
|
||||
include_gitignored : bool
|
||||
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
|
||||
token : str, optional
|
||||
GitHub personal-access token (PAT). Needed when *source* refers to a
|
||||
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
|
||||
|
|
@ -165,6 +176,7 @@ def ingest(
|
|||
include_patterns=include_patterns,
|
||||
exclude_patterns=exclude_patterns,
|
||||
branch=branch,
|
||||
include_gitignored=include_gitignored,
|
||||
token=token,
|
||||
output=output,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
"""Default ignore patterns for Gitingest."""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Set
|
||||
|
||||
DEFAULT_IGNORE_PATTERNS: Set[str] = {
|
||||
|
|
@ -160,3 +162,47 @@ DEFAULT_IGNORE_PATTERNS: Set[str] = {
|
|||
# Gitingest
|
||||
"digest.txt",
|
||||
}
|
||||
|
||||
|
||||
def load_gitignore_patterns(root: Path) -> Set[str]:
|
||||
"""
|
||||
Recursively load ignore patterns from all .gitignore files under the given root directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
root : Path
|
||||
The root directory to search for .gitignore files.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Set[str]
|
||||
A set of ignore patterns extracted from all .gitignore files found under the root directory.
|
||||
"""
|
||||
patterns: Set[str] = set()
|
||||
for dirpath, _, filenames in os.walk(root):
|
||||
if ".gitignore" not in filenames:
|
||||
continue
|
||||
|
||||
gitignore_path = Path(dirpath) / ".gitignore"
|
||||
with gitignore_path.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
stripped = line.strip()
|
||||
|
||||
if not stripped or stripped.startswith("#"):
|
||||
continue
|
||||
|
||||
negated = stripped.startswith("!")
|
||||
if negated:
|
||||
stripped = stripped[1:]
|
||||
|
||||
rel_dir = os.path.relpath(dirpath, root)
|
||||
if stripped.startswith("/"):
|
||||
pattern_body = os.path.join(rel_dir, stripped.lstrip("/"))
|
||||
else:
|
||||
pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped
|
||||
|
||||
pattern_body = pattern_body.replace("\\", "/")
|
||||
pattern = f"!{pattern_body}" if negated else pattern_body
|
||||
patterns.add(pattern)
|
||||
|
||||
return patterns
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
"""Utility functions for the ingestion process."""
|
||||
|
||||
from fnmatch import fnmatch
|
||||
from pathlib import Path
|
||||
from typing import Set
|
||||
|
||||
from pathspec import PathSpec
|
||||
|
||||
|
||||
def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool:
|
||||
"""
|
||||
|
|
@ -38,10 +39,8 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
|
|||
if path.is_dir():
|
||||
return True
|
||||
|
||||
for pattern in include_patterns:
|
||||
if fnmatch(rel_str, pattern):
|
||||
return True
|
||||
return False
|
||||
spec = PathSpec.from_lines("gitwildmatch", include_patterns)
|
||||
return spec.match_file(rel_str)
|
||||
|
||||
|
||||
def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool:
|
||||
|
|
@ -73,7 +72,5 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> b
|
|||
return True
|
||||
|
||||
rel_str = str(rel_path)
|
||||
for pattern in ignore_patterns:
|
||||
if pattern and fnmatch(rel_str, pattern):
|
||||
return True
|
||||
return False
|
||||
spec = PathSpec.from_lines("gitwildmatch", ignore_patterns)
|
||||
return spec.match_file(rel_str)
|
||||
|
|
|
|||
73
tests/test_gitignore_feature.py
Normal file
73
tests/test_gitignore_feature.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
"""
|
||||
Tests for the gitignore functionality in Gitingest.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from gitingest.entrypoint import ingest_async
|
||||
from gitingest.utils.ignore_patterns import load_gitignore_patterns
|
||||
|
||||
|
||||
@pytest.fixture(name="repo_path")
|
||||
def repo_fixture(tmp_path: Path) -> Path:
|
||||
"""
|
||||
Create a temporary repository structure with:
|
||||
- A .gitignore that excludes 'exclude.txt'
|
||||
- 'include.txt' (should be processed)
|
||||
- 'exclude.txt' (should be skipped when gitignore rules are respected)
|
||||
"""
|
||||
# Create a .gitignore file that excludes 'exclude.txt'
|
||||
gitignore_file = tmp_path / ".gitignore"
|
||||
gitignore_file.write_text("exclude.txt\n")
|
||||
|
||||
# Create a file that should be included
|
||||
include_file = tmp_path / "include.txt"
|
||||
include_file.write_text("This file should be included.")
|
||||
|
||||
# Create a file that should be excluded
|
||||
exclude_file = tmp_path / "exclude.txt"
|
||||
exclude_file.write_text("This file should be excluded.")
|
||||
|
||||
return tmp_path
|
||||
|
||||
|
||||
def test_load_gitignore_patterns(tmp_path: Path):
|
||||
"""
|
||||
Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file.
|
||||
"""
|
||||
gitignore = tmp_path / ".gitignore"
|
||||
# Write some sample patterns with a comment line included
|
||||
gitignore.write_text("exclude.txt\n*.log\n# a comment\n")
|
||||
|
||||
patterns = load_gitignore_patterns(tmp_path)
|
||||
|
||||
# Check that the expected patterns are loaded
|
||||
assert "exclude.txt" in patterns
|
||||
assert "*.log" in patterns
|
||||
# Ensure that comment lines are not added
|
||||
for pattern in patterns:
|
||||
assert not pattern.startswith("#")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_ingest_with_gitignore(repo_path: Path):
|
||||
"""
|
||||
Integration test for ingest_async() respecting .gitignore rules.
|
||||
|
||||
When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted.
|
||||
When ``include_gitignored`` is ``True``, both files should be present.
|
||||
"""
|
||||
# Run ingestion with the gitignore functionality enabled.
|
||||
_, _, content_with_ignore = await ingest_async(source=str(repo_path))
|
||||
# 'exclude.txt' should be skipped.
|
||||
assert "This file should be excluded." not in content_with_ignore
|
||||
# 'include.txt' should be processed.
|
||||
assert "This file should be included." in content_with_ignore
|
||||
|
||||
# Run ingestion with the gitignore functionality disabled.
|
||||
_, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True)
|
||||
# Now both files should be present.
|
||||
assert "This file should be excluded." in content_without_ignore
|
||||
assert "This file should be included." in content_without_ignore
|
||||
|
|
@ -84,10 +84,10 @@ class PatternScenario(TypedDict):
|
|||
"*/file_dir2.txt",
|
||||
},
|
||||
"ignore_patterns": {*()},
|
||||
"expected_num_files": 3,
|
||||
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
|
||||
"expected_structure": {"test_repo/", "dir2/"},
|
||||
"expected_not_structure": {"src/", "subdir/", "dir1/"},
|
||||
"expected_num_files": 4,
|
||||
"expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"},
|
||||
"expected_structure": {"test_repo/", "dir1/", "dir2/"},
|
||||
"expected_not_structure": {"src/", "subdir/"},
|
||||
}
|
||||
),
|
||||
id="include-wildcard-directory",
|
||||
|
|
@ -114,9 +114,10 @@ class PatternScenario(TypedDict):
|
|||
{
|
||||
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
|
||||
"ignore_patterns": {*()},
|
||||
"expected_num_files": 2,
|
||||
"expected_num_files": 3,
|
||||
"expected_content": {
|
||||
"dir2/file_dir2.txt",
|
||||
"src/subfile2.py",
|
||||
"src/subdir/file_subdir.py",
|
||||
},
|
||||
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
|
||||
|
|
@ -169,12 +170,11 @@ class PatternScenario(TypedDict):
|
|||
{
|
||||
"include_patterns": {*()},
|
||||
"ignore_patterns": {"src/**/*.py"},
|
||||
"expected_num_files": 7,
|
||||
"expected_num_files": 6,
|
||||
"expected_content": {
|
||||
"file1.txt",
|
||||
"file2.py",
|
||||
"src/subfile1.txt",
|
||||
"src/subfile2.py",
|
||||
"src/subdir/file_subdir.txt",
|
||||
"dir1/file_dir1.txt",
|
||||
"dir2/file_dir2.txt",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue