feat: ignore .gitignore files by default (use --include-gitignored to stay
Some checks are pending
CI / test (macos-latest, 3.10) (push) Waiting to run
CI / test (macos-latest, 3.11) (push) Waiting to run
CI / test (macos-latest, 3.12) (push) Waiting to run
CI / test (macos-latest, 3.13) (push) Waiting to run
CI / test (macos-latest, 3.8) (push) Waiting to run
CI / test (macos-latest, 3.9) (push) Waiting to run
CI / test (ubuntu-latest, 3.10) (push) Waiting to run
CI / test (ubuntu-latest, 3.11) (push) Waiting to run
CI / test (ubuntu-latest, 3.12) (push) Waiting to run
CI / test (ubuntu-latest, 3.13) (push) Waiting to run
CI / test (ubuntu-latest, 3.8) (push) Waiting to run
CI / test (ubuntu-latest, 3.9) (push) Waiting to run
CI / test (windows-latest, 3.10) (push) Waiting to run
CI / test (windows-latest, 3.11) (push) Waiting to run
CI / test (windows-latest, 3.12) (push) Waiting to run
CI / test (windows-latest, 3.13) (push) Waiting to run
CI / test (windows-latest, 3.8) (push) Waiting to run
CI / test (windows-latest, 3.9) (push) Waiting to run
OSSF Scorecard / Scorecard analysis (push) Waiting to run

* use_gitignore flag to exclude gitignore
---------

Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
This commit is contained in:
Arman 2025-06-24 23:04:50 -04:00 committed by GitHub
parent c19f275010
commit ba701a80c9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 165 additions and 17 deletions

View file

@ -105,6 +105,7 @@ repos:
starlette>=0.40.0,
tiktoken,
tomli,
pathspec,
uvicorn>=0.11.7,
]
- id: pylint
@ -124,6 +125,7 @@ repos:
starlette>=0.40.0,
tiktoken,
tomli,
pathspec,
uvicorn>=0.11.7,
]

View file

@ -109,6 +109,9 @@ export GITHUB_TOKEN=github_pat_...
gitingest https://github.com/username/private-repo
```
By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you
need those files in the digest.
By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways:
- Use `--output/-o <filename>` to write to a specific file.

View file

@ -13,6 +13,7 @@ dependencies = [
"starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
"tiktoken>=0.7.0", # Support for o200k_base encoding
"tomli",
"pathspec>=0.12.1",
"typing_extensions; python_version < '3.10'",
"uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150
]

View file

@ -1,5 +1,6 @@
click>=8.0.0
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
pathspec>=0.12.1
pydantic
python-dotenv
slowapi

View file

@ -44,6 +44,12 @@ from gitingest.entrypoint import ingest_async
),
)
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
@click.option(
"--include-gitignored",
is_flag=True,
default=False,
help="Include files matched by .gitignore",
)
@click.option(
"--token",
"-t",
@ -61,6 +67,7 @@ def main(
exclude_pattern: Tuple[str, ...],
include_pattern: Tuple[str, ...],
branch: Optional[str],
include_gitignored: bool,
token: Optional[str],
):
"""
@ -83,11 +90,12 @@ def main(
Glob patterns for including files in the output.
branch : str, optional
Specific branch to ingest (defaults to the repository's default).
include_gitignored : bool
If provided, include files normally ignored by .gitignore.
token: str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
"""
asyncio.run(
_async_main(
source=source,
@ -96,6 +104,7 @@ def main(
exclude_pattern=exclude_pattern,
include_pattern=include_pattern,
branch=branch,
include_gitignored=include_gitignored,
token=token,
)
)
@ -108,6 +117,7 @@ async def _async_main(
exclude_pattern: Tuple[str, ...],
include_pattern: Tuple[str, ...],
branch: Optional[str],
include_gitignored: bool,
token: Optional[str],
) -> None:
"""
@ -132,6 +142,8 @@ async def _async_main(
Glob patterns for including files in the output.
branch : str, optional
Specific branch to ingest (defaults to the repository's default).
include_gitignored : bool
If provided, include files normally ignored by .gitignore.
token: str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
@ -160,6 +172,7 @@ async def _async_main(
exclude_patterns=exclude_patterns,
branch=branch,
output=output_target,
include_gitignored=include_gitignored,
token=token,
)

View file

@ -11,6 +11,7 @@ from gitingest.cloning import clone_repo
from gitingest.config import TMP_BASE_PATH
from gitingest.ingestion import ingest_query
from gitingest.query_parsing import IngestionQuery, parse_query
from gitingest.utils.ignore_patterns import load_gitignore_patterns
async def ingest_async(
@ -19,6 +20,7 @@ async def ingest_async(
include_patterns: Optional[Union[str, Set[str]]] = None,
exclude_patterns: Optional[Union[str, Set[str]]] = None,
branch: Optional[str] = None,
include_gitignored: bool = False,
token: Optional[str] = None,
output: Optional[str] = None,
) -> Tuple[str, str, str]:
@ -42,6 +44,8 @@ async def ingest_async(
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
branch : str, optional
The branch to clone and ingest. If `None`, the default branch is used.
include_gitignored : bool
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
@ -76,6 +80,10 @@ async def ingest_async(
token=token,
)
if not include_gitignored:
gitignore_patterns = load_gitignore_patterns(query.local_path)
query.ignore_patterns.update(gitignore_patterns)
if query.url:
selected_branch = branch if branch else query.branch # prioritize branch argument
query.branch = selected_branch
@ -117,6 +125,7 @@ def ingest(
include_patterns: Optional[Union[str, Set[str]]] = None,
exclude_patterns: Optional[Union[str, Set[str]]] = None,
branch: Optional[str] = None,
include_gitignored: bool = False,
token: Optional[str] = None,
output: Optional[str] = None,
) -> Tuple[str, str, str]:
@ -140,6 +149,8 @@ def ingest(
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
branch : str, optional
The branch to clone and ingest. If `None`, the default branch is used.
include_gitignored : bool
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
@ -165,6 +176,7 @@ def ingest(
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
branch=branch,
include_gitignored=include_gitignored,
token=token,
output=output,
)

View file

@ -1,5 +1,7 @@
"""Default ignore patterns for Gitingest."""
import os
from pathlib import Path
from typing import Set
DEFAULT_IGNORE_PATTERNS: Set[str] = {
@ -160,3 +162,47 @@ DEFAULT_IGNORE_PATTERNS: Set[str] = {
# Gitingest
"digest.txt",
}
def load_gitignore_patterns(root: Path) -> Set[str]:
"""
Recursively load ignore patterns from all .gitignore files under the given root directory.
Parameters
----------
root : Path
The root directory to search for .gitignore files.
Returns
-------
Set[str]
A set of ignore patterns extracted from all .gitignore files found under the root directory.
"""
patterns: Set[str] = set()
for dirpath, _, filenames in os.walk(root):
if ".gitignore" not in filenames:
continue
gitignore_path = Path(dirpath) / ".gitignore"
with gitignore_path.open("r", encoding="utf-8") as f:
for line in f:
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
negated = stripped.startswith("!")
if negated:
stripped = stripped[1:]
rel_dir = os.path.relpath(dirpath, root)
if stripped.startswith("/"):
pattern_body = os.path.join(rel_dir, stripped.lstrip("/"))
else:
pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped
pattern_body = pattern_body.replace("\\", "/")
pattern = f"!{pattern_body}" if negated else pattern_body
patterns.add(pattern)
return patterns

View file

@ -1,9 +1,10 @@
"""Utility functions for the ingestion process."""
from fnmatch import fnmatch
from pathlib import Path
from typing import Set
from pathspec import PathSpec
def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool:
"""
@ -38,10 +39,8 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
if path.is_dir():
return True
for pattern in include_patterns:
if fnmatch(rel_str, pattern):
return True
return False
spec = PathSpec.from_lines("gitwildmatch", include_patterns)
return spec.match_file(rel_str)
def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool:
@ -73,7 +72,5 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> b
return True
rel_str = str(rel_path)
for pattern in ignore_patterns:
if pattern and fnmatch(rel_str, pattern):
return True
return False
spec = PathSpec.from_lines("gitwildmatch", ignore_patterns)
return spec.match_file(rel_str)

View file

@ -0,0 +1,73 @@
"""
Tests for the gitignore functionality in Gitingest.
"""
from pathlib import Path
import pytest
from gitingest.entrypoint import ingest_async
from gitingest.utils.ignore_patterns import load_gitignore_patterns
@pytest.fixture(name="repo_path")
def repo_fixture(tmp_path: Path) -> Path:
"""
Create a temporary repository structure with:
- A .gitignore that excludes 'exclude.txt'
- 'include.txt' (should be processed)
- 'exclude.txt' (should be skipped when gitignore rules are respected)
"""
# Create a .gitignore file that excludes 'exclude.txt'
gitignore_file = tmp_path / ".gitignore"
gitignore_file.write_text("exclude.txt\n")
# Create a file that should be included
include_file = tmp_path / "include.txt"
include_file.write_text("This file should be included.")
# Create a file that should be excluded
exclude_file = tmp_path / "exclude.txt"
exclude_file.write_text("This file should be excluded.")
return tmp_path
def test_load_gitignore_patterns(tmp_path: Path):
"""
Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file.
"""
gitignore = tmp_path / ".gitignore"
# Write some sample patterns with a comment line included
gitignore.write_text("exclude.txt\n*.log\n# a comment\n")
patterns = load_gitignore_patterns(tmp_path)
# Check that the expected patterns are loaded
assert "exclude.txt" in patterns
assert "*.log" in patterns
# Ensure that comment lines are not added
for pattern in patterns:
assert not pattern.startswith("#")
@pytest.mark.asyncio
async def test_ingest_with_gitignore(repo_path: Path):
"""
Integration test for ingest_async() respecting .gitignore rules.
When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted.
When ``include_gitignored`` is ``True``, both files should be present.
"""
# Run ingestion with the gitignore functionality enabled.
_, _, content_with_ignore = await ingest_async(source=str(repo_path))
# 'exclude.txt' should be skipped.
assert "This file should be excluded." not in content_with_ignore
# 'include.txt' should be processed.
assert "This file should be included." in content_with_ignore
# Run ingestion with the gitignore functionality disabled.
_, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True)
# Now both files should be present.
assert "This file should be excluded." in content_without_ignore
assert "This file should be included." in content_without_ignore

View file

@ -84,10 +84,10 @@ class PatternScenario(TypedDict):
"*/file_dir2.txt",
},
"ignore_patterns": {*()},
"expected_num_files": 3,
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
"expected_structure": {"test_repo/", "dir2/"},
"expected_not_structure": {"src/", "subdir/", "dir1/"},
"expected_num_files": 4,
"expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"},
"expected_structure": {"test_repo/", "dir1/", "dir2/"},
"expected_not_structure": {"src/", "subdir/"},
}
),
id="include-wildcard-directory",
@ -114,9 +114,10 @@ class PatternScenario(TypedDict):
{
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
"ignore_patterns": {*()},
"expected_num_files": 2,
"expected_num_files": 3,
"expected_content": {
"dir2/file_dir2.txt",
"src/subfile2.py",
"src/subdir/file_subdir.py",
},
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
@ -169,12 +170,11 @@ class PatternScenario(TypedDict):
{
"include_patterns": {*()},
"ignore_patterns": {"src/**/*.py"},
"expected_num_files": 7,
"expected_num_files": 6,
"expected_content": {
"file1.txt",
"file2.py",
"src/subfile1.txt",
"src/subfile2.py",
"src/subdir/file_subdir.txt",
"dir1/file_dir1.txt",
"dir2/file_dir2.txt",