refactor: centralize PAT validation, streamline repo checks & misc cleanup (#349)

* refactor: centralize PAT validation, streamline repo checks & housekeeping

* `.venv*` to `.gitignore`
* `# type: ignore[attr-defined]` hints in `compat_typing.py` for IDE-agnostic imports
* Helpful PAT string in `InvalidGitHubTokenError` for easier debugging

* Bump **ruff-pre-commit** hook → `v0.12.1`
* CONTRIBUTING:
  * Require **Python 3.9+**
  * Recommend signed (`-S`) commits
* PAT validation now happens **only** in entry points
  (`utils.auth.resolve_token` for CLI/lib, `server.process_query` for Web UI)
* Unified `_check_github_repo_exists` into `check_repo_exists`, replacing
  `curl -I` with `curl --silent --location --write-out %{http_code} -o /dev/null`
* Broaden `_GITHUB_PAT_PATTERN`
* `create_git_auth_header` raises `ValueError` when hostname is missing
* Tests updated to expect raw HTTP-code output

* Superfluous “token can be set via `GITHUB_TOKEN`” notes in docstrings
* `.gitingestignore` & `.terraform` from `DEFAULT_IGNORE_PATTERNS`
* Token validation inside `create_git_command`
* Obsolete `test_create_git_command_invalid_token`

* Adjust `test_clone.py` and `test_git_utils.py` for new status-code handling
* Consolidate mocks after token-validation relocation

BREAKING CHANGE:
`create_git_command` no longer validates GitHub tokens; callers must ensure
tokens are valid (via `validate_github_token`) before invoking lower-level
git helpers.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Filip Christiansen 2025-07-01 14:21:13 +02:00 committed by Vitalii Barenin
parent 6fe62cc2da
commit cea0eddce8
No known key found for this signature in database
GPG key ID: 9458A9ACC018AD28
5 changed files with 82 additions and 40 deletions

View file

@ -151,4 +151,4 @@ repos:
- repo: meta
hooks:
- id: check-hooks-apply
- id: check-useless-excludes
- id: check-useless-excludes

View file

@ -89,4 +89,4 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
13. **Iterate** on any review feedback—update your branch and repeat **6 11** as needed.
*(Optional) Invite a maintainer to your branch for easier collaboration.*
*(Optional) Invite a maintainer to your branch for easier collaboration.*

View file

@ -4,15 +4,27 @@ from __future__ import annotations
import asyncio
import base64
import os
import re
from typing import Final
import sys
from typing import Final
from urllib.parse import urlparse
import httpx
from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND
from gitingest.utils.compat_func import removesuffix
from starlette.status import (
HTTP_200_OK,
HTTP_301_MOVED_PERMANENTLY,
HTTP_302_FOUND,
HTTP_401_UNAUTHORIZED,
HTTP_403_FORBIDDEN,
HTTP_404_NOT_FOUND,
)
from gitingest.utils.exceptions import InvalidGitHubTokenError
from server.server_utils import Colors
@ -130,28 +142,46 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool:
If the host returns an unrecognised status code.
"""
headers = {}
# TODO: use `requests` instead of `curl`
cmd: list[str] = [
"curl",
"--silent",
"--location",
"--head",
"--write-out",
"%{http_code}",
"-o",
os.devnull,
]
if token and is_github_host(url):
host, owner, repo = _parse_github_url(url)
# Public GitHub vs. GitHub Enterprise
base_api = "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
url = f"{base_api}/repos/{owner}/{repo}"
headers["Authorization"] = f"Bearer {token}"
cmd += [f"Authorization: Bearer {token}"]
async with httpx.AsyncClient(follow_redirects=True) as client:
try:
response = await client.head(url, headers=headers)
except httpx.RequestError:
return False
cmd.append(url)
status_code = response.status_code
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()
if status_code == HTTP_200_OK:
return True
if status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND}:
if proc.returncode != 0:
return False
msg = f"Unexpected HTTP status {status_code} for {url}"
status = int(stdout.decode().strip())
if status in {HTTP_200_OK, HTTP_301_MOVED_PERMANENTLY}:
return True
# TODO: handle 302 redirects
if status in {HTTP_404_NOT_FOUND, HTTP_302_FOUND}:
return False
if status in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN}:
return False
msg = f"Unexpected HTTP status {status} for {url}"
raise RuntimeError(msg)
@ -183,7 +213,7 @@ def _parse_github_url(url: str) -> tuple[str, str, str]:
msg = f"Un-recognised GitHub hostname: {parsed.hostname!r}"
raise ValueError(msg)
parts = removesuffix(parsed.path, ".git").strip("/").split("/")
parts = parsed.path.strip("/").removesuffix(".git").split("/")
expected_path_length = 2
if len(parts) != expected_path_length:
msg = f"Path must look like /<owner>/<repo>: {parsed.path!r}"
@ -216,28 +246,13 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
If the ``ref_type`` parameter is not "branches" or "tags".
"""
if ref_type not in ("branches", "tags"):
msg = f"Invalid fetch type: {ref_type}"
raise ValueError(msg)
cmd = ["git"]
# Add authentication if needed
if token and is_github_host(url):
cmd += ["-c", create_git_auth_header(token, url=url)]
cmd += ["ls-remote"]
fetch_tags = ref_type == "tags"
to_fetch = "tags" if fetch_tags else "heads"
cmd += [f"--{to_fetch}"]
# `--refs` filters out the peeled tag objects (those ending with "^{}") (for tags)
if fetch_tags:
cmd += ["--refs"]
cmd += [url]
cmd += ["ls-remote", "--heads", url]
await ensure_git_installed()
stdout, _ = await run_command(*cmd)
@ -246,9 +261,9 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str |
# - Skip empty lines and lines that don't contain "refs/{to_fetch}/"
# - Extract the branch or tag name after "refs/{to_fetch}/"
return [
line.split(f"refs/{to_fetch}/", 1)[1]
line.split("refs/heads/", 1)[1]
for line in stdout.decode().splitlines()
if line.strip() and f"refs/{to_fetch}/" in line
if line.strip() and "refs/heads/" in line
]

View file

@ -9,8 +9,12 @@ from gitingest.clone import clone_repo
from gitingest.ingestion import ingest_query
from gitingest.query_parser import IngestionQuery, parse_query
from gitingest.utils.git_utils import validate_github_token
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse
from server.server_config import MAX_DISPLAY_SIZE
from server.server_config import (
DEFAULT_FILE_SIZE_KB,
EXAMPLE_REPOS,
MAX_DISPLAY_SIZE,
templates,
)
from server.server_utils import Colors, log_slider_to_size
@ -63,6 +67,8 @@ async def process_query(
if token:
validate_github_token(token)
template = "index.jinja" if is_index else "git.jinja"
template_response = partial(templates.TemplateResponse, name=template)
max_file_size = log_slider_to_size(slider_position)
query: IngestionQuery | None = None
@ -99,7 +105,10 @@ async def process_query(
print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="")
print(f"{Colors.RED}{exc}{Colors.END}")
return IngestErrorResponse(error=str(exc), repo_url=short_repo_url)
context["error_message"] = f"Error: {exc}"
if "405" in str(exc):
context["error_message"] = "Repository not found. Please make sure it is public."
return template_response(context=context)
if len(content) > MAX_DISPLAY_SIZE:
content = (

View file

@ -91,10 +91,9 @@ async def test_clone_nonexistent_repository(repo_exists_true: AsyncMock) -> None
@pytest.mark.parametrize(
("status_code", "expected"),
[
(HTTP_200_OK, True),
(HTTP_401_UNAUTHORIZED, False),
(HTTP_403_FORBIDDEN, False),
(HTTP_404_NOT_FOUND, False),
(b"200\n", 0, True), # Existing repo
(b"404\n", 0, False), # Non-existing repo
(b"200\n", 1, False), # Failed request
],
)
async def test_check_repo_exists(status_code: int, *, expected: bool, mocker: MockerFixture) -> None:
@ -209,6 +208,25 @@ async def test_check_repo_exists_with_redirect(mocker: MockerFixture) -> None:
assert repo_exists is False
@pytest.mark.asyncio
async def test_check_repo_exists_with_permanent_redirect(mocker: MockerFixture) -> None:
"""Test ``check_repo_exists`` when a permanent redirect (301) is returned.
Given a URL that responds with "301 Found":
When ``check_repo_exists`` is called,
Then it should return ``True``, indicating the repo may exist at the new location.
"""
mock_exec = mocker.patch("asyncio.create_subprocess_exec", new_callable=AsyncMock)
mock_process = AsyncMock()
mock_process.communicate.return_value = (b"301\n", b"")
mock_process.returncode = 0 # Simulate successful request
mock_exec.return_value = mock_process
repo_exists = await check_repo_exists(DEMO_URL)
assert repo_exists
@pytest.mark.asyncio
async def test_clone_with_timeout(run_command_mock: AsyncMock) -> None:
"""Test cloning a repository when a timeout occurs.