feat: serve cached digest if available (#462)

Co-authored-by: Nicolas IRAGNE <nicoragne@hotmail.fr>
2026-04-26 15:40:40 +00:00 · 2025-07-29 18:02:57 +02:00 · 2025-07-29 18:02:57 +02:00 · efe5a26861
commit efe5a26861
parent a63ed9ed2b
4 changed files with 427 additions and 38 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -17,7 +17,7 @@ jobs:
  test:
    runs-on: ${{ matrix.os }}
    strategy:
-      fail-fast: true
+      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
--- a/src/server/models.py
+++ b/src/server/models.py
@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel):
 IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse]


+class S3Metadata(BaseModel):
+    """Model for S3 metadata structure.
+
+    Attributes
+    ----------
+    summary : str
+        Summary of the ingestion process including token estimates.
+    tree : str
+        File tree structure of the repository.
+    content : str
+        Processed content from the repository files.
+
+    """
+
+    summary: str = Field(..., description="Ingestion summary with token estimates")
+    tree: str = Field(..., description="File tree structure")
+    content: str = Field(..., description="Processed file content")
+
+
 class QueryForm(BaseModel):
    """Form data for the query.

--- a/src/server/query_processor.py
+++ b/src/server/query_processor.py
@ -2,19 +2,211 @@

 from __future__ import annotations

+import logging
 from pathlib import Path
-from typing import cast
+from typing import TYPE_CHECKING, cast

 from gitingest.clone import clone_repo
 from gitingest.ingestion import ingest_query
 from gitingest.query_parser import parse_remote_repo
-from gitingest.utils.git_utils import validate_github_token
+from gitingest.utils.git_utils import resolve_commit, validate_github_token
 from gitingest.utils.pattern_utils import process_patterns
-from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
-from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
+from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata
+from server.s3_utils import (
+    _build_s3_url,
+    check_s3_object_exists,
+    generate_s3_file_path,
+    get_metadata_from_s3,
+    is_s3_enabled,
+    upload_metadata_to_s3,
+    upload_to_s3,
+)
 from server.server_config import MAX_DISPLAY_SIZE
 from server.server_utils import Colors

+if TYPE_CHECKING:
+    from gitingest.schemas.cloning import CloneConfig
+    from gitingest.schemas.ingestion import IngestionQuery
+
+logger = logging.getLogger(__name__)
+
+
+async def _check_s3_cache(
+    query: IngestionQuery,
+    input_text: str,
+    max_file_size: int,
+    pattern_type: str,
+    pattern: str,
+    token: str | None,
+) -> IngestSuccessResponse | None:
+    """Check if digest already exists on S3 and return response if found.
+
+    Parameters
+    ----------
+    query : IngestionQuery
+        The parsed query object.
+    input_text : str
+        Original input text.
+    max_file_size : int
+        Maximum file size in KB.
+    pattern_type : str
+        Pattern type (include/exclude).
+    pattern : str
+        Pattern string.
+    token : str | None
+        GitHub token.
+
+    Returns
+    -------
+    IngestSuccessResponse | None
+        Response if file exists on S3, None otherwise.
+
+    """
+    if not is_s3_enabled():
+        return None
+
+    try:
+        # Use git ls-remote to get commit SHA without cloning
+        clone_config = query.extract_clone_config()
+        query.commit = await resolve_commit(clone_config, token=token)
+        # Generate S3 file path using the resolved commit
+        s3_file_path = generate_s3_file_path(
+            source=query.url,
+            user_name=cast("str", query.user_name),
+            repo_name=cast("str", query.repo_name),
+            commit=query.commit,
+            include_patterns=query.include_patterns,
+            ignore_patterns=query.ignore_patterns,
+        )
+
+        # Check if file exists on S3
+        if check_s3_object_exists(s3_file_path):
+            # File exists on S3, serve it directly without cloning
+            s3_url = _build_s3_url(s3_file_path)
+            query.s3_url = s3_url
+
+            short_repo_url = f"{query.user_name}/{query.repo_name}"
+
+            # Try to get cached metadata
+            metadata = get_metadata_from_s3(s3_file_path)
+
+            if metadata:
+                # Use cached metadata if available
+                summary = metadata.summary
+                tree = metadata.tree
+                content = metadata.content
+            else:
+                # Fallback to placeholder messages if metadata not available
+                summary = "Digest served from cache (S3). Download the full digest to see content details."
+                tree = "Digest served from cache. Download the full digest to see the file tree."
+                content = "Digest served from cache. Download the full digest to see the content."
+
+            return IngestSuccessResponse(
+                repo_url=input_text,
+                short_repo_url=short_repo_url,
+                summary=summary,
+                digest_url=s3_url,
+                tree=tree,
+                content=content,
+                default_max_file_size=max_file_size,
+                pattern_type=pattern_type,
+                pattern=pattern,
+            )
+    except Exception as exc:
+        # Log the exception but don't fail the entire request
+        logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
+
+    return None
+
+
+def _store_digest_content(
+    query: IngestionQuery,
+    clone_config: CloneConfig,
+    digest_content: str,
+    summary: str,
+    tree: str,
+    content: str,
+) -> None:
+    """Store digest content either to S3 or locally based on configuration.
+
+    Parameters
+    ----------
+    query : IngestionQuery
+        The query object containing repository information.
+    clone_config : CloneConfig
+        The clone configuration object.
+    digest_content : str
+        The complete digest content to store.
+    summary : str
+        The summary content for metadata.
+    tree : str
+        The tree content for metadata.
+    content : str
+        The file content for metadata.
+
+    """
+    if is_s3_enabled():
+        # Upload to S3 instead of storing locally
+        s3_file_path = generate_s3_file_path(
+            source=query.url,
+            user_name=cast("str", query.user_name),
+            repo_name=cast("str", query.repo_name),
+            commit=query.commit,
+            include_patterns=query.include_patterns,
+            ignore_patterns=query.ignore_patterns,
+        )
+        s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
+
+        # Also upload metadata JSON for caching
+        metadata = S3Metadata(
+            summary=summary,
+            tree=tree,
+            content=content,
+        )
+        try:
+            upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
+            logger.debug("Successfully uploaded metadata to S3")
+        except Exception as metadata_exc:
+            # Log the error but don't fail the entire request
+            logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
+
+        # Store S3 URL in query for later use
+        query.s3_url = s3_url
+    else:
+        # Store locally
+        local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
+        with local_txt_file.open("w", encoding="utf-8") as f:
+            f.write(digest_content)
+
+
+def _generate_digest_url(query: IngestionQuery) -> str:
+    """Generate the digest URL based on S3 configuration.
+
+    Parameters
+    ----------
+    query : IngestionQuery
+        The query object containing repository information.
+
+    Returns
+    -------
+    str
+        The digest URL.
+
+    Raises
+    ------
+    RuntimeError
+        If S3 is enabled but no S3 URL was generated.
+
+    """
+    if is_s3_enabled():
+        digest_url = getattr(query, "s3_url", None)
+        if not digest_url:
+            # This should not happen if S3 upload was successful
+            msg = "S3 is enabled but no S3 URL was generated"
+            raise RuntimeError(msg)
+        return digest_url
+    return f"/api/download/file/{query.id}"
+

 async def process_query(
    input_text: str,
@ -69,10 +261,22 @@ async def process_query(
        include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
    )

+    # Check if digest already exists on S3 before cloning
+    s3_response = await _check_s3_cache(
+        query=query,
+        input_text=input_text,
+        max_file_size=max_file_size,
+        pattern_type=pattern_type.value,
+        pattern=pattern,
+        token=token,
+    )
+    if s3_response:
+        return s3_response
+
    clone_config = query.extract_clone_config()
    await clone_repo(clone_config, token=token)

-    short_repo_url = f"{query.user_name}/{query.repo_name}"  # Sets the "<user>/<repo>" for the page title
+    short_repo_url = f"{query.user_name}/{query.repo_name}"

    # The commit hash should always be available at this point
    if not query.commit:
@ -81,30 +285,8 @@ async def process_query(

    try:
        summary, tree, content = ingest_query(query)
-
-        # Prepare the digest content (tree + content)
        digest_content = tree + "\n" + content
-
-        # Store digest based on S3 configuration
-        if is_s3_enabled():
-            # Upload to S3 instead of storing locally
-            s3_file_path = generate_s3_file_path(
-                source=query.url,
-                user_name=cast("str", query.user_name),
-                repo_name=cast("str", query.repo_name),
-                commit=query.commit,
-                include_patterns=query.include_patterns,
-                ignore_patterns=query.ignore_patterns,
-            )
-            s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
-            # Store S3 URL in query for later use
-            query.s3_url = s3_url
-        else:
-            # Store locally
-            local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
-            with local_txt_file.open("w", encoding="utf-8") as f:
-                f.write(digest_content)
-
+        _store_digest_content(query, clone_config, digest_content, summary, tree, content)
    except Exception as exc:
        _print_error(query.url, exc, max_file_size, pattern_type, pattern)
        return IngestErrorResponse(error=str(exc))
@ -123,15 +305,7 @@ async def process_query(
        summary=summary,
    )

-    # Generate digest_url based on S3 configuration
-    if is_s3_enabled():
-        digest_url = getattr(query, "s3_url", None)
-        if not digest_url:
-            # This should not happen if S3 upload was successful
-            msg = "S3 is enabled but no S3 URL was generated"
-            raise RuntimeError(msg)
-    else:
-        digest_url = f"/api/download/file/{query.id}"
+    digest_url = _generate_digest_url(query)

    return IngestSuccessResponse(
        repo_url=input_text,
--- a/src/server/s3_utils.py
+++ b/src/server/s3_utils.py
@ -11,13 +11,21 @@ from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) neede

 import boto3
 from botocore.exceptions import ClientError
+from prometheus_client import Counter
+
+from server.models import S3Metadata

 if TYPE_CHECKING:
    from botocore.client import BaseClient

+
 # Initialize logger for this module
 logger = logging.getLogger(__name__)

+_cache_lookup_counter = Counter("gitingest_cache_lookup", "Number of cache lookups", ["url"])
+_cache_hit_counter = Counter("gitingest_cache_hit", "Number of cache hits", ["url"])
+_cache_miss_counter = Counter("gitingest_cache_miss", "Number of cache misses", ["url"])
+

 class S3UploadError(Exception):
    """Custom exception for S3 upload failures."""
@ -231,6 +239,149 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
    return public_url


+def upload_metadata_to_s3(metadata: S3Metadata, s3_file_path: str, ingest_id: UUID) -> str:
+    """Upload metadata JSON to S3 alongside the digest file.
+
+    Parameters
+    ----------
+    metadata : S3Metadata
+        The metadata struct containing summary, tree, and content.
+    s3_file_path : str
+        The S3 file path for the digest (metadata will use .json extension).
+    ingest_id : UUID
+        The ingest ID to store as an S3 object tag.
+
+    Returns
+    -------
+    str
+        Public URL to access the uploaded metadata file.
+
+    Raises
+    ------
+    ValueError
+        If S3 is not enabled.
+    S3UploadError
+        If the upload to S3 fails.
+
+    """
+    if not is_s3_enabled():
+        msg = "S3 is not enabled"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    # Generate metadata file path by replacing .txt with .json
+    metadata_file_path = s3_file_path.replace(".txt", ".json")
+
+    s3_client = create_s3_client()
+    bucket_name = get_s3_bucket_name()
+
+    extra_fields = {
+        "bucket_name": bucket_name,
+        "metadata_file_path": metadata_file_path,
+        "ingest_id": str(ingest_id),
+        "metadata_size": len(metadata.model_dump_json()),
+    }
+
+    # Log upload attempt
+    logger.debug("Starting S3 metadata upload", extra=extra_fields)
+
+    try:
+        # Upload the metadata with ingest_id as tag
+        s3_client.put_object(
+            Bucket=bucket_name,
+            Key=metadata_file_path,
+            Body=metadata.model_dump_json(indent=2).encode("utf-8"),
+            ContentType="application/json",
+            Tagging=f"ingest_id={ingest_id!s}",
+        )
+    except ClientError as err:
+        # Log upload failure
+        logger.exception(
+            "S3 metadata upload failed",
+            extra={
+                "bucket_name": bucket_name,
+                "metadata_file_path": metadata_file_path,
+                "ingest_id": str(ingest_id),
+                "error_code": err.response.get("Error", {}).get("Code"),
+                "error_message": str(err),
+            },
+        )
+        msg = f"Failed to upload metadata to S3: {err}"
+        raise S3UploadError(msg) from err
+
+    # Generate public URL
+    alias_host = get_s3_alias_host()
+    if alias_host:
+        # Use alias host if configured
+        public_url = f"{alias_host.rstrip('/')}/{metadata_file_path}"
+    else:
+        # Fallback to direct S3 URL
+        endpoint = get_s3_config().get("endpoint_url")
+        if endpoint:
+            public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{metadata_file_path}"
+        else:
+            public_url = (
+                f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{metadata_file_path}"
+            )
+
+    # Log successful upload
+    logger.debug(
+        "S3 metadata upload completed successfully",
+        extra={
+            "bucket_name": bucket_name,
+            "metadata_file_path": metadata_file_path,
+            "ingest_id": str(ingest_id),
+            "public_url": public_url,
+        },
+    )
+
+    return public_url
+
+
+def get_metadata_from_s3(s3_file_path: str) -> S3Metadata | None:
+    """Retrieve metadata JSON from S3.
+
+    Parameters
+    ----------
+    s3_file_path : str
+        The S3 file path for the digest (metadata will use .json extension).
+
+    Returns
+    -------
+    S3Metadata | None
+        The metadata struct if found, None otherwise.
+
+    """
+    if not is_s3_enabled():
+        return None
+
+    # Generate metadata file path by replacing .txt with .json
+    metadata_file_path = s3_file_path.replace(".txt", ".json")
+
+    try:
+        s3_client = create_s3_client()
+        bucket_name = get_s3_bucket_name()
+
+        # Get the metadata object
+        response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path)
+        metadata_content = response["Body"].read().decode("utf-8")
+
+        return S3Metadata.model_validate_json(metadata_content)
+    except ClientError as err:
+        # Object doesn't exist if we get a 404 error
+        error_code = err.response.get("Error", {}).get("Code")
+        if error_code == "404":
+            logger.debug("Metadata file not found: %s", metadata_file_path)
+            return None
+        # Log other errors but don't fail
+        logger.warning("Failed to retrieve metadata from S3: %s", err)
+        return None
+    except Exception as exc:
+        # For any other exception, log and return None
+        logger.warning("Unexpected error retrieving metadata from S3: %s", exc)
+        return None
+
+
 def _build_s3_url(key: str) -> str:
    """Build S3 URL for a given key."""
    alias_host = get_s3_alias_host()
@ -257,6 +408,51 @@ def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target
        return False


+def check_s3_object_exists(s3_file_path: str) -> bool:
+    """Check if an S3 object exists at the given path.
+
+    Parameters
+    ----------
+    s3_file_path : str
+        The S3 file path to check.
+
+    Returns
+    -------
+    bool
+        True if the object exists, False otherwise.
+
+    Raises
+    ------
+    ClientError
+        If there's an S3 error other than 404 (not found).
+
+    """
+    if not is_s3_enabled():
+        return False
+    _cache_lookup_counter.labels(url=s3_file_path).inc()
+    try:
+        s3_client = create_s3_client()
+        bucket_name = get_s3_bucket_name()
+
+        # Use head_object to check if the object exists without downloading it
+        s3_client.head_object(Bucket=bucket_name, Key=s3_file_path)
+    except ClientError as err:
+        # Object doesn't exist if we get a 404 error
+        error_code = err.response.get("Error", {}).get("Code")
+        if error_code == "404":
+            _cache_miss_counter.labels(url=s3_file_path).inc()
+            return False
+        # Re-raise other errors (permissions, etc.)
+        raise
+    except Exception:
+        # For any other exception, assume object doesn't exist
+        _cache_miss_counter.labels(url=s3_file_path).inc()
+        return False
+    else:
+        _cache_hit_counter.labels(url=s3_file_path).inc()
+        return True
+
+
 def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
    """Get S3 URL for a given ingest ID if it exists.