feat: serve cached digest if available (#462)
Some checks are pending
CI / test (macos-latest, 3.10) (push) Waiting to run
CI / test (macos-latest, 3.11) (push) Waiting to run
CI / test (macos-latest, 3.12) (push) Waiting to run
CI / test (macos-latest, 3.13) (push) Waiting to run
CI / test (macos-latest, 3.8) (push) Waiting to run
CI / test (macos-latest, 3.9) (push) Waiting to run
CI / test (true, ubuntu-latest, 3.13) (push) Waiting to run
CI / test (ubuntu-latest, 3.10) (push) Waiting to run
CI / test (ubuntu-latest, 3.11) (push) Waiting to run
CI / test (ubuntu-latest, 3.12) (push) Waiting to run
CI / test (ubuntu-latest, 3.8) (push) Waiting to run
CI / test (ubuntu-latest, 3.9) (push) Waiting to run
CI / test (windows-latest, 3.10) (push) Waiting to run
CI / test (windows-latest, 3.11) (push) Waiting to run
CI / test (windows-latest, 3.12) (push) Waiting to run
CI / test (windows-latest, 3.13) (push) Waiting to run
CI / test (windows-latest, 3.8) (push) Waiting to run
CI / test (windows-latest, 3.9) (push) Waiting to run
CodeQL / Analyze (push) Waiting to run
Build & Push Container / ECR (push) Waiting to run
Build & Push Container / GHCR (push) Waiting to run
release-please / release (push) Waiting to run
OSSF Scorecard / Scorecard analysis (push) Waiting to run

Co-authored-by: Nicolas IRAGNE <nicoragne@hotmail.fr>
This commit is contained in:
Mickael 2025-07-29 18:02:57 +02:00 committed by GitHub
parent a63ed9ed2b
commit efe5a26861
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 427 additions and 38 deletions

View file

@ -17,7 +17,7 @@ jobs:
test:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: true
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]

View file

@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel):
IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse]
class S3Metadata(BaseModel):
"""Model for S3 metadata structure.
Attributes
----------
summary : str
Summary of the ingestion process including token estimates.
tree : str
File tree structure of the repository.
content : str
Processed content from the repository files.
"""
summary: str = Field(..., description="Ingestion summary with token estimates")
tree: str = Field(..., description="File tree structure")
content: str = Field(..., description="Processed file content")
class QueryForm(BaseModel):
"""Form data for the query.

View file

@ -2,19 +2,211 @@
from __future__ import annotations
import logging
from pathlib import Path
from typing import cast
from typing import TYPE_CHECKING, cast
from gitingest.clone import clone_repo
from gitingest.ingestion import ingest_query
from gitingest.query_parser import parse_remote_repo
from gitingest.utils.git_utils import validate_github_token
from gitingest.utils.git_utils import resolve_commit, validate_github_token
from gitingest.utils.pattern_utils import process_patterns
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata
from server.s3_utils import (
_build_s3_url,
check_s3_object_exists,
generate_s3_file_path,
get_metadata_from_s3,
is_s3_enabled,
upload_metadata_to_s3,
upload_to_s3,
)
from server.server_config import MAX_DISPLAY_SIZE
from server.server_utils import Colors
if TYPE_CHECKING:
from gitingest.schemas.cloning import CloneConfig
from gitingest.schemas.ingestion import IngestionQuery
logger = logging.getLogger(__name__)
async def _check_s3_cache(
query: IngestionQuery,
input_text: str,
max_file_size: int,
pattern_type: str,
pattern: str,
token: str | None,
) -> IngestSuccessResponse | None:
"""Check if digest already exists on S3 and return response if found.
Parameters
----------
query : IngestionQuery
The parsed query object.
input_text : str
Original input text.
max_file_size : int
Maximum file size in KB.
pattern_type : str
Pattern type (include/exclude).
pattern : str
Pattern string.
token : str | None
GitHub token.
Returns
-------
IngestSuccessResponse | None
Response if file exists on S3, None otherwise.
"""
if not is_s3_enabled():
return None
try:
# Use git ls-remote to get commit SHA without cloning
clone_config = query.extract_clone_config()
query.commit = await resolve_commit(clone_config, token=token)
# Generate S3 file path using the resolved commit
s3_file_path = generate_s3_file_path(
source=query.url,
user_name=cast("str", query.user_name),
repo_name=cast("str", query.repo_name),
commit=query.commit,
include_patterns=query.include_patterns,
ignore_patterns=query.ignore_patterns,
)
# Check if file exists on S3
if check_s3_object_exists(s3_file_path):
# File exists on S3, serve it directly without cloning
s3_url = _build_s3_url(s3_file_path)
query.s3_url = s3_url
short_repo_url = f"{query.user_name}/{query.repo_name}"
# Try to get cached metadata
metadata = get_metadata_from_s3(s3_file_path)
if metadata:
# Use cached metadata if available
summary = metadata.summary
tree = metadata.tree
content = metadata.content
else:
# Fallback to placeholder messages if metadata not available
summary = "Digest served from cache (S3). Download the full digest to see content details."
tree = "Digest served from cache. Download the full digest to see the file tree."
content = "Digest served from cache. Download the full digest to see the content."
return IngestSuccessResponse(
repo_url=input_text,
short_repo_url=short_repo_url,
summary=summary,
digest_url=s3_url,
tree=tree,
content=content,
default_max_file_size=max_file_size,
pattern_type=pattern_type,
pattern=pattern,
)
except Exception as exc:
# Log the exception but don't fail the entire request
logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
return None
def _store_digest_content(
query: IngestionQuery,
clone_config: CloneConfig,
digest_content: str,
summary: str,
tree: str,
content: str,
) -> None:
"""Store digest content either to S3 or locally based on configuration.
Parameters
----------
query : IngestionQuery
The query object containing repository information.
clone_config : CloneConfig
The clone configuration object.
digest_content : str
The complete digest content to store.
summary : str
The summary content for metadata.
tree : str
The tree content for metadata.
content : str
The file content for metadata.
"""
if is_s3_enabled():
# Upload to S3 instead of storing locally
s3_file_path = generate_s3_file_path(
source=query.url,
user_name=cast("str", query.user_name),
repo_name=cast("str", query.repo_name),
commit=query.commit,
include_patterns=query.include_patterns,
ignore_patterns=query.ignore_patterns,
)
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
# Also upload metadata JSON for caching
metadata = S3Metadata(
summary=summary,
tree=tree,
content=content,
)
try:
upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
logger.debug("Successfully uploaded metadata to S3")
except Exception as metadata_exc:
# Log the error but don't fail the entire request
logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
# Store S3 URL in query for later use
query.s3_url = s3_url
else:
# Store locally
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
with local_txt_file.open("w", encoding="utf-8") as f:
f.write(digest_content)
def _generate_digest_url(query: IngestionQuery) -> str:
"""Generate the digest URL based on S3 configuration.
Parameters
----------
query : IngestionQuery
The query object containing repository information.
Returns
-------
str
The digest URL.
Raises
------
RuntimeError
If S3 is enabled but no S3 URL was generated.
"""
if is_s3_enabled():
digest_url = getattr(query, "s3_url", None)
if not digest_url:
# This should not happen if S3 upload was successful
msg = "S3 is enabled but no S3 URL was generated"
raise RuntimeError(msg)
return digest_url
return f"/api/download/file/{query.id}"
async def process_query(
input_text: str,
@ -69,10 +261,22 @@ async def process_query(
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
)
# Check if digest already exists on S3 before cloning
s3_response = await _check_s3_cache(
query=query,
input_text=input_text,
max_file_size=max_file_size,
pattern_type=pattern_type.value,
pattern=pattern,
token=token,
)
if s3_response:
return s3_response
clone_config = query.extract_clone_config()
await clone_repo(clone_config, token=token)
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
short_repo_url = f"{query.user_name}/{query.repo_name}"
# The commit hash should always be available at this point
if not query.commit:
@ -81,30 +285,8 @@ async def process_query(
try:
summary, tree, content = ingest_query(query)
# Prepare the digest content (tree + content)
digest_content = tree + "\n" + content
# Store digest based on S3 configuration
if is_s3_enabled():
# Upload to S3 instead of storing locally
s3_file_path = generate_s3_file_path(
source=query.url,
user_name=cast("str", query.user_name),
repo_name=cast("str", query.repo_name),
commit=query.commit,
include_patterns=query.include_patterns,
ignore_patterns=query.ignore_patterns,
)
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
# Store S3 URL in query for later use
query.s3_url = s3_url
else:
# Store locally
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
with local_txt_file.open("w", encoding="utf-8") as f:
f.write(digest_content)
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
except Exception as exc:
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
return IngestErrorResponse(error=str(exc))
@ -123,15 +305,7 @@ async def process_query(
summary=summary,
)
# Generate digest_url based on S3 configuration
if is_s3_enabled():
digest_url = getattr(query, "s3_url", None)
if not digest_url:
# This should not happen if S3 upload was successful
msg = "S3 is enabled but no S3 URL was generated"
raise RuntimeError(msg)
else:
digest_url = f"/api/download/file/{query.id}"
digest_url = _generate_digest_url(query)
return IngestSuccessResponse(
repo_url=input_text,

View file

@ -11,13 +11,21 @@ from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) neede
import boto3
from botocore.exceptions import ClientError
from prometheus_client import Counter
from server.models import S3Metadata
if TYPE_CHECKING:
from botocore.client import BaseClient
# Initialize logger for this module
logger = logging.getLogger(__name__)
_cache_lookup_counter = Counter("gitingest_cache_lookup", "Number of cache lookups", ["url"])
_cache_hit_counter = Counter("gitingest_cache_hit", "Number of cache hits", ["url"])
_cache_miss_counter = Counter("gitingest_cache_miss", "Number of cache misses", ["url"])
class S3UploadError(Exception):
"""Custom exception for S3 upload failures."""
@ -231,6 +239,149 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
return public_url
def upload_metadata_to_s3(metadata: S3Metadata, s3_file_path: str, ingest_id: UUID) -> str:
"""Upload metadata JSON to S3 alongside the digest file.
Parameters
----------
metadata : S3Metadata
The metadata struct containing summary, tree, and content.
s3_file_path : str
The S3 file path for the digest (metadata will use .json extension).
ingest_id : UUID
The ingest ID to store as an S3 object tag.
Returns
-------
str
Public URL to access the uploaded metadata file.
Raises
------
ValueError
If S3 is not enabled.
S3UploadError
If the upload to S3 fails.
"""
if not is_s3_enabled():
msg = "S3 is not enabled"
logger.error(msg)
raise ValueError(msg)
# Generate metadata file path by replacing .txt with .json
metadata_file_path = s3_file_path.replace(".txt", ".json")
s3_client = create_s3_client()
bucket_name = get_s3_bucket_name()
extra_fields = {
"bucket_name": bucket_name,
"metadata_file_path": metadata_file_path,
"ingest_id": str(ingest_id),
"metadata_size": len(metadata.model_dump_json()),
}
# Log upload attempt
logger.debug("Starting S3 metadata upload", extra=extra_fields)
try:
# Upload the metadata with ingest_id as tag
s3_client.put_object(
Bucket=bucket_name,
Key=metadata_file_path,
Body=metadata.model_dump_json(indent=2).encode("utf-8"),
ContentType="application/json",
Tagging=f"ingest_id={ingest_id!s}",
)
except ClientError as err:
# Log upload failure
logger.exception(
"S3 metadata upload failed",
extra={
"bucket_name": bucket_name,
"metadata_file_path": metadata_file_path,
"ingest_id": str(ingest_id),
"error_code": err.response.get("Error", {}).get("Code"),
"error_message": str(err),
},
)
msg = f"Failed to upload metadata to S3: {err}"
raise S3UploadError(msg) from err
# Generate public URL
alias_host = get_s3_alias_host()
if alias_host:
# Use alias host if configured
public_url = f"{alias_host.rstrip('/')}/{metadata_file_path}"
else:
# Fallback to direct S3 URL
endpoint = get_s3_config().get("endpoint_url")
if endpoint:
public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{metadata_file_path}"
else:
public_url = (
f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{metadata_file_path}"
)
# Log successful upload
logger.debug(
"S3 metadata upload completed successfully",
extra={
"bucket_name": bucket_name,
"metadata_file_path": metadata_file_path,
"ingest_id": str(ingest_id),
"public_url": public_url,
},
)
return public_url
def get_metadata_from_s3(s3_file_path: str) -> S3Metadata | None:
"""Retrieve metadata JSON from S3.
Parameters
----------
s3_file_path : str
The S3 file path for the digest (metadata will use .json extension).
Returns
-------
S3Metadata | None
The metadata struct if found, None otherwise.
"""
if not is_s3_enabled():
return None
# Generate metadata file path by replacing .txt with .json
metadata_file_path = s3_file_path.replace(".txt", ".json")
try:
s3_client = create_s3_client()
bucket_name = get_s3_bucket_name()
# Get the metadata object
response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path)
metadata_content = response["Body"].read().decode("utf-8")
return S3Metadata.model_validate_json(metadata_content)
except ClientError as err:
# Object doesn't exist if we get a 404 error
error_code = err.response.get("Error", {}).get("Code")
if error_code == "404":
logger.debug("Metadata file not found: %s", metadata_file_path)
return None
# Log other errors but don't fail
logger.warning("Failed to retrieve metadata from S3: %s", err)
return None
except Exception as exc:
# For any other exception, log and return None
logger.warning("Unexpected error retrieving metadata from S3: %s", exc)
return None
def _build_s3_url(key: str) -> str:
"""Build S3 URL for a given key."""
alias_host = get_s3_alias_host()
@ -257,6 +408,51 @@ def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target
return False
def check_s3_object_exists(s3_file_path: str) -> bool:
"""Check if an S3 object exists at the given path.
Parameters
----------
s3_file_path : str
The S3 file path to check.
Returns
-------
bool
True if the object exists, False otherwise.
Raises
------
ClientError
If there's an S3 error other than 404 (not found).
"""
if not is_s3_enabled():
return False
_cache_lookup_counter.labels(url=s3_file_path).inc()
try:
s3_client = create_s3_client()
bucket_name = get_s3_bucket_name()
# Use head_object to check if the object exists without downloading it
s3_client.head_object(Bucket=bucket_name, Key=s3_file_path)
except ClientError as err:
# Object doesn't exist if we get a 404 error
error_code = err.response.get("Error", {}).get("Code")
if error_code == "404":
_cache_miss_counter.labels(url=s3_file_path).inc()
return False
# Re-raise other errors (permissions, etc.)
raise
except Exception:
# For any other exception, assume object doesn't exist
_cache_miss_counter.labels(url=s3_file_path).inc()
return False
else:
_cache_hit_counter.labels(url=s3_file_path).inc()
return True
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
"""Get S3 URL for a given ingest ID if it exists.