mirror of
https://github.com/cyclotruc/gitingest.git
synced 2026-04-28 08:39:29 +00:00
feat: serve cached digest if available (#462)
Some checks are pending
CI / test (macos-latest, 3.10) (push) Waiting to run
CI / test (macos-latest, 3.11) (push) Waiting to run
CI / test (macos-latest, 3.12) (push) Waiting to run
CI / test (macos-latest, 3.13) (push) Waiting to run
CI / test (macos-latest, 3.8) (push) Waiting to run
CI / test (macos-latest, 3.9) (push) Waiting to run
CI / test (true, ubuntu-latest, 3.13) (push) Waiting to run
CI / test (ubuntu-latest, 3.10) (push) Waiting to run
CI / test (ubuntu-latest, 3.11) (push) Waiting to run
CI / test (ubuntu-latest, 3.12) (push) Waiting to run
CI / test (ubuntu-latest, 3.8) (push) Waiting to run
CI / test (ubuntu-latest, 3.9) (push) Waiting to run
CI / test (windows-latest, 3.10) (push) Waiting to run
CI / test (windows-latest, 3.11) (push) Waiting to run
CI / test (windows-latest, 3.12) (push) Waiting to run
CI / test (windows-latest, 3.13) (push) Waiting to run
CI / test (windows-latest, 3.8) (push) Waiting to run
CI / test (windows-latest, 3.9) (push) Waiting to run
CodeQL / Analyze (push) Waiting to run
Build & Push Container / ECR (push) Waiting to run
Build & Push Container / GHCR (push) Waiting to run
release-please / release (push) Waiting to run
OSSF Scorecard / Scorecard analysis (push) Waiting to run
Some checks are pending
CI / test (macos-latest, 3.10) (push) Waiting to run
CI / test (macos-latest, 3.11) (push) Waiting to run
CI / test (macos-latest, 3.12) (push) Waiting to run
CI / test (macos-latest, 3.13) (push) Waiting to run
CI / test (macos-latest, 3.8) (push) Waiting to run
CI / test (macos-latest, 3.9) (push) Waiting to run
CI / test (true, ubuntu-latest, 3.13) (push) Waiting to run
CI / test (ubuntu-latest, 3.10) (push) Waiting to run
CI / test (ubuntu-latest, 3.11) (push) Waiting to run
CI / test (ubuntu-latest, 3.12) (push) Waiting to run
CI / test (ubuntu-latest, 3.8) (push) Waiting to run
CI / test (ubuntu-latest, 3.9) (push) Waiting to run
CI / test (windows-latest, 3.10) (push) Waiting to run
CI / test (windows-latest, 3.11) (push) Waiting to run
CI / test (windows-latest, 3.12) (push) Waiting to run
CI / test (windows-latest, 3.13) (push) Waiting to run
CI / test (windows-latest, 3.8) (push) Waiting to run
CI / test (windows-latest, 3.9) (push) Waiting to run
CodeQL / Analyze (push) Waiting to run
Build & Push Container / ECR (push) Waiting to run
Build & Push Container / GHCR (push) Waiting to run
release-please / release (push) Waiting to run
OSSF Scorecard / Scorecard analysis (push) Waiting to run
Co-authored-by: Nicolas IRAGNE <nicoragne@hotmail.fr>
This commit is contained in:
parent
a63ed9ed2b
commit
efe5a26861
4 changed files with 427 additions and 38 deletions
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
|
|
@ -17,7 +17,7 @@ jobs:
|
||||||
test:
|
test:
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: true
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||||
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
|
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||||
|
|
|
||||||
|
|
@ -116,6 +116,25 @@ class IngestErrorResponse(BaseModel):
|
||||||
IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse]
|
IngestResponse = Union[IngestSuccessResponse, IngestErrorResponse]
|
||||||
|
|
||||||
|
|
||||||
|
class S3Metadata(BaseModel):
|
||||||
|
"""Model for S3 metadata structure.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
summary : str
|
||||||
|
Summary of the ingestion process including token estimates.
|
||||||
|
tree : str
|
||||||
|
File tree structure of the repository.
|
||||||
|
content : str
|
||||||
|
Processed content from the repository files.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
summary: str = Field(..., description="Ingestion summary with token estimates")
|
||||||
|
tree: str = Field(..., description="File tree structure")
|
||||||
|
content: str = Field(..., description="Processed file content")
|
||||||
|
|
||||||
|
|
||||||
class QueryForm(BaseModel):
|
class QueryForm(BaseModel):
|
||||||
"""Form data for the query.
|
"""Form data for the query.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,19 +2,211 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import TYPE_CHECKING, cast
|
||||||
|
|
||||||
from gitingest.clone import clone_repo
|
from gitingest.clone import clone_repo
|
||||||
from gitingest.ingestion import ingest_query
|
from gitingest.ingestion import ingest_query
|
||||||
from gitingest.query_parser import parse_remote_repo
|
from gitingest.query_parser import parse_remote_repo
|
||||||
from gitingest.utils.git_utils import validate_github_token
|
from gitingest.utils.git_utils import resolve_commit, validate_github_token
|
||||||
from gitingest.utils.pattern_utils import process_patterns
|
from gitingest.utils.pattern_utils import process_patterns
|
||||||
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
|
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata
|
||||||
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
|
from server.s3_utils import (
|
||||||
|
_build_s3_url,
|
||||||
|
check_s3_object_exists,
|
||||||
|
generate_s3_file_path,
|
||||||
|
get_metadata_from_s3,
|
||||||
|
is_s3_enabled,
|
||||||
|
upload_metadata_to_s3,
|
||||||
|
upload_to_s3,
|
||||||
|
)
|
||||||
from server.server_config import MAX_DISPLAY_SIZE
|
from server.server_config import MAX_DISPLAY_SIZE
|
||||||
from server.server_utils import Colors
|
from server.server_utils import Colors
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from gitingest.schemas.cloning import CloneConfig
|
||||||
|
from gitingest.schemas.ingestion import IngestionQuery
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def _check_s3_cache(
|
||||||
|
query: IngestionQuery,
|
||||||
|
input_text: str,
|
||||||
|
max_file_size: int,
|
||||||
|
pattern_type: str,
|
||||||
|
pattern: str,
|
||||||
|
token: str | None,
|
||||||
|
) -> IngestSuccessResponse | None:
|
||||||
|
"""Check if digest already exists on S3 and return response if found.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query : IngestionQuery
|
||||||
|
The parsed query object.
|
||||||
|
input_text : str
|
||||||
|
Original input text.
|
||||||
|
max_file_size : int
|
||||||
|
Maximum file size in KB.
|
||||||
|
pattern_type : str
|
||||||
|
Pattern type (include/exclude).
|
||||||
|
pattern : str
|
||||||
|
Pattern string.
|
||||||
|
token : str | None
|
||||||
|
GitHub token.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
IngestSuccessResponse | None
|
||||||
|
Response if file exists on S3, None otherwise.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not is_s3_enabled():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use git ls-remote to get commit SHA without cloning
|
||||||
|
clone_config = query.extract_clone_config()
|
||||||
|
query.commit = await resolve_commit(clone_config, token=token)
|
||||||
|
# Generate S3 file path using the resolved commit
|
||||||
|
s3_file_path = generate_s3_file_path(
|
||||||
|
source=query.url,
|
||||||
|
user_name=cast("str", query.user_name),
|
||||||
|
repo_name=cast("str", query.repo_name),
|
||||||
|
commit=query.commit,
|
||||||
|
include_patterns=query.include_patterns,
|
||||||
|
ignore_patterns=query.ignore_patterns,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if file exists on S3
|
||||||
|
if check_s3_object_exists(s3_file_path):
|
||||||
|
# File exists on S3, serve it directly without cloning
|
||||||
|
s3_url = _build_s3_url(s3_file_path)
|
||||||
|
query.s3_url = s3_url
|
||||||
|
|
||||||
|
short_repo_url = f"{query.user_name}/{query.repo_name}"
|
||||||
|
|
||||||
|
# Try to get cached metadata
|
||||||
|
metadata = get_metadata_from_s3(s3_file_path)
|
||||||
|
|
||||||
|
if metadata:
|
||||||
|
# Use cached metadata if available
|
||||||
|
summary = metadata.summary
|
||||||
|
tree = metadata.tree
|
||||||
|
content = metadata.content
|
||||||
|
else:
|
||||||
|
# Fallback to placeholder messages if metadata not available
|
||||||
|
summary = "Digest served from cache (S3). Download the full digest to see content details."
|
||||||
|
tree = "Digest served from cache. Download the full digest to see the file tree."
|
||||||
|
content = "Digest served from cache. Download the full digest to see the content."
|
||||||
|
|
||||||
|
return IngestSuccessResponse(
|
||||||
|
repo_url=input_text,
|
||||||
|
short_repo_url=short_repo_url,
|
||||||
|
summary=summary,
|
||||||
|
digest_url=s3_url,
|
||||||
|
tree=tree,
|
||||||
|
content=content,
|
||||||
|
default_max_file_size=max_file_size,
|
||||||
|
pattern_type=pattern_type,
|
||||||
|
pattern=pattern,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
# Log the exception but don't fail the entire request
|
||||||
|
logger.warning("S3 cache check failed, falling back to normal cloning: %s", exc)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _store_digest_content(
|
||||||
|
query: IngestionQuery,
|
||||||
|
clone_config: CloneConfig,
|
||||||
|
digest_content: str,
|
||||||
|
summary: str,
|
||||||
|
tree: str,
|
||||||
|
content: str,
|
||||||
|
) -> None:
|
||||||
|
"""Store digest content either to S3 or locally based on configuration.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query : IngestionQuery
|
||||||
|
The query object containing repository information.
|
||||||
|
clone_config : CloneConfig
|
||||||
|
The clone configuration object.
|
||||||
|
digest_content : str
|
||||||
|
The complete digest content to store.
|
||||||
|
summary : str
|
||||||
|
The summary content for metadata.
|
||||||
|
tree : str
|
||||||
|
The tree content for metadata.
|
||||||
|
content : str
|
||||||
|
The file content for metadata.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if is_s3_enabled():
|
||||||
|
# Upload to S3 instead of storing locally
|
||||||
|
s3_file_path = generate_s3_file_path(
|
||||||
|
source=query.url,
|
||||||
|
user_name=cast("str", query.user_name),
|
||||||
|
repo_name=cast("str", query.repo_name),
|
||||||
|
commit=query.commit,
|
||||||
|
include_patterns=query.include_patterns,
|
||||||
|
ignore_patterns=query.ignore_patterns,
|
||||||
|
)
|
||||||
|
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
|
||||||
|
|
||||||
|
# Also upload metadata JSON for caching
|
||||||
|
metadata = S3Metadata(
|
||||||
|
summary=summary,
|
||||||
|
tree=tree,
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
upload_metadata_to_s3(metadata=metadata, s3_file_path=s3_file_path, ingest_id=query.id)
|
||||||
|
logger.debug("Successfully uploaded metadata to S3")
|
||||||
|
except Exception as metadata_exc:
|
||||||
|
# Log the error but don't fail the entire request
|
||||||
|
logger.warning("Failed to upload metadata to S3: %s", metadata_exc)
|
||||||
|
|
||||||
|
# Store S3 URL in query for later use
|
||||||
|
query.s3_url = s3_url
|
||||||
|
else:
|
||||||
|
# Store locally
|
||||||
|
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
|
||||||
|
with local_txt_file.open("w", encoding="utf-8") as f:
|
||||||
|
f.write(digest_content)
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_digest_url(query: IngestionQuery) -> str:
|
||||||
|
"""Generate the digest URL based on S3 configuration.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
query : IngestionQuery
|
||||||
|
The query object containing repository information.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
The digest URL.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
RuntimeError
|
||||||
|
If S3 is enabled but no S3 URL was generated.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if is_s3_enabled():
|
||||||
|
digest_url = getattr(query, "s3_url", None)
|
||||||
|
if not digest_url:
|
||||||
|
# This should not happen if S3 upload was successful
|
||||||
|
msg = "S3 is enabled but no S3 URL was generated"
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
return digest_url
|
||||||
|
return f"/api/download/file/{query.id}"
|
||||||
|
|
||||||
|
|
||||||
async def process_query(
|
async def process_query(
|
||||||
input_text: str,
|
input_text: str,
|
||||||
|
|
@ -69,10 +261,22 @@ async def process_query(
|
||||||
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
|
include_patterns=pattern if pattern_type == PatternType.INCLUDE else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Check if digest already exists on S3 before cloning
|
||||||
|
s3_response = await _check_s3_cache(
|
||||||
|
query=query,
|
||||||
|
input_text=input_text,
|
||||||
|
max_file_size=max_file_size,
|
||||||
|
pattern_type=pattern_type.value,
|
||||||
|
pattern=pattern,
|
||||||
|
token=token,
|
||||||
|
)
|
||||||
|
if s3_response:
|
||||||
|
return s3_response
|
||||||
|
|
||||||
clone_config = query.extract_clone_config()
|
clone_config = query.extract_clone_config()
|
||||||
await clone_repo(clone_config, token=token)
|
await clone_repo(clone_config, token=token)
|
||||||
|
|
||||||
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
|
short_repo_url = f"{query.user_name}/{query.repo_name}"
|
||||||
|
|
||||||
# The commit hash should always be available at this point
|
# The commit hash should always be available at this point
|
||||||
if not query.commit:
|
if not query.commit:
|
||||||
|
|
@ -81,30 +285,8 @@ async def process_query(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
summary, tree, content = ingest_query(query)
|
summary, tree, content = ingest_query(query)
|
||||||
|
|
||||||
# Prepare the digest content (tree + content)
|
|
||||||
digest_content = tree + "\n" + content
|
digest_content = tree + "\n" + content
|
||||||
|
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
|
||||||
# Store digest based on S3 configuration
|
|
||||||
if is_s3_enabled():
|
|
||||||
# Upload to S3 instead of storing locally
|
|
||||||
s3_file_path = generate_s3_file_path(
|
|
||||||
source=query.url,
|
|
||||||
user_name=cast("str", query.user_name),
|
|
||||||
repo_name=cast("str", query.repo_name),
|
|
||||||
commit=query.commit,
|
|
||||||
include_patterns=query.include_patterns,
|
|
||||||
ignore_patterns=query.ignore_patterns,
|
|
||||||
)
|
|
||||||
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
|
|
||||||
# Store S3 URL in query for later use
|
|
||||||
query.s3_url = s3_url
|
|
||||||
else:
|
|
||||||
# Store locally
|
|
||||||
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
|
|
||||||
with local_txt_file.open("w", encoding="utf-8") as f:
|
|
||||||
f.write(digest_content)
|
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
|
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
|
||||||
return IngestErrorResponse(error=str(exc))
|
return IngestErrorResponse(error=str(exc))
|
||||||
|
|
@ -123,15 +305,7 @@ async def process_query(
|
||||||
summary=summary,
|
summary=summary,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate digest_url based on S3 configuration
|
digest_url = _generate_digest_url(query)
|
||||||
if is_s3_enabled():
|
|
||||||
digest_url = getattr(query, "s3_url", None)
|
|
||||||
if not digest_url:
|
|
||||||
# This should not happen if S3 upload was successful
|
|
||||||
msg = "S3 is enabled but no S3 URL was generated"
|
|
||||||
raise RuntimeError(msg)
|
|
||||||
else:
|
|
||||||
digest_url = f"/api/download/file/{query.id}"
|
|
||||||
|
|
||||||
return IngestSuccessResponse(
|
return IngestSuccessResponse(
|
||||||
repo_url=input_text,
|
repo_url=input_text,
|
||||||
|
|
|
||||||
|
|
@ -11,13 +11,21 @@ from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) neede
|
||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
|
from prometheus_client import Counter
|
||||||
|
|
||||||
|
from server.models import S3Metadata
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from botocore.client import BaseClient
|
from botocore.client import BaseClient
|
||||||
|
|
||||||
|
|
||||||
# Initialize logger for this module
|
# Initialize logger for this module
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_cache_lookup_counter = Counter("gitingest_cache_lookup", "Number of cache lookups", ["url"])
|
||||||
|
_cache_hit_counter = Counter("gitingest_cache_hit", "Number of cache hits", ["url"])
|
||||||
|
_cache_miss_counter = Counter("gitingest_cache_miss", "Number of cache misses", ["url"])
|
||||||
|
|
||||||
|
|
||||||
class S3UploadError(Exception):
|
class S3UploadError(Exception):
|
||||||
"""Custom exception for S3 upload failures."""
|
"""Custom exception for S3 upload failures."""
|
||||||
|
|
@ -231,6 +239,149 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
|
||||||
return public_url
|
return public_url
|
||||||
|
|
||||||
|
|
||||||
|
def upload_metadata_to_s3(metadata: S3Metadata, s3_file_path: str, ingest_id: UUID) -> str:
|
||||||
|
"""Upload metadata JSON to S3 alongside the digest file.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
metadata : S3Metadata
|
||||||
|
The metadata struct containing summary, tree, and content.
|
||||||
|
s3_file_path : str
|
||||||
|
The S3 file path for the digest (metadata will use .json extension).
|
||||||
|
ingest_id : UUID
|
||||||
|
The ingest ID to store as an S3 object tag.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
Public URL to access the uploaded metadata file.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If S3 is not enabled.
|
||||||
|
S3UploadError
|
||||||
|
If the upload to S3 fails.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not is_s3_enabled():
|
||||||
|
msg = "S3 is not enabled"
|
||||||
|
logger.error(msg)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
# Generate metadata file path by replacing .txt with .json
|
||||||
|
metadata_file_path = s3_file_path.replace(".txt", ".json")
|
||||||
|
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
bucket_name = get_s3_bucket_name()
|
||||||
|
|
||||||
|
extra_fields = {
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"metadata_file_path": metadata_file_path,
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"metadata_size": len(metadata.model_dump_json()),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Log upload attempt
|
||||||
|
logger.debug("Starting S3 metadata upload", extra=extra_fields)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Upload the metadata with ingest_id as tag
|
||||||
|
s3_client.put_object(
|
||||||
|
Bucket=bucket_name,
|
||||||
|
Key=metadata_file_path,
|
||||||
|
Body=metadata.model_dump_json(indent=2).encode("utf-8"),
|
||||||
|
ContentType="application/json",
|
||||||
|
Tagging=f"ingest_id={ingest_id!s}",
|
||||||
|
)
|
||||||
|
except ClientError as err:
|
||||||
|
# Log upload failure
|
||||||
|
logger.exception(
|
||||||
|
"S3 metadata upload failed",
|
||||||
|
extra={
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"metadata_file_path": metadata_file_path,
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"error_code": err.response.get("Error", {}).get("Code"),
|
||||||
|
"error_message": str(err),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
msg = f"Failed to upload metadata to S3: {err}"
|
||||||
|
raise S3UploadError(msg) from err
|
||||||
|
|
||||||
|
# Generate public URL
|
||||||
|
alias_host = get_s3_alias_host()
|
||||||
|
if alias_host:
|
||||||
|
# Use alias host if configured
|
||||||
|
public_url = f"{alias_host.rstrip('/')}/{metadata_file_path}"
|
||||||
|
else:
|
||||||
|
# Fallback to direct S3 URL
|
||||||
|
endpoint = get_s3_config().get("endpoint_url")
|
||||||
|
if endpoint:
|
||||||
|
public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{metadata_file_path}"
|
||||||
|
else:
|
||||||
|
public_url = (
|
||||||
|
f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{metadata_file_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log successful upload
|
||||||
|
logger.debug(
|
||||||
|
"S3 metadata upload completed successfully",
|
||||||
|
extra={
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"metadata_file_path": metadata_file_path,
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"public_url": public_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return public_url
|
||||||
|
|
||||||
|
|
||||||
|
def get_metadata_from_s3(s3_file_path: str) -> S3Metadata | None:
|
||||||
|
"""Retrieve metadata JSON from S3.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s3_file_path : str
|
||||||
|
The S3 file path for the digest (metadata will use .json extension).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
S3Metadata | None
|
||||||
|
The metadata struct if found, None otherwise.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not is_s3_enabled():
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Generate metadata file path by replacing .txt with .json
|
||||||
|
metadata_file_path = s3_file_path.replace(".txt", ".json")
|
||||||
|
|
||||||
|
try:
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
bucket_name = get_s3_bucket_name()
|
||||||
|
|
||||||
|
# Get the metadata object
|
||||||
|
response = s3_client.get_object(Bucket=bucket_name, Key=metadata_file_path)
|
||||||
|
metadata_content = response["Body"].read().decode("utf-8")
|
||||||
|
|
||||||
|
return S3Metadata.model_validate_json(metadata_content)
|
||||||
|
except ClientError as err:
|
||||||
|
# Object doesn't exist if we get a 404 error
|
||||||
|
error_code = err.response.get("Error", {}).get("Code")
|
||||||
|
if error_code == "404":
|
||||||
|
logger.debug("Metadata file not found: %s", metadata_file_path)
|
||||||
|
return None
|
||||||
|
# Log other errors but don't fail
|
||||||
|
logger.warning("Failed to retrieve metadata from S3: %s", err)
|
||||||
|
return None
|
||||||
|
except Exception as exc:
|
||||||
|
# For any other exception, log and return None
|
||||||
|
logger.warning("Unexpected error retrieving metadata from S3: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _build_s3_url(key: str) -> str:
|
def _build_s3_url(key: str) -> str:
|
||||||
"""Build S3 URL for a given key."""
|
"""Build S3 URL for a given key."""
|
||||||
alias_host = get_s3_alias_host()
|
alias_host = get_s3_alias_host()
|
||||||
|
|
@ -257,6 +408,51 @@ def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def check_s3_object_exists(s3_file_path: str) -> bool:
|
||||||
|
"""Check if an S3 object exists at the given path.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
s3_file_path : str
|
||||||
|
The S3 file path to check.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
True if the object exists, False otherwise.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ClientError
|
||||||
|
If there's an S3 error other than 404 (not found).
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not is_s3_enabled():
|
||||||
|
return False
|
||||||
|
_cache_lookup_counter.labels(url=s3_file_path).inc()
|
||||||
|
try:
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
bucket_name = get_s3_bucket_name()
|
||||||
|
|
||||||
|
# Use head_object to check if the object exists without downloading it
|
||||||
|
s3_client.head_object(Bucket=bucket_name, Key=s3_file_path)
|
||||||
|
except ClientError as err:
|
||||||
|
# Object doesn't exist if we get a 404 error
|
||||||
|
error_code = err.response.get("Error", {}).get("Code")
|
||||||
|
if error_code == "404":
|
||||||
|
_cache_miss_counter.labels(url=s3_file_path).inc()
|
||||||
|
return False
|
||||||
|
# Re-raise other errors (permissions, etc.)
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
# For any other exception, assume object doesn't exist
|
||||||
|
_cache_miss_counter.labels(url=s3_file_path).inc()
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
_cache_hit_counter.labels(url=s3_file_path).inc()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
|
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
|
||||||
"""Get S3 URL for a given ingest ID if it exists.
|
"""Get S3 URL for a given ingest ID if it exists.
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue