feat: implement S3 integration for storing and retrieving digest files (#427)

Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Co-authored-by: Nicolas Iragne <nicoragne@hotmail.fr>
2026-04-26 15:40:40 +00:00 · 2025-07-26 16:28:17 +02:00 · 2025-07-26 16:28:17 +02:00 · 414e85189f
commit 414e85189f
parent 998cea15b4
17 changed files with 688 additions and 38 deletions
--- a/.docker/minio/setup.sh
+++ b/.docker/minio/setup.sh
@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Simple script to set up MinIO bucket and user
+# Based on example from MinIO issues
+
+# Format bucket name to ensure compatibility
+BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+# Configure MinIO client
+mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
+
+# Remove bucket if it exists (for clean setup)
+mc rm -r --force myminio/${BUCKET_NAME} || true
+
+# Create bucket
+mc mb myminio/${BUCKET_NAME}
+
+# Set bucket policy to allow downloads
+mc anonymous set download myminio/${BUCKET_NAME}
+
+# Create user with access and secret keys
+mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists"
+
+# Create policy for the bucket
+echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json
+
+# Apply policy
+mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists"
+mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY}
+
+echo "MinIO setup completed successfully"
+echo "Bucket: ${BUCKET_NAME}"
+echo "Access via console: http://localhost:9001"
--- a/.env.example
+++ b/.env.example
@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace
 GITINGEST_SENTRY_SEND_DEFAULT_PII=true
 # Environment name for Sentry (default: "")
 GITINGEST_SENTRY_ENVIRONMENT=development
+
+# MinIO Configuration (for development)
+# Root user credentials for MinIO admin access
+MINIO_ROOT_USER=minioadmin
+MINIO_ROOT_PASSWORD=minioadmin
+
+# S3 Configuration (for application)
+# Set to "true" to enable S3 storage for digests
+# S3_ENABLED=true
+# Endpoint URL for the S3 service (MinIO in development)
+S3_ENDPOINT=http://minio:9000
+# Access key for the S3 bucket (created automatically in development)
+S3_ACCESS_KEY=gitingest
+# Secret key for the S3 bucket (created automatically in development)
+S3_SECRET_KEY=gitingest123
+# Name of the S3 bucket (created automatically in development)
+S3_BUCKET_NAME=gitingest-bucket
+# Region for the S3 bucket (default for MinIO)
+S3_REGION=us-east-1
+# Public URL/CDN for accessing S3 resources
+S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket
+# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
+# S3_DIRECTORY_PREFIX=my-prefix
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -113,6 +113,7 @@ repos:
        files: ^src/
        additional_dependencies:
          [
+            boto3>=1.28.0,
            click>=8.0.0,
            'fastapi[standard]>=0.109.1',
            httpx,
@ -138,6 +139,7 @@ repos:
          - --rcfile=tests/.pylintrc
        additional_dependencies:
          [
+            boto3>=1.28.0,
            click>=8.0.0,
            'fastapi[standard]>=0.109.1',
            httpx,
--- a/README.md
+++ b/README.md
@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default.

 ## 🐳 Self-host

+### Using Docker
+
 1. Build the image:

   ``` bash
@ -239,6 +241,89 @@ The application can be configured using the following environment variables:
 - **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0)
 - **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace")
 - **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true")
+- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
+- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
+
+### Using Docker Compose
+
+The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments.
+
+#### Compose File Structure
+
+The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services:
+
+```yaml
+# Common base configuration for all services
+x-app-base: &app-base
+  build:
+    context: .
+    dockerfile: Dockerfile
+  ports:
+    - "${APP_WEB_BIND:-8000}:8000"  # Main application port
+    - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090"  # Metrics port
+  # ... other common configurations
+```
+
+#### Services
+
+The file defines three services:
+
+1. **app**: Production service configuration
+   - Uses the `prod` profile
+   - Sets the Sentry environment to "production"
+   - Configured for stable operation with `restart: unless-stopped`
+
+2. **app-dev**: Development service configuration
+   - Uses the `dev` profile
+   - Enables debug mode
+   - Mounts the source code for live development
+   - Uses hot reloading for faster development
+
+3. **minio**: S3-compatible object storage for development
+   - Uses the `dev` profile (only available in development mode)
+   - Provides S3-compatible storage for local development
+   - Accessible via:
+     - API: Port 9000 ([localhost:9000](http://localhost:9000))
+     - Web Console: Port 9001 ([localhost:9001](http://localhost:9001))
+   - Default admin credentials:
+     - Username: `minioadmin`
+     - Password: `minioadmin`
+   - Configurable via environment variables:
+     - `MINIO_ROOT_USER`: Custom admin username (default: minioadmin)
+     - `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin)
+   - Includes persistent storage via Docker volume
+   - Auto-creates a bucket and application-specific credentials:
+     - Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`)
+     - Access key: `gitingest` (configurable via `S3_ACCESS_KEY`)
+     - Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`)
+   - These credentials are automatically passed to the app-dev service via environment variables:
+     - `S3_ENDPOINT`: URL of the MinIO server
+     - `S3_ACCESS_KEY`: Access key for the S3 bucket
+     - `S3_SECRET_KEY`: Secret key for the S3 bucket
+     - `S3_BUCKET_NAME`: Name of the S3 bucket
+     - `S3_REGION`: Region for the S3 bucket (default: us-east-1)
+     - `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
+
+#### Usage Examples
+
+To run the application in development mode:
+
+```bash
+docker compose --profile dev up
+```
+
+To run the application in production mode:
+
+```bash
+docker compose --profile prod up -d
+```
+
+To build and run the application:
+
+```bash
+docker compose --profile prod build
+docker compose --profile prod up -d
+```

 ## 🤝 Contributing

--- a/compose.yml
+++ b/compose.yml
@ -0,0 +1,111 @@
+# Common base configuration for all services
+x-app-base: &app-base
+  ports:
+    - "${APP_WEB_BIND:-8000}:8000"  # Main application port
+    - "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090"  # Metrics port
+  environment:
+    # Python Configuration
+    - PYTHONUNBUFFERED=1
+    - PYTHONDONTWRITEBYTECODE=1
+    # Host Configuration
+    - ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1}
+    # Metrics Configuration
+    - GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true}
+    - GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1}
+    - GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090}
+    # Sentry Configuration
+    - GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false}
+    - GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-}
+    - GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0}
+    - GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0}
+    - GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace}
+    - GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true}
+  user: "1000:1000"
+  command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"]
+
+services:
+  # Production service configuration
+  app:
+    <<: *app-base
+    image: ghcr.io/coderamp-labs/gitingest:latest
+    profiles:
+      - prod
+    environment:
+      - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production}
+    restart: unless-stopped
+
+  # Development service configuration
+  app-dev:
+    <<: *app-base
+    build:
+      context: .
+      dockerfile: Dockerfile
+    profiles:
+      - dev
+    environment:
+      - DEBUG=true
+      - GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development}
+      # S3 Configuration
+      - S3_ENABLED=true
+      - S3_ENDPOINT=http://minio:9000
+      - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
+      - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
+      # Use lowercase bucket name to ensure compatibility with MinIO
+      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
+      - S3_REGION=${S3_REGION:-us-east-1}
+      - S3_DIRECTORY_PREFIX=${S3_DIRECTORY_PREFIX:-dev}
+      # Public URL for S3 resources
+      - S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}}
+    volumes:
+      # Mount source code for live development
+      - ./src:/app:ro
+    # Use --reload flag for hot reloading during development
+    command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+    depends_on:
+      minio-setup:
+        condition: service_completed_successfully
+
+  # MinIO S3-compatible object storage for development
+  minio:
+    image: minio/minio:latest
+    profiles:
+      - dev
+    ports:
+      - "9000:9000"  # API port
+      - "9001:9001"  # Console port
+    environment:
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
+    volumes:
+      - minio-data:/data
+    command: server /data --console-address ":9001"
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 30s
+      start_period: 30s
+      start_interval: 1s
+
+  # MinIO setup service to create bucket and user
+  minio-setup:
+    image: minio/mc
+    profiles:
+      - dev
+    depends_on:
+      minio:
+        condition: service_healthy
+    environment:
+      - MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
+      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
+      - S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
+      - S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
+      - S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
+    volumes:
+      - ./.docker/minio/setup.sh:/setup.sh:ro
+    entrypoint: sh
+    command: -c /setup.sh
+
+volumes:
+  minio-data:
+    driver: local
--- a/pyproject.toml
+++ b/pyproject.toml
@ -44,6 +44,7 @@ dev = [
 ]

 server = [
+    "boto3>=1.28.0",  # AWS SDK for S3 support
    "fastapi[standard]>=0.109.1",  # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
    "prometheus-client",
    "sentry-sdk[fastapi]",
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
+boto3>=1.28.0  # AWS SDK for S3 support
 click>=8.0.0
 fastapi[standard]>=0.109.1  # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
 httpx
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
    host = parsed_url.netloc
    user, repo = _get_user_and_repo_from_path(parsed_url.path)

-    _id = str(uuid.uuid4())
+    _id = uuid.uuid4()
    slug = f"{user}-{repo}"
-    local_path = TMP_BASE_PATH / _id / slug
+    local_path = TMP_BASE_PATH / str(_id) / slug
    url = f"https://{host}/{user}/{repo}"

    query = IngestionQuery(
@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
    """
    path_obj = Path(path_str).resolve()
    slug = path_obj.name if path_str == "." else path_str.strip("/")
-    return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
+    return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())


 async def _configure_branch_or_tag(
--- a/src/gitingest/schemas/ingestion.py
+++ b/src/gitingest/schemas/ingestion.py
@ -3,6 +3,7 @@
 from __future__ import annotations

 from pathlib import Path  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)

 from pydantic import BaseModel, Field

@ -27,7 +28,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
        The URL of the repository.
    slug : str
        The slug of the repository.
-    id : str
+    id : UUID
        The ID of the repository.
    subpath : str
        The subpath to the repository or file (default: ``"/"``).
@ -47,6 +48,8 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
        The patterns to include.
    include_submodules : bool
        Whether to include all Git submodules within the repository. (default: ``False``)
+    s3_url : str | None
+        The S3 URL where the digest is stored if S3 is enabled.

    """

@ -56,7 +59,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
    local_path: Path
    url: str | None = None
    slug: str
-    id: str
+    id: UUID
    subpath: str = Field(default="/")
    type: str | None = None
    branch: str | None = None
@ -66,6 +69,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
    ignore_patterns: set[str] = Field(default_factory=set)  # TODO: ssame type for ignore_* and include_* patterns
    include_patterns: set[str] | None = None
    include_submodules: bool = Field(default=False)
+    s3_url: str | None = None

    def extract_clone_config(self) -> CloneConfig:
        """Extract the relevant fields for the CloneConfig object.
--- a/src/server/models.py
+++ b/src/server/models.py
@ -71,8 +71,8 @@ class IngestSuccessResponse(BaseModel):
        Short form of repository URL (user/repo).
    summary : str
        Summary of the ingestion process including token estimates.
-    ingest_id : str
-        Ingestion id used to download full context.
+    digest_url : str
+        URL to download the full digest content (either S3 URL or local download endpoint).
    tree : str
        File tree structure of the repository.
    content : str
@ -89,7 +89,7 @@ class IngestSuccessResponse(BaseModel):
    repo_url: str = Field(..., description="Original repository URL")
    short_repo_url: str = Field(..., description="Short repository URL (user/repo)")
    summary: str = Field(..., description="Ingestion summary with token estimates")
-    ingest_id: str = Field(..., description="Ingestion id used to download full context")
+    digest_url: str = Field(..., description="URL to download the full digest content")
    tree: str = Field(..., description="File tree structure")
    content: str = Field(..., description="Processed file content")
    default_max_file_size: int = Field(..., description="File size slider position used")
--- a/src/server/query_processor.py
+++ b/src/server/query_processor.py
@ -11,6 +11,7 @@ from gitingest.query_parser import parse_remote_repo
 from gitingest.utils.git_utils import validate_github_token
 from gitingest.utils.pattern_utils import process_patterns
 from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
+from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
 from server.server_config import MAX_DISPLAY_SIZE
 from server.server_utils import Colors, log_slider_to_size

@ -45,6 +46,11 @@ async def process_query(
    IngestResponse
        A union type, corresponding to IngestErrorResponse or IngestSuccessResponse

+    Raises
+    ------
+    RuntimeError
+        If the commit hash is not found (should never happen).
+
    """
    if token:
        validate_github_token(token)
@ -59,7 +65,6 @@ async def process_query(
        return IngestErrorResponse(error=str(exc))

    query.url = cast("str", query.url)
-    query.host = cast("str", query.host)
    query.max_file_size = max_file_size
    query.ignore_patterns, query.include_patterns = process_patterns(
        exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None,
@ -71,13 +76,36 @@ async def process_query(

    short_repo_url = f"{query.user_name}/{query.repo_name}"  # Sets the "<user>/<repo>" for the page title

+    # The commit hash should always be available at this point
+    if not query.commit:
+        msg = "Unexpected error: no commit hash found"
+        raise RuntimeError(msg)
+
    try:
        summary, tree, content = ingest_query(query)

-        # TODO: why are we writing the tree and content to a file here?
+        # Prepare the digest content (tree + content)
+        digest_content = tree + "\n" + content
+
+        # Store digest based on S3 configuration
+        if is_s3_enabled():
+            # Upload to S3 instead of storing locally
+            s3_file_path = generate_s3_file_path(
+                source=query.url,
+                user_name=cast("str", query.user_name),
+                repo_name=cast("str", query.repo_name),
+                commit=query.commit,
+                include_patterns=query.include_patterns,
+                ignore_patterns=query.ignore_patterns,
+            )
+            s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
+            # Store S3 URL in query for later use
+            query.s3_url = s3_url
+        else:
+            # Store locally
            local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
            with local_txt_file.open("w", encoding="utf-8") as f:
-            f.write(tree + "\n" + content)
+                f.write(digest_content)

    except Exception as exc:
        _print_error(query.url, exc, max_file_size, pattern_type, pattern)
@ -97,11 +125,21 @@ async def process_query(
        summary=summary,
    )

+    # Generate digest_url based on S3 configuration
+    if is_s3_enabled():
+        digest_url = getattr(query, "s3_url", None)
+        if not digest_url:
+            # This should not happen if S3 upload was successful
+            msg = "S3 is enabled but no S3 URL was generated"
+            raise RuntimeError(msg)
+    else:
+        digest_url = f"/api/download/file/{query.id}"
+
    return IngestSuccessResponse(
        repo_url=input_text,
        short_repo_url=short_repo_url,
        summary=summary,
-        ingest_id=query.id,
+        digest_url=digest_url,
        tree=tree,
        content=content,
        default_max_file_size=slider_position,
--- a/src/server/routers/ingest.py
+++ b/src/server/routers/ingest.py
@ -1,12 +1,16 @@
 """Ingest endpoint for the API."""

+from typing import Union
+from uuid import UUID
+
 from fastapi import APIRouter, HTTPException, Request, status
-from fastapi.responses import FileResponse, JSONResponse
+from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
 from prometheus_client import Counter

 from gitingest.config import TMP_BASE_PATH
 from server.models import IngestRequest
 from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
+from server.s3_utils import is_s3_enabled
 from server.server_config import MAX_DISPLAY_SIZE
 from server.server_utils import limiter

@ -39,7 +43,7 @@ async def api_ingest(
    response = await _perform_ingestion(
        input_text=ingest_request.input_text,
        max_file_size=ingest_request.max_file_size,
-        pattern_type=ingest_request.pattern_type,
+        pattern_type=ingest_request.pattern_type.value,
        pattern=ingest_request.pattern,
        token=ingest_request.token,
    )
@ -90,30 +94,42 @@ async def api_ingest_get(
    return response


-@router.get("/api/download/file/{ingest_id}", response_class=FileResponse)
-async def download_ingest(ingest_id: str) -> FileResponse:
+@router.get("/api/download/file/{ingest_id}", response_model=None)
+async def download_ingest(
+    ingest_id: UUID,
+) -> Union[RedirectResponse, FileResponse]:  # noqa: FA100 (future-rewritable-type-annotation) (pydantic)
    """Download the first text file produced for an ingest ID.

    **This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
-    and returns it as a downloadable file. The file is streamed with media type ``text/plain``
-    and prompts the browser to download it.
+    and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled
+    and clients should use the S3 URL provided in the ingest response instead.

    **Parameters**

-    - **ingest_id** (`str`): Identifier that the ingest step emitted
+    - **ingest_id** (`UUID`): Identifier that the ingest step emitted

    **Returns**

-    - **FileResponse**: Streamed response with media type ``text/plain``
+    - **FileResponse**: Streamed response with media type ``text/plain`` for local files

    **Raises**

+    - **HTTPException**: **503** - endpoint is disabled when S3 is enabled
    - **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file
    - **HTTPException**: **403** - the process lacks permission to read the directory or file

    """
+    # Disable download endpoint when S3 is enabled
+    if is_s3_enabled():
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Download endpoint is disabled when S3 is enabled. "
+            "Use the S3 URL provided in the ingest response instead.",
+        )
+
+    # Fall back to local file serving
    # Normalize and validate the directory path
-    directory = (TMP_BASE_PATH / ingest_id).resolve()
+    directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
    if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
        raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")

--- a/src/server/s3_utils.py
+++ b/src/server/s3_utils.py
@ -0,0 +1,335 @@
+"""S3 utility functions for uploading and managing digest files."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import os
+from typing import TYPE_CHECKING
+from urllib.parse import urlparse
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
+
+import boto3
+from botocore.exceptions import ClientError
+
+if TYPE_CHECKING:
+    from botocore.client import BaseClient
+
+# Initialize logger for this module
+logger = logging.getLogger(__name__)
+
+
+class S3UploadError(Exception):
+    """Custom exception for S3 upload failures."""
+
+
+def is_s3_enabled() -> bool:
+    """Check if S3 is enabled via environment variables."""
+    return os.getenv("S3_ENABLED", "false").lower() == "true"
+
+
+def get_s3_config() -> dict[str, str | None]:
+    """Get S3 configuration from environment variables."""
+    config = {
+        "endpoint_url": os.getenv("S3_ENDPOINT"),
+        "aws_access_key_id": os.getenv("S3_ACCESS_KEY"),
+        "aws_secret_access_key": os.getenv("S3_SECRET_KEY"),
+        "region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"),
+    }
+    return {k: v for k, v in config.items() if v is not None}
+
+
+def get_s3_bucket_name() -> str:
+    """Get S3 bucket name from environment variables."""
+    return os.getenv("S3_BUCKET_NAME", "gitingest-bucket")
+
+
+def get_s3_alias_host() -> str | None:
+    """Get S3 alias host for public URLs."""
+    return os.getenv("S3_ALIAS_HOST")
+
+
+def generate_s3_file_path(
+    source: str,
+    user_name: str,
+    repo_name: str,
+    commit: str,
+    include_patterns: set[str] | None,
+    ignore_patterns: set[str],
+) -> str:
+    """Generate S3 file path with proper naming convention.
+
+    The file path is formatted as:
+    [<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
+    <exclude&include hash>/<owner>-<repo-name>.txt
+
+    If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
+    The commit-ID is always included in the URL.
+    If no specific commit is provided, the actual commit hash from the cloned repository is used.
+
+    Parameters
+    ----------
+    source : str
+        Git host (e.g., github, gitlab, bitbucket, etc.).
+    user_name : str
+        Repository owner or user.
+    repo_name : str
+        Repository name.
+    commit : str
+        Commit hash.
+    include_patterns : set[str] | None
+        Set of patterns specifying which files to include.
+    ignore_patterns : set[str]
+        Set of patterns specifying which files to exclude.
+
+    Returns
+    -------
+    str
+        S3 file path string.
+
+    Raises
+    ------
+    ValueError
+        If the source URL is invalid.
+
+    """
+    hostname = urlparse(source).hostname
+    if hostname is None:
+        msg = "Invalid source URL"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    # Create hash of exclude/include patterns for uniqueness
+    patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
+    patterns_str += f"exclude:{sorted(ignore_patterns)}"
+    patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
+
+    # Build the base path using hostname directly
+    base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
+
+    # Check for S3_DIRECTORY_PREFIX environment variable
+    s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")
+
+    if not s3_directory_prefix:
+        return base_path
+
+    # Remove trailing slash if present and add the prefix
+    s3_directory_prefix = s3_directory_prefix.rstrip("/")
+    return f"{s3_directory_prefix}/{base_path}"
+
+
+def create_s3_client() -> BaseClient:
+    """Create and return an S3 client with configuration from environment."""
+    config = get_s3_config()
+    # Log S3 client creation (excluding sensitive info)
+    log_config = config.copy()
+    has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None))
+    logger.debug(
+        msg="Creating S3 client",
+        extra={
+            "s3_config": log_config,
+            "has_credentials": has_credentials,
+        },
+    )
+    return boto3.client("s3", **config)
+
+
+def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
+    """Upload content to S3 and return the public URL.
+
+    This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
+    The ingest ID is stored as an S3 object tag.
+
+    Parameters
+    ----------
+    content : str
+        The digest content to upload.
+    s3_file_path : str
+        The S3 file path where the content will be stored.
+    ingest_id : UUID
+        The ingest ID to store as an S3 object tag.
+
+    Returns
+    -------
+    str
+        Public URL to access the uploaded file.
+
+    Raises
+    ------
+    ValueError
+        If S3 is not enabled.
+    S3UploadError
+        If the upload to S3 fails.
+
+    """
+    if not is_s3_enabled():
+        msg = "S3 is not enabled"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    s3_client = create_s3_client()
+    bucket_name = get_s3_bucket_name()
+
+    extra_fields = {
+        "bucket_name": bucket_name,
+        "s3_file_path": s3_file_path,
+        "ingest_id": str(ingest_id),
+        "content_size": len(content),
+    }
+
+    # Log upload attempt
+    logger.debug("Starting S3 upload", extra=extra_fields)
+
+    try:
+        # Upload the content with ingest_id as tag
+        s3_client.put_object(
+            Bucket=bucket_name,
+            Key=s3_file_path,
+            Body=content.encode("utf-8"),
+            ContentType="text/plain",
+            Tagging=f"ingest_id={ingest_id!s}",
+        )
+    except ClientError as err:
+        # Log upload failure
+        logger.exception(
+            "S3 upload failed",
+            extra={
+                "bucket_name": bucket_name,
+                "s3_file_path": s3_file_path,
+                "ingest_id": str(ingest_id),
+                "error_code": err.response.get("Error", {}).get("Code"),
+                "error_message": str(err),
+            },
+        )
+        msg = f"Failed to upload to S3: {err}"
+        raise S3UploadError(msg) from err
+
+    # Generate public URL
+    alias_host = get_s3_alias_host()
+    if alias_host:
+        # Use alias host if configured
+        public_url = f"{alias_host.rstrip('/')}/{s3_file_path}"
+    else:
+        # Fallback to direct S3 URL
+        endpoint = get_s3_config().get("endpoint_url")
+        if endpoint:
+            public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}"
+        else:
+            public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}"
+
+    # Log successful upload
+    logger.debug(
+        "S3 upload completed successfully",
+        extra={
+            "bucket_name": bucket_name,
+            "s3_file_path": s3_file_path,
+            "ingest_id": str(ingest_id),
+            "public_url": public_url,
+        },
+    )
+
+    return public_url
+
+
+def _build_s3_url(key: str) -> str:
+    """Build S3 URL for a given key."""
+    alias_host = get_s3_alias_host()
+    if alias_host:
+        return f"{alias_host.rstrip('/')}/{key}"
+
+    bucket_name = get_s3_bucket_name()
+    config = get_s3_config()
+
+    endpoint = config["endpoint_url"]
+    if endpoint:
+        return f"{endpoint.rstrip('/')}/{bucket_name}/{key}"
+
+    return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}"
+
+
+def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool:
+    """Check if an S3 object has the matching ingest_id tag."""
+    try:
+        tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
+        tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
+        return tags.get("ingest_id") == str(target_ingest_id)
+    except ClientError:
+        return False
+
+
+def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
+    """Get S3 URL for a given ingest ID if it exists.
+
+    Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
+    Used by the download endpoint to redirect to S3 if available.
+
+    Parameters
+    ----------
+    ingest_id : UUID
+        The ingest ID to search for in S3 object tags.
+
+    Returns
+    -------
+    str | None
+        S3 URL if file exists, None otherwise.
+
+    """
+    if not is_s3_enabled():
+        logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id)
+        return None
+
+    logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)})
+
+    try:
+        s3_client = create_s3_client()
+        bucket_name = get_s3_bucket_name()
+
+        # List all objects in the ingest/ prefix and check their tags
+        paginator = s3_client.get_paginator("list_objects_v2")
+        page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/")
+
+        objects_checked = 0
+        for page in page_iterator:
+            if "Contents" not in page:
+                continue
+
+            for obj in page["Contents"]:
+                key = obj["Key"]
+                objects_checked += 1
+                if _check_object_tags(
+                    s3_client=s3_client,
+                    bucket_name=bucket_name,
+                    key=key,
+                    target_ingest_id=ingest_id,
+                ):
+                    s3_url = _build_s3_url(key)
+                    logger.debug(
+                        msg="Found S3 object for ingest ID",
+                        extra={
+                            "ingest_id": str(ingest_id),
+                            "s3_key": key,
+                            "s3_url": s3_url,
+                            "objects_checked": objects_checked,
+                        },
+                    )
+                    return s3_url
+
+        logger.debug(
+            msg="No S3 object found for ingest ID",
+            extra={
+                "ingest_id": str(ingest_id),
+                "objects_checked": objects_checked,
+            },
+        )
+
+    except ClientError as err:
+        logger.exception(
+            msg="Error during S3 URL lookup",
+            extra={
+                "ingest_id": str(ingest_id),
+                "error_code": err.response.get("Error", {}).get("Code"),
+                "error_message": str(err),
+            },
+        )
+
+    return None
--- a/src/static/js/utils.js
+++ b/src/static/js/utils.js
@ -172,8 +172,8 @@ function handleSuccessfulResponse(data) {
    // Show results section
    showResults();

-    // Store the ingest_id for download functionality
-    window.currentIngestId = data.ingest_id;
+    // Store the digest_url for download functionality
+    window.currentDigestUrl = data.digest_url;

    // Set plain text content for summary, tree, and content
    document.getElementById('result-summary').value = data.summary || '';
@ -271,9 +271,9 @@ function copyFullDigest() {
 }

 function downloadFullDigest() {
-    // Check if we have an ingest_id
-    if (!window.currentIngestId) {
-        console.error('No ingest_id available for download');
+    // Check if we have a digest_url
+    if (!window.currentDigestUrl) {
+        console.error('No digest_url available for download');

        return;
    }
@ -289,10 +289,10 @@ function downloadFullDigest() {
        Downloading...
    `;

-    // Create a download link to the server endpoint
+    // Create a download link using the digest_url
    const a = document.createElement('a');

-    a.href = `/api/download/file/${window.currentIngestId}`;
+    a.href = window.currentDigestUrl;
    a.download = 'digest.txt';
    document.body.appendChild(a);
    a.click();
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -8,6 +8,7 @@ from __future__ import annotations

 import json
 import sys
+import uuid
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict
 from unittest.mock import AsyncMock
@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery:
        repo_name="test_repo",
        local_path=Path("/tmp/test_repo").resolve(),
        slug="test_user/test_repo",
-        id="id",
+        id=uuid.uuid4(),
        branch="main",
        max_file_size=1_000_000,
        ignore_patterns={"*.pyc", "__pycache__", ".git"},
--- a/tests/query_parser/test_git_host_agnostic.py
+++ b/tests/query_parser/test_git_host_agnostic.py
@ -55,7 +55,7 @@ async def test_parse_query_without_host(
    query = await parse_remote_repo(url)

    # Compare against the canonical dict while ignoring unpredictable fields.
-    actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"})
+    actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"})

    assert "commit" in actual
    assert _is_valid_git_commit_hash(actual["commit"])
--- a/tests/server/test_flow_integration.py
+++ b/tests/server/test_flow_integration.py
@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non
    client = request.getfixturevalue("test_client")
    form_data = {
        "input_text": "https://github.com/octocat/Hello-World",
-        "max_file_size": "243",
+        "max_file_size": 243,
        "pattern_type": "exclude",
        "pattern": "",
        "token": "",
@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None:
    client = request.getfixturevalue("test_client")
    form_data = {
        "input_text": "https://github.com/nonexistent/repo",
-        "max_file_size": "243",
+        "max_file_size": 243,
        "pattern_type": "exclude",
        "pattern": "",
        "token": "",
@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None:
    # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository)
    form_data = {
        "input_text": "https://github.com/octocat/hello-world",
-        "max_file_size": "10",
+        "max_file_size": 10,
        "pattern_type": "exclude",
        "pattern": "",
        "token": "",
@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None:
    def make_request() -> None:
        form_data = {
            "input_text": "https://github.com/octocat/hello-world",
-            "max_file_size": "243",
+            "max_file_size": 243,
            "pattern_type": "exclude",
            "pattern": "",
            "token": "",
@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None:
    client = request.getfixturevalue("test_client")
    form_data = {
        "input_text": "https://github.com/octocat/Hello-World",
-        "max_file_size": "1",
+        "max_file_size": 1,
        "pattern_type": "exclude",
        "pattern": "",
        "token": "",
@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None:
    client = request.getfixturevalue("test_client")
    form_data = {
        "input_text": "https://github.com/octocat/Hello-World",
-        "max_file_size": "243",
+        "max_file_size": 243,
        "pattern_type": "include",
        "pattern": "*.md",
        "token": "",