feat: implement S3 integration for storing and retrieving digest files (#427)

Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Co-authored-by: Nicolas Iragne <nicoragne@hotmail.fr>
This commit is contained in:
Mickael 2025-07-26 16:28:17 +02:00 committed by GitHub
parent 998cea15b4
commit 414e85189f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 688 additions and 38 deletions

33
.docker/minio/setup.sh Executable file
View file

@ -0,0 +1,33 @@
#!/bin/sh
# Simple script to set up MinIO bucket and user
# Based on example from MinIO issues
# Format bucket name to ensure compatibility
BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
# Configure MinIO client
mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
# Remove bucket if it exists (for clean setup)
mc rm -r --force myminio/${BUCKET_NAME} || true
# Create bucket
mc mb myminio/${BUCKET_NAME}
# Set bucket policy to allow downloads
mc anonymous set download myminio/${BUCKET_NAME}
# Create user with access and secret keys
mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists"
# Create policy for the bucket
echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json
# Apply policy
mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists"
mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY}
echo "MinIO setup completed successfully"
echo "Bucket: ${BUCKET_NAME}"
echo "Access via console: http://localhost:9001"

View file

@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace
GITINGEST_SENTRY_SEND_DEFAULT_PII=true
# Environment name for Sentry (default: "")
GITINGEST_SENTRY_ENVIRONMENT=development
# MinIO Configuration (for development)
# Root user credentials for MinIO admin access
MINIO_ROOT_USER=minioadmin
MINIO_ROOT_PASSWORD=minioadmin
# S3 Configuration (for application)
# Set to "true" to enable S3 storage for digests
# S3_ENABLED=true
# Endpoint URL for the S3 service (MinIO in development)
S3_ENDPOINT=http://minio:9000
# Access key for the S3 bucket (created automatically in development)
S3_ACCESS_KEY=gitingest
# Secret key for the S3 bucket (created automatically in development)
S3_SECRET_KEY=gitingest123
# Name of the S3 bucket (created automatically in development)
S3_BUCKET_NAME=gitingest-bucket
# Region for the S3 bucket (default for MinIO)
S3_REGION=us-east-1
# Public URL/CDN for accessing S3 resources
S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket
# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
# S3_DIRECTORY_PREFIX=my-prefix

View file

@ -113,6 +113,7 @@ repos:
files: ^src/
additional_dependencies:
[
boto3>=1.28.0,
click>=8.0.0,
'fastapi[standard]>=0.109.1',
httpx,
@ -138,6 +139,7 @@ repos:
- --rcfile=tests/.pylintrc
additional_dependencies:
[
boto3>=1.28.0,
click>=8.0.0,
'fastapi[standard]>=0.109.1',
httpx,

View file

@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default.
## 🐳 Self-host
### Using Docker
1. Build the image:
``` bash
@ -239,6 +241,89 @@ The application can be configured using the following environment variables:
- **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0)
- **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace")
- **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true")
- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
### Using Docker Compose
The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments.
#### Compose File Structure
The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services:
```yaml
# Common base configuration for all services
x-app-base: &app-base
build:
context: .
dockerfile: Dockerfile
ports:
- "${APP_WEB_BIND:-8000}:8000" # Main application port
- "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port
# ... other common configurations
```
#### Services
The file defines three services:
1. **app**: Production service configuration
- Uses the `prod` profile
- Sets the Sentry environment to "production"
- Configured for stable operation with `restart: unless-stopped`
2. **app-dev**: Development service configuration
- Uses the `dev` profile
- Enables debug mode
- Mounts the source code for live development
- Uses hot reloading for faster development
3. **minio**: S3-compatible object storage for development
- Uses the `dev` profile (only available in development mode)
- Provides S3-compatible storage for local development
- Accessible via:
- API: Port 9000 ([localhost:9000](http://localhost:9000))
- Web Console: Port 9001 ([localhost:9001](http://localhost:9001))
- Default admin credentials:
- Username: `minioadmin`
- Password: `minioadmin`
- Configurable via environment variables:
- `MINIO_ROOT_USER`: Custom admin username (default: minioadmin)
- `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin)
- Includes persistent storage via Docker volume
- Auto-creates a bucket and application-specific credentials:
- Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`)
- Access key: `gitingest` (configurable via `S3_ACCESS_KEY`)
- Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`)
- These credentials are automatically passed to the app-dev service via environment variables:
- `S3_ENDPOINT`: URL of the MinIO server
- `S3_ACCESS_KEY`: Access key for the S3 bucket
- `S3_SECRET_KEY`: Secret key for the S3 bucket
- `S3_BUCKET_NAME`: Name of the S3 bucket
- `S3_REGION`: Region for the S3 bucket (default: us-east-1)
- `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
#### Usage Examples
To run the application in development mode:
```bash
docker compose --profile dev up
```
To run the application in production mode:
```bash
docker compose --profile prod up -d
```
To build and run the application:
```bash
docker compose --profile prod build
docker compose --profile prod up -d
```
## 🤝 Contributing

111
compose.yml Normal file
View file

@ -0,0 +1,111 @@
# Common base configuration for all services
x-app-base: &app-base
ports:
- "${APP_WEB_BIND:-8000}:8000" # Main application port
- "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port
environment:
# Python Configuration
- PYTHONUNBUFFERED=1
- PYTHONDONTWRITEBYTECODE=1
# Host Configuration
- ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1}
# Metrics Configuration
- GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true}
- GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1}
- GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090}
# Sentry Configuration
- GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false}
- GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-}
- GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0}
- GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0}
- GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace}
- GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true}
user: "1000:1000"
command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"]
services:
# Production service configuration
app:
<<: *app-base
image: ghcr.io/coderamp-labs/gitingest:latest
profiles:
- prod
environment:
- GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production}
restart: unless-stopped
# Development service configuration
app-dev:
<<: *app-base
build:
context: .
dockerfile: Dockerfile
profiles:
- dev
environment:
- DEBUG=true
- GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development}
# S3 Configuration
- S3_ENABLED=true
- S3_ENDPOINT=http://minio:9000
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
- S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
# Use lowercase bucket name to ensure compatibility with MinIO
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
- S3_REGION=${S3_REGION:-us-east-1}
- S3_DIRECTORY_PREFIX=${S3_DIRECTORY_PREFIX:-dev}
# Public URL for S3 resources
- S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}}
volumes:
# Mount source code for live development
- ./src:/app:ro
# Use --reload flag for hot reloading during development
command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
depends_on:
minio-setup:
condition: service_completed_successfully
# MinIO S3-compatible object storage for development
minio:
image: minio/minio:latest
profiles:
- dev
ports:
- "9000:9000" # API port
- "9001:9001" # Console port
environment:
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
volumes:
- minio-data:/data
command: server /data --console-address ":9001"
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 30s
start_period: 30s
start_interval: 1s
# MinIO setup service to create bucket and user
minio-setup:
image: minio/mc
profiles:
- dev
depends_on:
minio:
condition: service_healthy
environment:
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
- S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
volumes:
- ./.docker/minio/setup.sh:/setup.sh:ro
entrypoint: sh
command: -c /setup.sh
volumes:
minio-data:
driver: local

View file

@ -44,6 +44,7 @@ dev = [
]
server = [
"boto3>=1.28.0", # AWS SDK for S3 support
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
"prometheus-client",
"sentry-sdk[fastapi]",

View file

@ -1,3 +1,4 @@
boto3>=1.28.0 # AWS SDK for S3 support
click>=8.0.0
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
httpx

View file

@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
host = parsed_url.netloc
user, repo = _get_user_and_repo_from_path(parsed_url.path)
_id = str(uuid.uuid4())
_id = uuid.uuid4()
slug = f"{user}-{repo}"
local_path = TMP_BASE_PATH / _id / slug
local_path = TMP_BASE_PATH / str(_id) / slug
url = f"https://{host}/{user}/{repo}"
query = IngestionQuery(
@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
"""
path_obj = Path(path_str).resolve()
slug = path_obj.name if path_str == "." else path_str.strip("/")
return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())
async def _configure_branch_or_tag(

View file

@ -3,6 +3,7 @@
from __future__ import annotations
from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
from pydantic import BaseModel, Field
@ -27,7 +28,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
The URL of the repository.
slug : str
The slug of the repository.
id : str
id : UUID
The ID of the repository.
subpath : str
The subpath to the repository or file (default: ``"/"``).
@ -47,6 +48,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
The patterns to include.
include_submodules : bool
Whether to include all Git submodules within the repository. (default: ``False``)
s3_url : str | None
The S3 URL where the digest is stored if S3 is enabled.
"""
@ -56,7 +59,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
local_path: Path
url: str | None = None
slug: str
id: str
id: UUID
subpath: str = Field(default="/")
type: str | None = None
branch: str | None = None
@ -66,6 +69,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns
include_patterns: set[str] | None = None
include_submodules: bool = Field(default=False)
s3_url: str | None = None
def extract_clone_config(self) -> CloneConfig:
"""Extract the relevant fields for the CloneConfig object.

View file

@ -71,8 +71,8 @@ class IngestSuccessResponse(BaseModel):
Short form of repository URL (user/repo).
summary : str
Summary of the ingestion process including token estimates.
ingest_id : str
Ingestion id used to download full context.
digest_url : str
URL to download the full digest content (either S3 URL or local download endpoint).
tree : str
File tree structure of the repository.
content : str
@ -89,7 +89,7 @@ class IngestSuccessResponse(BaseModel):
repo_url: str = Field(..., description="Original repository URL")
short_repo_url: str = Field(..., description="Short repository URL (user/repo)")
summary: str = Field(..., description="Ingestion summary with token estimates")
ingest_id: str = Field(..., description="Ingestion id used to download full context")
digest_url: str = Field(..., description="URL to download the full digest content")
tree: str = Field(..., description="File tree structure")
content: str = Field(..., description="Processed file content")
default_max_file_size: int = Field(..., description="File size slider position used")

View file

@ -11,6 +11,7 @@ from gitingest.query_parser import parse_remote_repo
from gitingest.utils.git_utils import validate_github_token
from gitingest.utils.pattern_utils import process_patterns
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
from server.server_config import MAX_DISPLAY_SIZE
from server.server_utils import Colors, log_slider_to_size
@ -45,6 +46,11 @@ async def process_query(
IngestResponse
A union type, corresponding to IngestErrorResponse or IngestSuccessResponse
Raises
------
RuntimeError
If the commit hash is not found (should never happen).
"""
if token:
validate_github_token(token)
@ -59,7 +65,6 @@ async def process_query(
return IngestErrorResponse(error=str(exc))
query.url = cast("str", query.url)
query.host = cast("str", query.host)
query.max_file_size = max_file_size
query.ignore_patterns, query.include_patterns = process_patterns(
exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None,
@ -71,13 +76,36 @@ async def process_query(
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
# The commit hash should always be available at this point
if not query.commit:
msg = "Unexpected error: no commit hash found"
raise RuntimeError(msg)
try:
summary, tree, content = ingest_query(query)
# TODO: why are we writing the tree and content to a file here?
# Prepare the digest content (tree + content)
digest_content = tree + "\n" + content
# Store digest based on S3 configuration
if is_s3_enabled():
# Upload to S3 instead of storing locally
s3_file_path = generate_s3_file_path(
source=query.url,
user_name=cast("str", query.user_name),
repo_name=cast("str", query.repo_name),
commit=query.commit,
include_patterns=query.include_patterns,
ignore_patterns=query.ignore_patterns,
)
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
# Store S3 URL in query for later use
query.s3_url = s3_url
else:
# Store locally
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
with local_txt_file.open("w", encoding="utf-8") as f:
f.write(tree + "\n" + content)
f.write(digest_content)
except Exception as exc:
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
@ -97,11 +125,21 @@ async def process_query(
summary=summary,
)
# Generate digest_url based on S3 configuration
if is_s3_enabled():
digest_url = getattr(query, "s3_url", None)
if not digest_url:
# This should not happen if S3 upload was successful
msg = "S3 is enabled but no S3 URL was generated"
raise RuntimeError(msg)
else:
digest_url = f"/api/download/file/{query.id}"
return IngestSuccessResponse(
repo_url=input_text,
short_repo_url=short_repo_url,
summary=summary,
ingest_id=query.id,
digest_url=digest_url,
tree=tree,
content=content,
default_max_file_size=slider_position,

View file

@ -1,12 +1,16 @@
"""Ingest endpoint for the API."""
from typing import Union
from uuid import UUID
from fastapi import APIRouter, HTTPException, Request, status
from fastapi.responses import FileResponse, JSONResponse
from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
from prometheus_client import Counter
from gitingest.config import TMP_BASE_PATH
from server.models import IngestRequest
from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
from server.s3_utils import is_s3_enabled
from server.server_config import MAX_DISPLAY_SIZE
from server.server_utils import limiter
@ -39,7 +43,7 @@ async def api_ingest(
response = await _perform_ingestion(
input_text=ingest_request.input_text,
max_file_size=ingest_request.max_file_size,
pattern_type=ingest_request.pattern_type,
pattern_type=ingest_request.pattern_type.value,
pattern=ingest_request.pattern,
token=ingest_request.token,
)
@ -90,30 +94,42 @@ async def api_ingest_get(
return response
@router.get("/api/download/file/{ingest_id}", response_class=FileResponse)
async def download_ingest(ingest_id: str) -> FileResponse:
@router.get("/api/download/file/{ingest_id}", response_model=None)
async def download_ingest(
ingest_id: UUID,
) -> Union[RedirectResponse, FileResponse]: # noqa: FA100 (future-rewritable-type-annotation) (pydantic)
"""Download the first text file produced for an ingest ID.
**This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
and returns it as a downloadable file. The file is streamed with media type ``text/plain``
and prompts the browser to download it.
and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled
and clients should use the S3 URL provided in the ingest response instead.
**Parameters**
- **ingest_id** (`str`): Identifier that the ingest step emitted
- **ingest_id** (`UUID`): Identifier that the ingest step emitted
**Returns**
- **FileResponse**: Streamed response with media type ``text/plain``
- **FileResponse**: Streamed response with media type ``text/plain`` for local files
**Raises**
- **HTTPException**: **503** - endpoint is disabled when S3 is enabled
- **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file
- **HTTPException**: **403** - the process lacks permission to read the directory or file
"""
# Disable download endpoint when S3 is enabled
if is_s3_enabled():
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Download endpoint is disabled when S3 is enabled. "
"Use the S3 URL provided in the ingest response instead.",
)
# Fall back to local file serving
# Normalize and validate the directory path
directory = (TMP_BASE_PATH / ingest_id).resolve()
directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")

335
src/server/s3_utils.py Normal file
View file

@ -0,0 +1,335 @@
"""S3 utility functions for uploading and managing digest files."""
from __future__ import annotations
import hashlib
import logging
import os
from typing import TYPE_CHECKING
from urllib.parse import urlparse
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
import boto3
from botocore.exceptions import ClientError
if TYPE_CHECKING:
from botocore.client import BaseClient
# Initialize logger for this module
logger = logging.getLogger(__name__)
class S3UploadError(Exception):
"""Custom exception for S3 upload failures."""
def is_s3_enabled() -> bool:
"""Check if S3 is enabled via environment variables."""
return os.getenv("S3_ENABLED", "false").lower() == "true"
def get_s3_config() -> dict[str, str | None]:
"""Get S3 configuration from environment variables."""
config = {
"endpoint_url": os.getenv("S3_ENDPOINT"),
"aws_access_key_id": os.getenv("S3_ACCESS_KEY"),
"aws_secret_access_key": os.getenv("S3_SECRET_KEY"),
"region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"),
}
return {k: v for k, v in config.items() if v is not None}
def get_s3_bucket_name() -> str:
"""Get S3 bucket name from environment variables."""
return os.getenv("S3_BUCKET_NAME", "gitingest-bucket")
def get_s3_alias_host() -> str | None:
"""Get S3 alias host for public URLs."""
return os.getenv("S3_ALIAS_HOST")
def generate_s3_file_path(
source: str,
user_name: str,
repo_name: str,
commit: str,
include_patterns: set[str] | None,
ignore_patterns: set[str],
) -> str:
"""Generate S3 file path with proper naming convention.
The file path is formatted as:
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
<exclude&include hash>/<owner>-<repo-name>.txt
If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
The commit-ID is always included in the URL.
If no specific commit is provided, the actual commit hash from the cloned repository is used.
Parameters
----------
source : str
Git host (e.g., github, gitlab, bitbucket, etc.).
user_name : str
Repository owner or user.
repo_name : str
Repository name.
commit : str
Commit hash.
include_patterns : set[str] | None
Set of patterns specifying which files to include.
ignore_patterns : set[str]
Set of patterns specifying which files to exclude.
Returns
-------
str
S3 file path string.
Raises
------
ValueError
If the source URL is invalid.
"""
hostname = urlparse(source).hostname
if hostname is None:
msg = "Invalid source URL"
logger.error(msg)
raise ValueError(msg)
# Create hash of exclude/include patterns for uniqueness
patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
patterns_str += f"exclude:{sorted(ignore_patterns)}"
patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
# Build the base path using hostname directly
base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
# Check for S3_DIRECTORY_PREFIX environment variable
s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")
if not s3_directory_prefix:
return base_path
# Remove trailing slash if present and add the prefix
s3_directory_prefix = s3_directory_prefix.rstrip("/")
return f"{s3_directory_prefix}/{base_path}"
def create_s3_client() -> BaseClient:
"""Create and return an S3 client with configuration from environment."""
config = get_s3_config()
# Log S3 client creation (excluding sensitive info)
log_config = config.copy()
has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None))
logger.debug(
msg="Creating S3 client",
extra={
"s3_config": log_config,
"has_credentials": has_credentials,
},
)
return boto3.client("s3", **config)
def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
"""Upload content to S3 and return the public URL.
This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
The ingest ID is stored as an S3 object tag.
Parameters
----------
content : str
The digest content to upload.
s3_file_path : str
The S3 file path where the content will be stored.
ingest_id : UUID
The ingest ID to store as an S3 object tag.
Returns
-------
str
Public URL to access the uploaded file.
Raises
------
ValueError
If S3 is not enabled.
S3UploadError
If the upload to S3 fails.
"""
if not is_s3_enabled():
msg = "S3 is not enabled"
logger.error(msg)
raise ValueError(msg)
s3_client = create_s3_client()
bucket_name = get_s3_bucket_name()
extra_fields = {
"bucket_name": bucket_name,
"s3_file_path": s3_file_path,
"ingest_id": str(ingest_id),
"content_size": len(content),
}
# Log upload attempt
logger.debug("Starting S3 upload", extra=extra_fields)
try:
# Upload the content with ingest_id as tag
s3_client.put_object(
Bucket=bucket_name,
Key=s3_file_path,
Body=content.encode("utf-8"),
ContentType="text/plain",
Tagging=f"ingest_id={ingest_id!s}",
)
except ClientError as err:
# Log upload failure
logger.exception(
"S3 upload failed",
extra={
"bucket_name": bucket_name,
"s3_file_path": s3_file_path,
"ingest_id": str(ingest_id),
"error_code": err.response.get("Error", {}).get("Code"),
"error_message": str(err),
},
)
msg = f"Failed to upload to S3: {err}"
raise S3UploadError(msg) from err
# Generate public URL
alias_host = get_s3_alias_host()
if alias_host:
# Use alias host if configured
public_url = f"{alias_host.rstrip('/')}/{s3_file_path}"
else:
# Fallback to direct S3 URL
endpoint = get_s3_config().get("endpoint_url")
if endpoint:
public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}"
else:
public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}"
# Log successful upload
logger.debug(
"S3 upload completed successfully",
extra={
"bucket_name": bucket_name,
"s3_file_path": s3_file_path,
"ingest_id": str(ingest_id),
"public_url": public_url,
},
)
return public_url
def _build_s3_url(key: str) -> str:
"""Build S3 URL for a given key."""
alias_host = get_s3_alias_host()
if alias_host:
return f"{alias_host.rstrip('/')}/{key}"
bucket_name = get_s3_bucket_name()
config = get_s3_config()
endpoint = config["endpoint_url"]
if endpoint:
return f"{endpoint.rstrip('/')}/{bucket_name}/{key}"
return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}"
def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool:
"""Check if an S3 object has the matching ingest_id tag."""
try:
tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
return tags.get("ingest_id") == str(target_ingest_id)
except ClientError:
return False
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
"""Get S3 URL for a given ingest ID if it exists.
Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
Used by the download endpoint to redirect to S3 if available.
Parameters
----------
ingest_id : UUID
The ingest ID to search for in S3 object tags.
Returns
-------
str | None
S3 URL if file exists, None otherwise.
"""
if not is_s3_enabled():
logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id)
return None
logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)})
try:
s3_client = create_s3_client()
bucket_name = get_s3_bucket_name()
# List all objects in the ingest/ prefix and check their tags
paginator = s3_client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/")
objects_checked = 0
for page in page_iterator:
if "Contents" not in page:
continue
for obj in page["Contents"]:
key = obj["Key"]
objects_checked += 1
if _check_object_tags(
s3_client=s3_client,
bucket_name=bucket_name,
key=key,
target_ingest_id=ingest_id,
):
s3_url = _build_s3_url(key)
logger.debug(
msg="Found S3 object for ingest ID",
extra={
"ingest_id": str(ingest_id),
"s3_key": key,
"s3_url": s3_url,
"objects_checked": objects_checked,
},
)
return s3_url
logger.debug(
msg="No S3 object found for ingest ID",
extra={
"ingest_id": str(ingest_id),
"objects_checked": objects_checked,
},
)
except ClientError as err:
logger.exception(
msg="Error during S3 URL lookup",
extra={
"ingest_id": str(ingest_id),
"error_code": err.response.get("Error", {}).get("Code"),
"error_message": str(err),
},
)
return None

View file

@ -172,8 +172,8 @@ function handleSuccessfulResponse(data) {
// Show results section
showResults();
// Store the ingest_id for download functionality
window.currentIngestId = data.ingest_id;
// Store the digest_url for download functionality
window.currentDigestUrl = data.digest_url;
// Set plain text content for summary, tree, and content
document.getElementById('result-summary').value = data.summary || '';
@ -271,9 +271,9 @@ function copyFullDigest() {
}
function downloadFullDigest() {
// Check if we have an ingest_id
if (!window.currentIngestId) {
console.error('No ingest_id available for download');
// Check if we have a digest_url
if (!window.currentDigestUrl) {
console.error('No digest_url available for download');
return;
}
@ -289,10 +289,10 @@ function downloadFullDigest() {
Downloading...
`;
// Create a download link to the server endpoint
// Create a download link using the digest_url
const a = document.createElement('a');
a.href = `/api/download/file/${window.currentIngestId}`;
a.href = window.currentDigestUrl;
a.download = 'digest.txt';
document.body.appendChild(a);
a.click();

View file

@ -8,6 +8,7 @@ from __future__ import annotations
import json
import sys
import uuid
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict
from unittest.mock import AsyncMock
@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery:
repo_name="test_repo",
local_path=Path("/tmp/test_repo").resolve(),
slug="test_user/test_repo",
id="id",
id=uuid.uuid4(),
branch="main",
max_file_size=1_000_000,
ignore_patterns={"*.pyc", "__pycache__", ".git"},

View file

@ -55,7 +55,7 @@ async def test_parse_query_without_host(
query = await parse_remote_repo(url)
# Compare against the canonical dict while ignoring unpredictable fields.
actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"})
actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"})
assert "commit" in actual
assert _is_valid_git_commit_hash(actual["commit"])

View file

@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non
client = request.getfixturevalue("test_client")
form_data = {
"input_text": "https://github.com/octocat/Hello-World",
"max_file_size": "243",
"max_file_size": 243,
"pattern_type": "exclude",
"pattern": "",
"token": "",
@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None:
client = request.getfixturevalue("test_client")
form_data = {
"input_text": "https://github.com/nonexistent/repo",
"max_file_size": "243",
"max_file_size": 243,
"pattern_type": "exclude",
"pattern": "",
"token": "",
@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None:
# TODO: ingesting a large repo take too much time (eg: godotengine/godot repository)
form_data = {
"input_text": "https://github.com/octocat/hello-world",
"max_file_size": "10",
"max_file_size": 10,
"pattern_type": "exclude",
"pattern": "",
"token": "",
@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None:
def make_request() -> None:
form_data = {
"input_text": "https://github.com/octocat/hello-world",
"max_file_size": "243",
"max_file_size": 243,
"pattern_type": "exclude",
"pattern": "",
"token": "",
@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None:
client = request.getfixturevalue("test_client")
form_data = {
"input_text": "https://github.com/octocat/Hello-World",
"max_file_size": "1",
"max_file_size": 1,
"pattern_type": "exclude",
"pattern": "",
"token": "",
@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None:
client = request.getfixturevalue("test_client")
form_data = {
"input_text": "https://github.com/octocat/Hello-World",
"max_file_size": "243",
"max_file_size": 243,
"pattern_type": "include",
"pattern": "*.md",
"token": "",