mirror of
https://github.com/cyclotruc/gitingest.git
synced 2026-04-26 15:40:40 +00:00
feat: implement S3 integration for storing and retrieving digest files (#427)
Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Co-authored-by: Nicolas Iragne <nicoragne@hotmail.fr>
This commit is contained in:
parent
998cea15b4
commit
414e85189f
17 changed files with 688 additions and 38 deletions
33
.docker/minio/setup.sh
Executable file
33
.docker/minio/setup.sh
Executable file
|
|
@ -0,0 +1,33 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Simple script to set up MinIO bucket and user
|
||||
# Based on example from MinIO issues
|
||||
|
||||
# Format bucket name to ensure compatibility
|
||||
BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
|
||||
|
||||
# Configure MinIO client
|
||||
mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
|
||||
|
||||
# Remove bucket if it exists (for clean setup)
|
||||
mc rm -r --force myminio/${BUCKET_NAME} || true
|
||||
|
||||
# Create bucket
|
||||
mc mb myminio/${BUCKET_NAME}
|
||||
|
||||
# Set bucket policy to allow downloads
|
||||
mc anonymous set download myminio/${BUCKET_NAME}
|
||||
|
||||
# Create user with access and secret keys
|
||||
mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists"
|
||||
|
||||
# Create policy for the bucket
|
||||
echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json
|
||||
|
||||
# Apply policy
|
||||
mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists"
|
||||
mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY}
|
||||
|
||||
echo "MinIO setup completed successfully"
|
||||
echo "Bucket: ${BUCKET_NAME}"
|
||||
echo "Access via console: http://localhost:9001"
|
||||
23
.env.example
23
.env.example
|
|
@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace
|
|||
GITINGEST_SENTRY_SEND_DEFAULT_PII=true
|
||||
# Environment name for Sentry (default: "")
|
||||
GITINGEST_SENTRY_ENVIRONMENT=development
|
||||
|
||||
# MinIO Configuration (for development)
|
||||
# Root user credentials for MinIO admin access
|
||||
MINIO_ROOT_USER=minioadmin
|
||||
MINIO_ROOT_PASSWORD=minioadmin
|
||||
|
||||
# S3 Configuration (for application)
|
||||
# Set to "true" to enable S3 storage for digests
|
||||
# S3_ENABLED=true
|
||||
# Endpoint URL for the S3 service (MinIO in development)
|
||||
S3_ENDPOINT=http://minio:9000
|
||||
# Access key for the S3 bucket (created automatically in development)
|
||||
S3_ACCESS_KEY=gitingest
|
||||
# Secret key for the S3 bucket (created automatically in development)
|
||||
S3_SECRET_KEY=gitingest123
|
||||
# Name of the S3 bucket (created automatically in development)
|
||||
S3_BUCKET_NAME=gitingest-bucket
|
||||
# Region for the S3 bucket (default for MinIO)
|
||||
S3_REGION=us-east-1
|
||||
# Public URL/CDN for accessing S3 resources
|
||||
S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket
|
||||
# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
|
||||
# S3_DIRECTORY_PREFIX=my-prefix
|
||||
|
|
|
|||
|
|
@ -113,6 +113,7 @@ repos:
|
|||
files: ^src/
|
||||
additional_dependencies:
|
||||
[
|
||||
boto3>=1.28.0,
|
||||
click>=8.0.0,
|
||||
'fastapi[standard]>=0.109.1',
|
||||
httpx,
|
||||
|
|
@ -138,6 +139,7 @@ repos:
|
|||
- --rcfile=tests/.pylintrc
|
||||
additional_dependencies:
|
||||
[
|
||||
boto3>=1.28.0,
|
||||
click>=8.0.0,
|
||||
'fastapi[standard]>=0.109.1',
|
||||
httpx,
|
||||
|
|
|
|||
85
README.md
85
README.md
|
|
@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default.
|
|||
|
||||
## 🐳 Self-host
|
||||
|
||||
### Using Docker
|
||||
|
||||
1. Build the image:
|
||||
|
||||
``` bash
|
||||
|
|
@ -239,6 +241,89 @@ The application can be configured using the following environment variables:
|
|||
- **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0)
|
||||
- **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace")
|
||||
- **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true")
|
||||
- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
|
||||
- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
|
||||
|
||||
### Using Docker Compose
|
||||
|
||||
The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments.
|
||||
|
||||
#### Compose File Structure
|
||||
|
||||
The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services:
|
||||
|
||||
```yaml
|
||||
# Common base configuration for all services
|
||||
x-app-base: &app-base
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "${APP_WEB_BIND:-8000}:8000" # Main application port
|
||||
- "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port
|
||||
# ... other common configurations
|
||||
```
|
||||
|
||||
#### Services
|
||||
|
||||
The file defines three services:
|
||||
|
||||
1. **app**: Production service configuration
|
||||
- Uses the `prod` profile
|
||||
- Sets the Sentry environment to "production"
|
||||
- Configured for stable operation with `restart: unless-stopped`
|
||||
|
||||
2. **app-dev**: Development service configuration
|
||||
- Uses the `dev` profile
|
||||
- Enables debug mode
|
||||
- Mounts the source code for live development
|
||||
- Uses hot reloading for faster development
|
||||
|
||||
3. **minio**: S3-compatible object storage for development
|
||||
- Uses the `dev` profile (only available in development mode)
|
||||
- Provides S3-compatible storage for local development
|
||||
- Accessible via:
|
||||
- API: Port 9000 ([localhost:9000](http://localhost:9000))
|
||||
- Web Console: Port 9001 ([localhost:9001](http://localhost:9001))
|
||||
- Default admin credentials:
|
||||
- Username: `minioadmin`
|
||||
- Password: `minioadmin`
|
||||
- Configurable via environment variables:
|
||||
- `MINIO_ROOT_USER`: Custom admin username (default: minioadmin)
|
||||
- `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin)
|
||||
- Includes persistent storage via Docker volume
|
||||
- Auto-creates a bucket and application-specific credentials:
|
||||
- Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`)
|
||||
- Access key: `gitingest` (configurable via `S3_ACCESS_KEY`)
|
||||
- Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`)
|
||||
- These credentials are automatically passed to the app-dev service via environment variables:
|
||||
- `S3_ENDPOINT`: URL of the MinIO server
|
||||
- `S3_ACCESS_KEY`: Access key for the S3 bucket
|
||||
- `S3_SECRET_KEY`: Secret key for the S3 bucket
|
||||
- `S3_BUCKET_NAME`: Name of the S3 bucket
|
||||
- `S3_REGION`: Region for the S3 bucket (default: us-east-1)
|
||||
- `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
|
||||
|
||||
#### Usage Examples
|
||||
|
||||
To run the application in development mode:
|
||||
|
||||
```bash
|
||||
docker compose --profile dev up
|
||||
```
|
||||
|
||||
To run the application in production mode:
|
||||
|
||||
```bash
|
||||
docker compose --profile prod up -d
|
||||
```
|
||||
|
||||
To build and run the application:
|
||||
|
||||
```bash
|
||||
docker compose --profile prod build
|
||||
docker compose --profile prod up -d
|
||||
```
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
|
|
|
|||
111
compose.yml
Normal file
111
compose.yml
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
# Common base configuration for all services
|
||||
x-app-base: &app-base
|
||||
ports:
|
||||
- "${APP_WEB_BIND:-8000}:8000" # Main application port
|
||||
- "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port
|
||||
environment:
|
||||
# Python Configuration
|
||||
- PYTHONUNBUFFERED=1
|
||||
- PYTHONDONTWRITEBYTECODE=1
|
||||
# Host Configuration
|
||||
- ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1}
|
||||
# Metrics Configuration
|
||||
- GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true}
|
||||
- GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1}
|
||||
- GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090}
|
||||
# Sentry Configuration
|
||||
- GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false}
|
||||
- GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-}
|
||||
- GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0}
|
||||
- GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0}
|
||||
- GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace}
|
||||
- GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true}
|
||||
user: "1000:1000"
|
||||
command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
||||
services:
|
||||
# Production service configuration
|
||||
app:
|
||||
<<: *app-base
|
||||
image: ghcr.io/coderamp-labs/gitingest:latest
|
||||
profiles:
|
||||
- prod
|
||||
environment:
|
||||
- GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production}
|
||||
restart: unless-stopped
|
||||
|
||||
# Development service configuration
|
||||
app-dev:
|
||||
<<: *app-base
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
profiles:
|
||||
- dev
|
||||
environment:
|
||||
- DEBUG=true
|
||||
- GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development}
|
||||
# S3 Configuration
|
||||
- S3_ENABLED=true
|
||||
- S3_ENDPOINT=http://minio:9000
|
||||
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
|
||||
- S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
|
||||
# Use lowercase bucket name to ensure compatibility with MinIO
|
||||
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
|
||||
- S3_REGION=${S3_REGION:-us-east-1}
|
||||
- S3_DIRECTORY_PREFIX=${S3_DIRECTORY_PREFIX:-dev}
|
||||
# Public URL for S3 resources
|
||||
- S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}}
|
||||
volumes:
|
||||
# Mount source code for live development
|
||||
- ./src:/app:ro
|
||||
# Use --reload flag for hot reloading during development
|
||||
command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
depends_on:
|
||||
minio-setup:
|
||||
condition: service_completed_successfully
|
||||
|
||||
# MinIO S3-compatible object storage for development
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
profiles:
|
||||
- dev
|
||||
ports:
|
||||
- "9000:9000" # API port
|
||||
- "9001:9001" # Console port
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
|
||||
volumes:
|
||||
- minio-data:/data
|
||||
command: server /data --console-address ":9001"
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 30s
|
||||
timeout: 30s
|
||||
start_period: 30s
|
||||
start_interval: 1s
|
||||
|
||||
# MinIO setup service to create bucket and user
|
||||
minio-setup:
|
||||
image: minio/mc
|
||||
profiles:
|
||||
- dev
|
||||
depends_on:
|
||||
minio:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
|
||||
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
|
||||
- S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
|
||||
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
|
||||
volumes:
|
||||
- ./.docker/minio/setup.sh:/setup.sh:ro
|
||||
entrypoint: sh
|
||||
command: -c /setup.sh
|
||||
|
||||
volumes:
|
||||
minio-data:
|
||||
driver: local
|
||||
|
|
@ -44,6 +44,7 @@ dev = [
|
|||
]
|
||||
|
||||
server = [
|
||||
"boto3>=1.28.0", # AWS SDK for S3 support
|
||||
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
|
||||
"prometheus-client",
|
||||
"sentry-sdk[fastapi]",
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
boto3>=1.28.0 # AWS SDK for S3 support
|
||||
click>=8.0.0
|
||||
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
|
||||
httpx
|
||||
|
|
|
|||
|
|
@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
|
|||
host = parsed_url.netloc
|
||||
user, repo = _get_user_and_repo_from_path(parsed_url.path)
|
||||
|
||||
_id = str(uuid.uuid4())
|
||||
_id = uuid.uuid4()
|
||||
slug = f"{user}-{repo}"
|
||||
local_path = TMP_BASE_PATH / _id / slug
|
||||
local_path = TMP_BASE_PATH / str(_id) / slug
|
||||
url = f"https://{host}/{user}/{repo}"
|
||||
|
||||
query = IngestionQuery(
|
||||
|
|
@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
|
|||
"""
|
||||
path_obj = Path(path_str).resolve()
|
||||
slug = path_obj.name if path_str == "." else path_str.strip("/")
|
||||
return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
|
||||
return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())
|
||||
|
||||
|
||||
async def _configure_branch_or_tag(
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
||||
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
|
@ -27,7 +28,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
|||
The URL of the repository.
|
||||
slug : str
|
||||
The slug of the repository.
|
||||
id : str
|
||||
id : UUID
|
||||
The ID of the repository.
|
||||
subpath : str
|
||||
The subpath to the repository or file (default: ``"/"``).
|
||||
|
|
@ -47,6 +48,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
|||
The patterns to include.
|
||||
include_submodules : bool
|
||||
Whether to include all Git submodules within the repository. (default: ``False``)
|
||||
s3_url : str | None
|
||||
The S3 URL where the digest is stored if S3 is enabled.
|
||||
|
||||
"""
|
||||
|
||||
|
|
@ -56,7 +59,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
|||
local_path: Path
|
||||
url: str | None = None
|
||||
slug: str
|
||||
id: str
|
||||
id: UUID
|
||||
subpath: str = Field(default="/")
|
||||
type: str | None = None
|
||||
branch: str | None = None
|
||||
|
|
@ -66,6 +69,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
|||
ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns
|
||||
include_patterns: set[str] | None = None
|
||||
include_submodules: bool = Field(default=False)
|
||||
s3_url: str | None = None
|
||||
|
||||
def extract_clone_config(self) -> CloneConfig:
|
||||
"""Extract the relevant fields for the CloneConfig object.
|
||||
|
|
|
|||
|
|
@ -71,8 +71,8 @@ class IngestSuccessResponse(BaseModel):
|
|||
Short form of repository URL (user/repo).
|
||||
summary : str
|
||||
Summary of the ingestion process including token estimates.
|
||||
ingest_id : str
|
||||
Ingestion id used to download full context.
|
||||
digest_url : str
|
||||
URL to download the full digest content (either S3 URL or local download endpoint).
|
||||
tree : str
|
||||
File tree structure of the repository.
|
||||
content : str
|
||||
|
|
@ -89,7 +89,7 @@ class IngestSuccessResponse(BaseModel):
|
|||
repo_url: str = Field(..., description="Original repository URL")
|
||||
short_repo_url: str = Field(..., description="Short repository URL (user/repo)")
|
||||
summary: str = Field(..., description="Ingestion summary with token estimates")
|
||||
ingest_id: str = Field(..., description="Ingestion id used to download full context")
|
||||
digest_url: str = Field(..., description="URL to download the full digest content")
|
||||
tree: str = Field(..., description="File tree structure")
|
||||
content: str = Field(..., description="Processed file content")
|
||||
default_max_file_size: int = Field(..., description="File size slider position used")
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from gitingest.query_parser import parse_remote_repo
|
|||
from gitingest.utils.git_utils import validate_github_token
|
||||
from gitingest.utils.pattern_utils import process_patterns
|
||||
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
|
||||
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
|
||||
from server.server_config import MAX_DISPLAY_SIZE
|
||||
from server.server_utils import Colors, log_slider_to_size
|
||||
|
||||
|
|
@ -45,6 +46,11 @@ async def process_query(
|
|||
IngestResponse
|
||||
A union type, corresponding to IngestErrorResponse or IngestSuccessResponse
|
||||
|
||||
Raises
|
||||
------
|
||||
RuntimeError
|
||||
If the commit hash is not found (should never happen).
|
||||
|
||||
"""
|
||||
if token:
|
||||
validate_github_token(token)
|
||||
|
|
@ -59,7 +65,6 @@ async def process_query(
|
|||
return IngestErrorResponse(error=str(exc))
|
||||
|
||||
query.url = cast("str", query.url)
|
||||
query.host = cast("str", query.host)
|
||||
query.max_file_size = max_file_size
|
||||
query.ignore_patterns, query.include_patterns = process_patterns(
|
||||
exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None,
|
||||
|
|
@ -71,13 +76,36 @@ async def process_query(
|
|||
|
||||
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
|
||||
|
||||
# The commit hash should always be available at this point
|
||||
if not query.commit:
|
||||
msg = "Unexpected error: no commit hash found"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
try:
|
||||
summary, tree, content = ingest_query(query)
|
||||
|
||||
# TODO: why are we writing the tree and content to a file here?
|
||||
# Prepare the digest content (tree + content)
|
||||
digest_content = tree + "\n" + content
|
||||
|
||||
# Store digest based on S3 configuration
|
||||
if is_s3_enabled():
|
||||
# Upload to S3 instead of storing locally
|
||||
s3_file_path = generate_s3_file_path(
|
||||
source=query.url,
|
||||
user_name=cast("str", query.user_name),
|
||||
repo_name=cast("str", query.repo_name),
|
||||
commit=query.commit,
|
||||
include_patterns=query.include_patterns,
|
||||
ignore_patterns=query.ignore_patterns,
|
||||
)
|
||||
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
|
||||
# Store S3 URL in query for later use
|
||||
query.s3_url = s3_url
|
||||
else:
|
||||
# Store locally
|
||||
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
|
||||
with local_txt_file.open("w", encoding="utf-8") as f:
|
||||
f.write(tree + "\n" + content)
|
||||
f.write(digest_content)
|
||||
|
||||
except Exception as exc:
|
||||
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
|
||||
|
|
@ -97,11 +125,21 @@ async def process_query(
|
|||
summary=summary,
|
||||
)
|
||||
|
||||
# Generate digest_url based on S3 configuration
|
||||
if is_s3_enabled():
|
||||
digest_url = getattr(query, "s3_url", None)
|
||||
if not digest_url:
|
||||
# This should not happen if S3 upload was successful
|
||||
msg = "S3 is enabled but no S3 URL was generated"
|
||||
raise RuntimeError(msg)
|
||||
else:
|
||||
digest_url = f"/api/download/file/{query.id}"
|
||||
|
||||
return IngestSuccessResponse(
|
||||
repo_url=input_text,
|
||||
short_repo_url=short_repo_url,
|
||||
summary=summary,
|
||||
ingest_id=query.id,
|
||||
digest_url=digest_url,
|
||||
tree=tree,
|
||||
content=content,
|
||||
default_max_file_size=slider_position,
|
||||
|
|
|
|||
|
|
@ -1,12 +1,16 @@
|
|||
"""Ingest endpoint for the API."""
|
||||
|
||||
from typing import Union
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Request, status
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
|
||||
from prometheus_client import Counter
|
||||
|
||||
from gitingest.config import TMP_BASE_PATH
|
||||
from server.models import IngestRequest
|
||||
from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
|
||||
from server.s3_utils import is_s3_enabled
|
||||
from server.server_config import MAX_DISPLAY_SIZE
|
||||
from server.server_utils import limiter
|
||||
|
||||
|
|
@ -39,7 +43,7 @@ async def api_ingest(
|
|||
response = await _perform_ingestion(
|
||||
input_text=ingest_request.input_text,
|
||||
max_file_size=ingest_request.max_file_size,
|
||||
pattern_type=ingest_request.pattern_type,
|
||||
pattern_type=ingest_request.pattern_type.value,
|
||||
pattern=ingest_request.pattern,
|
||||
token=ingest_request.token,
|
||||
)
|
||||
|
|
@ -90,30 +94,42 @@ async def api_ingest_get(
|
|||
return response
|
||||
|
||||
|
||||
@router.get("/api/download/file/{ingest_id}", response_class=FileResponse)
|
||||
async def download_ingest(ingest_id: str) -> FileResponse:
|
||||
@router.get("/api/download/file/{ingest_id}", response_model=None)
|
||||
async def download_ingest(
|
||||
ingest_id: UUID,
|
||||
) -> Union[RedirectResponse, FileResponse]: # noqa: FA100 (future-rewritable-type-annotation) (pydantic)
|
||||
"""Download the first text file produced for an ingest ID.
|
||||
|
||||
**This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
|
||||
and returns it as a downloadable file. The file is streamed with media type ``text/plain``
|
||||
and prompts the browser to download it.
|
||||
and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled
|
||||
and clients should use the S3 URL provided in the ingest response instead.
|
||||
|
||||
**Parameters**
|
||||
|
||||
- **ingest_id** (`str`): Identifier that the ingest step emitted
|
||||
- **ingest_id** (`UUID`): Identifier that the ingest step emitted
|
||||
|
||||
**Returns**
|
||||
|
||||
- **FileResponse**: Streamed response with media type ``text/plain``
|
||||
- **FileResponse**: Streamed response with media type ``text/plain`` for local files
|
||||
|
||||
**Raises**
|
||||
|
||||
- **HTTPException**: **503** - endpoint is disabled when S3 is enabled
|
||||
- **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file
|
||||
- **HTTPException**: **403** - the process lacks permission to read the directory or file
|
||||
|
||||
"""
|
||||
# Disable download endpoint when S3 is enabled
|
||||
if is_s3_enabled():
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="Download endpoint is disabled when S3 is enabled. "
|
||||
"Use the S3 URL provided in the ingest response instead.",
|
||||
)
|
||||
|
||||
# Fall back to local file serving
|
||||
# Normalize and validate the directory path
|
||||
directory = (TMP_BASE_PATH / ingest_id).resolve()
|
||||
directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
|
||||
if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
|
||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")
|
||||
|
||||
|
|
|
|||
335
src/server/s3_utils.py
Normal file
335
src/server/s3_utils.py
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
"""S3 utility functions for uploading and managing digest files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from botocore.client import BaseClient
|
||||
|
||||
# Initialize logger for this module
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class S3UploadError(Exception):
|
||||
"""Custom exception for S3 upload failures."""
|
||||
|
||||
|
||||
def is_s3_enabled() -> bool:
|
||||
"""Check if S3 is enabled via environment variables."""
|
||||
return os.getenv("S3_ENABLED", "false").lower() == "true"
|
||||
|
||||
|
||||
def get_s3_config() -> dict[str, str | None]:
|
||||
"""Get S3 configuration from environment variables."""
|
||||
config = {
|
||||
"endpoint_url": os.getenv("S3_ENDPOINT"),
|
||||
"aws_access_key_id": os.getenv("S3_ACCESS_KEY"),
|
||||
"aws_secret_access_key": os.getenv("S3_SECRET_KEY"),
|
||||
"region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"),
|
||||
}
|
||||
return {k: v for k, v in config.items() if v is not None}
|
||||
|
||||
|
||||
def get_s3_bucket_name() -> str:
|
||||
"""Get S3 bucket name from environment variables."""
|
||||
return os.getenv("S3_BUCKET_NAME", "gitingest-bucket")
|
||||
|
||||
|
||||
def get_s3_alias_host() -> str | None:
|
||||
"""Get S3 alias host for public URLs."""
|
||||
return os.getenv("S3_ALIAS_HOST")
|
||||
|
||||
|
||||
def generate_s3_file_path(
|
||||
source: str,
|
||||
user_name: str,
|
||||
repo_name: str,
|
||||
commit: str,
|
||||
include_patterns: set[str] | None,
|
||||
ignore_patterns: set[str],
|
||||
) -> str:
|
||||
"""Generate S3 file path with proper naming convention.
|
||||
|
||||
The file path is formatted as:
|
||||
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
|
||||
<exclude&include hash>/<owner>-<repo-name>.txt
|
||||
|
||||
If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
|
||||
The commit-ID is always included in the URL.
|
||||
If no specific commit is provided, the actual commit hash from the cloned repository is used.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str
|
||||
Git host (e.g., github, gitlab, bitbucket, etc.).
|
||||
user_name : str
|
||||
Repository owner or user.
|
||||
repo_name : str
|
||||
Repository name.
|
||||
commit : str
|
||||
Commit hash.
|
||||
include_patterns : set[str] | None
|
||||
Set of patterns specifying which files to include.
|
||||
ignore_patterns : set[str]
|
||||
Set of patterns specifying which files to exclude.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
S3 file path string.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the source URL is invalid.
|
||||
|
||||
"""
|
||||
hostname = urlparse(source).hostname
|
||||
if hostname is None:
|
||||
msg = "Invalid source URL"
|
||||
logger.error(msg)
|
||||
raise ValueError(msg)
|
||||
|
||||
# Create hash of exclude/include patterns for uniqueness
|
||||
patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
|
||||
patterns_str += f"exclude:{sorted(ignore_patterns)}"
|
||||
patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
|
||||
|
||||
# Build the base path using hostname directly
|
||||
base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
|
||||
|
||||
# Check for S3_DIRECTORY_PREFIX environment variable
|
||||
s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")
|
||||
|
||||
if not s3_directory_prefix:
|
||||
return base_path
|
||||
|
||||
# Remove trailing slash if present and add the prefix
|
||||
s3_directory_prefix = s3_directory_prefix.rstrip("/")
|
||||
return f"{s3_directory_prefix}/{base_path}"
|
||||
|
||||
|
||||
def create_s3_client() -> BaseClient:
|
||||
"""Create and return an S3 client with configuration from environment."""
|
||||
config = get_s3_config()
|
||||
# Log S3 client creation (excluding sensitive info)
|
||||
log_config = config.copy()
|
||||
has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None))
|
||||
logger.debug(
|
||||
msg="Creating S3 client",
|
||||
extra={
|
||||
"s3_config": log_config,
|
||||
"has_credentials": has_credentials,
|
||||
},
|
||||
)
|
||||
return boto3.client("s3", **config)
|
||||
|
||||
|
||||
def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
|
||||
"""Upload content to S3 and return the public URL.
|
||||
|
||||
This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
|
||||
The ingest ID is stored as an S3 object tag.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
content : str
|
||||
The digest content to upload.
|
||||
s3_file_path : str
|
||||
The S3 file path where the content will be stored.
|
||||
ingest_id : UUID
|
||||
The ingest ID to store as an S3 object tag.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
Public URL to access the uploaded file.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If S3 is not enabled.
|
||||
S3UploadError
|
||||
If the upload to S3 fails.
|
||||
|
||||
"""
|
||||
if not is_s3_enabled():
|
||||
msg = "S3 is not enabled"
|
||||
logger.error(msg)
|
||||
raise ValueError(msg)
|
||||
|
||||
s3_client = create_s3_client()
|
||||
bucket_name = get_s3_bucket_name()
|
||||
|
||||
extra_fields = {
|
||||
"bucket_name": bucket_name,
|
||||
"s3_file_path": s3_file_path,
|
||||
"ingest_id": str(ingest_id),
|
||||
"content_size": len(content),
|
||||
}
|
||||
|
||||
# Log upload attempt
|
||||
logger.debug("Starting S3 upload", extra=extra_fields)
|
||||
|
||||
try:
|
||||
# Upload the content with ingest_id as tag
|
||||
s3_client.put_object(
|
||||
Bucket=bucket_name,
|
||||
Key=s3_file_path,
|
||||
Body=content.encode("utf-8"),
|
||||
ContentType="text/plain",
|
||||
Tagging=f"ingest_id={ingest_id!s}",
|
||||
)
|
||||
except ClientError as err:
|
||||
# Log upload failure
|
||||
logger.exception(
|
||||
"S3 upload failed",
|
||||
extra={
|
||||
"bucket_name": bucket_name,
|
||||
"s3_file_path": s3_file_path,
|
||||
"ingest_id": str(ingest_id),
|
||||
"error_code": err.response.get("Error", {}).get("Code"),
|
||||
"error_message": str(err),
|
||||
},
|
||||
)
|
||||
msg = f"Failed to upload to S3: {err}"
|
||||
raise S3UploadError(msg) from err
|
||||
|
||||
# Generate public URL
|
||||
alias_host = get_s3_alias_host()
|
||||
if alias_host:
|
||||
# Use alias host if configured
|
||||
public_url = f"{alias_host.rstrip('/')}/{s3_file_path}"
|
||||
else:
|
||||
# Fallback to direct S3 URL
|
||||
endpoint = get_s3_config().get("endpoint_url")
|
||||
if endpoint:
|
||||
public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}"
|
||||
else:
|
||||
public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}"
|
||||
|
||||
# Log successful upload
|
||||
logger.debug(
|
||||
"S3 upload completed successfully",
|
||||
extra={
|
||||
"bucket_name": bucket_name,
|
||||
"s3_file_path": s3_file_path,
|
||||
"ingest_id": str(ingest_id),
|
||||
"public_url": public_url,
|
||||
},
|
||||
)
|
||||
|
||||
return public_url
|
||||
|
||||
|
||||
def _build_s3_url(key: str) -> str:
|
||||
"""Build S3 URL for a given key."""
|
||||
alias_host = get_s3_alias_host()
|
||||
if alias_host:
|
||||
return f"{alias_host.rstrip('/')}/{key}"
|
||||
|
||||
bucket_name = get_s3_bucket_name()
|
||||
config = get_s3_config()
|
||||
|
||||
endpoint = config["endpoint_url"]
|
||||
if endpoint:
|
||||
return f"{endpoint.rstrip('/')}/{bucket_name}/{key}"
|
||||
|
||||
return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}"
|
||||
|
||||
|
||||
def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool:
|
||||
"""Check if an S3 object has the matching ingest_id tag."""
|
||||
try:
|
||||
tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
|
||||
tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
|
||||
return tags.get("ingest_id") == str(target_ingest_id)
|
||||
except ClientError:
|
||||
return False
|
||||
|
||||
|
||||
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
|
||||
"""Get S3 URL for a given ingest ID if it exists.
|
||||
|
||||
Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
|
||||
Used by the download endpoint to redirect to S3 if available.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ingest_id : UUID
|
||||
The ingest ID to search for in S3 object tags.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
S3 URL if file exists, None otherwise.
|
||||
|
||||
"""
|
||||
if not is_s3_enabled():
|
||||
logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id)
|
||||
return None
|
||||
|
||||
logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)})
|
||||
|
||||
try:
|
||||
s3_client = create_s3_client()
|
||||
bucket_name = get_s3_bucket_name()
|
||||
|
||||
# List all objects in the ingest/ prefix and check their tags
|
||||
paginator = s3_client.get_paginator("list_objects_v2")
|
||||
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/")
|
||||
|
||||
objects_checked = 0
|
||||
for page in page_iterator:
|
||||
if "Contents" not in page:
|
||||
continue
|
||||
|
||||
for obj in page["Contents"]:
|
||||
key = obj["Key"]
|
||||
objects_checked += 1
|
||||
if _check_object_tags(
|
||||
s3_client=s3_client,
|
||||
bucket_name=bucket_name,
|
||||
key=key,
|
||||
target_ingest_id=ingest_id,
|
||||
):
|
||||
s3_url = _build_s3_url(key)
|
||||
logger.debug(
|
||||
msg="Found S3 object for ingest ID",
|
||||
extra={
|
||||
"ingest_id": str(ingest_id),
|
||||
"s3_key": key,
|
||||
"s3_url": s3_url,
|
||||
"objects_checked": objects_checked,
|
||||
},
|
||||
)
|
||||
return s3_url
|
||||
|
||||
logger.debug(
|
||||
msg="No S3 object found for ingest ID",
|
||||
extra={
|
||||
"ingest_id": str(ingest_id),
|
||||
"objects_checked": objects_checked,
|
||||
},
|
||||
)
|
||||
|
||||
except ClientError as err:
|
||||
logger.exception(
|
||||
msg="Error during S3 URL lookup",
|
||||
extra={
|
||||
"ingest_id": str(ingest_id),
|
||||
"error_code": err.response.get("Error", {}).get("Code"),
|
||||
"error_message": str(err),
|
||||
},
|
||||
)
|
||||
|
||||
return None
|
||||
|
|
@ -172,8 +172,8 @@ function handleSuccessfulResponse(data) {
|
|||
// Show results section
|
||||
showResults();
|
||||
|
||||
// Store the ingest_id for download functionality
|
||||
window.currentIngestId = data.ingest_id;
|
||||
// Store the digest_url for download functionality
|
||||
window.currentDigestUrl = data.digest_url;
|
||||
|
||||
// Set plain text content for summary, tree, and content
|
||||
document.getElementById('result-summary').value = data.summary || '';
|
||||
|
|
@ -271,9 +271,9 @@ function copyFullDigest() {
|
|||
}
|
||||
|
||||
function downloadFullDigest() {
|
||||
// Check if we have an ingest_id
|
||||
if (!window.currentIngestId) {
|
||||
console.error('No ingest_id available for download');
|
||||
// Check if we have a digest_url
|
||||
if (!window.currentDigestUrl) {
|
||||
console.error('No digest_url available for download');
|
||||
|
||||
return;
|
||||
}
|
||||
|
|
@ -289,10 +289,10 @@ function downloadFullDigest() {
|
|||
Downloading...
|
||||
`;
|
||||
|
||||
// Create a download link to the server endpoint
|
||||
// Create a download link using the digest_url
|
||||
const a = document.createElement('a');
|
||||
|
||||
a.href = `/api/download/file/${window.currentIngestId}`;
|
||||
a.href = window.currentDigestUrl;
|
||||
a.download = 'digest.txt';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from __future__ import annotations
|
|||
|
||||
import json
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict
|
||||
from unittest.mock import AsyncMock
|
||||
|
|
@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery:
|
|||
repo_name="test_repo",
|
||||
local_path=Path("/tmp/test_repo").resolve(),
|
||||
slug="test_user/test_repo",
|
||||
id="id",
|
||||
id=uuid.uuid4(),
|
||||
branch="main",
|
||||
max_file_size=1_000_000,
|
||||
ignore_patterns={"*.pyc", "__pycache__", ".git"},
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ async def test_parse_query_without_host(
|
|||
query = await parse_remote_repo(url)
|
||||
|
||||
# Compare against the canonical dict while ignoring unpredictable fields.
|
||||
actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"})
|
||||
actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"})
|
||||
|
||||
assert "commit" in actual
|
||||
assert _is_valid_git_commit_hash(actual["commit"])
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non
|
|||
client = request.getfixturevalue("test_client")
|
||||
form_data = {
|
||||
"input_text": "https://github.com/octocat/Hello-World",
|
||||
"max_file_size": "243",
|
||||
"max_file_size": 243,
|
||||
"pattern_type": "exclude",
|
||||
"pattern": "",
|
||||
"token": "",
|
||||
|
|
@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None:
|
|||
client = request.getfixturevalue("test_client")
|
||||
form_data = {
|
||||
"input_text": "https://github.com/nonexistent/repo",
|
||||
"max_file_size": "243",
|
||||
"max_file_size": 243,
|
||||
"pattern_type": "exclude",
|
||||
"pattern": "",
|
||||
"token": "",
|
||||
|
|
@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None:
|
|||
# TODO: ingesting a large repo take too much time (eg: godotengine/godot repository)
|
||||
form_data = {
|
||||
"input_text": "https://github.com/octocat/hello-world",
|
||||
"max_file_size": "10",
|
||||
"max_file_size": 10,
|
||||
"pattern_type": "exclude",
|
||||
"pattern": "",
|
||||
"token": "",
|
||||
|
|
@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None:
|
|||
def make_request() -> None:
|
||||
form_data = {
|
||||
"input_text": "https://github.com/octocat/hello-world",
|
||||
"max_file_size": "243",
|
||||
"max_file_size": 243,
|
||||
"pattern_type": "exclude",
|
||||
"pattern": "",
|
||||
"token": "",
|
||||
|
|
@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None:
|
|||
client = request.getfixturevalue("test_client")
|
||||
form_data = {
|
||||
"input_text": "https://github.com/octocat/Hello-World",
|
||||
"max_file_size": "1",
|
||||
"max_file_size": 1,
|
||||
"pattern_type": "exclude",
|
||||
"pattern": "",
|
||||
"token": "",
|
||||
|
|
@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None:
|
|||
client = request.getfixturevalue("test_client")
|
||||
form_data = {
|
||||
"input_text": "https://github.com/octocat/Hello-World",
|
||||
"max_file_size": "243",
|
||||
"max_file_size": 243,
|
||||
"pattern_type": "include",
|
||||
"pattern": "*.md",
|
||||
"token": "",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue