mirror of
https://github.com/cyclotruc/gitingest.git
synced 2026-04-26 15:40:40 +00:00
feat: implement S3 integration for storing and retrieving digest files (#427)
Co-authored-by: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Co-authored-by: Nicolas Iragne <nicoragne@hotmail.fr>
This commit is contained in:
parent
998cea15b4
commit
414e85189f
17 changed files with 688 additions and 38 deletions
33
.docker/minio/setup.sh
Executable file
33
.docker/minio/setup.sh
Executable file
|
|
@ -0,0 +1,33 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# Simple script to set up MinIO bucket and user
|
||||||
|
# Based on example from MinIO issues
|
||||||
|
|
||||||
|
# Format bucket name to ensure compatibility
|
||||||
|
BUCKET_NAME=$(echo "${S3_BUCKET_NAME}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
|
||||||
|
|
||||||
|
# Configure MinIO client
|
||||||
|
mc alias set myminio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
|
||||||
|
|
||||||
|
# Remove bucket if it exists (for clean setup)
|
||||||
|
mc rm -r --force myminio/${BUCKET_NAME} || true
|
||||||
|
|
||||||
|
# Create bucket
|
||||||
|
mc mb myminio/${BUCKET_NAME}
|
||||||
|
|
||||||
|
# Set bucket policy to allow downloads
|
||||||
|
mc anonymous set download myminio/${BUCKET_NAME}
|
||||||
|
|
||||||
|
# Create user with access and secret keys
|
||||||
|
mc admin user add myminio ${S3_ACCESS_KEY} ${S3_SECRET_KEY} || echo "User already exists"
|
||||||
|
|
||||||
|
# Create policy for the bucket
|
||||||
|
echo '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:*"],"Resource":["arn:aws:s3:::'${BUCKET_NAME}'/*","arn:aws:s3:::'${BUCKET_NAME}'"]}]}' > /tmp/policy.json
|
||||||
|
|
||||||
|
# Apply policy
|
||||||
|
mc admin policy create myminio gitingest-policy /tmp/policy.json || echo "Policy already exists"
|
||||||
|
mc admin policy attach myminio gitingest-policy --user ${S3_ACCESS_KEY}
|
||||||
|
|
||||||
|
echo "MinIO setup completed successfully"
|
||||||
|
echo "Bucket: ${BUCKET_NAME}"
|
||||||
|
echo "Access via console: http://localhost:9001"
|
||||||
23
.env.example
23
.env.example
|
|
@ -33,3 +33,26 @@ GITINGEST_SENTRY_PROFILE_LIFECYCLE=trace
|
||||||
GITINGEST_SENTRY_SEND_DEFAULT_PII=true
|
GITINGEST_SENTRY_SEND_DEFAULT_PII=true
|
||||||
# Environment name for Sentry (default: "")
|
# Environment name for Sentry (default: "")
|
||||||
GITINGEST_SENTRY_ENVIRONMENT=development
|
GITINGEST_SENTRY_ENVIRONMENT=development
|
||||||
|
|
||||||
|
# MinIO Configuration (for development)
|
||||||
|
# Root user credentials for MinIO admin access
|
||||||
|
MINIO_ROOT_USER=minioadmin
|
||||||
|
MINIO_ROOT_PASSWORD=minioadmin
|
||||||
|
|
||||||
|
# S3 Configuration (for application)
|
||||||
|
# Set to "true" to enable S3 storage for digests
|
||||||
|
# S3_ENABLED=true
|
||||||
|
# Endpoint URL for the S3 service (MinIO in development)
|
||||||
|
S3_ENDPOINT=http://minio:9000
|
||||||
|
# Access key for the S3 bucket (created automatically in development)
|
||||||
|
S3_ACCESS_KEY=gitingest
|
||||||
|
# Secret key for the S3 bucket (created automatically in development)
|
||||||
|
S3_SECRET_KEY=gitingest123
|
||||||
|
# Name of the S3 bucket (created automatically in development)
|
||||||
|
S3_BUCKET_NAME=gitingest-bucket
|
||||||
|
# Region for the S3 bucket (default for MinIO)
|
||||||
|
S3_REGION=us-east-1
|
||||||
|
# Public URL/CDN for accessing S3 resources
|
||||||
|
S3_ALIAS_HOST=127.0.0.1:9000/gitingest-bucket
|
||||||
|
# Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
|
||||||
|
# S3_DIRECTORY_PREFIX=my-prefix
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,7 @@ repos:
|
||||||
files: ^src/
|
files: ^src/
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
[
|
[
|
||||||
|
boto3>=1.28.0,
|
||||||
click>=8.0.0,
|
click>=8.0.0,
|
||||||
'fastapi[standard]>=0.109.1',
|
'fastapi[standard]>=0.109.1',
|
||||||
httpx,
|
httpx,
|
||||||
|
|
@ -138,6 +139,7 @@ repos:
|
||||||
- --rcfile=tests/.pylintrc
|
- --rcfile=tests/.pylintrc
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
[
|
[
|
||||||
|
boto3>=1.28.0,
|
||||||
click>=8.0.0,
|
click>=8.0.0,
|
||||||
'fastapi[standard]>=0.109.1',
|
'fastapi[standard]>=0.109.1',
|
||||||
httpx,
|
httpx,
|
||||||
|
|
|
||||||
85
README.md
85
README.md
|
|
@ -204,6 +204,8 @@ This is because Jupyter notebooks are asynchronous by default.
|
||||||
|
|
||||||
## 🐳 Self-host
|
## 🐳 Self-host
|
||||||
|
|
||||||
|
### Using Docker
|
||||||
|
|
||||||
1. Build the image:
|
1. Build the image:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
|
|
@ -239,6 +241,89 @@ The application can be configured using the following environment variables:
|
||||||
- **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0)
|
- **GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE**: Sampling rate for profile sessions (default: "1.0", range: 0.0-1.0)
|
||||||
- **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace")
|
- **GITINGEST_SENTRY_PROFILE_LIFECYCLE**: Profile lifecycle mode (default: "trace")
|
||||||
- **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true")
|
- **GITINGEST_SENTRY_SEND_DEFAULT_PII**: Send default personally identifiable information (default: "true")
|
||||||
|
- **S3_ALIAS_HOST**: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
|
||||||
|
- **S3_DIRECTORY_PREFIX**: Optional prefix for S3 file paths (if set, prefixes all S3 paths with this value)
|
||||||
|
|
||||||
|
### Using Docker Compose
|
||||||
|
|
||||||
|
The project includes a `compose.yml` file that allows you to easily run the application in both development and production environments.
|
||||||
|
|
||||||
|
#### Compose File Structure
|
||||||
|
|
||||||
|
The `compose.yml` file uses YAML anchoring with `&app-base` and `<<: *app-base` to define common configuration that is shared between services:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Common base configuration for all services
|
||||||
|
x-app-base: &app-base
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- "${APP_WEB_BIND:-8000}:8000" # Main application port
|
||||||
|
- "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port
|
||||||
|
# ... other common configurations
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Services
|
||||||
|
|
||||||
|
The file defines three services:
|
||||||
|
|
||||||
|
1. **app**: Production service configuration
|
||||||
|
- Uses the `prod` profile
|
||||||
|
- Sets the Sentry environment to "production"
|
||||||
|
- Configured for stable operation with `restart: unless-stopped`
|
||||||
|
|
||||||
|
2. **app-dev**: Development service configuration
|
||||||
|
- Uses the `dev` profile
|
||||||
|
- Enables debug mode
|
||||||
|
- Mounts the source code for live development
|
||||||
|
- Uses hot reloading for faster development
|
||||||
|
|
||||||
|
3. **minio**: S3-compatible object storage for development
|
||||||
|
- Uses the `dev` profile (only available in development mode)
|
||||||
|
- Provides S3-compatible storage for local development
|
||||||
|
- Accessible via:
|
||||||
|
- API: Port 9000 ([localhost:9000](http://localhost:9000))
|
||||||
|
- Web Console: Port 9001 ([localhost:9001](http://localhost:9001))
|
||||||
|
- Default admin credentials:
|
||||||
|
- Username: `minioadmin`
|
||||||
|
- Password: `minioadmin`
|
||||||
|
- Configurable via environment variables:
|
||||||
|
- `MINIO_ROOT_USER`: Custom admin username (default: minioadmin)
|
||||||
|
- `MINIO_ROOT_PASSWORD`: Custom admin password (default: minioadmin)
|
||||||
|
- Includes persistent storage via Docker volume
|
||||||
|
- Auto-creates a bucket and application-specific credentials:
|
||||||
|
- Bucket name: `gitingest-bucket` (configurable via `S3_BUCKET_NAME`)
|
||||||
|
- Access key: `gitingest` (configurable via `S3_ACCESS_KEY`)
|
||||||
|
- Secret key: `gitingest123` (configurable via `S3_SECRET_KEY`)
|
||||||
|
- These credentials are automatically passed to the app-dev service via environment variables:
|
||||||
|
- `S3_ENDPOINT`: URL of the MinIO server
|
||||||
|
- `S3_ACCESS_KEY`: Access key for the S3 bucket
|
||||||
|
- `S3_SECRET_KEY`: Secret key for the S3 bucket
|
||||||
|
- `S3_BUCKET_NAME`: Name of the S3 bucket
|
||||||
|
- `S3_REGION`: Region for the S3 bucket (default: us-east-1)
|
||||||
|
- `S3_ALIAS_HOST`: Public URL/CDN for accessing S3 resources (default: "127.0.0.1:9000/gitingest-bucket")
|
||||||
|
|
||||||
|
#### Usage Examples
|
||||||
|
|
||||||
|
To run the application in development mode:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose --profile dev up
|
||||||
|
```
|
||||||
|
|
||||||
|
To run the application in production mode:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose --profile prod up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
To build and run the application:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose --profile prod build
|
||||||
|
docker compose --profile prod up -d
|
||||||
|
```
|
||||||
|
|
||||||
## 🤝 Contributing
|
## 🤝 Contributing
|
||||||
|
|
||||||
|
|
|
||||||
111
compose.yml
Normal file
111
compose.yml
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
# Common base configuration for all services
|
||||||
|
x-app-base: &app-base
|
||||||
|
ports:
|
||||||
|
- "${APP_WEB_BIND:-8000}:8000" # Main application port
|
||||||
|
- "${GITINGEST_METRICS_HOST:-127.0.0.1}:${GITINGEST_METRICS_PORT:-9090}:9090" # Metrics port
|
||||||
|
environment:
|
||||||
|
# Python Configuration
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
- PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Host Configuration
|
||||||
|
- ALLOWED_HOSTS=${ALLOWED_HOSTS:-gitingest.com,*.gitingest.com,localhost,127.0.0.1}
|
||||||
|
# Metrics Configuration
|
||||||
|
- GITINGEST_METRICS_ENABLED=${GITINGEST_METRICS_ENABLED:-true}
|
||||||
|
- GITINGEST_METRICS_HOST=${GITINGEST_METRICS_HOST:-127.0.0.1}
|
||||||
|
- GITINGEST_METRICS_PORT=${GITINGEST_METRICS_PORT:-9090}
|
||||||
|
# Sentry Configuration
|
||||||
|
- GITINGEST_SENTRY_ENABLED=${GITINGEST_SENTRY_ENABLED:-false}
|
||||||
|
- GITINGEST_SENTRY_DSN=${GITINGEST_SENTRY_DSN:-}
|
||||||
|
- GITINGEST_SENTRY_TRACES_SAMPLE_RATE=${GITINGEST_SENTRY_TRACES_SAMPLE_RATE:-1.0}
|
||||||
|
- GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE=${GITINGEST_SENTRY_PROFILE_SESSION_SAMPLE_RATE:-1.0}
|
||||||
|
- GITINGEST_SENTRY_PROFILE_LIFECYCLE=${GITINGEST_SENTRY_PROFILE_LIFECYCLE:-trace}
|
||||||
|
- GITINGEST_SENTRY_SEND_DEFAULT_PII=${GITINGEST_SENTRY_SEND_DEFAULT_PII:-true}
|
||||||
|
user: "1000:1000"
|
||||||
|
command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|
||||||
|
services:
|
||||||
|
# Production service configuration
|
||||||
|
app:
|
||||||
|
<<: *app-base
|
||||||
|
image: ghcr.io/coderamp-labs/gitingest:latest
|
||||||
|
profiles:
|
||||||
|
- prod
|
||||||
|
environment:
|
||||||
|
- GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-production}
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Development service configuration
|
||||||
|
app-dev:
|
||||||
|
<<: *app-base
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
profiles:
|
||||||
|
- dev
|
||||||
|
environment:
|
||||||
|
- DEBUG=true
|
||||||
|
- GITINGEST_SENTRY_ENVIRONMENT=${GITINGEST_SENTRY_ENVIRONMENT:-development}
|
||||||
|
# S3 Configuration
|
||||||
|
- S3_ENABLED=true
|
||||||
|
- S3_ENDPOINT=http://minio:9000
|
||||||
|
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
|
||||||
|
- S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
|
||||||
|
# Use lowercase bucket name to ensure compatibility with MinIO
|
||||||
|
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
|
||||||
|
- S3_REGION=${S3_REGION:-us-east-1}
|
||||||
|
- S3_DIRECTORY_PREFIX=${S3_DIRECTORY_PREFIX:-dev}
|
||||||
|
# Public URL for S3 resources
|
||||||
|
- S3_ALIAS_HOST=${S3_ALIAS_HOST:-http://127.0.0.1:9000/${S3_BUCKET_NAME:-gitingest-bucket}}
|
||||||
|
volumes:
|
||||||
|
# Mount source code for live development
|
||||||
|
- ./src:/app:ro
|
||||||
|
# Use --reload flag for hot reloading during development
|
||||||
|
command: ["python", "-m", "uvicorn", "server.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||||
|
depends_on:
|
||||||
|
minio-setup:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
|
||||||
|
# MinIO S3-compatible object storage for development
|
||||||
|
minio:
|
||||||
|
image: minio/minio:latest
|
||||||
|
profiles:
|
||||||
|
- dev
|
||||||
|
ports:
|
||||||
|
- "9000:9000" # API port
|
||||||
|
- "9001:9001" # Console port
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
|
||||||
|
volumes:
|
||||||
|
- minio-data:/data
|
||||||
|
command: server /data --console-address ":9001"
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 30s
|
||||||
|
start_period: 30s
|
||||||
|
start_interval: 1s
|
||||||
|
|
||||||
|
# MinIO setup service to create bucket and user
|
||||||
|
minio-setup:
|
||||||
|
image: minio/mc
|
||||||
|
profiles:
|
||||||
|
- dev
|
||||||
|
depends_on:
|
||||||
|
minio:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
|
||||||
|
- S3_ACCESS_KEY=${S3_ACCESS_KEY:-gitingest}
|
||||||
|
- S3_SECRET_KEY=${S3_SECRET_KEY:-gitingest123}
|
||||||
|
- S3_BUCKET_NAME=${S3_BUCKET_NAME:-gitingest-bucket}
|
||||||
|
volumes:
|
||||||
|
- ./.docker/minio/setup.sh:/setup.sh:ro
|
||||||
|
entrypoint: sh
|
||||||
|
command: -c /setup.sh
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
minio-data:
|
||||||
|
driver: local
|
||||||
|
|
@ -44,6 +44,7 @@ dev = [
|
||||||
]
|
]
|
||||||
|
|
||||||
server = [
|
server = [
|
||||||
|
"boto3>=1.28.0", # AWS SDK for S3 support
|
||||||
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
|
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
|
||||||
"prometheus-client",
|
"prometheus-client",
|
||||||
"sentry-sdk[fastapi]",
|
"sentry-sdk[fastapi]",
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
boto3>=1.28.0 # AWS SDK for S3 support
|
||||||
click>=8.0.0
|
click>=8.0.0
|
||||||
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
|
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
|
||||||
httpx
|
httpx
|
||||||
|
|
|
||||||
|
|
@ -44,9 +44,9 @@ async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQ
|
||||||
host = parsed_url.netloc
|
host = parsed_url.netloc
|
||||||
user, repo = _get_user_and_repo_from_path(parsed_url.path)
|
user, repo = _get_user_and_repo_from_path(parsed_url.path)
|
||||||
|
|
||||||
_id = str(uuid.uuid4())
|
_id = uuid.uuid4()
|
||||||
slug = f"{user}-{repo}"
|
slug = f"{user}-{repo}"
|
||||||
local_path = TMP_BASE_PATH / _id / slug
|
local_path = TMP_BASE_PATH / str(_id) / slug
|
||||||
url = f"https://{host}/{user}/{repo}"
|
url = f"https://{host}/{user}/{repo}"
|
||||||
|
|
||||||
query = IngestionQuery(
|
query = IngestionQuery(
|
||||||
|
|
@ -132,7 +132,7 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery:
|
||||||
"""
|
"""
|
||||||
path_obj = Path(path_str).resolve()
|
path_obj = Path(path_str).resolve()
|
||||||
slug = path_obj.name if path_str == "." else path_str.strip("/")
|
slug = path_obj.name if path_str == "." else path_str.strip("/")
|
||||||
return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
|
return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())
|
||||||
|
|
||||||
|
|
||||||
async def _configure_branch_or_tag(
|
async def _configure_branch_or_tag(
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
||||||
|
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
@ -27,7 +28,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
||||||
The URL of the repository.
|
The URL of the repository.
|
||||||
slug : str
|
slug : str
|
||||||
The slug of the repository.
|
The slug of the repository.
|
||||||
id : str
|
id : UUID
|
||||||
The ID of the repository.
|
The ID of the repository.
|
||||||
subpath : str
|
subpath : str
|
||||||
The subpath to the repository or file (default: ``"/"``).
|
The subpath to the repository or file (default: ``"/"``).
|
||||||
|
|
@ -47,6 +48,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
||||||
The patterns to include.
|
The patterns to include.
|
||||||
include_submodules : bool
|
include_submodules : bool
|
||||||
Whether to include all Git submodules within the repository. (default: ``False``)
|
Whether to include all Git submodules within the repository. (default: ``False``)
|
||||||
|
s3_url : str | None
|
||||||
|
The S3 URL where the digest is stored if S3 is enabled.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -56,7 +59,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
||||||
local_path: Path
|
local_path: Path
|
||||||
url: str | None = None
|
url: str | None = None
|
||||||
slug: str
|
slug: str
|
||||||
id: str
|
id: UUID
|
||||||
subpath: str = Field(default="/")
|
subpath: str = Field(default="/")
|
||||||
type: str | None = None
|
type: str | None = None
|
||||||
branch: str | None = None
|
branch: str | None = None
|
||||||
|
|
@ -66,6 +69,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
|
||||||
ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns
|
ignore_patterns: set[str] = Field(default_factory=set) # TODO: ssame type for ignore_* and include_* patterns
|
||||||
include_patterns: set[str] | None = None
|
include_patterns: set[str] | None = None
|
||||||
include_submodules: bool = Field(default=False)
|
include_submodules: bool = Field(default=False)
|
||||||
|
s3_url: str | None = None
|
||||||
|
|
||||||
def extract_clone_config(self) -> CloneConfig:
|
def extract_clone_config(self) -> CloneConfig:
|
||||||
"""Extract the relevant fields for the CloneConfig object.
|
"""Extract the relevant fields for the CloneConfig object.
|
||||||
|
|
|
||||||
|
|
@ -71,8 +71,8 @@ class IngestSuccessResponse(BaseModel):
|
||||||
Short form of repository URL (user/repo).
|
Short form of repository URL (user/repo).
|
||||||
summary : str
|
summary : str
|
||||||
Summary of the ingestion process including token estimates.
|
Summary of the ingestion process including token estimates.
|
||||||
ingest_id : str
|
digest_url : str
|
||||||
Ingestion id used to download full context.
|
URL to download the full digest content (either S3 URL or local download endpoint).
|
||||||
tree : str
|
tree : str
|
||||||
File tree structure of the repository.
|
File tree structure of the repository.
|
||||||
content : str
|
content : str
|
||||||
|
|
@ -89,7 +89,7 @@ class IngestSuccessResponse(BaseModel):
|
||||||
repo_url: str = Field(..., description="Original repository URL")
|
repo_url: str = Field(..., description="Original repository URL")
|
||||||
short_repo_url: str = Field(..., description="Short repository URL (user/repo)")
|
short_repo_url: str = Field(..., description="Short repository URL (user/repo)")
|
||||||
summary: str = Field(..., description="Ingestion summary with token estimates")
|
summary: str = Field(..., description="Ingestion summary with token estimates")
|
||||||
ingest_id: str = Field(..., description="Ingestion id used to download full context")
|
digest_url: str = Field(..., description="URL to download the full digest content")
|
||||||
tree: str = Field(..., description="File tree structure")
|
tree: str = Field(..., description="File tree structure")
|
||||||
content: str = Field(..., description="Processed file content")
|
content: str = Field(..., description="Processed file content")
|
||||||
default_max_file_size: int = Field(..., description="File size slider position used")
|
default_max_file_size: int = Field(..., description="File size slider position used")
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ from gitingest.query_parser import parse_remote_repo
|
||||||
from gitingest.utils.git_utils import validate_github_token
|
from gitingest.utils.git_utils import validate_github_token
|
||||||
from gitingest.utils.pattern_utils import process_patterns
|
from gitingest.utils.pattern_utils import process_patterns
|
||||||
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
|
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType
|
||||||
|
from server.s3_utils import generate_s3_file_path, is_s3_enabled, upload_to_s3
|
||||||
from server.server_config import MAX_DISPLAY_SIZE
|
from server.server_config import MAX_DISPLAY_SIZE
|
||||||
from server.server_utils import Colors, log_slider_to_size
|
from server.server_utils import Colors, log_slider_to_size
|
||||||
|
|
||||||
|
|
@ -45,6 +46,11 @@ async def process_query(
|
||||||
IngestResponse
|
IngestResponse
|
||||||
A union type, corresponding to IngestErrorResponse or IngestSuccessResponse
|
A union type, corresponding to IngestErrorResponse or IngestSuccessResponse
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
RuntimeError
|
||||||
|
If the commit hash is not found (should never happen).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if token:
|
if token:
|
||||||
validate_github_token(token)
|
validate_github_token(token)
|
||||||
|
|
@ -59,7 +65,6 @@ async def process_query(
|
||||||
return IngestErrorResponse(error=str(exc))
|
return IngestErrorResponse(error=str(exc))
|
||||||
|
|
||||||
query.url = cast("str", query.url)
|
query.url = cast("str", query.url)
|
||||||
query.host = cast("str", query.host)
|
|
||||||
query.max_file_size = max_file_size
|
query.max_file_size = max_file_size
|
||||||
query.ignore_patterns, query.include_patterns = process_patterns(
|
query.ignore_patterns, query.include_patterns = process_patterns(
|
||||||
exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None,
|
exclude_patterns=pattern if pattern_type == PatternType.EXCLUDE else None,
|
||||||
|
|
@ -71,13 +76,36 @@ async def process_query(
|
||||||
|
|
||||||
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
|
short_repo_url = f"{query.user_name}/{query.repo_name}" # Sets the "<user>/<repo>" for the page title
|
||||||
|
|
||||||
|
# The commit hash should always be available at this point
|
||||||
|
if not query.commit:
|
||||||
|
msg = "Unexpected error: no commit hash found"
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
summary, tree, content = ingest_query(query)
|
summary, tree, content = ingest_query(query)
|
||||||
|
|
||||||
# TODO: why are we writing the tree and content to a file here?
|
# Prepare the digest content (tree + content)
|
||||||
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
|
digest_content = tree + "\n" + content
|
||||||
with local_txt_file.open("w", encoding="utf-8") as f:
|
|
||||||
f.write(tree + "\n" + content)
|
# Store digest based on S3 configuration
|
||||||
|
if is_s3_enabled():
|
||||||
|
# Upload to S3 instead of storing locally
|
||||||
|
s3_file_path = generate_s3_file_path(
|
||||||
|
source=query.url,
|
||||||
|
user_name=cast("str", query.user_name),
|
||||||
|
repo_name=cast("str", query.repo_name),
|
||||||
|
commit=query.commit,
|
||||||
|
include_patterns=query.include_patterns,
|
||||||
|
ignore_patterns=query.ignore_patterns,
|
||||||
|
)
|
||||||
|
s3_url = upload_to_s3(content=digest_content, s3_file_path=s3_file_path, ingest_id=query.id)
|
||||||
|
# Store S3 URL in query for later use
|
||||||
|
query.s3_url = s3_url
|
||||||
|
else:
|
||||||
|
# Store locally
|
||||||
|
local_txt_file = Path(clone_config.local_path).with_suffix(".txt")
|
||||||
|
with local_txt_file.open("w", encoding="utf-8") as f:
|
||||||
|
f.write(digest_content)
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
|
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
|
||||||
|
|
@ -97,11 +125,21 @@ async def process_query(
|
||||||
summary=summary,
|
summary=summary,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Generate digest_url based on S3 configuration
|
||||||
|
if is_s3_enabled():
|
||||||
|
digest_url = getattr(query, "s3_url", None)
|
||||||
|
if not digest_url:
|
||||||
|
# This should not happen if S3 upload was successful
|
||||||
|
msg = "S3 is enabled but no S3 URL was generated"
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
else:
|
||||||
|
digest_url = f"/api/download/file/{query.id}"
|
||||||
|
|
||||||
return IngestSuccessResponse(
|
return IngestSuccessResponse(
|
||||||
repo_url=input_text,
|
repo_url=input_text,
|
||||||
short_repo_url=short_repo_url,
|
short_repo_url=short_repo_url,
|
||||||
summary=summary,
|
summary=summary,
|
||||||
ingest_id=query.id,
|
digest_url=digest_url,
|
||||||
tree=tree,
|
tree=tree,
|
||||||
content=content,
|
content=content,
|
||||||
default_max_file_size=slider_position,
|
default_max_file_size=slider_position,
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,16 @@
|
||||||
"""Ingest endpoint for the API."""
|
"""Ingest endpoint for the API."""
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
|
from uuid import UUID
|
||||||
|
|
||||||
from fastapi import APIRouter, HTTPException, Request, status
|
from fastapi import APIRouter, HTTPException, Request, status
|
||||||
from fastapi.responses import FileResponse, JSONResponse
|
from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
|
||||||
from prometheus_client import Counter
|
from prometheus_client import Counter
|
||||||
|
|
||||||
from gitingest.config import TMP_BASE_PATH
|
from gitingest.config import TMP_BASE_PATH
|
||||||
from server.models import IngestRequest
|
from server.models import IngestRequest
|
||||||
from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
|
from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
|
||||||
|
from server.s3_utils import is_s3_enabled
|
||||||
from server.server_config import MAX_DISPLAY_SIZE
|
from server.server_config import MAX_DISPLAY_SIZE
|
||||||
from server.server_utils import limiter
|
from server.server_utils import limiter
|
||||||
|
|
||||||
|
|
@ -39,7 +43,7 @@ async def api_ingest(
|
||||||
response = await _perform_ingestion(
|
response = await _perform_ingestion(
|
||||||
input_text=ingest_request.input_text,
|
input_text=ingest_request.input_text,
|
||||||
max_file_size=ingest_request.max_file_size,
|
max_file_size=ingest_request.max_file_size,
|
||||||
pattern_type=ingest_request.pattern_type,
|
pattern_type=ingest_request.pattern_type.value,
|
||||||
pattern=ingest_request.pattern,
|
pattern=ingest_request.pattern,
|
||||||
token=ingest_request.token,
|
token=ingest_request.token,
|
||||||
)
|
)
|
||||||
|
|
@ -90,30 +94,42 @@ async def api_ingest_get(
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
@router.get("/api/download/file/{ingest_id}", response_class=FileResponse)
|
@router.get("/api/download/file/{ingest_id}", response_model=None)
|
||||||
async def download_ingest(ingest_id: str) -> FileResponse:
|
async def download_ingest(
|
||||||
|
ingest_id: UUID,
|
||||||
|
) -> Union[RedirectResponse, FileResponse]: # noqa: FA100 (future-rewritable-type-annotation) (pydantic)
|
||||||
"""Download the first text file produced for an ingest ID.
|
"""Download the first text file produced for an ingest ID.
|
||||||
|
|
||||||
**This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
|
**This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
|
||||||
and returns it as a downloadable file. The file is streamed with media type ``text/plain``
|
and returns it as a downloadable file. When S3 is enabled, this endpoint is disabled
|
||||||
and prompts the browser to download it.
|
and clients should use the S3 URL provided in the ingest response instead.
|
||||||
|
|
||||||
**Parameters**
|
**Parameters**
|
||||||
|
|
||||||
- **ingest_id** (`str`): Identifier that the ingest step emitted
|
- **ingest_id** (`UUID`): Identifier that the ingest step emitted
|
||||||
|
|
||||||
**Returns**
|
**Returns**
|
||||||
|
|
||||||
- **FileResponse**: Streamed response with media type ``text/plain``
|
- **FileResponse**: Streamed response with media type ``text/plain`` for local files
|
||||||
|
|
||||||
**Raises**
|
**Raises**
|
||||||
|
|
||||||
|
- **HTTPException**: **503** - endpoint is disabled when S3 is enabled
|
||||||
- **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file
|
- **HTTPException**: **404** - digest directory is missing or contains no ``*.txt`` file
|
||||||
- **HTTPException**: **403** - the process lacks permission to read the directory or file
|
- **HTTPException**: **403** - the process lacks permission to read the directory or file
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
# Disable download endpoint when S3 is enabled
|
||||||
|
if is_s3_enabled():
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||||
|
detail="Download endpoint is disabled when S3 is enabled. "
|
||||||
|
"Use the S3 URL provided in the ingest response instead.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fall back to local file serving
|
||||||
# Normalize and validate the directory path
|
# Normalize and validate the directory path
|
||||||
directory = (TMP_BASE_PATH / ingest_id).resolve()
|
directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
|
||||||
if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
|
if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
|
||||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")
|
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")
|
||||||
|
|
||||||
|
|
|
||||||
335
src/server/s3_utils.py
Normal file
335
src/server/s3_utils.py
Normal file
|
|
@ -0,0 +1,335 @@
|
||||||
|
"""S3 utility functions for uploading and managing digest files."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
|
||||||
|
|
||||||
|
import boto3
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from botocore.client import BaseClient
|
||||||
|
|
||||||
|
# Initialize logger for this module
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class S3UploadError(Exception):
|
||||||
|
"""Custom exception for S3 upload failures."""
|
||||||
|
|
||||||
|
|
||||||
|
def is_s3_enabled() -> bool:
|
||||||
|
"""Check if S3 is enabled via environment variables."""
|
||||||
|
return os.getenv("S3_ENABLED", "false").lower() == "true"
|
||||||
|
|
||||||
|
|
||||||
|
def get_s3_config() -> dict[str, str | None]:
|
||||||
|
"""Get S3 configuration from environment variables."""
|
||||||
|
config = {
|
||||||
|
"endpoint_url": os.getenv("S3_ENDPOINT"),
|
||||||
|
"aws_access_key_id": os.getenv("S3_ACCESS_KEY"),
|
||||||
|
"aws_secret_access_key": os.getenv("S3_SECRET_KEY"),
|
||||||
|
"region_name": os.getenv("S3_REGION") or os.getenv("AWS_REGION", "us-east-1"),
|
||||||
|
}
|
||||||
|
return {k: v for k, v in config.items() if v is not None}
|
||||||
|
|
||||||
|
|
||||||
|
def get_s3_bucket_name() -> str:
|
||||||
|
"""Get S3 bucket name from environment variables."""
|
||||||
|
return os.getenv("S3_BUCKET_NAME", "gitingest-bucket")
|
||||||
|
|
||||||
|
|
||||||
|
def get_s3_alias_host() -> str | None:
|
||||||
|
"""Get S3 alias host for public URLs."""
|
||||||
|
return os.getenv("S3_ALIAS_HOST")
|
||||||
|
|
||||||
|
|
||||||
|
def generate_s3_file_path(
|
||||||
|
source: str,
|
||||||
|
user_name: str,
|
||||||
|
repo_name: str,
|
||||||
|
commit: str,
|
||||||
|
include_patterns: set[str] | None,
|
||||||
|
ignore_patterns: set[str],
|
||||||
|
) -> str:
|
||||||
|
"""Generate S3 file path with proper naming convention.
|
||||||
|
|
||||||
|
The file path is formatted as:
|
||||||
|
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
|
||||||
|
<exclude&include hash>/<owner>-<repo-name>.txt
|
||||||
|
|
||||||
|
If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
|
||||||
|
The commit-ID is always included in the URL.
|
||||||
|
If no specific commit is provided, the actual commit hash from the cloned repository is used.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
source : str
|
||||||
|
Git host (e.g., github, gitlab, bitbucket, etc.).
|
||||||
|
user_name : str
|
||||||
|
Repository owner or user.
|
||||||
|
repo_name : str
|
||||||
|
Repository name.
|
||||||
|
commit : str
|
||||||
|
Commit hash.
|
||||||
|
include_patterns : set[str] | None
|
||||||
|
Set of patterns specifying which files to include.
|
||||||
|
ignore_patterns : set[str]
|
||||||
|
Set of patterns specifying which files to exclude.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
S3 file path string.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the source URL is invalid.
|
||||||
|
|
||||||
|
"""
|
||||||
|
hostname = urlparse(source).hostname
|
||||||
|
if hostname is None:
|
||||||
|
msg = "Invalid source URL"
|
||||||
|
logger.error(msg)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
# Create hash of exclude/include patterns for uniqueness
|
||||||
|
patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
|
||||||
|
patterns_str += f"exclude:{sorted(ignore_patterns)}"
|
||||||
|
patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
# Build the base path using hostname directly
|
||||||
|
base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
|
||||||
|
|
||||||
|
# Check for S3_DIRECTORY_PREFIX environment variable
|
||||||
|
s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")
|
||||||
|
|
||||||
|
if not s3_directory_prefix:
|
||||||
|
return base_path
|
||||||
|
|
||||||
|
# Remove trailing slash if present and add the prefix
|
||||||
|
s3_directory_prefix = s3_directory_prefix.rstrip("/")
|
||||||
|
return f"{s3_directory_prefix}/{base_path}"
|
||||||
|
|
||||||
|
|
||||||
|
def create_s3_client() -> BaseClient:
|
||||||
|
"""Create and return an S3 client with configuration from environment."""
|
||||||
|
config = get_s3_config()
|
||||||
|
# Log S3 client creation (excluding sensitive info)
|
||||||
|
log_config = config.copy()
|
||||||
|
has_credentials = bool(log_config.pop("aws_access_key_id", None) or log_config.pop("aws_secret_access_key", None))
|
||||||
|
logger.debug(
|
||||||
|
msg="Creating S3 client",
|
||||||
|
extra={
|
||||||
|
"s3_config": log_config,
|
||||||
|
"has_credentials": has_credentials,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return boto3.client("s3", **config)
|
||||||
|
|
||||||
|
|
||||||
|
def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
|
||||||
|
"""Upload content to S3 and return the public URL.
|
||||||
|
|
||||||
|
This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
|
||||||
|
The ingest ID is stored as an S3 object tag.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
content : str
|
||||||
|
The digest content to upload.
|
||||||
|
s3_file_path : str
|
||||||
|
The S3 file path where the content will be stored.
|
||||||
|
ingest_id : UUID
|
||||||
|
The ingest ID to store as an S3 object tag.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
Public URL to access the uploaded file.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If S3 is not enabled.
|
||||||
|
S3UploadError
|
||||||
|
If the upload to S3 fails.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not is_s3_enabled():
|
||||||
|
msg = "S3 is not enabled"
|
||||||
|
logger.error(msg)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
bucket_name = get_s3_bucket_name()
|
||||||
|
|
||||||
|
extra_fields = {
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"s3_file_path": s3_file_path,
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"content_size": len(content),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Log upload attempt
|
||||||
|
logger.debug("Starting S3 upload", extra=extra_fields)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Upload the content with ingest_id as tag
|
||||||
|
s3_client.put_object(
|
||||||
|
Bucket=bucket_name,
|
||||||
|
Key=s3_file_path,
|
||||||
|
Body=content.encode("utf-8"),
|
||||||
|
ContentType="text/plain",
|
||||||
|
Tagging=f"ingest_id={ingest_id!s}",
|
||||||
|
)
|
||||||
|
except ClientError as err:
|
||||||
|
# Log upload failure
|
||||||
|
logger.exception(
|
||||||
|
"S3 upload failed",
|
||||||
|
extra={
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"s3_file_path": s3_file_path,
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"error_code": err.response.get("Error", {}).get("Code"),
|
||||||
|
"error_message": str(err),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
msg = f"Failed to upload to S3: {err}"
|
||||||
|
raise S3UploadError(msg) from err
|
||||||
|
|
||||||
|
# Generate public URL
|
||||||
|
alias_host = get_s3_alias_host()
|
||||||
|
if alias_host:
|
||||||
|
# Use alias host if configured
|
||||||
|
public_url = f"{alias_host.rstrip('/')}/{s3_file_path}"
|
||||||
|
else:
|
||||||
|
# Fallback to direct S3 URL
|
||||||
|
endpoint = get_s3_config().get("endpoint_url")
|
||||||
|
if endpoint:
|
||||||
|
public_url = f"{endpoint.rstrip('/')}/{bucket_name}/{s3_file_path}"
|
||||||
|
else:
|
||||||
|
public_url = f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{s3_file_path}"
|
||||||
|
|
||||||
|
# Log successful upload
|
||||||
|
logger.debug(
|
||||||
|
"S3 upload completed successfully",
|
||||||
|
extra={
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"s3_file_path": s3_file_path,
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"public_url": public_url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return public_url
|
||||||
|
|
||||||
|
|
||||||
|
def _build_s3_url(key: str) -> str:
|
||||||
|
"""Build S3 URL for a given key."""
|
||||||
|
alias_host = get_s3_alias_host()
|
||||||
|
if alias_host:
|
||||||
|
return f"{alias_host.rstrip('/')}/{key}"
|
||||||
|
|
||||||
|
bucket_name = get_s3_bucket_name()
|
||||||
|
config = get_s3_config()
|
||||||
|
|
||||||
|
endpoint = config["endpoint_url"]
|
||||||
|
if endpoint:
|
||||||
|
return f"{endpoint.rstrip('/')}/{bucket_name}/{key}"
|
||||||
|
|
||||||
|
return f"https://{bucket_name}.s3.{config['region_name']}.amazonaws.com/{key}"
|
||||||
|
|
||||||
|
|
||||||
|
def _check_object_tags(s3_client: BaseClient, bucket_name: str, key: str, target_ingest_id: UUID) -> bool:
|
||||||
|
"""Check if an S3 object has the matching ingest_id tag."""
|
||||||
|
try:
|
||||||
|
tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
|
||||||
|
tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
|
||||||
|
return tags.get("ingest_id") == str(target_ingest_id)
|
||||||
|
except ClientError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
|
||||||
|
"""Get S3 URL for a given ingest ID if it exists.
|
||||||
|
|
||||||
|
Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
|
||||||
|
Used by the download endpoint to redirect to S3 if available.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ingest_id : UUID
|
||||||
|
The ingest ID to search for in S3 object tags.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str | None
|
||||||
|
S3 URL if file exists, None otherwise.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not is_s3_enabled():
|
||||||
|
logger.debug("S3 not enabled, skipping URL lookup for ingest_id: %s", ingest_id)
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.debug(msg="Starting S3 URL lookup for ingest ID", extra={"ingest_id": str(ingest_id)})
|
||||||
|
|
||||||
|
try:
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
bucket_name = get_s3_bucket_name()
|
||||||
|
|
||||||
|
# List all objects in the ingest/ prefix and check their tags
|
||||||
|
paginator = s3_client.get_paginator("list_objects_v2")
|
||||||
|
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix="ingest/")
|
||||||
|
|
||||||
|
objects_checked = 0
|
||||||
|
for page in page_iterator:
|
||||||
|
if "Contents" not in page:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for obj in page["Contents"]:
|
||||||
|
key = obj["Key"]
|
||||||
|
objects_checked += 1
|
||||||
|
if _check_object_tags(
|
||||||
|
s3_client=s3_client,
|
||||||
|
bucket_name=bucket_name,
|
||||||
|
key=key,
|
||||||
|
target_ingest_id=ingest_id,
|
||||||
|
):
|
||||||
|
s3_url = _build_s3_url(key)
|
||||||
|
logger.debug(
|
||||||
|
msg="Found S3 object for ingest ID",
|
||||||
|
extra={
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"s3_key": key,
|
||||||
|
"s3_url": s3_url,
|
||||||
|
"objects_checked": objects_checked,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return s3_url
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
msg="No S3 object found for ingest ID",
|
||||||
|
extra={
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"objects_checked": objects_checked,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
except ClientError as err:
|
||||||
|
logger.exception(
|
||||||
|
msg="Error during S3 URL lookup",
|
||||||
|
extra={
|
||||||
|
"ingest_id": str(ingest_id),
|
||||||
|
"error_code": err.response.get("Error", {}).get("Code"),
|
||||||
|
"error_message": str(err),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
@ -172,8 +172,8 @@ function handleSuccessfulResponse(data) {
|
||||||
// Show results section
|
// Show results section
|
||||||
showResults();
|
showResults();
|
||||||
|
|
||||||
// Store the ingest_id for download functionality
|
// Store the digest_url for download functionality
|
||||||
window.currentIngestId = data.ingest_id;
|
window.currentDigestUrl = data.digest_url;
|
||||||
|
|
||||||
// Set plain text content for summary, tree, and content
|
// Set plain text content for summary, tree, and content
|
||||||
document.getElementById('result-summary').value = data.summary || '';
|
document.getElementById('result-summary').value = data.summary || '';
|
||||||
|
|
@ -271,9 +271,9 @@ function copyFullDigest() {
|
||||||
}
|
}
|
||||||
|
|
||||||
function downloadFullDigest() {
|
function downloadFullDigest() {
|
||||||
// Check if we have an ingest_id
|
// Check if we have a digest_url
|
||||||
if (!window.currentIngestId) {
|
if (!window.currentDigestUrl) {
|
||||||
console.error('No ingest_id available for download');
|
console.error('No digest_url available for download');
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -289,10 +289,10 @@ function downloadFullDigest() {
|
||||||
Downloading...
|
Downloading...
|
||||||
`;
|
`;
|
||||||
|
|
||||||
// Create a download link to the server endpoint
|
// Create a download link using the digest_url
|
||||||
const a = document.createElement('a');
|
const a = document.createElement('a');
|
||||||
|
|
||||||
a.href = `/api/download/file/${window.currentIngestId}`;
|
a.href = window.currentDigestUrl;
|
||||||
a.download = 'digest.txt';
|
a.download = 'digest.txt';
|
||||||
document.body.appendChild(a);
|
document.body.appendChild(a);
|
||||||
a.click();
|
a.click();
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, Dict
|
from typing import TYPE_CHECKING, Any, Callable, Dict
|
||||||
from unittest.mock import AsyncMock
|
from unittest.mock import AsyncMock
|
||||||
|
|
@ -62,7 +63,7 @@ def sample_query() -> IngestionQuery:
|
||||||
repo_name="test_repo",
|
repo_name="test_repo",
|
||||||
local_path=Path("/tmp/test_repo").resolve(),
|
local_path=Path("/tmp/test_repo").resolve(),
|
||||||
slug="test_user/test_repo",
|
slug="test_user/test_repo",
|
||||||
id="id",
|
id=uuid.uuid4(),
|
||||||
branch="main",
|
branch="main",
|
||||||
max_file_size=1_000_000,
|
max_file_size=1_000_000,
|
||||||
ignore_patterns={"*.pyc", "__pycache__", ".git"},
|
ignore_patterns={"*.pyc", "__pycache__", ".git"},
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ async def test_parse_query_without_host(
|
||||||
query = await parse_remote_repo(url)
|
query = await parse_remote_repo(url)
|
||||||
|
|
||||||
# Compare against the canonical dict while ignoring unpredictable fields.
|
# Compare against the canonical dict while ignoring unpredictable fields.
|
||||||
actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns"})
|
actual = query.model_dump(exclude={"id", "local_path", "ignore_patterns", "s3_url"})
|
||||||
|
|
||||||
assert "commit" in actual
|
assert "commit" in actual
|
||||||
assert _is_valid_git_commit_hash(actual["commit"])
|
assert _is_valid_git_commit_hash(actual["commit"])
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non
|
||||||
client = request.getfixturevalue("test_client")
|
client = request.getfixturevalue("test_client")
|
||||||
form_data = {
|
form_data = {
|
||||||
"input_text": "https://github.com/octocat/Hello-World",
|
"input_text": "https://github.com/octocat/Hello-World",
|
||||||
"max_file_size": "243",
|
"max_file_size": 243,
|
||||||
"pattern_type": "exclude",
|
"pattern_type": "exclude",
|
||||||
"pattern": "",
|
"pattern": "",
|
||||||
"token": "",
|
"token": "",
|
||||||
|
|
@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None:
|
||||||
client = request.getfixturevalue("test_client")
|
client = request.getfixturevalue("test_client")
|
||||||
form_data = {
|
form_data = {
|
||||||
"input_text": "https://github.com/nonexistent/repo",
|
"input_text": "https://github.com/nonexistent/repo",
|
||||||
"max_file_size": "243",
|
"max_file_size": 243,
|
||||||
"pattern_type": "exclude",
|
"pattern_type": "exclude",
|
||||||
"pattern": "",
|
"pattern": "",
|
||||||
"token": "",
|
"token": "",
|
||||||
|
|
@ -97,7 +97,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None:
|
||||||
# TODO: ingesting a large repo take too much time (eg: godotengine/godot repository)
|
# TODO: ingesting a large repo take too much time (eg: godotengine/godot repository)
|
||||||
form_data = {
|
form_data = {
|
||||||
"input_text": "https://github.com/octocat/hello-world",
|
"input_text": "https://github.com/octocat/hello-world",
|
||||||
"max_file_size": "10",
|
"max_file_size": 10,
|
||||||
"pattern_type": "exclude",
|
"pattern_type": "exclude",
|
||||||
"pattern": "",
|
"pattern": "",
|
||||||
"token": "",
|
"token": "",
|
||||||
|
|
@ -122,7 +122,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None:
|
||||||
def make_request() -> None:
|
def make_request() -> None:
|
||||||
form_data = {
|
form_data = {
|
||||||
"input_text": "https://github.com/octocat/hello-world",
|
"input_text": "https://github.com/octocat/hello-world",
|
||||||
"max_file_size": "243",
|
"max_file_size": 243,
|
||||||
"pattern_type": "exclude",
|
"pattern_type": "exclude",
|
||||||
"pattern": "",
|
"pattern": "",
|
||||||
"token": "",
|
"token": "",
|
||||||
|
|
@ -149,7 +149,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None:
|
||||||
client = request.getfixturevalue("test_client")
|
client = request.getfixturevalue("test_client")
|
||||||
form_data = {
|
form_data = {
|
||||||
"input_text": "https://github.com/octocat/Hello-World",
|
"input_text": "https://github.com/octocat/Hello-World",
|
||||||
"max_file_size": "1",
|
"max_file_size": 1,
|
||||||
"pattern_type": "exclude",
|
"pattern_type": "exclude",
|
||||||
"pattern": "",
|
"pattern": "",
|
||||||
"token": "",
|
"token": "",
|
||||||
|
|
@ -172,7 +172,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None:
|
||||||
client = request.getfixturevalue("test_client")
|
client = request.getfixturevalue("test_client")
|
||||||
form_data = {
|
form_data = {
|
||||||
"input_text": "https://github.com/octocat/Hello-World",
|
"input_text": "https://github.com/octocat/Hello-World",
|
||||||
"max_file_size": "243",
|
"max_file_size": 243,
|
||||||
"pattern_type": "include",
|
"pattern_type": "include",
|
||||||
"pattern": "*.md",
|
"pattern": "*.md",
|
||||||
"token": "",
|
"token": "",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue