airi/apps/server/docker-compose.otel.yml
RainbowBird f259e7eb92
Some checks are pending
Cloudflare Workers (server-dev) / Deploy - stage-web (server-dev) (push) Waiting to run
feat(server): replace legacy health endpoints with K8s-style /livez and /readyz probes
2026-05-15 19:02:57 +08:00

122 lines
3.8 KiB
YAML

name: proj-airi-otel
services:
# ============================================================
# OpenTelemetry Collector
# Receives traces, metrics, and logs from the application
# and exports them to the appropriate backends.
# ============================================================
otel-collector:
image: otel/opentelemetry-collector-contrib:0.120.0
command: ['--config=/etc/otelcol/otel-collector.yaml']
volumes:
- ./otel/collector/otel-collector.yaml:/etc/otelcol/otel-collector.yaml:ro
ports:
- '4317:4317' # OTLP gRPC
- '4318:4318' # OTLP HTTP
depends_on:
loki:
condition: service_started
tempo:
condition: service_started
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:13133/']
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
# ============================================================
# Prometheus - Metrics storage and querying
# ============================================================
prometheus:
image: prom/prometheus:v3.2.1
command:
- --config.file=/etc/prometheus/prometheus.yaml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=7d
- --web.enable-remote-write-receiver
- --enable-feature=exemplar-storage
- --enable-feature=native-histograms
volumes:
- ./otel/prometheus/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro
- prometheus_data:/prometheus
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:9090/-/healthy']
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
# ============================================================
# Loki - Log aggregation
# ============================================================
loki:
image: grafana/loki:3.4.3
command: -config.file=/etc/loki/loki.yaml
volumes:
- ./otel/loki/loki.yaml:/etc/loki/loki.yaml:ro
- loki_data:/loki
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3100/ready']
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
# ============================================================
# Tempo - Distributed tracing backend
# ============================================================
tempo:
image: grafana/tempo:2.7.2
command: ['-config.file=/etc/tempo/tempo.yaml']
volumes:
- ./otel/tempo/tempo.yaml:/etc/tempo/tempo.yaml:ro
- tempo_data:/var/tempo
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3200/ready']
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
# ============================================================
# Grafana - Visualization and dashboards
# ============================================================
grafana:
image: grafana/grafana:11.5.2
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor tempoSearch tempoServiceGraph
volumes:
- ./otel/grafana/provisioning:/etc/grafana/provisioning:ro
- ./otel/grafana/dashboards:/var/lib/grafana/dashboards:ro
- grafana_data:/var/lib/grafana
ports:
- '3001:3000'
depends_on:
prometheus:
condition: service_healthy
loki:
condition: service_healthy
tempo:
condition: service_healthy
healthcheck:
test: ['CMD', 'wget', '--spider', '-q', 'http://localhost:3000/livez']
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped
volumes:
prometheus_data:
driver: local
loki_data:
driver: local
tempo_data:
driver: local
grafana_data:
driver: local