Update docs and helm chart for agent health endpoints

- Add health-addr config option to UNIFIED_AGENT.md
- Document /healthz, /readyz, /metrics endpoints
- Add Kubernetes probe examples to docs
- Add liveness/readiness probes to helm chart agent template
- Add healthPort, livenessProbe, readinessProbe to values.yaml
- Update values.schema.json with new agent probe options
This commit is contained in:
rcourtman 2025-12-02 22:45:24 +00:00
parent 7fc15417e4
commit da43588189
4 changed files with 133 additions and 0 deletions

View file

@ -96,6 +96,26 @@ spec:
resources:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- if .Values.agent.livenessProbe.enabled }}
livenessProbe:
httpGet:
path: {{ .Values.agent.livenessProbe.path }}
port: {{ .Values.agent.healthPort }}
initialDelaySeconds: {{ .Values.agent.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.agent.livenessProbe.periodSeconds }}
timeoutSeconds: {{ .Values.agent.livenessProbe.timeoutSeconds }}
failureThreshold: {{ .Values.agent.livenessProbe.failureThreshold }}
{{- end }}
{{- if .Values.agent.readinessProbe.enabled }}
readinessProbe:
httpGet:
path: {{ .Values.agent.readinessProbe.path }}
port: {{ .Values.agent.healthPort }}
initialDelaySeconds: {{ .Values.agent.readinessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.agent.readinessProbe.periodSeconds }}
timeoutSeconds: {{ .Values.agent.readinessProbe.timeoutSeconds }}
failureThreshold: {{ .Values.agent.readinessProbe.failureThreshold }}
{{- end }}
{{- if or .Values.agent.dockerSocket.enabled .Values.agent.extraVolumes }}
volumes:
{{- if .Values.agent.dockerSocket.enabled }}

View file

@ -159,6 +159,62 @@
"type": "string",
"enum": ["DaemonSet", "Deployment"],
"description": "Agent deployment type"
},
"healthPort": {
"type": "integer",
"minimum": 1,
"maximum": 65535,
"description": "Health/metrics server port"
},
"livenessProbe": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable liveness probe"
},
"path": {
"type": "string",
"description": "Liveness probe path"
},
"initialDelaySeconds": {
"type": "integer"
},
"periodSeconds": {
"type": "integer"
},
"timeoutSeconds": {
"type": "integer"
},
"failureThreshold": {
"type": "integer"
}
}
},
"readinessProbe": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable readiness probe"
},
"path": {
"type": "string",
"description": "Readiness probe path"
},
"initialDelaySeconds": {
"type": "integer"
},
"periodSeconds": {
"type": "integer"
},
"timeoutSeconds": {
"type": "integer"
},
"failureThreshold": {
"type": "integer"
}
}
}
}
},

View file

@ -138,6 +138,21 @@ agent:
hostPathType: Socket
extraVolumes: []
extraVolumeMounts: []
healthPort: 9191
livenessProbe:
enabled: true
path: /healthz
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
enabled: true
path: /readyz
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
# Monitoring configuration
monitoring:

View file

@ -45,6 +45,7 @@ curl -fsSL http://<pulse-ip>:7655/install.sh | \
| `--insecure` | `PULSE_INSECURE_SKIP_VERIFY` | Skip TLS verification | `false` |
| `--hostname` | `PULSE_HOSTNAME` | Override hostname | *(OS hostname)* |
| `--agent-id` | `PULSE_AGENT_ID` | Unique agent identifier | *(machine-id)* |
| `--health-addr` | `PULSE_HEALTH_ADDR` | Health/metrics server address | `:9191` |
## Installation Options
@ -105,6 +106,47 @@ The install script automatically removes legacy agents when installing the unifi
No manual cleanup is required.
## Health Checks & Metrics
The agent exposes HTTP endpoints for health checks and Prometheus metrics on port 9191 by default.
### Endpoints
| Endpoint | Description |
|----------|-------------|
| `/healthz` | Liveness probe - returns 200 if agent is running |
| `/readyz` | Readiness probe - returns 200 when agents are initialized |
| `/metrics` | Prometheus metrics |
### Prometheus Metrics
| Metric | Type | Description |
|--------|------|-------------|
| `pulse_agent_info` | Gauge | Agent info with version, host_enabled, docker_enabled labels |
| `pulse_agent_up` | Gauge | 1 when running, 0 when shutting down |
### Kubernetes Probes
```yaml
livenessProbe:
httpGet:
path: /healthz
port: 9191
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
httpGet:
path: /readyz
port: 9191
initialDelaySeconds: 5
periodSeconds: 5
```
### Disable Health Server
Set `--health-addr=""` or `PULSE_HEALTH_ADDR=""` to disable the health/metrics server.
## Troubleshooting
### Agent Not Updating