Pulse/internal/api/deploy_handlers.go

2049 lines
63 KiB
Go

package api
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/deploy"
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
unifiedresources "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rcourtman/pulse-go-rewrite/pkg/auth"
"github.com/rs/zerolog/log"
)
// DeployHandlers provides HTTP handlers for cluster agent deployment.
type DeployHandlers struct {
store *deploy.Store
monitor *monitoring.Monitor
execServer *agentexec.Server
reservation *deploy.ReservationManager
// resolvePublicURL derives the Pulse URL for agent reachability checks.
resolvePublicURL func(req *http.Request) string
// config and persistence for token minting/validation in enroll flow.
config *config.Config
persistence *config.ConfigPersistence
// Active preflight SSE subscriptions keyed by preflightID.
sseMu sync.Mutex
sseSubs map[string]*deploySSESub
}
// deploySSESub tracks SSE clients for a single preflight job.
type deploySSESub struct {
clients map[string]chan []byte // clientID -> event channel
mu sync.Mutex
}
// NewDeployHandlers creates a DeployHandlers instance.
func NewDeployHandlers(
store *deploy.Store,
monitor *monitoring.Monitor,
execServer *agentexec.Server,
reservation *deploy.ReservationManager,
resolvePublicURL func(req *http.Request) string,
cfg *config.Config,
persistence *config.ConfigPersistence,
) *DeployHandlers {
return &DeployHandlers{
store: store,
monitor: monitor,
execServer: execServer,
reservation: reservation,
resolvePublicURL: resolvePublicURL,
config: cfg,
persistence: persistence,
sseSubs: make(map[string]*deploySSESub),
}
}
// --- Candidates ---
// candidateNode is the per-node response in the candidates list.
type candidateNode struct {
NodeID string `json:"nodeId"`
Name string `json:"name"`
IP string `json:"ip,omitempty"`
HasAgent bool `json:"hasAgent"`
Deployable bool `json:"deployable"`
Reason string `json:"reason,omitempty"`
}
// sourceAgentInfo describes a connected agent that can execute SSH to peers.
type sourceAgentInfo struct {
AgentID string `json:"agentId"`
NodeID string `json:"nodeId"`
Online bool `json:"online"`
}
type candidatesResponse struct {
ClusterID string `json:"clusterId"`
ClusterName string `json:"clusterName"`
SourceAgents []sourceAgentInfo `json:"sourceAgents"`
Nodes []candidateNode `json:"nodes"`
}
// HandleCandidates returns deployment candidate nodes for a cluster.
// GET /api/clusters/{clusterId}/agent-deploy/candidates
func (h *DeployHandlers) HandleCandidates(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
clusterID := extractClusterID(r.URL.Path, "/api/clusters/", "/agent-deploy/candidates")
if clusterID == "" {
writeErrorResponse(w, http.StatusBadRequest, "invalid_cluster_id", "Cluster ID is required", nil)
return
}
readState := h.monitor.GetUnifiedReadStateOrSnapshot()
if readState == nil {
writeErrorResponse(w, http.StatusServiceUnavailable, "state_unavailable", "Resource state is unavailable", nil)
return
}
// Build connected agents set
connectedAgents := make(map[string]bool)
for _, agent := range h.execServer.GetConnectedAgents() {
connectedAgents[agent.AgentID] = true
}
var (
clusterName string
nodes []candidateNode
sourceAgents []sourceAgentInfo
)
for _, node := range readState.Nodes() {
if node == nil {
continue
}
if !node.IsClusterMember() {
continue
}
// Match cluster by name (clusterID in URL = cluster name).
if node.ClusterName() != clusterID {
continue
}
if clusterName == "" {
clusterName = node.ClusterName()
}
hasAgent := node.LinkedAgentID() != ""
cn := candidateNode{
NodeID: node.ID(),
Name: nodeName(node),
IP: nodeIP(node.HostURL()),
HasAgent: hasAgent,
}
if hasAgent {
cn.Deployable = false
cn.Reason = "already_agent"
// This node has an agent — check if it's a source candidate.
hostID := node.LinkedAgentID()
if connectedAgents[hostID] {
sourceAgents = append(sourceAgents, sourceAgentInfo{
AgentID: hostID,
NodeID: node.ID(),
Online: true,
})
}
} else {
cn.Deployable = true
}
nodes = append(nodes, cn)
}
resp := candidatesResponse{
ClusterID: clusterID,
ClusterName: clusterName,
SourceAgents: sourceAgents,
Nodes: nodes,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resp)
}
// --- Preflight ---
type createPreflightRequest struct {
SourceAgentID string `json:"sourceAgentId"`
TargetNodeIDs []string `json:"targetNodeIds"`
MaxParallel int `json:"maxParallel"`
}
type createPreflightResponse struct {
PreflightID string `json:"preflightId"`
Status string `json:"status"`
EventsURL string `json:"eventsUrl"`
}
// HandleCreatePreflight creates a preflight job and dispatches to the source agent.
// POST /api/clusters/{clusterId}/agent-deploy/preflights
func (h *DeployHandlers) HandleCreatePreflight(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
clusterID := extractClusterID(r.URL.Path, "/api/clusters/", "/agent-deploy/preflights")
if clusterID == "" {
writeErrorResponse(w, http.StatusBadRequest, "invalid_cluster_id", "Cluster ID is required", nil)
return
}
var req createPreflightRequest
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
return
}
req.SourceAgentID = strings.TrimSpace(req.SourceAgentID)
if req.SourceAgentID == "" {
writeErrorResponse(w, http.StatusBadRequest, "missing_source_agent", "sourceAgentId is required", nil)
return
}
if len(req.TargetNodeIDs) == 0 {
writeErrorResponse(w, http.StatusBadRequest, "missing_targets", "At least one targetNodeIds entry is required", nil)
return
}
if len(req.TargetNodeIDs) > 100 {
writeErrorResponse(w, http.StatusBadRequest, "too_many_targets", "Maximum 100 targets per preflight", nil)
return
}
// Verify source agent is connected.
if !h.execServer.IsAgentConnected(req.SourceAgentID) {
writeErrorResponse(w, http.StatusConflict, "source_agent_offline", "Source agent is not connected", nil)
return
}
// Resolve cluster nodes from read-state.
readState := h.monitor.GetUnifiedReadStateOrSnapshot()
if readState == nil {
writeErrorResponse(w, http.StatusServiceUnavailable, "state_unavailable", "Resource state is unavailable", nil)
return
}
clusterName := ""
sourceNodeID := ""
nodesByID := make(map[string]*unifiedresources.NodeView)
for _, node := range readState.Nodes() {
if node == nil {
continue
}
if node.ClusterName() == clusterID && node.IsClusterMember() {
nodesByID[node.ID()] = node
if clusterName == "" {
clusterName = node.ClusterName()
}
if node.LinkedAgentID() == req.SourceAgentID {
sourceNodeID = node.ID()
}
}
}
if sourceNodeID == "" {
writeErrorResponse(w, http.StatusBadRequest, "source_not_in_cluster",
"Source agent is not linked to a node in this cluster", nil)
return
}
// Build deploy targets from requested node IDs.
now := time.Now().UTC()
jobID := generateID("pf")
maxParallel := req.MaxParallel
if maxParallel <= 0 {
maxParallel = 2
}
if maxParallel > 10 {
maxParallel = 10
}
job := &deploy.Job{
ID: jobID,
ClusterID: clusterID,
ClusterName: clusterName,
SourceAgentID: req.SourceAgentID,
SourceNodeID: sourceNodeID,
OrgID: resolveTenantOrgID(r),
Status: deploy.JobQueued,
MaxParallel: maxParallel,
RetryMax: 0, // preflights don't retry
CreatedAt: now,
UpdatedAt: now,
}
ctx := r.Context()
if err := h.store.CreateJob(ctx, job); err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to create preflight job")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to create preflight job", nil)
return
}
var targets []agentexec.DeployPreflightTarget
for _, nodeID := range req.TargetNodeIDs {
nodeID = strings.TrimSpace(nodeID)
node, ok := nodesByID[nodeID]
if !ok {
continue // skip nodes not in cluster
}
ip := nodeIP(node.HostURL())
if ip == "" {
continue // skip nodes without IP
}
targetID := generateID("tgt")
target := &deploy.Target{
ID: targetID,
JobID: jobID,
NodeID: nodeID,
NodeName: nodeName(node),
NodeIP: ip,
Status: deploy.TargetPending,
CreatedAt: now,
UpdatedAt: now,
}
if err := h.store.CreateTarget(ctx, target); err != nil {
log.Error().Err(err).Str("target_id", targetID).Msg("Failed to create preflight target")
continue
}
targets = append(targets, agentexec.DeployPreflightTarget{
TargetID: targetID,
NodeName: nodeName(node),
NodeIP: ip,
})
}
if len(targets) == 0 {
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
writeErrorResponse(w, http.StatusBadRequest, "no_valid_targets",
"None of the requested nodes are valid deployment targets", nil)
return
}
// Resolve Pulse URL for agent reachability.
pulseURL := h.resolvePublicURL(r)
if pulseURL == "" {
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
writeErrorResponse(w, http.StatusInternalServerError, "no_pulse_url",
"Cannot determine Pulse URL for agent reachability", nil)
return
}
requestID := generateID("req")
payload := agentexec.DeployPreflightPayload{
RequestID: requestID,
JobID: jobID,
Targets: targets,
PulseURL: pulseURL,
MaxParallel: maxParallel,
Timeout: 120,
}
// Transition to running.
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobRunning)
// Append job-created event.
_ = h.store.AppendEvent(ctx, &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventJobCreated,
Message: fmt.Sprintf("Preflight started for %d targets", len(targets)),
CreatedAt: now,
})
// Subscribe to progress before sending command to avoid race.
progressCh := h.execServer.SubscribeDeployProgress(req.SourceAgentID, jobID, 64)
// Send command to agent.
if err := h.execServer.SendDeployPreflight(ctx, req.SourceAgentID, payload); err != nil {
h.execServer.UnsubscribeDeployProgress(req.SourceAgentID, jobID)
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send preflight command")
writeErrorResponse(w, http.StatusInternalServerError, "send_failed",
"Failed to send preflight command to agent", nil)
return
}
// Start background goroutine to process progress events.
go h.processPreflightProgress(jobID, req.SourceAgentID, progressCh)
resp := createPreflightResponse{
PreflightID: jobID,
Status: string(deploy.JobRunning),
EventsURL: fmt.Sprintf("/api/agent-deploy/preflights/%s/events", jobID),
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusAccepted)
json.NewEncoder(w).Encode(resp)
}
// HandleGetPreflight returns the current status of a preflight job.
// GET /api/agent-deploy/preflights/{preflightId}
func (h *DeployHandlers) HandleGetPreflight(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
preflightID := extractPathSuffix(r.URL.Path, "/api/agent-deploy/preflights/")
if preflightID == "" {
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Preflight ID is required", nil)
return
}
// Strip /events suffix if present (shouldn't happen via routing, but be safe).
preflightID = strings.TrimSuffix(preflightID, "/events")
job, err := h.store.GetJob(r.Context(), preflightID)
if err != nil {
log.Error().Err(err).Str("id", preflightID).Msg("Failed to get preflight job")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight", nil)
return
}
if job == nil {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
return
}
// Tenant isolation: verify the job belongs to the caller's org.
orgID := resolveTenantOrgID(r)
if job.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
return
}
targets, err := h.store.GetTargetsForJob(r.Context(), preflightID)
if err != nil {
log.Error().Err(err).Str("id", preflightID).Msg("Failed to get preflight targets")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get targets", nil)
return
}
resp := struct {
*deploy.Job
Targets []deploy.Target `json:"targets"`
}{
Job: job,
Targets: targets,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resp)
}
// HandlePreflightEvents streams SSE events for a preflight job.
// GET /api/agent-deploy/preflights/{preflightId}/events
func (h *DeployHandlers) HandlePreflightEvents(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
// Extract preflight ID: /api/agent-deploy/preflights/{id}/events
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/preflights/")
preflightID := strings.TrimSuffix(path, "/events")
if preflightID == "" || preflightID == path {
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Preflight ID is required", nil)
return
}
// Verify the preflight exists.
job, err := h.store.GetJob(r.Context(), preflightID)
if err != nil {
log.Error().Err(err).Str("id", preflightID).Msg("Failed to get preflight job for SSE")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight", nil)
return
}
if job == nil {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
return
}
// Tenant isolation: verify the job belongs to the caller's org.
orgID := resolveTenantOrgID(r)
if job.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
return
}
flusher, ok := w.(http.Flusher)
if !ok {
writeErrorResponse(w, http.StatusInternalServerError, "streaming_unsupported", "Streaming not supported", nil)
return
}
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
// Register SSE client.
clientID := generateID("sse")
eventCh := h.addSSEClient(preflightID, clientID)
defer h.removeSSEClient(preflightID, clientID)
// Send existing events first (replay).
events, replayErr := h.store.GetEventsForJob(r.Context(), preflightID)
if replayErr != nil {
log.Error().Err(replayErr).Str("id", preflightID).Msg("Failed to load events for SSE replay")
// Send an error event so the client knows replay is incomplete.
fmt.Fprintf(w, "event: error\ndata: {\"message\":\"failed to load event history\"}\n\n")
}
for _, evt := range events {
data, _ := json.Marshal(evt)
fmt.Fprintf(w, "data: %s\n\n", data)
}
flusher.Flush()
// If job is already terminal, send final status and close.
if isDeployJobTerminal(job.Status) {
data, _ := json.Marshal(map[string]string{
"type": "job_complete",
"status": string(job.Status),
})
fmt.Fprintf(w, "data: %s\n\n", data)
flusher.Flush()
return
}
// Stream new events.
heartbeat := time.NewTicker(15 * time.Second)
defer heartbeat.Stop()
for {
select {
case <-r.Context().Done():
return
case eventData, ok := <-eventCh:
if !ok {
return
}
fmt.Fprintf(w, "data: %s\n\n", eventData)
flusher.Flush()
case <-heartbeat.C:
fmt.Fprint(w, ": heartbeat\n\n")
flusher.Flush()
}
}
}
// --- Progress processing ---
// processPreflightProgress reads deploy progress events from the agent and
// persists them as deploy events, also broadcasting to SSE clients.
func (h *DeployHandlers) processPreflightProgress(jobID, agentID string, ch <-chan agentexec.DeployProgressPayload) {
defer h.execServer.UnsubscribeDeployProgress(agentID, jobID)
ctx := context.Background()
for progress := range ch {
// Persist as event.
evt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
TargetID: progress.TargetID,
Type: deploy.EventPreflightResult,
Message: progress.Message,
Data: progress.Data,
CreatedAt: time.Now().UTC(),
}
if err := h.store.AppendEvent(ctx, evt); err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to persist deploy event")
}
// Update target status based on progress phase.
if progress.TargetID != "" {
h.updateTargetFromProgress(ctx, progress)
}
// Broadcast to SSE clients.
h.broadcastSSE(jobID, evt)
if progress.Final {
// Derive final job status from target statuses.
// For preflights, TargetReady means "passed" (not active).
finalStatus := derivePreflightJobStatus(ctx, h.store, jobID)
_ = h.store.UpdateJobStatus(ctx, jobID, finalStatus)
// Broadcast final status.
finalEvt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventJobStatusChanged,
Message: fmt.Sprintf("Preflight completed: %s", finalStatus),
CreatedAt: time.Now().UTC(),
}
_ = h.store.AppendEvent(ctx, finalEvt)
h.broadcastSSE(jobID, finalEvt)
// Close SSE channels for this job.
h.closeSSESub(jobID)
return
}
}
// Channel closed without final — agent disconnected.
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
finalEvt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventError,
Message: "Source agent disconnected during preflight",
CreatedAt: time.Now().UTC(),
}
_ = h.store.AppendEvent(ctx, finalEvt)
h.broadcastSSE(jobID, finalEvt)
h.closeSSESub(jobID)
}
// updateTargetFromProgress maps progress phases to target status transitions.
func (h *DeployHandlers) updateTargetFromProgress(ctx context.Context, p agentexec.DeployProgressPayload) {
var newStatus deploy.TargetStatus
var errMsg string
switch {
case p.Phase == agentexec.DeployPhasePreflightComplete && p.Status == agentexec.DeployStepOK:
newStatus = deploy.TargetReady
case p.Phase == agentexec.DeployPhasePreflightComplete && p.Status == agentexec.DeployStepFailed:
newStatus = deploy.TargetFailedPermanent
errMsg = p.Message
case p.Phase == agentexec.DeployPhasePreflightComplete && p.Status == agentexec.DeployStepSkipped:
newStatus = deploy.TargetSkippedAgent
case p.Phase == agentexec.DeployPhasePreflightSSH && p.Status == agentexec.DeployStepStarted:
newStatus = deploy.TargetPreflighting
case p.Phase == agentexec.DeployPhasePreflightSSH && p.Status == agentexec.DeployStepFailed:
newStatus = deploy.TargetFailedPermanent
errMsg = p.Message
case p.Phase == agentexec.DeployPhaseCanceled:
newStatus = deploy.TargetCanceled
default:
return // intermediate step, no status change
}
if err := h.store.UpdateTargetStatus(ctx, p.TargetID, newStatus, errMsg); err != nil {
log.Error().Err(err).
Str("target_id", p.TargetID).
Str("new_status", string(newStatus)).
Msg("Failed to update target status from progress")
}
}
// --- SSE subscription management ---
func (h *DeployHandlers) addSSEClient(jobID, clientID string) chan []byte {
h.sseMu.Lock()
defer h.sseMu.Unlock()
sub, ok := h.sseSubs[jobID]
if !ok {
sub = &deploySSESub{clients: make(map[string]chan []byte)}
h.sseSubs[jobID] = sub
}
ch := make(chan []byte, 64)
sub.mu.Lock()
sub.clients[clientID] = ch
sub.mu.Unlock()
return ch
}
func (h *DeployHandlers) removeSSEClient(jobID, clientID string) {
h.sseMu.Lock()
sub, ok := h.sseSubs[jobID]
h.sseMu.Unlock()
if !ok {
return
}
sub.mu.Lock()
if ch, exists := sub.clients[clientID]; exists {
close(ch)
delete(sub.clients, clientID)
}
sub.mu.Unlock()
}
func (h *DeployHandlers) broadcastSSE(jobID string, evt *deploy.Event) {
h.sseMu.Lock()
sub, ok := h.sseSubs[jobID]
h.sseMu.Unlock()
if !ok {
return
}
data, err := json.Marshal(evt)
if err != nil {
return
}
sub.mu.Lock()
defer sub.mu.Unlock()
for _, ch := range sub.clients {
select {
case ch <- data:
default:
// Drop if client is slow.
}
}
}
func (h *DeployHandlers) closeSSESub(jobID string) {
h.sseMu.Lock()
sub, ok := h.sseSubs[jobID]
if ok {
delete(h.sseSubs, jobID)
}
h.sseMu.Unlock()
if !ok {
return
}
sub.mu.Lock()
for id, ch := range sub.clients {
close(ch)
delete(sub.clients, id)
}
sub.mu.Unlock()
}
// --- Bootstrap Enrollment ---
// MintBootstrapTokenForTarget creates a single-use bootstrap token for a deploy target.
// Used by the deploy job creation flow to issue per-target tokens.
func (h *DeployHandlers) MintBootstrapTokenForTarget(req deploy.BootstrapTokenRequest) (rawToken string, tokenID string, err error) {
if req.TTL <= 0 {
return "", "", fmt.Errorf("TTL must be positive, got %v", req.TTL)
}
raw, err := auth.GenerateAPIToken()
if err != nil {
return "", "", fmt.Errorf("generate token: %w", err)
}
record, err := config.NewAPITokenRecord(raw,
fmt.Sprintf("deploy-bootstrap:%s:%s", req.JobID, req.TargetID),
[]string{config.ScopeAgentEnroll})
if err != nil {
return "", "", fmt.Errorf("create token record: %w", err)
}
exp := time.Now().UTC().Add(req.TTL)
record.ExpiresAt = &exp
record.OrgID = req.OrgID
record.Metadata = req.BuildMetadata()
config.Mu.Lock()
h.config.UpsertAPIToken(*record)
tokens := make([]config.APITokenRecord, len(h.config.APITokens))
copy(tokens, h.config.APITokens)
config.Mu.Unlock()
if h.persistence != nil {
if err := h.persistence.SaveAPITokens(tokens); err != nil {
log.Warn().Err(err).Msg("Failed to persist bootstrap token")
}
}
return raw, record.ID, nil
}
// enrollRequest matches the design doc Section 3 enrollment payload.
type enrollRequest struct {
Hostname string `json:"hostname"`
FQDN string `json:"fqdn,omitempty"`
MachineID string `json:"machineId,omitempty"`
OS string `json:"os"`
Arch string `json:"arch"`
AgentVersion string `json:"agentVersion"`
CommandsEnabled bool `json:"commandsEnabled,omitempty"`
Proxmox *struct {
ClusterName string `json:"clusterName,omitempty"`
NodeName string `json:"nodeName,omitempty"`
} `json:"proxmox,omitempty"`
DeployJobID string `json:"deployJobId,omitempty"`
}
// HandleEnroll processes bootstrap token enrollment from freshly-deployed agents.
// POST /api/agents/agent/enroll
func (h *DeployHandlers) HandleEnroll(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
// 1. Decode request body.
var req enrollRequest
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
return
}
req.Hostname = strings.TrimSpace(req.Hostname)
if req.Hostname == "" {
writeErrorResponse(w, http.StatusBadRequest, "missing_hostname", "hostname is required", nil)
return
}
// 2. Get bootstrap token from context (set by RequireAuth middleware).
bootstrapToken := getAPITokenRecordFromRequest(r)
if bootstrapToken == nil {
writeErrorResponse(w, http.StatusUnauthorized, "no_token", "Bootstrap token required", nil)
return
}
// 3. Validate token binding metadata.
meta := bootstrapToken.Metadata
if meta == nil {
writeErrorResponse(w, http.StatusForbidden, "invalid_token", "Token is not a bootstrap deploy token", nil)
return
}
jobID := meta[deploy.MetaKeyJobID]
targetID := meta[deploy.MetaKeyTargetID]
expectedNode := meta[deploy.MetaKeyExpectedNode]
if jobID == "" || targetID == "" {
writeErrorResponse(w, http.StatusForbidden, "invalid_token", "Token missing deploy binding", nil)
return
}
// 4. Validate node name binding (if set).
if expectedNode != "" && req.Hostname != expectedNode {
proxmoxMatch := false
if req.Proxmox != nil && req.Proxmox.NodeName == expectedNode {
proxmoxMatch = true
}
if !proxmoxMatch {
writeErrorResponse(w, http.StatusForbidden, "binding_mismatch",
fmt.Sprintf("Token bound to node %q, got hostname %q", expectedNode, req.Hostname), nil)
return
}
}
// 5. Verify deploy target exists and is in correct state.
ctx := r.Context()
target, err := h.store.GetTarget(ctx, targetID)
if err != nil || target == nil {
writeErrorResponse(w, http.StatusNotFound, "target_not_found", "Deploy target not found", nil)
return
}
if target.Status != deploy.TargetEnrolling && target.Status != deploy.TargetInstalling {
writeErrorResponse(w, http.StatusConflict, "invalid_target_state",
fmt.Sprintf("Target is in state %q, expected enrolling or installing", target.Status), nil)
return
}
// 6. Verify target belongs to the job referenced in the token.
if target.JobID != jobID {
writeErrorResponse(w, http.StatusForbidden, "binding_mismatch",
"Token job binding does not match target", nil)
return
}
// 7. Invalidate bootstrap token (single-use) BEFORE minting runtime token.
// Check return value to prevent concurrent replay.
config.Mu.Lock()
removed := h.config.RemoveAPIToken(bootstrapToken.ID)
tokensAfterRemove := make([]config.APITokenRecord, len(h.config.APITokens))
copy(tokensAfterRemove, h.config.APITokens)
config.Mu.Unlock()
if removed == nil {
writeErrorResponse(w, http.StatusConflict, "token_already_consumed",
"Bootstrap token has already been used", nil)
return
}
if h.persistence != nil {
if err := h.persistence.SaveAPITokens(tokensAfterRemove); err != nil {
log.Warn().Err(err).Msg("Failed to persist token removal during enroll")
}
}
// 8. Mint runtime token (long-lived, host-bound).
runtimeRaw, err := auth.GenerateAPIToken()
if err != nil {
log.Error().Err(err).Msg("Failed to generate runtime token during enroll")
writeErrorResponse(w, http.StatusInternalServerError, "token_error", "Failed to generate runtime token", nil)
return
}
runtimeScopes := []string{
config.ScopeAgentReport, config.ScopeAgentConfigRead, config.ScopeAgentManage,
config.ScopeDockerReport, config.ScopeKubernetesReport,
}
if req.CommandsEnabled {
runtimeScopes = append(runtimeScopes, config.ScopeAgentExec)
}
runtimeRecord, err := config.NewAPITokenRecord(runtimeRaw,
fmt.Sprintf("agent:%s", req.Hostname),
runtimeScopes)
if err != nil {
log.Error().Err(err).Msg("Failed to create runtime token record during enroll")
writeErrorResponse(w, http.StatusInternalServerError, "token_error", "Failed to create runtime token", nil)
return
}
runtimeRecord.OrgID = bootstrapToken.OrgID
runtimeRecord.Metadata = map[string]string{
"bound_hostname": req.Hostname,
"deploy_job_id": jobID,
}
config.Mu.Lock()
h.config.UpsertAPIToken(*runtimeRecord)
tokensAfterMint := make([]config.APITokenRecord, len(h.config.APITokens))
copy(tokensAfterMint, h.config.APITokens)
config.Mu.Unlock()
if h.persistence != nil {
if err := h.persistence.SaveAPITokens(tokensAfterMint); err != nil {
log.Warn().Err(err).Msg("Failed to persist runtime token during enroll")
}
}
// 9. Update target status to VERIFYING.
_ = h.store.UpdateTargetStatus(ctx, targetID, deploy.TargetVerifying, "")
// 10. Append enroll event.
enrollEvt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
TargetID: targetID,
Type: deploy.EventEnrollComplete,
Message: fmt.Sprintf("Agent enrolled from %s", req.Hostname),
CreatedAt: time.Now().UTC(),
}
_ = h.store.AppendEvent(ctx, enrollEvt)
// 11. Broadcast to SSE clients.
h.broadcastSSE(jobID, enrollEvt)
// 12. Return runtime token + config to agent.
canonicalAgentID := fmt.Sprintf("agent-%s", req.Hostname)
resp := map[string]any{
"agentId": canonicalAgentID,
"runtimeToken": runtimeRaw,
"runtimeTokenId": runtimeRecord.ID,
"reportInterval": "30s",
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(resp)
}
// --- Deploy Jobs ---
type createJobRequest struct {
SourceAgentID string `json:"sourceAgentId"`
PreflightID string `json:"preflightId"`
TargetNodeIDs []string `json:"targetNodeIds"`
Mode string `json:"mode"`
MaxParallel int `json:"maxParallel"`
RetryPolicy *struct {
MaxAttempts int `json:"maxAttempts"`
} `json:"retryPolicy,omitempty"`
}
type createJobSkip struct {
NodeID string `json:"nodeId"`
Reason string `json:"reason"`
}
type createJobResponse struct {
JobID string `json:"jobId"`
AcceptedTargets []string `json:"acceptedTargets"`
SkippedTargets []createJobSkip `json:"skippedTargets"`
ReservedLicenseSlots int `json:"reservedLicenseSlots"`
EventsURL string `json:"eventsUrl"`
}
// HandleCreateJob creates a deploy install job from preflight results.
// POST /api/clusters/{clusterId}/agent-deploy/jobs
func (h *DeployHandlers) HandleCreateJob(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
clusterID := extractClusterID(r.URL.Path, "/api/clusters/", "/agent-deploy/jobs")
if clusterID == "" {
writeErrorResponse(w, http.StatusBadRequest, "invalid_cluster_id", "Cluster ID is required", nil)
return
}
var req createJobRequest
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
return
}
req.SourceAgentID = strings.TrimSpace(req.SourceAgentID)
if req.SourceAgentID == "" {
writeErrorResponse(w, http.StatusBadRequest, "missing_source_agent", "sourceAgentId is required", nil)
return
}
req.PreflightID = strings.TrimSpace(req.PreflightID)
if req.PreflightID == "" {
writeErrorResponse(w, http.StatusBadRequest, "missing_preflight_id", "preflightId is required", nil)
return
}
if len(req.TargetNodeIDs) == 0 {
writeErrorResponse(w, http.StatusBadRequest, "missing_targets", "At least one targetNodeIds entry is required", nil)
return
}
// Verify source agent is connected.
if !h.execServer.IsAgentConnected(req.SourceAgentID) {
writeErrorResponse(w, http.StatusConflict, "source_agent_offline", "Source agent is not connected", nil)
return
}
ctx := r.Context()
orgID := resolveTenantOrgID(r)
// Verify preflight exists and belongs to same org.
pfJob, err := h.store.GetJob(ctx, req.PreflightID)
if err != nil {
log.Error().Err(err).Str("preflight_id", req.PreflightID).Msg("Failed to get preflight job")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight job", nil)
return
}
if pfJob == nil {
writeErrorResponse(w, http.StatusNotFound, "preflight_not_found", "Preflight job not found", nil)
return
}
if pfJob.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "preflight_not_found", "Preflight job not found", nil)
return
}
if pfJob.Status != deploy.JobSucceeded && pfJob.Status != deploy.JobPartialSuccess {
writeErrorResponse(w, http.StatusConflict, "preflight_not_passed",
fmt.Sprintf("Preflight is in state %q, expected succeeded or partial_success", pfJob.Status), nil)
return
}
// Verify cluster and source agent consistency.
if pfJob.ClusterID != clusterID {
writeErrorResponse(w, http.StatusBadRequest, "cluster_mismatch",
"Preflight cluster does not match request cluster", nil)
return
}
if pfJob.SourceAgentID != req.SourceAgentID {
writeErrorResponse(w, http.StatusBadRequest, "source_agent_mismatch",
"Preflight source agent does not match request source agent", nil)
return
}
// Get preflight targets — filter requested nodeIDs against Ready targets.
pfTargets, err := h.store.GetTargetsForJob(ctx, req.PreflightID)
if err != nil {
log.Error().Err(err).Str("preflight_id", req.PreflightID).Msg("Failed to get preflight targets")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight targets", nil)
return
}
// Build lookup of preflight targets by node ID.
pfTargetByNode := make(map[string]*deploy.Target)
for i := range pfTargets {
pfTargetByNode[pfTargets[i].NodeID] = &pfTargets[i]
}
// Deduplicate requested nodes while preserving request order.
seen := make(map[string]bool, len(req.TargetNodeIDs))
var orderedNodeIDs []string
for _, nid := range req.TargetNodeIDs {
nid = strings.TrimSpace(nid)
if nid != "" && !seen[nid] {
seen[nid] = true
orderedNodeIDs = append(orderedNodeIDs, nid)
}
}
// Filter: only accept targets that passed preflight (Ready state).
// Order is preserved from the request so license truncation is deterministic.
var acceptedPfTargets []*deploy.Target
var skipped []createJobSkip
for _, nodeID := range orderedNodeIDs {
pfTgt, ok := pfTargetByNode[nodeID]
if !ok {
skipped = append(skipped, createJobSkip{NodeID: nodeID, Reason: "not_in_preflight"})
continue
}
if pfTgt.Status != deploy.TargetReady {
skipped = append(skipped, createJobSkip{NodeID: nodeID, Reason: fmt.Sprintf("preflight_status_%s", pfTgt.Status)})
continue
}
acceptedPfTargets = append(acceptedPfTargets, pfTgt)
}
// License slot check.
maxLimit := maxMonitoredSystemsLimitForContext(ctx)
if maxLimit > 0 {
decision := monitoredSystemLimitDecisionForAdditionalSlots(ctx, h.monitor, 0)
if !decision.usageAvailable {
writeMonitoredSystemUsageUnavailable(w, decision.usageUnavailableReason)
return
}
available := decision.limit - decision.current
if available < 0 {
available = 0
}
if available < len(acceptedPfTargets) {
// Accept only what fits; skip the rest.
for i := available; i < len(acceptedPfTargets); i++ {
skipped = append(skipped, createJobSkip{
NodeID: acceptedPfTargets[i].NodeID,
Reason: "skipped_license",
})
}
acceptedPfTargets = acceptedPfTargets[:available]
}
}
if len(acceptedPfTargets) == 0 {
writeErrorResponse(w, http.StatusConflict, "no_eligible_targets",
"No targets are eligible for deployment", nil)
return
}
// Create deploy job.
now := time.Now().UTC()
jobID := generateID("dep")
maxParallel := req.MaxParallel
if maxParallel <= 0 {
maxParallel = 2
}
if maxParallel > 10 {
maxParallel = 10
}
retryMax := 3
if req.RetryPolicy != nil && req.RetryPolicy.MaxAttempts > 0 {
retryMax = req.RetryPolicy.MaxAttempts
}
job := &deploy.Job{
ID: jobID,
ClusterID: clusterID,
ClusterName: pfJob.ClusterName,
SourceAgentID: req.SourceAgentID,
SourceNodeID: pfJob.SourceNodeID,
OrgID: orgID,
Status: deploy.JobQueued,
MaxParallel: maxParallel,
RetryMax: retryMax,
CreatedAt: now,
UpdatedAt: now,
}
if err := h.store.CreateJob(ctx, job); err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to create deploy job")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to create deploy job", nil)
return
}
// Resolve Pulse URL for install commands.
pulseURL := h.resolvePublicURL(r)
if pulseURL == "" {
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
writeErrorResponse(w, http.StatusInternalServerError, "no_pulse_url",
"Cannot determine Pulse URL for agent installation", nil)
return
}
// Create targets and mint bootstrap tokens.
var installTargets []agentexec.DeployInstallTarget
var acceptedNodeIDs []string
for _, pfTgt := range acceptedPfTargets {
targetID := generateID("tgt")
arch := h.getTargetArchFromPreflight(ctx, req.PreflightID, pfTgt.NodeID)
target := &deploy.Target{
ID: targetID,
JobID: jobID,
NodeID: pfTgt.NodeID,
NodeName: pfTgt.NodeName,
NodeIP: pfTgt.NodeIP,
Arch: arch,
Status: deploy.TargetPending,
CreatedAt: now,
UpdatedAt: now,
}
if err := h.store.CreateTarget(ctx, target); err != nil {
log.Error().Err(err).Str("target_id", targetID).Msg("Failed to create deploy target")
continue
}
// Mint bootstrap token for this target.
rawToken, _, err := h.MintBootstrapTokenForTarget(deploy.BootstrapTokenRequest{
ClusterID: clusterID,
NodeID: pfTgt.NodeID,
ExpectedNode: pfTgt.NodeName,
JobID: jobID,
TargetID: targetID,
SourceAgentID: req.SourceAgentID,
OrgID: orgID,
TTL: 30 * time.Minute,
})
if err != nil {
log.Error().Err(err).Str("target_id", targetID).Msg("Failed to mint bootstrap token")
_ = h.store.UpdateTargetStatus(ctx, targetID, deploy.TargetFailedPermanent, "failed to mint bootstrap token")
continue
}
installTargets = append(installTargets, agentexec.DeployInstallTarget{
TargetID: targetID,
NodeName: pfTgt.NodeName,
NodeIP: pfTgt.NodeIP,
Arch: arch,
BootstrapToken: rawToken,
})
acceptedNodeIDs = append(acceptedNodeIDs, pfTgt.NodeID)
}
if len(installTargets) == 0 {
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
writeErrorResponse(w, http.StatusInternalServerError, "target_setup_failed",
"Failed to set up any deployment targets", nil)
return
}
// Reserve license slots based on actual dispatched target count.
if err := h.reservation.Reserve(jobID, orgID, len(installTargets), 1*time.Hour); err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to reserve license slots")
// Non-fatal — continue. The reservation is for proactive slot tracking.
}
// Transition to running.
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobRunning)
// Append job-created event.
_ = h.store.AppendEvent(ctx, &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventJobCreated,
Message: fmt.Sprintf("Deploy started for %d targets", len(installTargets)),
CreatedAt: now,
})
requestID := generateID("req")
payload := agentexec.DeployInstallPayload{
RequestID: requestID,
JobID: jobID,
Targets: installTargets,
PulseURL: pulseURL,
MaxParallel: maxParallel,
Timeout: 300,
}
// Subscribe to progress before sending command.
progressCh := h.execServer.SubscribeDeployProgress(req.SourceAgentID, jobID, 64)
// Send install command to agent.
if err := h.execServer.SendDeployInstall(ctx, req.SourceAgentID, payload); err != nil {
h.execServer.UnsubscribeDeployProgress(req.SourceAgentID, jobID)
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
h.reservation.Release(jobID)
// Mark pending targets as failed so they're eligible for retry.
for _, it := range installTargets {
_ = h.store.UpdateTargetStatus(ctx, it.TargetID, deploy.TargetFailedRetryable, "dispatch failed")
}
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send install command")
writeErrorResponse(w, http.StatusInternalServerError, "send_failed",
"Failed to send install command to agent", nil)
return
}
// Start background goroutine to process progress events.
go h.processInstallProgress(jobID, req.SourceAgentID, job.RetryMax, progressCh)
resp := createJobResponse{
JobID: jobID,
AcceptedTargets: acceptedNodeIDs,
SkippedTargets: skipped,
ReservedLicenseSlots: len(installTargets),
EventsURL: fmt.Sprintf("/api/agent-deploy/jobs/%s/events", jobID),
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusAccepted)
json.NewEncoder(w).Encode(resp)
}
// HandleGetJob returns the current status of a deploy job.
// GET /api/agent-deploy/jobs/{jobId}
func (h *DeployHandlers) HandleGetJob(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
jobID := extractPathSuffix(r.URL.Path, "/api/agent-deploy/jobs/")
if jobID == "" {
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
return
}
jobID = strings.TrimSuffix(jobID, "/events")
job, err := h.store.GetJob(r.Context(), jobID)
if err != nil {
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
return
}
if job == nil {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
orgID := resolveTenantOrgID(r)
if job.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
targets, err := h.store.GetTargetsForJob(r.Context(), jobID)
if err != nil {
log.Error().Err(err).Str("id", jobID).Msg("Failed to get job targets")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get targets", nil)
return
}
resp := struct {
*deploy.Job
Targets []deploy.Target `json:"targets"`
}{
Job: job,
Targets: targets,
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(resp)
}
// HandleJobEvents streams SSE events for a deploy job.
// GET /api/agent-deploy/jobs/{jobId}/events
func (h *DeployHandlers) HandleJobEvents(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
// Extract job ID: /api/agent-deploy/jobs/{id}/events
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/jobs/")
jobID := strings.TrimSuffix(path, "/events")
if jobID == "" || jobID == path {
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
return
}
job, err := h.store.GetJob(r.Context(), jobID)
if err != nil {
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job for SSE")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
return
}
if job == nil {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
orgID := resolveTenantOrgID(r)
if job.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
flusher, ok := w.(http.Flusher)
if !ok {
writeErrorResponse(w, http.StatusInternalServerError, "streaming_unsupported", "Streaming not supported", nil)
return
}
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache")
w.Header().Set("Connection", "keep-alive")
clientID := generateID("sse")
eventCh := h.addSSEClient(jobID, clientID)
defer h.removeSSEClient(jobID, clientID)
// Replay existing events.
events, replayErr := h.store.GetEventsForJob(r.Context(), jobID)
if replayErr != nil {
log.Error().Err(replayErr).Str("id", jobID).Msg("Failed to load events for SSE replay")
fmt.Fprintf(w, "event: error\ndata: {\"message\":\"failed to load event history\"}\n\n")
}
for _, evt := range events {
data, _ := json.Marshal(evt)
fmt.Fprintf(w, "data: %s\n\n", data)
}
flusher.Flush()
// If job is terminal, send final and close.
if isDeployJobTerminal(job.Status) {
data, _ := json.Marshal(map[string]string{
"type": "job_complete",
"status": string(job.Status),
})
fmt.Fprintf(w, "data: %s\n\n", data)
flusher.Flush()
return
}
// Stream new events.
heartbeat := time.NewTicker(15 * time.Second)
defer heartbeat.Stop()
for {
select {
case <-r.Context().Done():
return
case eventData, ok := <-eventCh:
if !ok {
return
}
fmt.Fprintf(w, "data: %s\n\n", eventData)
flusher.Flush()
case <-heartbeat.C:
fmt.Fprint(w, ": heartbeat\n\n")
flusher.Flush()
}
}
}
// HandleCancelJob cancels a running deploy job.
// POST /api/agent-deploy/jobs/{jobId}/cancel
func (h *DeployHandlers) HandleCancelJob(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
// Extract job ID: /api/agent-deploy/jobs/{id}/cancel
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/jobs/")
jobID := strings.TrimSuffix(path, "/cancel")
if jobID == "" || jobID == path {
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
return
}
ctx := r.Context()
job, err := h.store.GetJob(ctx, jobID)
if err != nil {
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job for cancel")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
return
}
if job == nil {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
orgID := resolveTenantOrgID(r)
if job.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
if job.Status != deploy.JobRunning {
writeErrorResponse(w, http.StatusConflict, "not_running",
fmt.Sprintf("Job is in state %q, only running jobs can be canceled", job.Status), nil)
return
}
// Transition to canceling.
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobCanceling)
// Send cancel to source agent.
cancelPayload := agentexec.DeployCancelPayload{
RequestID: generateID("req"),
JobID: jobID,
}
if err := h.execServer.SendDeployCancel(ctx, job.SourceAgentID, cancelPayload); err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send cancel command")
// Don't fail the request — the agent may have already disconnected.
// processInstallProgress will handle the channel close.
}
// Append cancel event.
_ = h.store.AppendEvent(ctx, &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventJobStatusChanged,
Message: "Cancel requested",
CreatedAt: time.Now().UTC(),
})
// Return current state.
job.Status = deploy.JobCanceling
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(job)
}
type retryJobRequest struct {
TargetIDs []string `json:"targetIds,omitempty"`
}
// HandleRetryJob retries failed targets in a terminal deploy job.
// POST /api/agent-deploy/jobs/{jobId}/retry
func (h *DeployHandlers) HandleRetryJob(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
return
}
// Extract job ID: /api/agent-deploy/jobs/{id}/retry
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/jobs/")
jobID := strings.TrimSuffix(path, "/retry")
if jobID == "" || jobID == path {
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
return
}
var req retryJobRequest
if r.Body != nil && r.ContentLength != 0 {
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
// EOF means empty body (e.g. chunked with no data) — treat as "retry all".
if !errors.Is(err, io.EOF) {
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
return
}
}
}
ctx := r.Context()
job, err := h.store.GetJob(ctx, jobID)
if err != nil {
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job for retry")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
return
}
if job == nil {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
orgID := resolveTenantOrgID(r)
if job.OrgID != orgID {
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
return
}
if !isDeployJobTerminal(job.Status) {
writeErrorResponse(w, http.StatusConflict, "not_terminal",
fmt.Sprintf("Job is in state %q, only terminal jobs can be retried", job.Status), nil)
return
}
// Verify source agent is connected.
if !h.execServer.IsAgentConnected(job.SourceAgentID) {
writeErrorResponse(w, http.StatusConflict, "source_agent_offline", "Source agent is not connected", nil)
return
}
// Load targets.
targets, err := h.store.GetTargetsForJob(ctx, jobID)
if err != nil {
log.Error().Err(err).Str("id", jobID).Msg("Failed to get job targets for retry")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get targets", nil)
return
}
// Filter to retryable failed targets.
requestedIDs := make(map[string]bool, len(req.TargetIDs))
for _, id := range req.TargetIDs {
requestedIDs[strings.TrimSpace(id)] = true
}
var retryTargets []deploy.Target
for _, t := range targets {
if t.Status != deploy.TargetFailedRetryable && t.Status != deploy.TargetFailedPermanent {
continue
}
if t.Attempts >= job.RetryMax {
continue
}
if len(requestedIDs) > 0 && !requestedIDs[t.ID] {
continue
}
retryTargets = append(retryTargets, t)
}
if len(retryTargets) == 0 {
writeErrorResponse(w, http.StatusConflict, "nothing_to_retry",
"No eligible targets to retry (all succeeded or exceeded max attempts)", nil)
return
}
// License slot re-check.
maxLimit := maxMonitoredSystemsLimitForContext(ctx)
if maxLimit > 0 {
decision := monitoredSystemLimitDecisionForAdditionalSlots(ctx, h.monitor, 0)
if !decision.usageAvailable {
writeMonitoredSystemUsageUnavailable(w, decision.usageUnavailableReason)
return
}
available := decision.limit - decision.current
if available < 0 {
available = 0
}
if available < len(retryTargets) {
retryTargets = retryTargets[:available]
}
if len(retryTargets) == 0 {
writeErrorResponse(w, http.StatusConflict, "license_limit",
"No license slots available for retry", nil)
return
}
}
// Reset targets to pending.
retryIDs := make([]string, len(retryTargets))
for i, t := range retryTargets {
retryIDs[i] = t.ID
}
resetCount, err := h.store.ResetTargetsForRetry(ctx, retryIDs)
if err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to reset targets for retry")
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to reset targets", nil)
return
}
if resetCount == 0 {
writeErrorResponse(w, http.StatusConflict, "nothing_to_retry",
"No targets were in a retryable state", nil)
return
}
// Transition job back to running.
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobRunning)
// Resolve Pulse URL.
pulseURL := h.resolvePublicURL(r)
if pulseURL == "" {
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
// Mark reset targets back to failed_retryable so they remain retryable.
for _, id := range retryIDs {
_ = h.store.UpdateTargetStatus(ctx, id, deploy.TargetFailedRetryable, "dispatch failed: no Pulse URL")
}
h.reservation.Release(jobID + "-retry")
writeErrorResponse(w, http.StatusInternalServerError, "no_pulse_url",
"Cannot determine Pulse URL for agent installation", nil)
return
}
// Mint fresh bootstrap tokens and build install targets.
var installTargets []agentexec.DeployInstallTarget
for _, t := range retryTargets {
rawToken, _, err := h.MintBootstrapTokenForTarget(deploy.BootstrapTokenRequest{
ClusterID: job.ClusterID,
NodeID: t.NodeID,
ExpectedNode: t.NodeName,
JobID: jobID,
TargetID: t.ID,
SourceAgentID: job.SourceAgentID,
OrgID: orgID,
TTL: 30 * time.Minute,
})
if err != nil {
log.Error().Err(err).Str("target_id", t.ID).Msg("Failed to mint retry bootstrap token")
_ = h.store.UpdateTargetStatus(ctx, t.ID, deploy.TargetFailedPermanent, "failed to mint bootstrap token for retry")
continue
}
arch := t.Arch
if arch == "" {
arch = "amd64"
}
installTargets = append(installTargets, agentexec.DeployInstallTarget{
TargetID: t.ID,
NodeName: t.NodeName,
NodeIP: t.NodeIP,
Arch: arch,
BootstrapToken: rawToken,
})
}
if len(installTargets) == 0 {
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
writeErrorResponse(w, http.StatusInternalServerError, "retry_setup_failed",
"Failed to set up any retry targets", nil)
return
}
// Reserve license slots based on actual dispatch count (after token minting).
if err := h.reservation.Reserve(jobID+"-retry", orgID, len(installTargets), 1*time.Hour); err != nil {
log.Warn().Err(err).Str("job_id", jobID).Msg("Failed to reserve license slots for retry")
}
// Append retry event.
_ = h.store.AppendEvent(ctx, &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventJobStatusChanged,
Message: fmt.Sprintf("Retry started for %d targets", len(installTargets)),
CreatedAt: time.Now().UTC(),
})
requestID := generateID("req")
payload := agentexec.DeployInstallPayload{
RequestID: requestID,
JobID: jobID,
Targets: installTargets,
PulseURL: pulseURL,
MaxParallel: job.MaxParallel,
Timeout: 300,
}
// Subscribe and send.
progressCh := h.execServer.SubscribeDeployProgress(job.SourceAgentID, jobID, 64)
if err := h.execServer.SendDeployInstall(ctx, job.SourceAgentID, payload); err != nil {
h.execServer.UnsubscribeDeployProgress(job.SourceAgentID, jobID)
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
h.reservation.Release(jobID + "-retry")
// Mark retried targets back to failed so they can be retried again.
for _, it := range installTargets {
_ = h.store.UpdateTargetStatus(ctx, it.TargetID, deploy.TargetFailedRetryable, "dispatch failed")
}
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send retry install command")
writeErrorResponse(w, http.StatusInternalServerError, "send_failed",
"Failed to send retry command to agent", nil)
return
}
go h.processInstallProgress(jobID, job.SourceAgentID, job.RetryMax, progressCh)
resp := map[string]any{
"jobId": jobID,
"retryTargets": len(installTargets),
"status": "running",
"eventsUrl": fmt.Sprintf("/api/agent-deploy/jobs/%s/events", jobID),
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusAccepted)
json.NewEncoder(w).Encode(resp)
}
// processInstallProgress reads install progress events from the agent and
// persists them as deploy events, also broadcasting to SSE clients.
func (h *DeployHandlers) processInstallProgress(jobID, agentID string, retryMax int, ch <-chan agentexec.DeployProgressPayload) {
defer h.execServer.UnsubscribeDeployProgress(agentID, jobID)
ctx := context.Background()
for progress := range ch {
// Persist as event.
evt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
TargetID: progress.TargetID,
Type: deploy.EventInstallOutput,
Message: progress.Message,
Data: progress.Data,
CreatedAt: time.Now().UTC(),
}
if err := h.store.AppendEvent(ctx, evt); err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to persist install event")
}
// Update target status based on install phase.
if progress.TargetID != "" {
h.updateTargetFromInstallProgress(ctx, progress, retryMax)
}
// Broadcast to SSE clients.
h.broadcastSSE(jobID, evt)
if progress.Final {
// Derive final job status from target statuses.
// Use install-specific derivation that treats failed_retryable as terminal.
targets, err := h.store.GetTargetsForJob(ctx, jobID)
if err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to get targets for job status derivation")
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
} else {
finalStatus := deriveInstallJobStatus(targets)
_ = h.store.UpdateJobStatus(ctx, jobID, finalStatus)
// Broadcast final status.
finalEvt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventJobStatusChanged,
Message: fmt.Sprintf("Deploy completed: %s", finalStatus),
CreatedAt: time.Now().UTC(),
}
_ = h.store.AppendEvent(ctx, finalEvt)
h.broadcastSSE(jobID, finalEvt)
}
// Release license reservation.
h.reservation.Release(jobID)
h.reservation.Release(jobID + "-retry") // in case of retry
// Close SSE channels.
h.closeSSESub(jobID)
return
}
}
// Channel closed without final — agent disconnected.
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
h.reservation.Release(jobID)
h.reservation.Release(jobID + "-retry")
finalEvt := &deploy.Event{
ID: generateID("evt"),
JobID: jobID,
Type: deploy.EventError,
Message: "Source agent disconnected during install",
CreatedAt: time.Now().UTC(),
}
_ = h.store.AppendEvent(ctx, finalEvt)
h.broadcastSSE(jobID, finalEvt)
h.closeSSESub(jobID)
}
// updateTargetFromInstallProgress maps install progress phases to target status transitions.
func (h *DeployHandlers) updateTargetFromInstallProgress(ctx context.Context, p agentexec.DeployProgressPayload, retryMax int) {
var newStatus deploy.TargetStatus
var errMsg string
switch {
case p.Phase == agentexec.DeployPhaseInstallTransfer && p.Status == agentexec.DeployStepStarted:
newStatus = deploy.TargetInstalling
case p.Phase == agentexec.DeployPhaseInstallExecute && p.Status == agentexec.DeployStepFailed:
// Check attempt count to decide retryable vs permanent.
target, err := h.store.GetTarget(ctx, p.TargetID)
if err != nil || target == nil {
newStatus = deploy.TargetFailedRetryable
} else if target.Attempts+1 >= retryMax {
newStatus = deploy.TargetFailedPermanent
} else {
newStatus = deploy.TargetFailedRetryable
}
errMsg = p.Message
// Increment attempts.
_ = h.store.IncrementTargetAttempts(ctx, p.TargetID)
case p.Phase == agentexec.DeployPhaseInstallEnrollWait && p.Status == agentexec.DeployStepStarted:
newStatus = deploy.TargetEnrolling
case p.Phase == agentexec.DeployPhaseInstallEnrollWait && p.Status == agentexec.DeployStepFailed:
newStatus = deploy.TargetFailedRetryable
errMsg = p.Message
_ = h.store.IncrementTargetAttempts(ctx, p.TargetID)
case p.Phase == agentexec.DeployPhaseInstallComplete && p.Status == agentexec.DeployStepFailed:
newStatus = deploy.TargetFailedRetryable
errMsg = p.Message
_ = h.store.IncrementTargetAttempts(ctx, p.TargetID)
case p.Phase == agentexec.DeployPhaseInstallComplete && p.Status == agentexec.DeployStepOK:
// Don't change status here — target remains in 'enrolling' until
// the enrollment endpoint transitions it to 'succeeded'. The source
// agent fires this event immediately without waiting for enrollment.
return
case p.Phase == agentexec.DeployPhaseCanceled:
newStatus = deploy.TargetCanceled
default:
return // intermediate step, no status change
}
if err := h.store.UpdateTargetStatus(ctx, p.TargetID, newStatus, errMsg); err != nil {
log.Error().Err(err).
Str("target_id", p.TargetID).
Str("new_status", string(newStatus)).
Msg("Failed to update target status from install progress")
}
}
// getTargetArchFromPreflight extracts the architecture from preflight events for a given node.
func (h *DeployHandlers) getTargetArchFromPreflight(ctx context.Context, preflightJobID string, nodeID string) string {
events, err := h.store.GetEventsForJob(ctx, preflightJobID)
if err != nil {
return "amd64"
}
// Get preflight targets to map target IDs to node IDs.
pfTargets, err := h.store.GetTargetsForJob(ctx, preflightJobID)
if err != nil {
return "amd64"
}
targetIDForNode := ""
for _, t := range pfTargets {
if t.NodeID == nodeID {
targetIDForNode = t.ID
// Also check if arch was stored on the target directly.
if t.Arch != "" {
return t.Arch
}
break
}
}
if targetIDForNode == "" {
return "amd64"
}
// Look through events for preflight_complete with arch data.
for _, evt := range events {
if evt.TargetID != targetIDForNode {
continue
}
if evt.Type != deploy.EventPreflightResult || evt.Data == "" {
continue
}
var result agentexec.PreflightResultData
if err := json.Unmarshal([]byte(evt.Data), &result); err == nil && result.Arch != "" {
return result.Arch
}
}
return "amd64"
}
// --- Helpers ---
// extractClusterID extracts a cluster ID from a path like /api/clusters/{id}/agent-deploy/...
func extractClusterID(path, prefix, suffix string) string {
path = strings.TrimPrefix(path, prefix)
idx := strings.Index(path, suffix)
if idx < 0 {
idx = strings.Index(path, "/")
if idx < 0 {
return strings.TrimSpace(path)
}
}
return strings.TrimSpace(path[:idx])
}
// extractPathSuffix extracts the part after a prefix, e.g. /api/foo/bar -> bar
func extractPathSuffix(path, prefix string) string {
s := strings.TrimPrefix(path, prefix)
// Remove trailing slashes and nested paths.
if idx := strings.Index(s, "/"); idx >= 0 {
s = s[:idx]
}
return strings.TrimSpace(s)
}
// nodeIP extracts the hostname/IP from a node host URL (e.g. "https://198.51.100.2:8006" -> "198.51.100.2").
func nodeIP(hostURL string) string {
raw := strings.TrimSpace(hostURL)
if raw == "" {
return ""
}
parsed, err := url.Parse(raw)
if err != nil || parsed.Host == "" {
return ""
}
host := parsed.Hostname() // strips port
return host
}
func nodeName(node *unifiedresources.NodeView) string {
if node == nil {
return ""
}
if name := strings.TrimSpace(node.NodeName()); name != "" {
return name
}
return strings.TrimSpace(node.Name())
}
// generateID creates a prefixed unique ID.
func generateID(prefix string) string {
return fmt.Sprintf("%s_%d", prefix, time.Now().UnixNano())
}
// isDeployJobTerminal returns true if the job status is terminal.
func isDeployJobTerminal(s deploy.JobStatus) bool {
switch s {
case deploy.JobSucceeded, deploy.JobPartialSuccess, deploy.JobFailed, deploy.JobCanceled:
return true
}
return false
}
// deriveInstallJobStatus computes the final install job status from target statuses.
// Unlike DeriveStatus, this treats TargetFailedRetryable as terminal (the agent
// has finished its work and signaled Final=true, so all targets are settled).
// TargetEnrolling is treated as succeeded because the source agent fires
// install_complete/ok immediately without waiting for async enrollment.
func deriveInstallJobStatus(targets []deploy.Target) deploy.JobStatus {
if len(targets) == 0 {
return deploy.JobSucceeded
}
var succeeded, failed int
for _, t := range targets {
switch t.Status {
case deploy.TargetSucceeded, deploy.TargetVerifying, deploy.TargetEnrolling:
// enrolling = install completed, enrollment is async and expected to succeed
succeeded++
case deploy.TargetFailedPermanent, deploy.TargetFailedRetryable,
deploy.TargetSkippedAgent, deploy.TargetSkippedLicense, deploy.TargetCanceled:
failed++
// pending/installing — shouldn't happen at Final but treat as incomplete
default:
failed++
}
}
total := len(targets)
if succeeded == total {
return deploy.JobSucceeded
}
if succeeded > 0 {
return deploy.JobPartialSuccess
}
return deploy.JobFailed
}
// derivePreflightJobStatus computes the final job status from target statuses.
// Unlike DeriveStatus, this treats TargetReady as "passed" (preflight success)
// and TargetSkippedAgent as neutral success (not a failure).
func derivePreflightJobStatus(ctx context.Context, store *deploy.Store, jobID string) deploy.JobStatus {
targets, err := store.GetTargetsForJob(ctx, jobID)
if err != nil {
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to get targets for job status derivation")
return deploy.JobFailed
}
if len(targets) == 0 {
return deploy.JobSucceeded
}
var succeeded, failed int
for _, t := range targets {
switch t.Status {
case deploy.TargetReady, deploy.TargetSucceeded, deploy.TargetSkippedAgent:
succeeded++
case deploy.TargetFailedPermanent, deploy.TargetFailedRetryable:
failed++
}
}
total := len(targets)
if succeeded == total {
return deploy.JobSucceeded
}
if failed == total {
return deploy.JobFailed
}
if succeeded > 0 {
return deploy.JobPartialSuccess
}
return deploy.JobFailed
}