mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-06 16:16:26 +00:00
2049 lines
63 KiB
Go
2049 lines
63 KiB
Go
package api
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/agentexec"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/deploy"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
|
|
unifiedresources "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/auth"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// DeployHandlers provides HTTP handlers for cluster agent deployment.
|
|
type DeployHandlers struct {
|
|
store *deploy.Store
|
|
monitor *monitoring.Monitor
|
|
execServer *agentexec.Server
|
|
reservation *deploy.ReservationManager
|
|
|
|
// resolvePublicURL derives the Pulse URL for agent reachability checks.
|
|
resolvePublicURL func(req *http.Request) string
|
|
|
|
// config and persistence for token minting/validation in enroll flow.
|
|
config *config.Config
|
|
persistence *config.ConfigPersistence
|
|
|
|
// Active preflight SSE subscriptions keyed by preflightID.
|
|
sseMu sync.Mutex
|
|
sseSubs map[string]*deploySSESub
|
|
}
|
|
|
|
// deploySSESub tracks SSE clients for a single preflight job.
|
|
type deploySSESub struct {
|
|
clients map[string]chan []byte // clientID -> event channel
|
|
mu sync.Mutex
|
|
}
|
|
|
|
// NewDeployHandlers creates a DeployHandlers instance.
|
|
func NewDeployHandlers(
|
|
store *deploy.Store,
|
|
monitor *monitoring.Monitor,
|
|
execServer *agentexec.Server,
|
|
reservation *deploy.ReservationManager,
|
|
resolvePublicURL func(req *http.Request) string,
|
|
cfg *config.Config,
|
|
persistence *config.ConfigPersistence,
|
|
) *DeployHandlers {
|
|
return &DeployHandlers{
|
|
store: store,
|
|
monitor: monitor,
|
|
execServer: execServer,
|
|
reservation: reservation,
|
|
resolvePublicURL: resolvePublicURL,
|
|
config: cfg,
|
|
persistence: persistence,
|
|
sseSubs: make(map[string]*deploySSESub),
|
|
}
|
|
}
|
|
|
|
// --- Candidates ---
|
|
|
|
// candidateNode is the per-node response in the candidates list.
|
|
type candidateNode struct {
|
|
NodeID string `json:"nodeId"`
|
|
Name string `json:"name"`
|
|
IP string `json:"ip,omitempty"`
|
|
HasAgent bool `json:"hasAgent"`
|
|
Deployable bool `json:"deployable"`
|
|
Reason string `json:"reason,omitempty"`
|
|
}
|
|
|
|
// sourceAgentInfo describes a connected agent that can execute SSH to peers.
|
|
type sourceAgentInfo struct {
|
|
AgentID string `json:"agentId"`
|
|
NodeID string `json:"nodeId"`
|
|
Online bool `json:"online"`
|
|
}
|
|
|
|
type candidatesResponse struct {
|
|
ClusterID string `json:"clusterId"`
|
|
ClusterName string `json:"clusterName"`
|
|
SourceAgents []sourceAgentInfo `json:"sourceAgents"`
|
|
Nodes []candidateNode `json:"nodes"`
|
|
}
|
|
|
|
// HandleCandidates returns deployment candidate nodes for a cluster.
|
|
// GET /api/clusters/{clusterId}/agent-deploy/candidates
|
|
func (h *DeployHandlers) HandleCandidates(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
clusterID := extractClusterID(r.URL.Path, "/api/clusters/", "/agent-deploy/candidates")
|
|
if clusterID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_cluster_id", "Cluster ID is required", nil)
|
|
return
|
|
}
|
|
|
|
readState := h.monitor.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
writeErrorResponse(w, http.StatusServiceUnavailable, "state_unavailable", "Resource state is unavailable", nil)
|
|
return
|
|
}
|
|
|
|
// Build connected agents set
|
|
connectedAgents := make(map[string]bool)
|
|
for _, agent := range h.execServer.GetConnectedAgents() {
|
|
connectedAgents[agent.AgentID] = true
|
|
}
|
|
|
|
var (
|
|
clusterName string
|
|
nodes []candidateNode
|
|
sourceAgents []sourceAgentInfo
|
|
)
|
|
|
|
for _, node := range readState.Nodes() {
|
|
if node == nil {
|
|
continue
|
|
}
|
|
if !node.IsClusterMember() {
|
|
continue
|
|
}
|
|
// Match cluster by name (clusterID in URL = cluster name).
|
|
if node.ClusterName() != clusterID {
|
|
continue
|
|
}
|
|
if clusterName == "" {
|
|
clusterName = node.ClusterName()
|
|
}
|
|
|
|
hasAgent := node.LinkedAgentID() != ""
|
|
cn := candidateNode{
|
|
NodeID: node.ID(),
|
|
Name: nodeName(node),
|
|
IP: nodeIP(node.HostURL()),
|
|
HasAgent: hasAgent,
|
|
}
|
|
|
|
if hasAgent {
|
|
cn.Deployable = false
|
|
cn.Reason = "already_agent"
|
|
|
|
// This node has an agent — check if it's a source candidate.
|
|
hostID := node.LinkedAgentID()
|
|
if connectedAgents[hostID] {
|
|
sourceAgents = append(sourceAgents, sourceAgentInfo{
|
|
AgentID: hostID,
|
|
NodeID: node.ID(),
|
|
Online: true,
|
|
})
|
|
}
|
|
} else {
|
|
cn.Deployable = true
|
|
}
|
|
|
|
nodes = append(nodes, cn)
|
|
}
|
|
|
|
resp := candidatesResponse{
|
|
ClusterID: clusterID,
|
|
ClusterName: clusterName,
|
|
SourceAgents: sourceAgents,
|
|
Nodes: nodes,
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// --- Preflight ---
|
|
|
|
type createPreflightRequest struct {
|
|
SourceAgentID string `json:"sourceAgentId"`
|
|
TargetNodeIDs []string `json:"targetNodeIds"`
|
|
MaxParallel int `json:"maxParallel"`
|
|
}
|
|
|
|
type createPreflightResponse struct {
|
|
PreflightID string `json:"preflightId"`
|
|
Status string `json:"status"`
|
|
EventsURL string `json:"eventsUrl"`
|
|
}
|
|
|
|
// HandleCreatePreflight creates a preflight job and dispatches to the source agent.
|
|
// POST /api/clusters/{clusterId}/agent-deploy/preflights
|
|
func (h *DeployHandlers) HandleCreatePreflight(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
clusterID := extractClusterID(r.URL.Path, "/api/clusters/", "/agent-deploy/preflights")
|
|
if clusterID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_cluster_id", "Cluster ID is required", nil)
|
|
return
|
|
}
|
|
|
|
var req createPreflightRequest
|
|
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
|
|
return
|
|
}
|
|
|
|
req.SourceAgentID = strings.TrimSpace(req.SourceAgentID)
|
|
if req.SourceAgentID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_source_agent", "sourceAgentId is required", nil)
|
|
return
|
|
}
|
|
if len(req.TargetNodeIDs) == 0 {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_targets", "At least one targetNodeIds entry is required", nil)
|
|
return
|
|
}
|
|
if len(req.TargetNodeIDs) > 100 {
|
|
writeErrorResponse(w, http.StatusBadRequest, "too_many_targets", "Maximum 100 targets per preflight", nil)
|
|
return
|
|
}
|
|
|
|
// Verify source agent is connected.
|
|
if !h.execServer.IsAgentConnected(req.SourceAgentID) {
|
|
writeErrorResponse(w, http.StatusConflict, "source_agent_offline", "Source agent is not connected", nil)
|
|
return
|
|
}
|
|
|
|
// Resolve cluster nodes from read-state.
|
|
readState := h.monitor.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
writeErrorResponse(w, http.StatusServiceUnavailable, "state_unavailable", "Resource state is unavailable", nil)
|
|
return
|
|
}
|
|
|
|
clusterName := ""
|
|
sourceNodeID := ""
|
|
nodesByID := make(map[string]*unifiedresources.NodeView)
|
|
for _, node := range readState.Nodes() {
|
|
if node == nil {
|
|
continue
|
|
}
|
|
if node.ClusterName() == clusterID && node.IsClusterMember() {
|
|
nodesByID[node.ID()] = node
|
|
if clusterName == "" {
|
|
clusterName = node.ClusterName()
|
|
}
|
|
if node.LinkedAgentID() == req.SourceAgentID {
|
|
sourceNodeID = node.ID()
|
|
}
|
|
}
|
|
}
|
|
|
|
if sourceNodeID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "source_not_in_cluster",
|
|
"Source agent is not linked to a node in this cluster", nil)
|
|
return
|
|
}
|
|
|
|
// Build deploy targets from requested node IDs.
|
|
now := time.Now().UTC()
|
|
jobID := generateID("pf")
|
|
maxParallel := req.MaxParallel
|
|
if maxParallel <= 0 {
|
|
maxParallel = 2
|
|
}
|
|
if maxParallel > 10 {
|
|
maxParallel = 10
|
|
}
|
|
|
|
job := &deploy.Job{
|
|
ID: jobID,
|
|
ClusterID: clusterID,
|
|
ClusterName: clusterName,
|
|
SourceAgentID: req.SourceAgentID,
|
|
SourceNodeID: sourceNodeID,
|
|
OrgID: resolveTenantOrgID(r),
|
|
Status: deploy.JobQueued,
|
|
MaxParallel: maxParallel,
|
|
RetryMax: 0, // preflights don't retry
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
|
|
ctx := r.Context()
|
|
if err := h.store.CreateJob(ctx, job); err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to create preflight job")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to create preflight job", nil)
|
|
return
|
|
}
|
|
|
|
var targets []agentexec.DeployPreflightTarget
|
|
for _, nodeID := range req.TargetNodeIDs {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
node, ok := nodesByID[nodeID]
|
|
if !ok {
|
|
continue // skip nodes not in cluster
|
|
}
|
|
ip := nodeIP(node.HostURL())
|
|
if ip == "" {
|
|
continue // skip nodes without IP
|
|
}
|
|
|
|
targetID := generateID("tgt")
|
|
target := &deploy.Target{
|
|
ID: targetID,
|
|
JobID: jobID,
|
|
NodeID: nodeID,
|
|
NodeName: nodeName(node),
|
|
NodeIP: ip,
|
|
Status: deploy.TargetPending,
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
if err := h.store.CreateTarget(ctx, target); err != nil {
|
|
log.Error().Err(err).Str("target_id", targetID).Msg("Failed to create preflight target")
|
|
continue
|
|
}
|
|
targets = append(targets, agentexec.DeployPreflightTarget{
|
|
TargetID: targetID,
|
|
NodeName: nodeName(node),
|
|
NodeIP: ip,
|
|
})
|
|
}
|
|
|
|
if len(targets) == 0 {
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
writeErrorResponse(w, http.StatusBadRequest, "no_valid_targets",
|
|
"None of the requested nodes are valid deployment targets", nil)
|
|
return
|
|
}
|
|
|
|
// Resolve Pulse URL for agent reachability.
|
|
pulseURL := h.resolvePublicURL(r)
|
|
if pulseURL == "" {
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
writeErrorResponse(w, http.StatusInternalServerError, "no_pulse_url",
|
|
"Cannot determine Pulse URL for agent reachability", nil)
|
|
return
|
|
}
|
|
|
|
requestID := generateID("req")
|
|
payload := agentexec.DeployPreflightPayload{
|
|
RequestID: requestID,
|
|
JobID: jobID,
|
|
Targets: targets,
|
|
PulseURL: pulseURL,
|
|
MaxParallel: maxParallel,
|
|
Timeout: 120,
|
|
}
|
|
|
|
// Transition to running.
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobRunning)
|
|
|
|
// Append job-created event.
|
|
_ = h.store.AppendEvent(ctx, &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventJobCreated,
|
|
Message: fmt.Sprintf("Preflight started for %d targets", len(targets)),
|
|
CreatedAt: now,
|
|
})
|
|
|
|
// Subscribe to progress before sending command to avoid race.
|
|
progressCh := h.execServer.SubscribeDeployProgress(req.SourceAgentID, jobID, 64)
|
|
|
|
// Send command to agent.
|
|
if err := h.execServer.SendDeployPreflight(ctx, req.SourceAgentID, payload); err != nil {
|
|
h.execServer.UnsubscribeDeployProgress(req.SourceAgentID, jobID)
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send preflight command")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "send_failed",
|
|
"Failed to send preflight command to agent", nil)
|
|
return
|
|
}
|
|
|
|
// Start background goroutine to process progress events.
|
|
go h.processPreflightProgress(jobID, req.SourceAgentID, progressCh)
|
|
|
|
resp := createPreflightResponse{
|
|
PreflightID: jobID,
|
|
Status: string(deploy.JobRunning),
|
|
EventsURL: fmt.Sprintf("/api/agent-deploy/preflights/%s/events", jobID),
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusAccepted)
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// HandleGetPreflight returns the current status of a preflight job.
|
|
// GET /api/agent-deploy/preflights/{preflightId}
|
|
func (h *DeployHandlers) HandleGetPreflight(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
preflightID := extractPathSuffix(r.URL.Path, "/api/agent-deploy/preflights/")
|
|
if preflightID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Preflight ID is required", nil)
|
|
return
|
|
}
|
|
// Strip /events suffix if present (shouldn't happen via routing, but be safe).
|
|
preflightID = strings.TrimSuffix(preflightID, "/events")
|
|
|
|
job, err := h.store.GetJob(r.Context(), preflightID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", preflightID).Msg("Failed to get preflight job")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight", nil)
|
|
return
|
|
}
|
|
if job == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
|
|
return
|
|
}
|
|
|
|
// Tenant isolation: verify the job belongs to the caller's org.
|
|
orgID := resolveTenantOrgID(r)
|
|
if job.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
|
|
return
|
|
}
|
|
|
|
targets, err := h.store.GetTargetsForJob(r.Context(), preflightID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", preflightID).Msg("Failed to get preflight targets")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get targets", nil)
|
|
return
|
|
}
|
|
|
|
resp := struct {
|
|
*deploy.Job
|
|
Targets []deploy.Target `json:"targets"`
|
|
}{
|
|
Job: job,
|
|
Targets: targets,
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// HandlePreflightEvents streams SSE events for a preflight job.
|
|
// GET /api/agent-deploy/preflights/{preflightId}/events
|
|
func (h *DeployHandlers) HandlePreflightEvents(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// Extract preflight ID: /api/agent-deploy/preflights/{id}/events
|
|
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/preflights/")
|
|
preflightID := strings.TrimSuffix(path, "/events")
|
|
if preflightID == "" || preflightID == path {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Preflight ID is required", nil)
|
|
return
|
|
}
|
|
|
|
// Verify the preflight exists.
|
|
job, err := h.store.GetJob(r.Context(), preflightID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", preflightID).Msg("Failed to get preflight job for SSE")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight", nil)
|
|
return
|
|
}
|
|
if job == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
|
|
return
|
|
}
|
|
|
|
// Tenant isolation: verify the job belongs to the caller's org.
|
|
orgID := resolveTenantOrgID(r)
|
|
if job.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Preflight not found", nil)
|
|
return
|
|
}
|
|
|
|
flusher, ok := w.(http.Flusher)
|
|
if !ok {
|
|
writeErrorResponse(w, http.StatusInternalServerError, "streaming_unsupported", "Streaming not supported", nil)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "text/event-stream")
|
|
w.Header().Set("Cache-Control", "no-cache")
|
|
w.Header().Set("Connection", "keep-alive")
|
|
|
|
// Register SSE client.
|
|
clientID := generateID("sse")
|
|
eventCh := h.addSSEClient(preflightID, clientID)
|
|
defer h.removeSSEClient(preflightID, clientID)
|
|
|
|
// Send existing events first (replay).
|
|
events, replayErr := h.store.GetEventsForJob(r.Context(), preflightID)
|
|
if replayErr != nil {
|
|
log.Error().Err(replayErr).Str("id", preflightID).Msg("Failed to load events for SSE replay")
|
|
// Send an error event so the client knows replay is incomplete.
|
|
fmt.Fprintf(w, "event: error\ndata: {\"message\":\"failed to load event history\"}\n\n")
|
|
}
|
|
for _, evt := range events {
|
|
data, _ := json.Marshal(evt)
|
|
fmt.Fprintf(w, "data: %s\n\n", data)
|
|
}
|
|
flusher.Flush()
|
|
|
|
// If job is already terminal, send final status and close.
|
|
if isDeployJobTerminal(job.Status) {
|
|
data, _ := json.Marshal(map[string]string{
|
|
"type": "job_complete",
|
|
"status": string(job.Status),
|
|
})
|
|
fmt.Fprintf(w, "data: %s\n\n", data)
|
|
flusher.Flush()
|
|
return
|
|
}
|
|
|
|
// Stream new events.
|
|
heartbeat := time.NewTicker(15 * time.Second)
|
|
defer heartbeat.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-r.Context().Done():
|
|
return
|
|
case eventData, ok := <-eventCh:
|
|
if !ok {
|
|
return
|
|
}
|
|
fmt.Fprintf(w, "data: %s\n\n", eventData)
|
|
flusher.Flush()
|
|
case <-heartbeat.C:
|
|
fmt.Fprint(w, ": heartbeat\n\n")
|
|
flusher.Flush()
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Progress processing ---
|
|
|
|
// processPreflightProgress reads deploy progress events from the agent and
|
|
// persists them as deploy events, also broadcasting to SSE clients.
|
|
func (h *DeployHandlers) processPreflightProgress(jobID, agentID string, ch <-chan agentexec.DeployProgressPayload) {
|
|
defer h.execServer.UnsubscribeDeployProgress(agentID, jobID)
|
|
|
|
ctx := context.Background()
|
|
|
|
for progress := range ch {
|
|
// Persist as event.
|
|
evt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
TargetID: progress.TargetID,
|
|
Type: deploy.EventPreflightResult,
|
|
Message: progress.Message,
|
|
Data: progress.Data,
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
if err := h.store.AppendEvent(ctx, evt); err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to persist deploy event")
|
|
}
|
|
|
|
// Update target status based on progress phase.
|
|
if progress.TargetID != "" {
|
|
h.updateTargetFromProgress(ctx, progress)
|
|
}
|
|
|
|
// Broadcast to SSE clients.
|
|
h.broadcastSSE(jobID, evt)
|
|
|
|
if progress.Final {
|
|
// Derive final job status from target statuses.
|
|
// For preflights, TargetReady means "passed" (not active).
|
|
finalStatus := derivePreflightJobStatus(ctx, h.store, jobID)
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, finalStatus)
|
|
|
|
// Broadcast final status.
|
|
finalEvt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventJobStatusChanged,
|
|
Message: fmt.Sprintf("Preflight completed: %s", finalStatus),
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
_ = h.store.AppendEvent(ctx, finalEvt)
|
|
h.broadcastSSE(jobID, finalEvt)
|
|
|
|
// Close SSE channels for this job.
|
|
h.closeSSESub(jobID)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Channel closed without final — agent disconnected.
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
finalEvt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventError,
|
|
Message: "Source agent disconnected during preflight",
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
_ = h.store.AppendEvent(ctx, finalEvt)
|
|
h.broadcastSSE(jobID, finalEvt)
|
|
h.closeSSESub(jobID)
|
|
}
|
|
|
|
// updateTargetFromProgress maps progress phases to target status transitions.
|
|
func (h *DeployHandlers) updateTargetFromProgress(ctx context.Context, p agentexec.DeployProgressPayload) {
|
|
var newStatus deploy.TargetStatus
|
|
var errMsg string
|
|
|
|
switch {
|
|
case p.Phase == agentexec.DeployPhasePreflightComplete && p.Status == agentexec.DeployStepOK:
|
|
newStatus = deploy.TargetReady
|
|
case p.Phase == agentexec.DeployPhasePreflightComplete && p.Status == agentexec.DeployStepFailed:
|
|
newStatus = deploy.TargetFailedPermanent
|
|
errMsg = p.Message
|
|
case p.Phase == agentexec.DeployPhasePreflightComplete && p.Status == agentexec.DeployStepSkipped:
|
|
newStatus = deploy.TargetSkippedAgent
|
|
case p.Phase == agentexec.DeployPhasePreflightSSH && p.Status == agentexec.DeployStepStarted:
|
|
newStatus = deploy.TargetPreflighting
|
|
case p.Phase == agentexec.DeployPhasePreflightSSH && p.Status == agentexec.DeployStepFailed:
|
|
newStatus = deploy.TargetFailedPermanent
|
|
errMsg = p.Message
|
|
case p.Phase == agentexec.DeployPhaseCanceled:
|
|
newStatus = deploy.TargetCanceled
|
|
default:
|
|
return // intermediate step, no status change
|
|
}
|
|
|
|
if err := h.store.UpdateTargetStatus(ctx, p.TargetID, newStatus, errMsg); err != nil {
|
|
log.Error().Err(err).
|
|
Str("target_id", p.TargetID).
|
|
Str("new_status", string(newStatus)).
|
|
Msg("Failed to update target status from progress")
|
|
}
|
|
}
|
|
|
|
// --- SSE subscription management ---
|
|
|
|
func (h *DeployHandlers) addSSEClient(jobID, clientID string) chan []byte {
|
|
h.sseMu.Lock()
|
|
defer h.sseMu.Unlock()
|
|
|
|
sub, ok := h.sseSubs[jobID]
|
|
if !ok {
|
|
sub = &deploySSESub{clients: make(map[string]chan []byte)}
|
|
h.sseSubs[jobID] = sub
|
|
}
|
|
|
|
ch := make(chan []byte, 64)
|
|
sub.mu.Lock()
|
|
sub.clients[clientID] = ch
|
|
sub.mu.Unlock()
|
|
return ch
|
|
}
|
|
|
|
func (h *DeployHandlers) removeSSEClient(jobID, clientID string) {
|
|
h.sseMu.Lock()
|
|
sub, ok := h.sseSubs[jobID]
|
|
h.sseMu.Unlock()
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
sub.mu.Lock()
|
|
if ch, exists := sub.clients[clientID]; exists {
|
|
close(ch)
|
|
delete(sub.clients, clientID)
|
|
}
|
|
sub.mu.Unlock()
|
|
}
|
|
|
|
func (h *DeployHandlers) broadcastSSE(jobID string, evt *deploy.Event) {
|
|
h.sseMu.Lock()
|
|
sub, ok := h.sseSubs[jobID]
|
|
h.sseMu.Unlock()
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
data, err := json.Marshal(evt)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
sub.mu.Lock()
|
|
defer sub.mu.Unlock()
|
|
for _, ch := range sub.clients {
|
|
select {
|
|
case ch <- data:
|
|
default:
|
|
// Drop if client is slow.
|
|
}
|
|
}
|
|
}
|
|
|
|
func (h *DeployHandlers) closeSSESub(jobID string) {
|
|
h.sseMu.Lock()
|
|
sub, ok := h.sseSubs[jobID]
|
|
if ok {
|
|
delete(h.sseSubs, jobID)
|
|
}
|
|
h.sseMu.Unlock()
|
|
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
sub.mu.Lock()
|
|
for id, ch := range sub.clients {
|
|
close(ch)
|
|
delete(sub.clients, id)
|
|
}
|
|
sub.mu.Unlock()
|
|
}
|
|
|
|
// --- Bootstrap Enrollment ---
|
|
|
|
// MintBootstrapTokenForTarget creates a single-use bootstrap token for a deploy target.
|
|
// Used by the deploy job creation flow to issue per-target tokens.
|
|
func (h *DeployHandlers) MintBootstrapTokenForTarget(req deploy.BootstrapTokenRequest) (rawToken string, tokenID string, err error) {
|
|
if req.TTL <= 0 {
|
|
return "", "", fmt.Errorf("TTL must be positive, got %v", req.TTL)
|
|
}
|
|
|
|
raw, err := auth.GenerateAPIToken()
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("generate token: %w", err)
|
|
}
|
|
|
|
record, err := config.NewAPITokenRecord(raw,
|
|
fmt.Sprintf("deploy-bootstrap:%s:%s", req.JobID, req.TargetID),
|
|
[]string{config.ScopeAgentEnroll})
|
|
if err != nil {
|
|
return "", "", fmt.Errorf("create token record: %w", err)
|
|
}
|
|
|
|
exp := time.Now().UTC().Add(req.TTL)
|
|
record.ExpiresAt = &exp
|
|
record.OrgID = req.OrgID
|
|
record.Metadata = req.BuildMetadata()
|
|
|
|
config.Mu.Lock()
|
|
h.config.UpsertAPIToken(*record)
|
|
tokens := make([]config.APITokenRecord, len(h.config.APITokens))
|
|
copy(tokens, h.config.APITokens)
|
|
config.Mu.Unlock()
|
|
|
|
if h.persistence != nil {
|
|
if err := h.persistence.SaveAPITokens(tokens); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to persist bootstrap token")
|
|
}
|
|
}
|
|
|
|
return raw, record.ID, nil
|
|
}
|
|
|
|
// enrollRequest matches the design doc Section 3 enrollment payload.
|
|
type enrollRequest struct {
|
|
Hostname string `json:"hostname"`
|
|
FQDN string `json:"fqdn,omitempty"`
|
|
MachineID string `json:"machineId,omitempty"`
|
|
OS string `json:"os"`
|
|
Arch string `json:"arch"`
|
|
AgentVersion string `json:"agentVersion"`
|
|
CommandsEnabled bool `json:"commandsEnabled,omitempty"`
|
|
Proxmox *struct {
|
|
ClusterName string `json:"clusterName,omitempty"`
|
|
NodeName string `json:"nodeName,omitempty"`
|
|
} `json:"proxmox,omitempty"`
|
|
DeployJobID string `json:"deployJobId,omitempty"`
|
|
}
|
|
|
|
// HandleEnroll processes bootstrap token enrollment from freshly-deployed agents.
|
|
// POST /api/agents/agent/enroll
|
|
func (h *DeployHandlers) HandleEnroll(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// 1. Decode request body.
|
|
var req enrollRequest
|
|
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
|
|
return
|
|
}
|
|
req.Hostname = strings.TrimSpace(req.Hostname)
|
|
if req.Hostname == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_hostname", "hostname is required", nil)
|
|
return
|
|
}
|
|
|
|
// 2. Get bootstrap token from context (set by RequireAuth middleware).
|
|
bootstrapToken := getAPITokenRecordFromRequest(r)
|
|
if bootstrapToken == nil {
|
|
writeErrorResponse(w, http.StatusUnauthorized, "no_token", "Bootstrap token required", nil)
|
|
return
|
|
}
|
|
|
|
// 3. Validate token binding metadata.
|
|
meta := bootstrapToken.Metadata
|
|
if meta == nil {
|
|
writeErrorResponse(w, http.StatusForbidden, "invalid_token", "Token is not a bootstrap deploy token", nil)
|
|
return
|
|
}
|
|
jobID := meta[deploy.MetaKeyJobID]
|
|
targetID := meta[deploy.MetaKeyTargetID]
|
|
expectedNode := meta[deploy.MetaKeyExpectedNode]
|
|
|
|
if jobID == "" || targetID == "" {
|
|
writeErrorResponse(w, http.StatusForbidden, "invalid_token", "Token missing deploy binding", nil)
|
|
return
|
|
}
|
|
|
|
// 4. Validate node name binding (if set).
|
|
if expectedNode != "" && req.Hostname != expectedNode {
|
|
proxmoxMatch := false
|
|
if req.Proxmox != nil && req.Proxmox.NodeName == expectedNode {
|
|
proxmoxMatch = true
|
|
}
|
|
if !proxmoxMatch {
|
|
writeErrorResponse(w, http.StatusForbidden, "binding_mismatch",
|
|
fmt.Sprintf("Token bound to node %q, got hostname %q", expectedNode, req.Hostname), nil)
|
|
return
|
|
}
|
|
}
|
|
|
|
// 5. Verify deploy target exists and is in correct state.
|
|
ctx := r.Context()
|
|
target, err := h.store.GetTarget(ctx, targetID)
|
|
if err != nil || target == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "target_not_found", "Deploy target not found", nil)
|
|
return
|
|
}
|
|
if target.Status != deploy.TargetEnrolling && target.Status != deploy.TargetInstalling {
|
|
writeErrorResponse(w, http.StatusConflict, "invalid_target_state",
|
|
fmt.Sprintf("Target is in state %q, expected enrolling or installing", target.Status), nil)
|
|
return
|
|
}
|
|
|
|
// 6. Verify target belongs to the job referenced in the token.
|
|
if target.JobID != jobID {
|
|
writeErrorResponse(w, http.StatusForbidden, "binding_mismatch",
|
|
"Token job binding does not match target", nil)
|
|
return
|
|
}
|
|
|
|
// 7. Invalidate bootstrap token (single-use) BEFORE minting runtime token.
|
|
// Check return value to prevent concurrent replay.
|
|
config.Mu.Lock()
|
|
removed := h.config.RemoveAPIToken(bootstrapToken.ID)
|
|
tokensAfterRemove := make([]config.APITokenRecord, len(h.config.APITokens))
|
|
copy(tokensAfterRemove, h.config.APITokens)
|
|
config.Mu.Unlock()
|
|
if removed == nil {
|
|
writeErrorResponse(w, http.StatusConflict, "token_already_consumed",
|
|
"Bootstrap token has already been used", nil)
|
|
return
|
|
}
|
|
if h.persistence != nil {
|
|
if err := h.persistence.SaveAPITokens(tokensAfterRemove); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to persist token removal during enroll")
|
|
}
|
|
}
|
|
|
|
// 8. Mint runtime token (long-lived, host-bound).
|
|
runtimeRaw, err := auth.GenerateAPIToken()
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Failed to generate runtime token during enroll")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "token_error", "Failed to generate runtime token", nil)
|
|
return
|
|
}
|
|
runtimeScopes := []string{
|
|
config.ScopeAgentReport, config.ScopeAgentConfigRead, config.ScopeAgentManage,
|
|
config.ScopeDockerReport, config.ScopeKubernetesReport,
|
|
}
|
|
if req.CommandsEnabled {
|
|
runtimeScopes = append(runtimeScopes, config.ScopeAgentExec)
|
|
}
|
|
runtimeRecord, err := config.NewAPITokenRecord(runtimeRaw,
|
|
fmt.Sprintf("agent:%s", req.Hostname),
|
|
runtimeScopes)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Failed to create runtime token record during enroll")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "token_error", "Failed to create runtime token", nil)
|
|
return
|
|
}
|
|
runtimeRecord.OrgID = bootstrapToken.OrgID
|
|
runtimeRecord.Metadata = map[string]string{
|
|
"bound_hostname": req.Hostname,
|
|
"deploy_job_id": jobID,
|
|
}
|
|
|
|
config.Mu.Lock()
|
|
h.config.UpsertAPIToken(*runtimeRecord)
|
|
tokensAfterMint := make([]config.APITokenRecord, len(h.config.APITokens))
|
|
copy(tokensAfterMint, h.config.APITokens)
|
|
config.Mu.Unlock()
|
|
if h.persistence != nil {
|
|
if err := h.persistence.SaveAPITokens(tokensAfterMint); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to persist runtime token during enroll")
|
|
}
|
|
}
|
|
|
|
// 9. Update target status to VERIFYING.
|
|
_ = h.store.UpdateTargetStatus(ctx, targetID, deploy.TargetVerifying, "")
|
|
|
|
// 10. Append enroll event.
|
|
enrollEvt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
TargetID: targetID,
|
|
Type: deploy.EventEnrollComplete,
|
|
Message: fmt.Sprintf("Agent enrolled from %s", req.Hostname),
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
_ = h.store.AppendEvent(ctx, enrollEvt)
|
|
|
|
// 11. Broadcast to SSE clients.
|
|
h.broadcastSSE(jobID, enrollEvt)
|
|
|
|
// 12. Return runtime token + config to agent.
|
|
canonicalAgentID := fmt.Sprintf("agent-%s", req.Hostname)
|
|
resp := map[string]any{
|
|
"agentId": canonicalAgentID,
|
|
"runtimeToken": runtimeRaw,
|
|
"runtimeTokenId": runtimeRecord.ID,
|
|
"reportInterval": "30s",
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusOK)
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// --- Deploy Jobs ---
|
|
|
|
type createJobRequest struct {
|
|
SourceAgentID string `json:"sourceAgentId"`
|
|
PreflightID string `json:"preflightId"`
|
|
TargetNodeIDs []string `json:"targetNodeIds"`
|
|
Mode string `json:"mode"`
|
|
MaxParallel int `json:"maxParallel"`
|
|
RetryPolicy *struct {
|
|
MaxAttempts int `json:"maxAttempts"`
|
|
} `json:"retryPolicy,omitempty"`
|
|
}
|
|
|
|
type createJobSkip struct {
|
|
NodeID string `json:"nodeId"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type createJobResponse struct {
|
|
JobID string `json:"jobId"`
|
|
AcceptedTargets []string `json:"acceptedTargets"`
|
|
SkippedTargets []createJobSkip `json:"skippedTargets"`
|
|
ReservedLicenseSlots int `json:"reservedLicenseSlots"`
|
|
EventsURL string `json:"eventsUrl"`
|
|
}
|
|
|
|
// HandleCreateJob creates a deploy install job from preflight results.
|
|
// POST /api/clusters/{clusterId}/agent-deploy/jobs
|
|
func (h *DeployHandlers) HandleCreateJob(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
clusterID := extractClusterID(r.URL.Path, "/api/clusters/", "/agent-deploy/jobs")
|
|
if clusterID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_cluster_id", "Cluster ID is required", nil)
|
|
return
|
|
}
|
|
|
|
var req createJobRequest
|
|
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
|
|
return
|
|
}
|
|
|
|
req.SourceAgentID = strings.TrimSpace(req.SourceAgentID)
|
|
if req.SourceAgentID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_source_agent", "sourceAgentId is required", nil)
|
|
return
|
|
}
|
|
req.PreflightID = strings.TrimSpace(req.PreflightID)
|
|
if req.PreflightID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_preflight_id", "preflightId is required", nil)
|
|
return
|
|
}
|
|
if len(req.TargetNodeIDs) == 0 {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_targets", "At least one targetNodeIds entry is required", nil)
|
|
return
|
|
}
|
|
|
|
// Verify source agent is connected.
|
|
if !h.execServer.IsAgentConnected(req.SourceAgentID) {
|
|
writeErrorResponse(w, http.StatusConflict, "source_agent_offline", "Source agent is not connected", nil)
|
|
return
|
|
}
|
|
|
|
ctx := r.Context()
|
|
orgID := resolveTenantOrgID(r)
|
|
|
|
// Verify preflight exists and belongs to same org.
|
|
pfJob, err := h.store.GetJob(ctx, req.PreflightID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("preflight_id", req.PreflightID).Msg("Failed to get preflight job")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight job", nil)
|
|
return
|
|
}
|
|
if pfJob == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "preflight_not_found", "Preflight job not found", nil)
|
|
return
|
|
}
|
|
if pfJob.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "preflight_not_found", "Preflight job not found", nil)
|
|
return
|
|
}
|
|
if pfJob.Status != deploy.JobSucceeded && pfJob.Status != deploy.JobPartialSuccess {
|
|
writeErrorResponse(w, http.StatusConflict, "preflight_not_passed",
|
|
fmt.Sprintf("Preflight is in state %q, expected succeeded or partial_success", pfJob.Status), nil)
|
|
return
|
|
}
|
|
|
|
// Verify cluster and source agent consistency.
|
|
if pfJob.ClusterID != clusterID {
|
|
writeErrorResponse(w, http.StatusBadRequest, "cluster_mismatch",
|
|
"Preflight cluster does not match request cluster", nil)
|
|
return
|
|
}
|
|
if pfJob.SourceAgentID != req.SourceAgentID {
|
|
writeErrorResponse(w, http.StatusBadRequest, "source_agent_mismatch",
|
|
"Preflight source agent does not match request source agent", nil)
|
|
return
|
|
}
|
|
|
|
// Get preflight targets — filter requested nodeIDs against Ready targets.
|
|
pfTargets, err := h.store.GetTargetsForJob(ctx, req.PreflightID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("preflight_id", req.PreflightID).Msg("Failed to get preflight targets")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get preflight targets", nil)
|
|
return
|
|
}
|
|
|
|
// Build lookup of preflight targets by node ID.
|
|
pfTargetByNode := make(map[string]*deploy.Target)
|
|
for i := range pfTargets {
|
|
pfTargetByNode[pfTargets[i].NodeID] = &pfTargets[i]
|
|
}
|
|
|
|
// Deduplicate requested nodes while preserving request order.
|
|
seen := make(map[string]bool, len(req.TargetNodeIDs))
|
|
var orderedNodeIDs []string
|
|
for _, nid := range req.TargetNodeIDs {
|
|
nid = strings.TrimSpace(nid)
|
|
if nid != "" && !seen[nid] {
|
|
seen[nid] = true
|
|
orderedNodeIDs = append(orderedNodeIDs, nid)
|
|
}
|
|
}
|
|
|
|
// Filter: only accept targets that passed preflight (Ready state).
|
|
// Order is preserved from the request so license truncation is deterministic.
|
|
var acceptedPfTargets []*deploy.Target
|
|
var skipped []createJobSkip
|
|
for _, nodeID := range orderedNodeIDs {
|
|
pfTgt, ok := pfTargetByNode[nodeID]
|
|
if !ok {
|
|
skipped = append(skipped, createJobSkip{NodeID: nodeID, Reason: "not_in_preflight"})
|
|
continue
|
|
}
|
|
if pfTgt.Status != deploy.TargetReady {
|
|
skipped = append(skipped, createJobSkip{NodeID: nodeID, Reason: fmt.Sprintf("preflight_status_%s", pfTgt.Status)})
|
|
continue
|
|
}
|
|
acceptedPfTargets = append(acceptedPfTargets, pfTgt)
|
|
}
|
|
|
|
// License slot check.
|
|
maxLimit := maxMonitoredSystemsLimitForContext(ctx)
|
|
if maxLimit > 0 {
|
|
decision := monitoredSystemLimitDecisionForAdditionalSlots(ctx, h.monitor, 0)
|
|
if !decision.usageAvailable {
|
|
writeMonitoredSystemUsageUnavailable(w, decision.usageUnavailableReason)
|
|
return
|
|
}
|
|
available := decision.limit - decision.current
|
|
if available < 0 {
|
|
available = 0
|
|
}
|
|
if available < len(acceptedPfTargets) {
|
|
// Accept only what fits; skip the rest.
|
|
for i := available; i < len(acceptedPfTargets); i++ {
|
|
skipped = append(skipped, createJobSkip{
|
|
NodeID: acceptedPfTargets[i].NodeID,
|
|
Reason: "skipped_license",
|
|
})
|
|
}
|
|
acceptedPfTargets = acceptedPfTargets[:available]
|
|
}
|
|
}
|
|
|
|
if len(acceptedPfTargets) == 0 {
|
|
writeErrorResponse(w, http.StatusConflict, "no_eligible_targets",
|
|
"No targets are eligible for deployment", nil)
|
|
return
|
|
}
|
|
|
|
// Create deploy job.
|
|
now := time.Now().UTC()
|
|
jobID := generateID("dep")
|
|
maxParallel := req.MaxParallel
|
|
if maxParallel <= 0 {
|
|
maxParallel = 2
|
|
}
|
|
if maxParallel > 10 {
|
|
maxParallel = 10
|
|
}
|
|
|
|
retryMax := 3
|
|
if req.RetryPolicy != nil && req.RetryPolicy.MaxAttempts > 0 {
|
|
retryMax = req.RetryPolicy.MaxAttempts
|
|
}
|
|
|
|
job := &deploy.Job{
|
|
ID: jobID,
|
|
ClusterID: clusterID,
|
|
ClusterName: pfJob.ClusterName,
|
|
SourceAgentID: req.SourceAgentID,
|
|
SourceNodeID: pfJob.SourceNodeID,
|
|
OrgID: orgID,
|
|
Status: deploy.JobQueued,
|
|
MaxParallel: maxParallel,
|
|
RetryMax: retryMax,
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
|
|
if err := h.store.CreateJob(ctx, job); err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to create deploy job")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to create deploy job", nil)
|
|
return
|
|
}
|
|
|
|
// Resolve Pulse URL for install commands.
|
|
pulseURL := h.resolvePublicURL(r)
|
|
if pulseURL == "" {
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
writeErrorResponse(w, http.StatusInternalServerError, "no_pulse_url",
|
|
"Cannot determine Pulse URL for agent installation", nil)
|
|
return
|
|
}
|
|
|
|
// Create targets and mint bootstrap tokens.
|
|
var installTargets []agentexec.DeployInstallTarget
|
|
var acceptedNodeIDs []string
|
|
for _, pfTgt := range acceptedPfTargets {
|
|
targetID := generateID("tgt")
|
|
arch := h.getTargetArchFromPreflight(ctx, req.PreflightID, pfTgt.NodeID)
|
|
|
|
target := &deploy.Target{
|
|
ID: targetID,
|
|
JobID: jobID,
|
|
NodeID: pfTgt.NodeID,
|
|
NodeName: pfTgt.NodeName,
|
|
NodeIP: pfTgt.NodeIP,
|
|
Arch: arch,
|
|
Status: deploy.TargetPending,
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
if err := h.store.CreateTarget(ctx, target); err != nil {
|
|
log.Error().Err(err).Str("target_id", targetID).Msg("Failed to create deploy target")
|
|
continue
|
|
}
|
|
|
|
// Mint bootstrap token for this target.
|
|
rawToken, _, err := h.MintBootstrapTokenForTarget(deploy.BootstrapTokenRequest{
|
|
ClusterID: clusterID,
|
|
NodeID: pfTgt.NodeID,
|
|
ExpectedNode: pfTgt.NodeName,
|
|
JobID: jobID,
|
|
TargetID: targetID,
|
|
SourceAgentID: req.SourceAgentID,
|
|
OrgID: orgID,
|
|
TTL: 30 * time.Minute,
|
|
})
|
|
if err != nil {
|
|
log.Error().Err(err).Str("target_id", targetID).Msg("Failed to mint bootstrap token")
|
|
_ = h.store.UpdateTargetStatus(ctx, targetID, deploy.TargetFailedPermanent, "failed to mint bootstrap token")
|
|
continue
|
|
}
|
|
|
|
installTargets = append(installTargets, agentexec.DeployInstallTarget{
|
|
TargetID: targetID,
|
|
NodeName: pfTgt.NodeName,
|
|
NodeIP: pfTgt.NodeIP,
|
|
Arch: arch,
|
|
BootstrapToken: rawToken,
|
|
})
|
|
acceptedNodeIDs = append(acceptedNodeIDs, pfTgt.NodeID)
|
|
}
|
|
|
|
if len(installTargets) == 0 {
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
writeErrorResponse(w, http.StatusInternalServerError, "target_setup_failed",
|
|
"Failed to set up any deployment targets", nil)
|
|
return
|
|
}
|
|
|
|
// Reserve license slots based on actual dispatched target count.
|
|
if err := h.reservation.Reserve(jobID, orgID, len(installTargets), 1*time.Hour); err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to reserve license slots")
|
|
// Non-fatal — continue. The reservation is for proactive slot tracking.
|
|
}
|
|
|
|
// Transition to running.
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobRunning)
|
|
|
|
// Append job-created event.
|
|
_ = h.store.AppendEvent(ctx, &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventJobCreated,
|
|
Message: fmt.Sprintf("Deploy started for %d targets", len(installTargets)),
|
|
CreatedAt: now,
|
|
})
|
|
|
|
requestID := generateID("req")
|
|
payload := agentexec.DeployInstallPayload{
|
|
RequestID: requestID,
|
|
JobID: jobID,
|
|
Targets: installTargets,
|
|
PulseURL: pulseURL,
|
|
MaxParallel: maxParallel,
|
|
Timeout: 300,
|
|
}
|
|
|
|
// Subscribe to progress before sending command.
|
|
progressCh := h.execServer.SubscribeDeployProgress(req.SourceAgentID, jobID, 64)
|
|
|
|
// Send install command to agent.
|
|
if err := h.execServer.SendDeployInstall(ctx, req.SourceAgentID, payload); err != nil {
|
|
h.execServer.UnsubscribeDeployProgress(req.SourceAgentID, jobID)
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
h.reservation.Release(jobID)
|
|
// Mark pending targets as failed so they're eligible for retry.
|
|
for _, it := range installTargets {
|
|
_ = h.store.UpdateTargetStatus(ctx, it.TargetID, deploy.TargetFailedRetryable, "dispatch failed")
|
|
}
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send install command")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "send_failed",
|
|
"Failed to send install command to agent", nil)
|
|
return
|
|
}
|
|
|
|
// Start background goroutine to process progress events.
|
|
go h.processInstallProgress(jobID, req.SourceAgentID, job.RetryMax, progressCh)
|
|
|
|
resp := createJobResponse{
|
|
JobID: jobID,
|
|
AcceptedTargets: acceptedNodeIDs,
|
|
SkippedTargets: skipped,
|
|
ReservedLicenseSlots: len(installTargets),
|
|
EventsURL: fmt.Sprintf("/api/agent-deploy/jobs/%s/events", jobID),
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusAccepted)
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// HandleGetJob returns the current status of a deploy job.
|
|
// GET /api/agent-deploy/jobs/{jobId}
|
|
func (h *DeployHandlers) HandleGetJob(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
jobID := extractPathSuffix(r.URL.Path, "/api/agent-deploy/jobs/")
|
|
if jobID == "" {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
|
|
return
|
|
}
|
|
jobID = strings.TrimSuffix(jobID, "/events")
|
|
|
|
job, err := h.store.GetJob(r.Context(), jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
|
|
return
|
|
}
|
|
if job == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
orgID := resolveTenantOrgID(r)
|
|
if job.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
targets, err := h.store.GetTargetsForJob(r.Context(), jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", jobID).Msg("Failed to get job targets")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get targets", nil)
|
|
return
|
|
}
|
|
|
|
resp := struct {
|
|
*deploy.Job
|
|
Targets []deploy.Target `json:"targets"`
|
|
}{
|
|
Job: job,
|
|
Targets: targets,
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// HandleJobEvents streams SSE events for a deploy job.
|
|
// GET /api/agent-deploy/jobs/{jobId}/events
|
|
func (h *DeployHandlers) HandleJobEvents(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// Extract job ID: /api/agent-deploy/jobs/{id}/events
|
|
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/jobs/")
|
|
jobID := strings.TrimSuffix(path, "/events")
|
|
if jobID == "" || jobID == path {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
|
|
return
|
|
}
|
|
|
|
job, err := h.store.GetJob(r.Context(), jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job for SSE")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
|
|
return
|
|
}
|
|
if job == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
orgID := resolveTenantOrgID(r)
|
|
if job.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
flusher, ok := w.(http.Flusher)
|
|
if !ok {
|
|
writeErrorResponse(w, http.StatusInternalServerError, "streaming_unsupported", "Streaming not supported", nil)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "text/event-stream")
|
|
w.Header().Set("Cache-Control", "no-cache")
|
|
w.Header().Set("Connection", "keep-alive")
|
|
|
|
clientID := generateID("sse")
|
|
eventCh := h.addSSEClient(jobID, clientID)
|
|
defer h.removeSSEClient(jobID, clientID)
|
|
|
|
// Replay existing events.
|
|
events, replayErr := h.store.GetEventsForJob(r.Context(), jobID)
|
|
if replayErr != nil {
|
|
log.Error().Err(replayErr).Str("id", jobID).Msg("Failed to load events for SSE replay")
|
|
fmt.Fprintf(w, "event: error\ndata: {\"message\":\"failed to load event history\"}\n\n")
|
|
}
|
|
for _, evt := range events {
|
|
data, _ := json.Marshal(evt)
|
|
fmt.Fprintf(w, "data: %s\n\n", data)
|
|
}
|
|
flusher.Flush()
|
|
|
|
// If job is terminal, send final and close.
|
|
if isDeployJobTerminal(job.Status) {
|
|
data, _ := json.Marshal(map[string]string{
|
|
"type": "job_complete",
|
|
"status": string(job.Status),
|
|
})
|
|
fmt.Fprintf(w, "data: %s\n\n", data)
|
|
flusher.Flush()
|
|
return
|
|
}
|
|
|
|
// Stream new events.
|
|
heartbeat := time.NewTicker(15 * time.Second)
|
|
defer heartbeat.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-r.Context().Done():
|
|
return
|
|
case eventData, ok := <-eventCh:
|
|
if !ok {
|
|
return
|
|
}
|
|
fmt.Fprintf(w, "data: %s\n\n", eventData)
|
|
flusher.Flush()
|
|
case <-heartbeat.C:
|
|
fmt.Fprint(w, ": heartbeat\n\n")
|
|
flusher.Flush()
|
|
}
|
|
}
|
|
}
|
|
|
|
// HandleCancelJob cancels a running deploy job.
|
|
// POST /api/agent-deploy/jobs/{jobId}/cancel
|
|
func (h *DeployHandlers) HandleCancelJob(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// Extract job ID: /api/agent-deploy/jobs/{id}/cancel
|
|
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/jobs/")
|
|
jobID := strings.TrimSuffix(path, "/cancel")
|
|
if jobID == "" || jobID == path {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
|
|
return
|
|
}
|
|
|
|
ctx := r.Context()
|
|
job, err := h.store.GetJob(ctx, jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job for cancel")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
|
|
return
|
|
}
|
|
if job == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
orgID := resolveTenantOrgID(r)
|
|
if job.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
if job.Status != deploy.JobRunning {
|
|
writeErrorResponse(w, http.StatusConflict, "not_running",
|
|
fmt.Sprintf("Job is in state %q, only running jobs can be canceled", job.Status), nil)
|
|
return
|
|
}
|
|
|
|
// Transition to canceling.
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobCanceling)
|
|
|
|
// Send cancel to source agent.
|
|
cancelPayload := agentexec.DeployCancelPayload{
|
|
RequestID: generateID("req"),
|
|
JobID: jobID,
|
|
}
|
|
if err := h.execServer.SendDeployCancel(ctx, job.SourceAgentID, cancelPayload); err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send cancel command")
|
|
// Don't fail the request — the agent may have already disconnected.
|
|
// processInstallProgress will handle the channel close.
|
|
}
|
|
|
|
// Append cancel event.
|
|
_ = h.store.AppendEvent(ctx, &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventJobStatusChanged,
|
|
Message: "Cancel requested",
|
|
CreatedAt: time.Now().UTC(),
|
|
})
|
|
|
|
// Return current state.
|
|
job.Status = deploy.JobCanceling
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(job)
|
|
}
|
|
|
|
type retryJobRequest struct {
|
|
TargetIDs []string `json:"targetIds,omitempty"`
|
|
}
|
|
|
|
// HandleRetryJob retries failed targets in a terminal deploy job.
|
|
// POST /api/agent-deploy/jobs/{jobId}/retry
|
|
func (h *DeployHandlers) HandleRetryJob(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodPost {
|
|
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
|
|
return
|
|
}
|
|
|
|
// Extract job ID: /api/agent-deploy/jobs/{id}/retry
|
|
path := strings.TrimPrefix(r.URL.Path, "/api/agent-deploy/jobs/")
|
|
jobID := strings.TrimSuffix(path, "/retry")
|
|
if jobID == "" || jobID == path {
|
|
writeErrorResponse(w, http.StatusBadRequest, "missing_id", "Job ID is required", nil)
|
|
return
|
|
}
|
|
|
|
var req retryJobRequest
|
|
if r.Body != nil && r.ContentLength != 0 {
|
|
if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil {
|
|
// EOF means empty body (e.g. chunked with no data) — treat as "retry all".
|
|
if !errors.Is(err, io.EOF) {
|
|
writeErrorResponse(w, http.StatusBadRequest, "invalid_body", "Invalid request body", nil)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
ctx := r.Context()
|
|
job, err := h.store.GetJob(ctx, jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", jobID).Msg("Failed to get deploy job for retry")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get job", nil)
|
|
return
|
|
}
|
|
if job == nil {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
orgID := resolveTenantOrgID(r)
|
|
if job.OrgID != orgID {
|
|
writeErrorResponse(w, http.StatusNotFound, "not_found", "Job not found", nil)
|
|
return
|
|
}
|
|
|
|
if !isDeployJobTerminal(job.Status) {
|
|
writeErrorResponse(w, http.StatusConflict, "not_terminal",
|
|
fmt.Sprintf("Job is in state %q, only terminal jobs can be retried", job.Status), nil)
|
|
return
|
|
}
|
|
|
|
// Verify source agent is connected.
|
|
if !h.execServer.IsAgentConnected(job.SourceAgentID) {
|
|
writeErrorResponse(w, http.StatusConflict, "source_agent_offline", "Source agent is not connected", nil)
|
|
return
|
|
}
|
|
|
|
// Load targets.
|
|
targets, err := h.store.GetTargetsForJob(ctx, jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("id", jobID).Msg("Failed to get job targets for retry")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to get targets", nil)
|
|
return
|
|
}
|
|
|
|
// Filter to retryable failed targets.
|
|
requestedIDs := make(map[string]bool, len(req.TargetIDs))
|
|
for _, id := range req.TargetIDs {
|
|
requestedIDs[strings.TrimSpace(id)] = true
|
|
}
|
|
|
|
var retryTargets []deploy.Target
|
|
for _, t := range targets {
|
|
if t.Status != deploy.TargetFailedRetryable && t.Status != deploy.TargetFailedPermanent {
|
|
continue
|
|
}
|
|
if t.Attempts >= job.RetryMax {
|
|
continue
|
|
}
|
|
if len(requestedIDs) > 0 && !requestedIDs[t.ID] {
|
|
continue
|
|
}
|
|
retryTargets = append(retryTargets, t)
|
|
}
|
|
|
|
if len(retryTargets) == 0 {
|
|
writeErrorResponse(w, http.StatusConflict, "nothing_to_retry",
|
|
"No eligible targets to retry (all succeeded or exceeded max attempts)", nil)
|
|
return
|
|
}
|
|
|
|
// License slot re-check.
|
|
maxLimit := maxMonitoredSystemsLimitForContext(ctx)
|
|
if maxLimit > 0 {
|
|
decision := monitoredSystemLimitDecisionForAdditionalSlots(ctx, h.monitor, 0)
|
|
if !decision.usageAvailable {
|
|
writeMonitoredSystemUsageUnavailable(w, decision.usageUnavailableReason)
|
|
return
|
|
}
|
|
available := decision.limit - decision.current
|
|
if available < 0 {
|
|
available = 0
|
|
}
|
|
if available < len(retryTargets) {
|
|
retryTargets = retryTargets[:available]
|
|
}
|
|
if len(retryTargets) == 0 {
|
|
writeErrorResponse(w, http.StatusConflict, "license_limit",
|
|
"No license slots available for retry", nil)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Reset targets to pending.
|
|
retryIDs := make([]string, len(retryTargets))
|
|
for i, t := range retryTargets {
|
|
retryIDs[i] = t.ID
|
|
}
|
|
resetCount, err := h.store.ResetTargetsForRetry(ctx, retryIDs)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to reset targets for retry")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "store_error", "Failed to reset targets", nil)
|
|
return
|
|
}
|
|
if resetCount == 0 {
|
|
writeErrorResponse(w, http.StatusConflict, "nothing_to_retry",
|
|
"No targets were in a retryable state", nil)
|
|
return
|
|
}
|
|
|
|
// Transition job back to running.
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobRunning)
|
|
|
|
// Resolve Pulse URL.
|
|
pulseURL := h.resolvePublicURL(r)
|
|
if pulseURL == "" {
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
// Mark reset targets back to failed_retryable so they remain retryable.
|
|
for _, id := range retryIDs {
|
|
_ = h.store.UpdateTargetStatus(ctx, id, deploy.TargetFailedRetryable, "dispatch failed: no Pulse URL")
|
|
}
|
|
h.reservation.Release(jobID + "-retry")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "no_pulse_url",
|
|
"Cannot determine Pulse URL for agent installation", nil)
|
|
return
|
|
}
|
|
|
|
// Mint fresh bootstrap tokens and build install targets.
|
|
var installTargets []agentexec.DeployInstallTarget
|
|
for _, t := range retryTargets {
|
|
rawToken, _, err := h.MintBootstrapTokenForTarget(deploy.BootstrapTokenRequest{
|
|
ClusterID: job.ClusterID,
|
|
NodeID: t.NodeID,
|
|
ExpectedNode: t.NodeName,
|
|
JobID: jobID,
|
|
TargetID: t.ID,
|
|
SourceAgentID: job.SourceAgentID,
|
|
OrgID: orgID,
|
|
TTL: 30 * time.Minute,
|
|
})
|
|
if err != nil {
|
|
log.Error().Err(err).Str("target_id", t.ID).Msg("Failed to mint retry bootstrap token")
|
|
_ = h.store.UpdateTargetStatus(ctx, t.ID, deploy.TargetFailedPermanent, "failed to mint bootstrap token for retry")
|
|
continue
|
|
}
|
|
|
|
arch := t.Arch
|
|
if arch == "" {
|
|
arch = "amd64"
|
|
}
|
|
installTargets = append(installTargets, agentexec.DeployInstallTarget{
|
|
TargetID: t.ID,
|
|
NodeName: t.NodeName,
|
|
NodeIP: t.NodeIP,
|
|
Arch: arch,
|
|
BootstrapToken: rawToken,
|
|
})
|
|
}
|
|
|
|
if len(installTargets) == 0 {
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
writeErrorResponse(w, http.StatusInternalServerError, "retry_setup_failed",
|
|
"Failed to set up any retry targets", nil)
|
|
return
|
|
}
|
|
|
|
// Reserve license slots based on actual dispatch count (after token minting).
|
|
if err := h.reservation.Reserve(jobID+"-retry", orgID, len(installTargets), 1*time.Hour); err != nil {
|
|
log.Warn().Err(err).Str("job_id", jobID).Msg("Failed to reserve license slots for retry")
|
|
}
|
|
|
|
// Append retry event.
|
|
_ = h.store.AppendEvent(ctx, &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventJobStatusChanged,
|
|
Message: fmt.Sprintf("Retry started for %d targets", len(installTargets)),
|
|
CreatedAt: time.Now().UTC(),
|
|
})
|
|
|
|
requestID := generateID("req")
|
|
payload := agentexec.DeployInstallPayload{
|
|
RequestID: requestID,
|
|
JobID: jobID,
|
|
Targets: installTargets,
|
|
PulseURL: pulseURL,
|
|
MaxParallel: job.MaxParallel,
|
|
Timeout: 300,
|
|
}
|
|
|
|
// Subscribe and send.
|
|
progressCh := h.execServer.SubscribeDeployProgress(job.SourceAgentID, jobID, 64)
|
|
if err := h.execServer.SendDeployInstall(ctx, job.SourceAgentID, payload); err != nil {
|
|
h.execServer.UnsubscribeDeployProgress(job.SourceAgentID, jobID)
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
h.reservation.Release(jobID + "-retry")
|
|
// Mark retried targets back to failed so they can be retried again.
|
|
for _, it := range installTargets {
|
|
_ = h.store.UpdateTargetStatus(ctx, it.TargetID, deploy.TargetFailedRetryable, "dispatch failed")
|
|
}
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to send retry install command")
|
|
writeErrorResponse(w, http.StatusInternalServerError, "send_failed",
|
|
"Failed to send retry command to agent", nil)
|
|
return
|
|
}
|
|
|
|
go h.processInstallProgress(jobID, job.SourceAgentID, job.RetryMax, progressCh)
|
|
|
|
resp := map[string]any{
|
|
"jobId": jobID,
|
|
"retryTargets": len(installTargets),
|
|
"status": "running",
|
|
"eventsUrl": fmt.Sprintf("/api/agent-deploy/jobs/%s/events", jobID),
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(http.StatusAccepted)
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// processInstallProgress reads install progress events from the agent and
|
|
// persists them as deploy events, also broadcasting to SSE clients.
|
|
func (h *DeployHandlers) processInstallProgress(jobID, agentID string, retryMax int, ch <-chan agentexec.DeployProgressPayload) {
|
|
defer h.execServer.UnsubscribeDeployProgress(agentID, jobID)
|
|
|
|
ctx := context.Background()
|
|
|
|
for progress := range ch {
|
|
// Persist as event.
|
|
evt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
TargetID: progress.TargetID,
|
|
Type: deploy.EventInstallOutput,
|
|
Message: progress.Message,
|
|
Data: progress.Data,
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
if err := h.store.AppendEvent(ctx, evt); err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to persist install event")
|
|
}
|
|
|
|
// Update target status based on install phase.
|
|
if progress.TargetID != "" {
|
|
h.updateTargetFromInstallProgress(ctx, progress, retryMax)
|
|
}
|
|
|
|
// Broadcast to SSE clients.
|
|
h.broadcastSSE(jobID, evt)
|
|
|
|
if progress.Final {
|
|
// Derive final job status from target statuses.
|
|
// Use install-specific derivation that treats failed_retryable as terminal.
|
|
targets, err := h.store.GetTargetsForJob(ctx, jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to get targets for job status derivation")
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
} else {
|
|
finalStatus := deriveInstallJobStatus(targets)
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, finalStatus)
|
|
|
|
// Broadcast final status.
|
|
finalEvt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventJobStatusChanged,
|
|
Message: fmt.Sprintf("Deploy completed: %s", finalStatus),
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
_ = h.store.AppendEvent(ctx, finalEvt)
|
|
h.broadcastSSE(jobID, finalEvt)
|
|
}
|
|
|
|
// Release license reservation.
|
|
h.reservation.Release(jobID)
|
|
h.reservation.Release(jobID + "-retry") // in case of retry
|
|
|
|
// Close SSE channels.
|
|
h.closeSSESub(jobID)
|
|
return
|
|
}
|
|
}
|
|
|
|
// Channel closed without final — agent disconnected.
|
|
_ = h.store.UpdateJobStatus(ctx, jobID, deploy.JobFailed)
|
|
h.reservation.Release(jobID)
|
|
h.reservation.Release(jobID + "-retry")
|
|
|
|
finalEvt := &deploy.Event{
|
|
ID: generateID("evt"),
|
|
JobID: jobID,
|
|
Type: deploy.EventError,
|
|
Message: "Source agent disconnected during install",
|
|
CreatedAt: time.Now().UTC(),
|
|
}
|
|
_ = h.store.AppendEvent(ctx, finalEvt)
|
|
h.broadcastSSE(jobID, finalEvt)
|
|
h.closeSSESub(jobID)
|
|
}
|
|
|
|
// updateTargetFromInstallProgress maps install progress phases to target status transitions.
|
|
func (h *DeployHandlers) updateTargetFromInstallProgress(ctx context.Context, p agentexec.DeployProgressPayload, retryMax int) {
|
|
var newStatus deploy.TargetStatus
|
|
var errMsg string
|
|
|
|
switch {
|
|
case p.Phase == agentexec.DeployPhaseInstallTransfer && p.Status == agentexec.DeployStepStarted:
|
|
newStatus = deploy.TargetInstalling
|
|
case p.Phase == agentexec.DeployPhaseInstallExecute && p.Status == agentexec.DeployStepFailed:
|
|
// Check attempt count to decide retryable vs permanent.
|
|
target, err := h.store.GetTarget(ctx, p.TargetID)
|
|
if err != nil || target == nil {
|
|
newStatus = deploy.TargetFailedRetryable
|
|
} else if target.Attempts+1 >= retryMax {
|
|
newStatus = deploy.TargetFailedPermanent
|
|
} else {
|
|
newStatus = deploy.TargetFailedRetryable
|
|
}
|
|
errMsg = p.Message
|
|
// Increment attempts.
|
|
_ = h.store.IncrementTargetAttempts(ctx, p.TargetID)
|
|
case p.Phase == agentexec.DeployPhaseInstallEnrollWait && p.Status == agentexec.DeployStepStarted:
|
|
newStatus = deploy.TargetEnrolling
|
|
case p.Phase == agentexec.DeployPhaseInstallEnrollWait && p.Status == agentexec.DeployStepFailed:
|
|
newStatus = deploy.TargetFailedRetryable
|
|
errMsg = p.Message
|
|
_ = h.store.IncrementTargetAttempts(ctx, p.TargetID)
|
|
case p.Phase == agentexec.DeployPhaseInstallComplete && p.Status == agentexec.DeployStepFailed:
|
|
newStatus = deploy.TargetFailedRetryable
|
|
errMsg = p.Message
|
|
_ = h.store.IncrementTargetAttempts(ctx, p.TargetID)
|
|
case p.Phase == agentexec.DeployPhaseInstallComplete && p.Status == agentexec.DeployStepOK:
|
|
// Don't change status here — target remains in 'enrolling' until
|
|
// the enrollment endpoint transitions it to 'succeeded'. The source
|
|
// agent fires this event immediately without waiting for enrollment.
|
|
return
|
|
case p.Phase == agentexec.DeployPhaseCanceled:
|
|
newStatus = deploy.TargetCanceled
|
|
default:
|
|
return // intermediate step, no status change
|
|
}
|
|
|
|
if err := h.store.UpdateTargetStatus(ctx, p.TargetID, newStatus, errMsg); err != nil {
|
|
log.Error().Err(err).
|
|
Str("target_id", p.TargetID).
|
|
Str("new_status", string(newStatus)).
|
|
Msg("Failed to update target status from install progress")
|
|
}
|
|
}
|
|
|
|
// getTargetArchFromPreflight extracts the architecture from preflight events for a given node.
|
|
func (h *DeployHandlers) getTargetArchFromPreflight(ctx context.Context, preflightJobID string, nodeID string) string {
|
|
events, err := h.store.GetEventsForJob(ctx, preflightJobID)
|
|
if err != nil {
|
|
return "amd64"
|
|
}
|
|
|
|
// Get preflight targets to map target IDs to node IDs.
|
|
pfTargets, err := h.store.GetTargetsForJob(ctx, preflightJobID)
|
|
if err != nil {
|
|
return "amd64"
|
|
}
|
|
|
|
targetIDForNode := ""
|
|
for _, t := range pfTargets {
|
|
if t.NodeID == nodeID {
|
|
targetIDForNode = t.ID
|
|
// Also check if arch was stored on the target directly.
|
|
if t.Arch != "" {
|
|
return t.Arch
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
if targetIDForNode == "" {
|
|
return "amd64"
|
|
}
|
|
|
|
// Look through events for preflight_complete with arch data.
|
|
for _, evt := range events {
|
|
if evt.TargetID != targetIDForNode {
|
|
continue
|
|
}
|
|
if evt.Type != deploy.EventPreflightResult || evt.Data == "" {
|
|
continue
|
|
}
|
|
var result agentexec.PreflightResultData
|
|
if err := json.Unmarshal([]byte(evt.Data), &result); err == nil && result.Arch != "" {
|
|
return result.Arch
|
|
}
|
|
}
|
|
|
|
return "amd64"
|
|
}
|
|
|
|
// --- Helpers ---
|
|
|
|
// extractClusterID extracts a cluster ID from a path like /api/clusters/{id}/agent-deploy/...
|
|
func extractClusterID(path, prefix, suffix string) string {
|
|
path = strings.TrimPrefix(path, prefix)
|
|
idx := strings.Index(path, suffix)
|
|
if idx < 0 {
|
|
idx = strings.Index(path, "/")
|
|
if idx < 0 {
|
|
return strings.TrimSpace(path)
|
|
}
|
|
}
|
|
return strings.TrimSpace(path[:idx])
|
|
}
|
|
|
|
// extractPathSuffix extracts the part after a prefix, e.g. /api/foo/bar -> bar
|
|
func extractPathSuffix(path, prefix string) string {
|
|
s := strings.TrimPrefix(path, prefix)
|
|
// Remove trailing slashes and nested paths.
|
|
if idx := strings.Index(s, "/"); idx >= 0 {
|
|
s = s[:idx]
|
|
}
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// nodeIP extracts the hostname/IP from a node host URL (e.g. "https://198.51.100.2:8006" -> "198.51.100.2").
|
|
func nodeIP(hostURL string) string {
|
|
raw := strings.TrimSpace(hostURL)
|
|
if raw == "" {
|
|
return ""
|
|
}
|
|
parsed, err := url.Parse(raw)
|
|
if err != nil || parsed.Host == "" {
|
|
return ""
|
|
}
|
|
host := parsed.Hostname() // strips port
|
|
return host
|
|
}
|
|
|
|
func nodeName(node *unifiedresources.NodeView) string {
|
|
if node == nil {
|
|
return ""
|
|
}
|
|
if name := strings.TrimSpace(node.NodeName()); name != "" {
|
|
return name
|
|
}
|
|
return strings.TrimSpace(node.Name())
|
|
}
|
|
|
|
// generateID creates a prefixed unique ID.
|
|
func generateID(prefix string) string {
|
|
return fmt.Sprintf("%s_%d", prefix, time.Now().UnixNano())
|
|
}
|
|
|
|
// isDeployJobTerminal returns true if the job status is terminal.
|
|
func isDeployJobTerminal(s deploy.JobStatus) bool {
|
|
switch s {
|
|
case deploy.JobSucceeded, deploy.JobPartialSuccess, deploy.JobFailed, deploy.JobCanceled:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// deriveInstallJobStatus computes the final install job status from target statuses.
|
|
// Unlike DeriveStatus, this treats TargetFailedRetryable as terminal (the agent
|
|
// has finished its work and signaled Final=true, so all targets are settled).
|
|
// TargetEnrolling is treated as succeeded because the source agent fires
|
|
// install_complete/ok immediately without waiting for async enrollment.
|
|
func deriveInstallJobStatus(targets []deploy.Target) deploy.JobStatus {
|
|
if len(targets) == 0 {
|
|
return deploy.JobSucceeded
|
|
}
|
|
|
|
var succeeded, failed int
|
|
for _, t := range targets {
|
|
switch t.Status {
|
|
case deploy.TargetSucceeded, deploy.TargetVerifying, deploy.TargetEnrolling:
|
|
// enrolling = install completed, enrollment is async and expected to succeed
|
|
succeeded++
|
|
case deploy.TargetFailedPermanent, deploy.TargetFailedRetryable,
|
|
deploy.TargetSkippedAgent, deploy.TargetSkippedLicense, deploy.TargetCanceled:
|
|
failed++
|
|
// pending/installing — shouldn't happen at Final but treat as incomplete
|
|
default:
|
|
failed++
|
|
}
|
|
}
|
|
|
|
total := len(targets)
|
|
if succeeded == total {
|
|
return deploy.JobSucceeded
|
|
}
|
|
if succeeded > 0 {
|
|
return deploy.JobPartialSuccess
|
|
}
|
|
return deploy.JobFailed
|
|
}
|
|
|
|
// derivePreflightJobStatus computes the final job status from target statuses.
|
|
// Unlike DeriveStatus, this treats TargetReady as "passed" (preflight success)
|
|
// and TargetSkippedAgent as neutral success (not a failure).
|
|
func derivePreflightJobStatus(ctx context.Context, store *deploy.Store, jobID string) deploy.JobStatus {
|
|
targets, err := store.GetTargetsForJob(ctx, jobID)
|
|
if err != nil {
|
|
log.Error().Err(err).Str("job_id", jobID).Msg("Failed to get targets for job status derivation")
|
|
return deploy.JobFailed
|
|
}
|
|
if len(targets) == 0 {
|
|
return deploy.JobSucceeded
|
|
}
|
|
|
|
var succeeded, failed int
|
|
for _, t := range targets {
|
|
switch t.Status {
|
|
case deploy.TargetReady, deploy.TargetSucceeded, deploy.TargetSkippedAgent:
|
|
succeeded++
|
|
case deploy.TargetFailedPermanent, deploy.TargetFailedRetryable:
|
|
failed++
|
|
}
|
|
}
|
|
|
|
total := len(targets)
|
|
if succeeded == total {
|
|
return deploy.JobSucceeded
|
|
}
|
|
if failed == total {
|
|
return deploy.JobFailed
|
|
}
|
|
if succeeded > 0 {
|
|
return deploy.JobPartialSuccess
|
|
}
|
|
return deploy.JobFailed
|
|
}
|