mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 08:57:12 +00:00
1937 lines
52 KiB
Go
1937 lines
52 KiB
Go
package kubernetesagent
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/sha256"
|
|
"crypto/tls"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/IGLOU-EU/go-wildcard/v2"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
|
|
agentsk8s "github.com/rcourtman/pulse-go-rewrite/pkg/agents/kubernetes"
|
|
"github.com/rs/zerolog"
|
|
appsv1 "k8s.io/api/apps/v1"
|
|
corev1 "k8s.io/api/core/v1"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
k8sresource "k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/fields"
|
|
"k8s.io/client-go/kubernetes"
|
|
"k8s.io/client-go/rest"
|
|
"k8s.io/client-go/tools/clientcmd"
|
|
)
|
|
|
|
// Config holds all configuration needed to run the Kubernetes agent.
|
|
// It specifies how to connect to the Pulse backend, the Kubernetes cluster,
|
|
// and what resources to include in reports.
|
|
type Config struct {
|
|
PulseURL string
|
|
APIToken string
|
|
Interval time.Duration
|
|
AgentID string
|
|
AgentType string // "unified" when running as part of pulse-agent
|
|
AgentVersion string // Version to report; if empty, uses kubernetesagent.Version
|
|
InsecureSkipVerify bool
|
|
LogLevel zerolog.Level
|
|
Logger *zerolog.Logger
|
|
|
|
// Kubernetes connection
|
|
KubeconfigPath string
|
|
KubeContext string
|
|
|
|
// Report shaping
|
|
IncludeNamespaces []string
|
|
ExcludeNamespaces []string
|
|
IncludeAllPods bool // Include all non-succeeded pods (still capped)
|
|
IncludeAllDeployments bool // Include all deployments, not just problem ones
|
|
MaxPods int // Max pods included in the report
|
|
}
|
|
|
|
// Agent collects and reports Kubernetes cluster state to Pulse.
|
|
// It periodically gathers pod, deployment, and node metrics, then sends
|
|
// them to the configured Pulse URL. The agent handles authentication,
|
|
// Kubernetes API connection, and automatic retry on failure.
|
|
type Agent struct {
|
|
cfg Config
|
|
logger zerolog.Logger
|
|
httpClient *http.Client
|
|
|
|
kubeClient kubernetes.Interface
|
|
restCfg *rest.Config
|
|
|
|
agentID string
|
|
agentVersion string
|
|
interval time.Duration
|
|
pulseURL string
|
|
|
|
clusterID string
|
|
clusterName string
|
|
clusterServer string
|
|
clusterContext string
|
|
clusterVersion string
|
|
|
|
includeNamespaces []string
|
|
excludeNamespaces []string
|
|
|
|
reportBuffer *utils.Queue[agentsk8s.Report]
|
|
}
|
|
|
|
const (
|
|
defaultInterval = 30 * time.Second
|
|
defaultMaxPods = 200
|
|
defaultMaxDeployments = 1000
|
|
requestTimeout = 20 * time.Second
|
|
collectReportTimeout = 45 * time.Second
|
|
listPageSize int64 = 250
|
|
maxKubeAPIRetries = 3
|
|
initialRetryBackoff = 300 * time.Millisecond
|
|
maxRetryBackoff = 3 * time.Second
|
|
maxSummaryMetricNodes = 200
|
|
summaryMetricsWorkers = 8
|
|
reportUserAgent = "pulse-kubernetes-agent/"
|
|
maxMetricsResponseBodyBytes int64 = 32 * 1024 * 1024 // 32 MB
|
|
maxRecoveryResponseBodyBytes int64 = 8 * 1024 * 1024 // 8 MB (recovery APIs can be large)
|
|
)
|
|
|
|
func New(cfg Config) (*Agent, error) {
|
|
if cfg.Interval <= 0 {
|
|
cfg.Interval = defaultInterval
|
|
}
|
|
if cfg.MaxPods <= 0 {
|
|
cfg.MaxPods = defaultMaxPods
|
|
}
|
|
|
|
if zerolog.GlobalLevel() == zerolog.DebugLevel && cfg.LogLevel != zerolog.DebugLevel {
|
|
zerolog.SetGlobalLevel(cfg.LogLevel)
|
|
}
|
|
|
|
logger := cfg.Logger
|
|
if logger == nil {
|
|
defaultLogger := zerolog.New(os.Stdout).Level(cfg.LogLevel).With().Timestamp().Str("component", "pulse-kubernetes-agent").Logger()
|
|
logger = &defaultLogger
|
|
} else {
|
|
scoped := logger.With().Str("component", "pulse-kubernetes-agent").Logger()
|
|
logger = &scoped
|
|
}
|
|
|
|
if strings.TrimSpace(cfg.APIToken) == "" {
|
|
return nil, fmt.Errorf("api token is required")
|
|
}
|
|
|
|
pulseURL := strings.TrimSpace(cfg.PulseURL)
|
|
if pulseURL == "" {
|
|
pulseURL = "http://localhost:7655"
|
|
}
|
|
pulseURL, err := normalizePulseURL(pulseURL)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid pulse URL: %w", err)
|
|
}
|
|
cfg.PulseURL = pulseURL
|
|
|
|
restCfg, contextName, err := buildRESTConfig(cfg.KubeconfigPath, cfg.KubeContext)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("build Kubernetes REST config: %w", err)
|
|
}
|
|
if restCfg.Timeout <= 0 {
|
|
restCfg.Timeout = requestTimeout
|
|
}
|
|
|
|
kubeClient, err := kubernetes.NewForConfig(restCfg)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create kubernetes client: %w", err)
|
|
}
|
|
|
|
agentVersion := strings.TrimSpace(cfg.AgentVersion)
|
|
if agentVersion == "" {
|
|
agentVersion = Version
|
|
}
|
|
|
|
tlsConfig := &tls.Config{MinVersion: tls.VersionTLS12}
|
|
if cfg.InsecureSkipVerify {
|
|
//nolint:gosec // Insecure mode is explicitly user-controlled.
|
|
tlsConfig.InsecureSkipVerify = true
|
|
}
|
|
httpClient := &http.Client{
|
|
Timeout: 15 * time.Second,
|
|
Transport: &http.Transport{
|
|
Proxy: http.ProxyFromEnvironment,
|
|
TLSClientConfig: tlsConfig,
|
|
},
|
|
// Disallow redirects for agent API calls. If a reverse proxy redirects
|
|
// HTTP to HTTPS, Go's default behavior converts POST to GET (per HTTP spec),
|
|
// causing 405 errors. Return an error with guidance instead.
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
return fmt.Errorf("server returned redirect to %s - if using a reverse proxy, ensure you use the correct protocol (https:// instead of http://) in your --url flag", req.URL)
|
|
},
|
|
}
|
|
|
|
clusterServer := strings.TrimSpace(restCfg.Host)
|
|
clusterContext := strings.TrimSpace(contextName)
|
|
clusterName := clusterContext
|
|
clusterID := computeClusterID(clusterServer, clusterContext, clusterName)
|
|
|
|
agentID := strings.TrimSpace(cfg.AgentID)
|
|
if agentID == "" {
|
|
agentID = clusterID
|
|
}
|
|
|
|
agent := &Agent{
|
|
cfg: cfg,
|
|
logger: *logger,
|
|
httpClient: httpClient,
|
|
kubeClient: kubeClient,
|
|
restCfg: restCfg,
|
|
agentID: agentID,
|
|
agentVersion: agentVersion,
|
|
interval: cfg.Interval,
|
|
pulseURL: pulseURL,
|
|
clusterID: clusterID,
|
|
clusterName: clusterName,
|
|
clusterServer: clusterServer,
|
|
clusterContext: clusterContext,
|
|
includeNamespaces: cfg.IncludeNamespaces,
|
|
excludeNamespaces: cfg.ExcludeNamespaces,
|
|
reportBuffer: utils.New[agentsk8s.Report](60),
|
|
}
|
|
|
|
if err := agent.discoverClusterMetadata(context.Background()); err != nil {
|
|
agent.logger.Warn().Err(err).Str("cluster_id", agent.clusterID).Msg("failed to discover cluster metadata")
|
|
}
|
|
|
|
agent.logger.Info().
|
|
Str("cluster_id", agent.clusterID).
|
|
Str("cluster_name", agent.clusterName).
|
|
Str("server", agent.clusterServer).
|
|
Str("context", agent.clusterContext).
|
|
Msg("kubernetes agent initialized")
|
|
|
|
return agent, nil
|
|
}
|
|
|
|
func normalizePulseURL(rawURL string) (string, error) {
|
|
parsed, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return "", fmt.Errorf("pulse URL %q is invalid: %w", rawURL, err)
|
|
}
|
|
|
|
if parsed.Scheme == "" {
|
|
return "", fmt.Errorf("pulse URL %q must include http:// or https:// scheme", rawURL)
|
|
}
|
|
if parsed.Host == "" {
|
|
return "", fmt.Errorf("pulse URL %q must include host", rawURL)
|
|
}
|
|
if parsed.User != nil {
|
|
return "", fmt.Errorf("pulse URL %q: userinfo is not supported", rawURL)
|
|
}
|
|
if parsed.RawQuery != "" {
|
|
return "", fmt.Errorf("pulse URL %q: query parameters are not supported", rawURL)
|
|
}
|
|
if parsed.Fragment != "" {
|
|
return "", fmt.Errorf("pulse URL %q: fragments are not supported", rawURL)
|
|
}
|
|
|
|
scheme := strings.ToLower(parsed.Scheme)
|
|
switch scheme {
|
|
case "https":
|
|
// Always allowed.
|
|
case "http":
|
|
if !isLoopbackOrPrivateHost(parsed.Hostname()) {
|
|
return "", fmt.Errorf("pulse URL %q must use https unless host is loopback or private network", rawURL)
|
|
}
|
|
default:
|
|
return "", fmt.Errorf("pulse URL %q has unsupported scheme %q", rawURL, parsed.Scheme)
|
|
}
|
|
|
|
if port := parsed.Port(); port != "" {
|
|
portNum, err := strconv.Atoi(port)
|
|
if err != nil || portNum < 1 || portNum > 65535 {
|
|
return "", fmt.Errorf("invalid port %q: must be between 1 and 65535", port)
|
|
}
|
|
}
|
|
|
|
parsed.Scheme = scheme
|
|
parsed.Host = strings.ToLower(parsed.Host)
|
|
parsed.Path = strings.TrimRight(parsed.Path, "/")
|
|
parsed.RawPath = strings.TrimRight(parsed.RawPath, "/")
|
|
|
|
return parsed.String(), nil
|
|
}
|
|
|
|
// isLoopbackOrPrivateHost returns true for loopback and RFC1918 private
|
|
// network addresses. HTTP (non-TLS) is safe over a local/private network;
|
|
// the scheme guard only needs to prevent plaintext over the public internet.
|
|
func isLoopbackOrPrivateHost(host string) bool {
|
|
if strings.EqualFold(host, "localhost") {
|
|
return true
|
|
}
|
|
|
|
ip := net.ParseIP(host)
|
|
if ip == nil {
|
|
return false
|
|
}
|
|
return ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast()
|
|
}
|
|
|
|
func buildRESTConfig(kubeconfigPath, kubeContext string) (*rest.Config, string, error) {
|
|
kubeconfigPath = strings.TrimSpace(kubeconfigPath)
|
|
kubeContext = strings.TrimSpace(kubeContext)
|
|
|
|
// Prefer explicit kubeconfig.
|
|
if kubeconfigPath != "" {
|
|
loadingRules := &clientcmd.ClientConfigLoadingRules{ExplicitPath: kubeconfigPath}
|
|
overrides := &clientcmd.ConfigOverrides{}
|
|
if kubeContext != "" {
|
|
overrides.CurrentContext = kubeContext
|
|
}
|
|
|
|
cc := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, overrides)
|
|
rawCfg, err := cc.RawConfig()
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("load kubeconfig: %w", err)
|
|
}
|
|
|
|
contextName := rawCfg.CurrentContext
|
|
if kubeContext != "" {
|
|
contextName = kubeContext
|
|
}
|
|
|
|
restCfg, err := cc.ClientConfig()
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("build kubeconfig rest config: %w", err)
|
|
}
|
|
return restCfg, contextName, nil
|
|
}
|
|
|
|
// Otherwise try in-cluster configuration.
|
|
restCfg, err := rest.InClusterConfig()
|
|
if err == nil {
|
|
return restCfg, "in-cluster", nil
|
|
}
|
|
|
|
// Fallback: default kubeconfig path.
|
|
cc := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
|
|
clientcmd.NewDefaultClientConfigLoadingRules(),
|
|
&clientcmd.ConfigOverrides{CurrentContext: kubeContext},
|
|
)
|
|
rawCfg, rawErr := cc.RawConfig()
|
|
if rawErr != nil {
|
|
return nil, "", fmt.Errorf("kubernetes config not available (in-cluster failed: %v; kubeconfig failed: %w)", err, rawErr)
|
|
}
|
|
|
|
contextName := rawCfg.CurrentContext
|
|
if kubeContext != "" {
|
|
contextName = kubeContext
|
|
}
|
|
|
|
restCfg, cfgErr := cc.ClientConfig()
|
|
if cfgErr != nil {
|
|
return nil, "", fmt.Errorf("build kubeconfig rest config: %w", cfgErr)
|
|
}
|
|
return restCfg, contextName, nil
|
|
}
|
|
|
|
func computeClusterID(server, context, name string) string {
|
|
payload := strings.TrimSpace(server) + "|" + strings.TrimSpace(context) + "|" + strings.TrimSpace(name)
|
|
sum := sha256.Sum256([]byte(payload))
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func (a *Agent) discoverClusterMetadata(ctx context.Context) error {
|
|
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
|
|
defer cancel()
|
|
|
|
version, err := a.kubeClient.Discovery().ServerVersion()
|
|
if err != nil {
|
|
return fmt.Errorf("discover cluster server version: %w", err)
|
|
}
|
|
if version != nil {
|
|
a.clusterVersion = strings.TrimSpace(version.GitVersion)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (a *Agent) closeIdleConnections() {
|
|
if a.httpClient != nil {
|
|
a.httpClient.CloseIdleConnections()
|
|
}
|
|
}
|
|
|
|
func (a *Agent) Run(ctx context.Context) error {
|
|
defer a.closeIdleConnections()
|
|
|
|
ticker := time.NewTicker(a.interval)
|
|
defer ticker.Stop()
|
|
|
|
a.runOnce(ctx)
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
a.runOnce(ctx)
|
|
case <-ctx.Done():
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
func (a *Agent) bufferedReportCount() int {
|
|
if a == nil || a.reportBuffer == nil {
|
|
return 0
|
|
}
|
|
return a.reportBuffer.Len()
|
|
}
|
|
|
|
func (a *Agent) runOnce(ctx context.Context) {
|
|
a.flushReports(ctx)
|
|
|
|
report, err := a.collectReport(ctx)
|
|
if err != nil {
|
|
a.logger.Warn().
|
|
Err(err).
|
|
Str("phase", "collect_report").
|
|
Str("cluster_id", a.clusterID).
|
|
Int("buffer_depth", a.bufferedReportCount()).
|
|
Msg("Failed to collect Kubernetes report")
|
|
return
|
|
}
|
|
|
|
if err := a.sendReport(ctx, report); err != nil {
|
|
a.logger.Warn().
|
|
Err(err).
|
|
Str("phase", "send_report").
|
|
Str("cluster_id", a.clusterID).
|
|
Str("agent_id", a.agentID).
|
|
Int("report_nodes", len(report.Nodes)).
|
|
Int("report_pods", len(report.Pods)).
|
|
Int("report_deployments", len(report.Deployments)).
|
|
Int("buffer_depth_before", a.bufferedReportCount()).
|
|
Msg("Failed to send Kubernetes report, buffering")
|
|
a.reportBuffer.Push(report)
|
|
a.logger.Debug().
|
|
Str("phase", "buffer_report").
|
|
Str("cluster_id", a.clusterID).
|
|
Int("buffer_depth_after", a.bufferedReportCount()).
|
|
Msg("Buffered Kubernetes report for retry")
|
|
}
|
|
}
|
|
|
|
func (a *Agent) flushReports(ctx context.Context) {
|
|
flushed := 0
|
|
for {
|
|
report, ok := a.reportBuffer.Peek()
|
|
if !ok {
|
|
if flushed > 0 {
|
|
a.logger.Debug().
|
|
Str("phase", "flush_buffered_reports").
|
|
Str("cluster_id", a.clusterID).
|
|
Str("agent_id", a.agentID).
|
|
Int("flushed_reports", flushed).
|
|
Int("buffer_depth_remaining", a.bufferedReportCount()).
|
|
Msg("Flushed buffered Kubernetes reports")
|
|
}
|
|
return
|
|
}
|
|
if err := a.sendReport(ctx, report); err != nil {
|
|
a.logger.Warn().
|
|
Err(err).
|
|
Str("phase", "flush_buffered_report").
|
|
Str("cluster_id", a.clusterID).
|
|
Str("agent_id", a.agentID).
|
|
Int("report_nodes", len(report.Nodes)).
|
|
Int("report_pods", len(report.Pods)).
|
|
Int("report_deployments", len(report.Deployments)).
|
|
Int("buffer_depth", a.bufferedReportCount()).
|
|
Msg("Failed to flush buffered Kubernetes report")
|
|
return
|
|
}
|
|
if _, ok := a.reportBuffer.Pop(); !ok {
|
|
a.logger.Debug().Msg("Failed to remove buffered report after successful send")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (a *Agent) namespaceAllowed(ns string) bool {
|
|
ns = strings.TrimSpace(ns)
|
|
if ns == "" {
|
|
return false
|
|
}
|
|
for _, excludeNamespace := range a.excludeNamespaces {
|
|
if wildcard.Match(excludeNamespace, ns) {
|
|
return false
|
|
}
|
|
}
|
|
if len(a.includeNamespaces) == 0 {
|
|
return true
|
|
}
|
|
for _, includeNamespace := range a.includeNamespaces {
|
|
if wildcard.Match(includeNamespace, ns) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (a *Agent) collectReport(ctx context.Context) (agentsk8s.Report, error) {
|
|
ctx, cancel := context.WithTimeout(ctx, collectReportTimeout)
|
|
defer cancel()
|
|
|
|
nodes, err := a.collectNodes(ctx)
|
|
if err != nil {
|
|
return agentsk8s.Report{}, fmt.Errorf("collect nodes: %w", err)
|
|
}
|
|
|
|
pods, err := a.collectPods(ctx)
|
|
if err != nil {
|
|
return agentsk8s.Report{}, fmt.Errorf("collect pods: %w", err)
|
|
}
|
|
|
|
deployments, err := a.collectDeployments(ctx)
|
|
if err != nil {
|
|
return agentsk8s.Report{}, fmt.Errorf("collect deployments: %w", err)
|
|
}
|
|
|
|
nodeUsage, podUsage, usageErr := a.collectUsageMetrics(ctx, nodes)
|
|
if usageErr != nil {
|
|
a.logger.Debug().Err(usageErr).Str("cluster_id", a.clusterID).Msg("kubernetes usage metrics unavailable, continuing with inventory-only report")
|
|
}
|
|
applyNodeUsage(nodes, nodeUsage)
|
|
applyPodUsage(pods, podUsage)
|
|
|
|
recoveryReport, recoveryErr := a.collectRecovery(ctx)
|
|
if recoveryErr != nil {
|
|
a.logger.Debug().Err(recoveryErr).Str("cluster_id", a.clusterID).Msg("kubernetes recovery artifacts unavailable, continuing without recovery report")
|
|
}
|
|
|
|
return agentsk8s.Report{
|
|
Agent: agentsk8s.AgentInfo{
|
|
ID: a.agentID,
|
|
Version: a.agentVersion,
|
|
Type: strings.TrimSpace(a.cfg.AgentType),
|
|
IntervalSeconds: int(a.interval / time.Second),
|
|
},
|
|
Cluster: agentsk8s.ClusterInfo{
|
|
ID: a.clusterID,
|
|
Name: a.clusterName,
|
|
Server: a.clusterServer,
|
|
Context: a.clusterContext,
|
|
Version: a.clusterVersion,
|
|
},
|
|
Nodes: nodes,
|
|
Pods: pods,
|
|
Deployments: deployments,
|
|
Recovery: recoveryReport,
|
|
Timestamp: time.Now().UTC(),
|
|
}, nil
|
|
}
|
|
|
|
func (a *Agent) collectRecovery(ctx context.Context) (*agentsk8s.RecoveryReport, error) {
|
|
restClient := a.getDiscoveryRESTClient()
|
|
if restClient == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
volumeSnapshots, _ := a.collectVolumeSnapshots(ctx, restClient)
|
|
veleroBackups, _ := a.collectVeleroBackups(ctx, restClient)
|
|
|
|
if len(volumeSnapshots) == 0 && len(veleroBackups) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
return &agentsk8s.RecoveryReport{
|
|
VolumeSnapshots: volumeSnapshots,
|
|
VeleroBackups: veleroBackups,
|
|
}, nil
|
|
}
|
|
|
|
func (a *Agent) collectVolumeSnapshots(ctx context.Context, restClient rest.Interface) ([]agentsk8s.VolumeSnapshot, error) {
|
|
raw, ok, err := a.doOptionalRawPath(ctx, restClient, "list volumesnapshots", "/apis/snapshot.storage.k8s.io/v1/volumesnapshots?limit=200")
|
|
if err != nil || !ok || len(raw) == 0 {
|
|
return nil, err
|
|
}
|
|
|
|
type vsError struct {
|
|
Message string `json:"message"`
|
|
}
|
|
type vsItem struct {
|
|
Metadata struct {
|
|
UID string `json:"uid"`
|
|
Name string `json:"name"`
|
|
Namespace string `json:"namespace"`
|
|
CreationTimestamp time.Time `json:"creationTimestamp"`
|
|
} `json:"metadata"`
|
|
Spec struct {
|
|
VolumeSnapshotClassName string `json:"volumeSnapshotClassName"`
|
|
Source struct {
|
|
PersistentVolumeClaimName string `json:"persistentVolumeClaimName"`
|
|
} `json:"source"`
|
|
} `json:"spec"`
|
|
Status struct {
|
|
ReadyToUse *bool `json:"readyToUse"`
|
|
CreationTime *time.Time `json:"creationTime"`
|
|
CompletionTime *time.Time `json:"completionTime"`
|
|
BoundVolumeSnapshotContentName string `json:"boundVolumeSnapshotContentName"`
|
|
RestoreSize string `json:"restoreSize"`
|
|
Error *vsError `json:"error"`
|
|
} `json:"status"`
|
|
}
|
|
type vsList struct {
|
|
Items []vsItem `json:"items"`
|
|
}
|
|
|
|
var parsed vsList
|
|
if err := json.Unmarshal(raw, &parsed); err != nil {
|
|
return nil, fmt.Errorf("parse volumesnapshots: %w", err)
|
|
}
|
|
|
|
out := make([]agentsk8s.VolumeSnapshot, 0, len(parsed.Items))
|
|
for _, item := range parsed.Items {
|
|
createdAt := item.Metadata.CreationTimestamp.UTC()
|
|
name := strings.TrimSpace(item.Metadata.Name)
|
|
namespace := strings.TrimSpace(item.Metadata.Namespace)
|
|
if name == "" || namespace == "" {
|
|
continue
|
|
}
|
|
|
|
var restoreSizeBytes *int64
|
|
if strings.TrimSpace(item.Status.RestoreSize) != "" {
|
|
if q, err := k8sresource.ParseQuantity(strings.TrimSpace(item.Status.RestoreSize)); err == nil {
|
|
v := q.Value()
|
|
restoreSizeBytes = &v
|
|
}
|
|
}
|
|
|
|
errText := ""
|
|
if item.Status.Error != nil {
|
|
errText = strings.TrimSpace(item.Status.Error.Message)
|
|
}
|
|
|
|
creationTime := item.Status.CreationTime
|
|
if creationTime == nil {
|
|
creationTime = &createdAt
|
|
}
|
|
|
|
out = append(out, agentsk8s.VolumeSnapshot{
|
|
UID: strings.TrimSpace(item.Metadata.UID),
|
|
Name: name,
|
|
Namespace: namespace,
|
|
SnapshotClass: strings.TrimSpace(item.Spec.VolumeSnapshotClassName),
|
|
SourcePVC: strings.TrimSpace(item.Spec.Source.PersistentVolumeClaimName),
|
|
ReadyToUse: item.Status.ReadyToUse,
|
|
RestoreSizeBytes: restoreSizeBytes,
|
|
CreationTime: creationTime,
|
|
CompletionTime: item.Status.CompletionTime,
|
|
ContentName: strings.TrimSpace(item.Status.BoundVolumeSnapshotContentName),
|
|
Error: errText,
|
|
})
|
|
}
|
|
|
|
// Sort newest-first to keep payload stable.
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
a := out[i].CompletionTime
|
|
b := out[j].CompletionTime
|
|
if a == nil && b == nil {
|
|
return out[i].Name > out[j].Name
|
|
}
|
|
if a == nil {
|
|
return false
|
|
}
|
|
if b == nil {
|
|
return true
|
|
}
|
|
return a.After(*b)
|
|
})
|
|
|
|
const maxItems = 200
|
|
if len(out) > maxItems {
|
|
out = out[:maxItems]
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (a *Agent) collectVeleroBackups(ctx context.Context, restClient rest.Interface) ([]agentsk8s.VeleroBackup, error) {
|
|
raw, ok, err := a.doOptionalRawPath(ctx, restClient, "list velero backups", "/apis/velero.io/v1/backups?limit=200")
|
|
if err != nil || !ok || len(raw) == 0 {
|
|
return nil, err
|
|
}
|
|
|
|
type vbItem struct {
|
|
Metadata struct {
|
|
UID string `json:"uid"`
|
|
Name string `json:"name"`
|
|
Namespace string `json:"namespace"`
|
|
CreatedAt time.Time `json:"creationTimestamp"`
|
|
} `json:"metadata"`
|
|
Spec struct {
|
|
StorageLocation string `json:"storageLocation"`
|
|
} `json:"spec"`
|
|
Status struct {
|
|
Phase string `json:"phase"`
|
|
StartTimestamp *time.Time `json:"startTimestamp"`
|
|
CompletionTimestamp *time.Time `json:"completionTimestamp"`
|
|
Expiration *time.Time `json:"expiration"`
|
|
} `json:"status"`
|
|
}
|
|
type vbList struct {
|
|
Items []vbItem `json:"items"`
|
|
}
|
|
|
|
var parsed vbList
|
|
if err := json.Unmarshal(raw, &parsed); err != nil {
|
|
return nil, fmt.Errorf("parse velero backups: %w", err)
|
|
}
|
|
|
|
out := make([]agentsk8s.VeleroBackup, 0, len(parsed.Items))
|
|
for _, item := range parsed.Items {
|
|
name := strings.TrimSpace(item.Metadata.Name)
|
|
if name == "" {
|
|
continue
|
|
}
|
|
|
|
namespace := strings.TrimSpace(item.Metadata.Namespace)
|
|
if namespace == "" {
|
|
namespace = "velero"
|
|
}
|
|
|
|
out = append(out, agentsk8s.VeleroBackup{
|
|
UID: strings.TrimSpace(item.Metadata.UID),
|
|
Name: name,
|
|
Namespace: namespace,
|
|
Phase: strings.TrimSpace(item.Status.Phase),
|
|
StartedAt: item.Status.StartTimestamp,
|
|
CompletedAt: item.Status.CompletionTimestamp,
|
|
Expiration: item.Status.Expiration,
|
|
StorageLocation: strings.TrimSpace(item.Spec.StorageLocation),
|
|
})
|
|
}
|
|
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
a := out[i].CompletedAt
|
|
b := out[j].CompletedAt
|
|
if a == nil && b == nil {
|
|
return out[i].Name > out[j].Name
|
|
}
|
|
if a == nil {
|
|
return false
|
|
}
|
|
if b == nil {
|
|
return true
|
|
}
|
|
return a.After(*b)
|
|
})
|
|
const maxItems = 200
|
|
if len(out) > maxItems {
|
|
out = out[:maxItems]
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (a *Agent) doOptionalRawPath(ctx context.Context, restClient rest.Interface, action, path string) ([]byte, bool, error) {
|
|
if restClient == nil {
|
|
return nil, false, nil
|
|
}
|
|
|
|
callCtx, cancel := context.WithTimeout(ctx, requestTimeout)
|
|
payload, err := readKubernetesResponseBody(callCtx, restClient, path, maxRecoveryResponseBodyBytes)
|
|
cancel()
|
|
if err == nil {
|
|
return payload, true, nil
|
|
}
|
|
|
|
var statusErr *apierrors.StatusError
|
|
if errors.As(err, &statusErr) {
|
|
if apierrors.IsForbidden(statusErr) || apierrors.IsNotFound(statusErr) || apierrors.IsUnauthorized(statusErr) {
|
|
return nil, false, nil
|
|
}
|
|
}
|
|
|
|
msg := strings.ToLower(err.Error())
|
|
if strings.Contains(msg, "not found") || strings.Contains(msg, "forbidden") {
|
|
return nil, false, nil
|
|
}
|
|
|
|
return nil, false, fmt.Errorf("%s: %w", action, err)
|
|
}
|
|
|
|
func (a *Agent) getDiscoveryRESTClient() rest.Interface {
|
|
if a == nil || a.kubeClient == nil {
|
|
return nil
|
|
}
|
|
discovery := a.kubeClient.Discovery()
|
|
if discovery == nil || discovery.RESTClient() == nil {
|
|
return nil
|
|
}
|
|
return discovery.RESTClient()
|
|
}
|
|
|
|
func (a *Agent) collectUsageMetrics(ctx context.Context, nodes []agentsk8s.Node) (map[string]agentsk8s.NodeUsage, map[string]agentsk8s.PodUsage, error) {
|
|
restClient := a.getDiscoveryRESTClient()
|
|
if restClient == nil {
|
|
return nil, nil, nil
|
|
}
|
|
|
|
nodeRaw, nodeErr := readKubernetesResponseBody(ctx, restClient, "/apis/metrics.k8s.io/v1beta1/nodes", maxMetricsResponseBodyBytes)
|
|
podRaw, podErr := readKubernetesResponseBody(ctx, restClient, "/apis/metrics.k8s.io/v1beta1/pods", maxMetricsResponseBodyBytes)
|
|
|
|
nodeUsage := map[string]agentsk8s.NodeUsage{}
|
|
if nodeErr == nil {
|
|
parsed, err := parseNodeMetricsPayload(nodeRaw)
|
|
if err != nil {
|
|
a.logger.Debug().
|
|
Err(err).
|
|
Int("payload_bytes", len(nodeRaw)).
|
|
Msg("Failed to parse Kubernetes node metrics payload")
|
|
} else {
|
|
nodeUsage = parsed
|
|
}
|
|
}
|
|
|
|
podUsage := map[string]agentsk8s.PodUsage{}
|
|
if podErr == nil {
|
|
parsed, err := parsePodMetricsPayload(podRaw)
|
|
if err != nil {
|
|
a.logger.Debug().
|
|
Err(err).
|
|
Int("payload_bytes", len(podRaw)).
|
|
Msg("Failed to parse Kubernetes pod metrics payload")
|
|
} else {
|
|
podUsage = parsed
|
|
}
|
|
}
|
|
|
|
summaryUsage, summaryErr := a.collectPodSummaryMetrics(ctx, nodes)
|
|
if summaryErr != nil {
|
|
a.logger.Debug().
|
|
Err(summaryErr).
|
|
Int("node_count", len(nodes)).
|
|
Msg("Failed to collect Kubernetes pod summary metrics")
|
|
}
|
|
mergePodSummaryUsage(podUsage, summaryUsage)
|
|
|
|
if nodeErr != nil && podErr != nil && len(podUsage) == 0 && len(nodeUsage) == 0 {
|
|
if summaryErr != nil {
|
|
return nil, nil, fmt.Errorf("metrics.k8s.io unavailable (nodes: %w; pods: %v); summary unavailable: %v", nodeErr, podErr, summaryErr)
|
|
}
|
|
return nil, nil, fmt.Errorf("metrics.k8s.io unavailable (nodes: %w; pods: %v)", nodeErr, podErr)
|
|
}
|
|
|
|
return nodeUsage, podUsage, nil
|
|
}
|
|
|
|
type podSummaryUsage struct {
|
|
NetworkRxBytes int64
|
|
NetworkTxBytes int64
|
|
EphemeralStorageUsedBytes int64
|
|
EphemeralStorageCapacityBytes int64
|
|
}
|
|
|
|
type resourceUsage struct {
|
|
CPU string `json:"cpu"`
|
|
Memory string `json:"memory"`
|
|
}
|
|
|
|
func summaryNodeNames(nodes []agentsk8s.Node, max int) ([]string, int) {
|
|
names := make([]string, 0, len(nodes))
|
|
seen := make(map[string]struct{}, len(nodes))
|
|
for _, node := range nodes {
|
|
nodeName := strings.TrimSpace(node.Name)
|
|
if nodeName == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[nodeName]; ok {
|
|
continue
|
|
}
|
|
seen[nodeName] = struct{}{}
|
|
names = append(names, nodeName)
|
|
}
|
|
total := len(names)
|
|
if max > 0 && len(names) > max {
|
|
names = names[:max]
|
|
}
|
|
return names, total
|
|
}
|
|
|
|
func (a *Agent) collectPodSummaryMetrics(ctx context.Context, nodes []agentsk8s.Node) (map[string]podSummaryUsage, error) {
|
|
restClient := a.getDiscoveryRESTClient()
|
|
if restClient == nil {
|
|
return nil, nil
|
|
}
|
|
result := make(map[string]podSummaryUsage)
|
|
nodeNames, totalNodeNames := summaryNodeNames(nodes, maxSummaryMetricNodes)
|
|
if totalNodeNames > len(nodeNames) {
|
|
a.logger.Debug().
|
|
Int("total_nodes", totalNodeNames).
|
|
Int("queried_nodes", len(nodeNames)).
|
|
Msg("Limiting pod summary metrics collection to avoid overloading large clusters")
|
|
}
|
|
|
|
workerCount := summaryMetricsWorkers
|
|
if workerCount > len(nodeNames) {
|
|
workerCount = len(nodeNames)
|
|
}
|
|
if workerCount == 0 {
|
|
return result, nil
|
|
}
|
|
|
|
jobs := make(chan string)
|
|
var wg sync.WaitGroup
|
|
var lock sync.Mutex
|
|
var failed int
|
|
var succeeded int
|
|
|
|
collectNode := func(nodeName string) {
|
|
path := "/api/v1/nodes/" + url.PathEscape(nodeName) + "/proxy/stats/summary"
|
|
raw, err := a.doRawPathWithRetry(ctx, restClient, fmt.Sprintf("fetch pod summary metrics from node %q", nodeName), path)
|
|
if err != nil {
|
|
lock.Lock()
|
|
failed++
|
|
lock.Unlock()
|
|
return
|
|
}
|
|
|
|
parsed, parseErr := parsePodSummaryMetricsPayload(raw)
|
|
if parseErr != nil {
|
|
lock.Lock()
|
|
failed++
|
|
a.logger.Debug().
|
|
Err(parseErr).
|
|
Str("node", nodeName).
|
|
Int("payload_bytes", len(raw)).
|
|
Msg("Failed to parse Kubernetes pod summary metrics payload")
|
|
lock.Unlock()
|
|
return
|
|
}
|
|
|
|
lock.Lock()
|
|
succeeded++
|
|
for key, usage := range parsed {
|
|
existing := result[key]
|
|
if usage.NetworkRxBytes > existing.NetworkRxBytes {
|
|
existing.NetworkRxBytes = usage.NetworkRxBytes
|
|
}
|
|
if usage.NetworkTxBytes > existing.NetworkTxBytes {
|
|
existing.NetworkTxBytes = usage.NetworkTxBytes
|
|
}
|
|
if usage.EphemeralStorageUsedBytes > existing.EphemeralStorageUsedBytes {
|
|
existing.EphemeralStorageUsedBytes = usage.EphemeralStorageUsedBytes
|
|
}
|
|
if usage.EphemeralStorageCapacityBytes > existing.EphemeralStorageCapacityBytes {
|
|
existing.EphemeralStorageCapacityBytes = usage.EphemeralStorageCapacityBytes
|
|
}
|
|
result[key] = existing
|
|
}
|
|
lock.Unlock()
|
|
}
|
|
|
|
for i := 0; i < workerCount; i++ {
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
for nodeName := range jobs {
|
|
if ctx.Err() != nil {
|
|
return
|
|
}
|
|
collectNode(nodeName)
|
|
}
|
|
}()
|
|
}
|
|
|
|
dispatchLoop:
|
|
for _, nodeName := range nodeNames {
|
|
select {
|
|
case <-ctx.Done():
|
|
break dispatchLoop
|
|
case jobs <- nodeName:
|
|
}
|
|
}
|
|
close(jobs)
|
|
wg.Wait()
|
|
|
|
if succeeded == 0 && failed > 0 {
|
|
return nil, fmt.Errorf("no node summary metrics endpoints available")
|
|
}
|
|
if succeeded > 0 && failed > 0 {
|
|
a.logger.Debug().
|
|
Int("nodes_attempted", len(nodes)).
|
|
Int("summary_success_nodes", succeeded).
|
|
Int("summary_failed_nodes", failed).
|
|
Int("pods_with_summary_usage", len(result)).
|
|
Msg("Collected Kubernetes pod summary metrics with partial availability")
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func readKubernetesResponseBody(ctx context.Context, restClient rest.Interface, path string, maxBytes int64) ([]byte, error) {
|
|
stream, err := restClient.Get().AbsPath(path).Stream(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer stream.Close()
|
|
|
|
body, err := readBoundedBody(stream, maxBytes)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read %s response: %w", path, err)
|
|
}
|
|
return body, nil
|
|
}
|
|
|
|
func readBoundedBody(reader io.Reader, maxBytes int64) ([]byte, error) {
|
|
if maxBytes <= 0 {
|
|
return nil, fmt.Errorf("invalid max bytes %d", maxBytes)
|
|
}
|
|
|
|
body, err := io.ReadAll(io.LimitReader(reader, maxBytes+1))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if int64(len(body)) > maxBytes {
|
|
return nil, fmt.Errorf("response body exceeds %d bytes", maxBytes)
|
|
}
|
|
return body, nil
|
|
}
|
|
|
|
func parseNodeMetricsPayload(raw []byte) (map[string]agentsk8s.NodeUsage, error) {
|
|
var payload struct {
|
|
Items []struct {
|
|
Metadata struct {
|
|
Name string `json:"name"`
|
|
} `json:"metadata"`
|
|
Usage resourceUsage `json:"usage"`
|
|
} `json:"items"`
|
|
}
|
|
|
|
if err := json.Unmarshal(raw, &payload); err != nil {
|
|
return nil, fmt.Errorf("unmarshal node metrics payload: %w", err)
|
|
}
|
|
|
|
result := make(map[string]agentsk8s.NodeUsage, len(payload.Items))
|
|
for _, item := range payload.Items {
|
|
name := strings.TrimSpace(item.Metadata.Name)
|
|
if name == "" {
|
|
continue
|
|
}
|
|
|
|
cpuMilli := parseCPUMilli(item.Usage.CPU)
|
|
memBytes := parseBytes(item.Usage.Memory)
|
|
if cpuMilli <= 0 && memBytes <= 0 {
|
|
continue
|
|
}
|
|
|
|
result[name] = agentsk8s.NodeUsage{
|
|
CPUMilliCores: cpuMilli,
|
|
MemoryBytes: memBytes,
|
|
}
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func parsePodMetricsPayload(raw []byte) (map[string]agentsk8s.PodUsage, error) {
|
|
var payload struct {
|
|
Items []struct {
|
|
Metadata struct {
|
|
Name string `json:"name"`
|
|
Namespace string `json:"namespace"`
|
|
} `json:"metadata"`
|
|
Containers []struct {
|
|
Usage resourceUsage `json:"usage"`
|
|
} `json:"containers"`
|
|
} `json:"items"`
|
|
}
|
|
|
|
if err := json.Unmarshal(raw, &payload); err != nil {
|
|
return nil, fmt.Errorf("unmarshal pod metrics payload: %w", err)
|
|
}
|
|
|
|
result := make(map[string]agentsk8s.PodUsage, len(payload.Items))
|
|
for _, item := range payload.Items {
|
|
key := podUsageKey(item.Metadata.Namespace, item.Metadata.Name)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
|
|
var cpuMilli int64
|
|
var memBytes int64
|
|
for _, container := range item.Containers {
|
|
cpuMilli += parseCPUMilli(container.Usage.CPU)
|
|
memBytes += parseBytes(container.Usage.Memory)
|
|
}
|
|
if cpuMilli <= 0 && memBytes <= 0 {
|
|
continue
|
|
}
|
|
result[key] = agentsk8s.PodUsage{
|
|
CPUMilliCores: cpuMilli,
|
|
MemoryBytes: memBytes,
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func parsePodSummaryMetricsPayload(raw []byte) (map[string]podSummaryUsage, error) {
|
|
var payload struct {
|
|
Pods []struct {
|
|
PodRef struct {
|
|
Name string `json:"name"`
|
|
Namespace string `json:"namespace"`
|
|
} `json:"podRef"`
|
|
Network struct {
|
|
RxBytes *uint64 `json:"rxBytes"`
|
|
TxBytes *uint64 `json:"txBytes"`
|
|
} `json:"network"`
|
|
EphemeralStorage struct {
|
|
UsedBytes *uint64 `json:"usedBytes"`
|
|
CapacityBytes *uint64 `json:"capacityBytes"`
|
|
} `json:"ephemeral-storage"`
|
|
Volume []struct {
|
|
UsedBytes *uint64 `json:"usedBytes"`
|
|
CapacityBytes *uint64 `json:"capacityBytes"`
|
|
} `json:"volume"`
|
|
} `json:"pods"`
|
|
}
|
|
|
|
if err := json.Unmarshal(raw, &payload); err != nil {
|
|
return nil, fmt.Errorf("unmarshal pod summary metrics payload: %w", err)
|
|
}
|
|
|
|
result := make(map[string]podSummaryUsage, len(payload.Pods))
|
|
for _, pod := range payload.Pods {
|
|
key := podUsageKey(pod.PodRef.Namespace, pod.PodRef.Name)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
|
|
usedBytes := int64FromUint64Ptr(pod.EphemeralStorage.UsedBytes)
|
|
capacityBytes := int64FromUint64Ptr(pod.EphemeralStorage.CapacityBytes)
|
|
if usedBytes <= 0 || capacityBytes <= 0 {
|
|
var volumeUsed int64
|
|
var volumeCapacity int64
|
|
for _, volume := range pod.Volume {
|
|
volumeUsed += int64FromUint64Ptr(volume.UsedBytes)
|
|
volumeCapacity += int64FromUint64Ptr(volume.CapacityBytes)
|
|
}
|
|
if usedBytes <= 0 && volumeUsed > 0 {
|
|
usedBytes = volumeUsed
|
|
}
|
|
if capacityBytes <= 0 && volumeCapacity > 0 {
|
|
capacityBytes = volumeCapacity
|
|
}
|
|
}
|
|
|
|
result[key] = podSummaryUsage{
|
|
NetworkRxBytes: int64FromUint64Ptr(pod.Network.RxBytes),
|
|
NetworkTxBytes: int64FromUint64Ptr(pod.Network.TxBytes),
|
|
EphemeralStorageUsedBytes: usedBytes,
|
|
EphemeralStorageCapacityBytes: capacityBytes,
|
|
}
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func mergePodSummaryUsage(podUsage map[string]agentsk8s.PodUsage, summary map[string]podSummaryUsage) {
|
|
if len(summary) == 0 {
|
|
return
|
|
}
|
|
for key, usage := range summary {
|
|
merged := podUsage[key]
|
|
if usage.NetworkRxBytes > 0 {
|
|
merged.NetworkRxBytes = usage.NetworkRxBytes
|
|
}
|
|
if usage.NetworkTxBytes > 0 {
|
|
merged.NetworkTxBytes = usage.NetworkTxBytes
|
|
}
|
|
if usage.EphemeralStorageUsedBytes > 0 {
|
|
merged.EphemeralStorageUsedBytes = usage.EphemeralStorageUsedBytes
|
|
}
|
|
if usage.EphemeralStorageCapacityBytes > 0 {
|
|
merged.EphemeralStorageCapacityBytes = usage.EphemeralStorageCapacityBytes
|
|
}
|
|
if hasPodUsage(merged) {
|
|
podUsage[key] = merged
|
|
}
|
|
}
|
|
}
|
|
|
|
func hasPodUsage(usage agentsk8s.PodUsage) bool {
|
|
return usage.CPUMilliCores > 0 ||
|
|
usage.MemoryBytes > 0 ||
|
|
usage.NetworkRxBytes > 0 ||
|
|
usage.NetworkTxBytes > 0 ||
|
|
usage.EphemeralStorageUsedBytes > 0 ||
|
|
usage.EphemeralStorageCapacityBytes > 0
|
|
}
|
|
|
|
func copyStringMap(m map[string]string) map[string]string {
|
|
c := make(map[string]string, len(m))
|
|
for k, v := range m {
|
|
c[k] = v
|
|
}
|
|
return c
|
|
}
|
|
|
|
func parseQuantity(value string, convert func(k8sresource.Quantity) int64) int64 {
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
return 0
|
|
}
|
|
quantity, err := k8sresource.ParseQuantity(value)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return convert(quantity)
|
|
}
|
|
|
|
func parseCPUMilli(value string) int64 {
|
|
return parseQuantity(value, func(q k8sresource.Quantity) int64 { return q.MilliValue() })
|
|
}
|
|
|
|
func int64FromUint64Ptr(value *uint64) int64 {
|
|
if value == nil {
|
|
return 0
|
|
}
|
|
const maxInt64 = ^uint64(0) >> 1
|
|
if *value > maxInt64 {
|
|
return int64(maxInt64)
|
|
}
|
|
return int64(*value)
|
|
}
|
|
|
|
func parseBytes(value string) int64 {
|
|
return parseQuantity(value, func(q k8sresource.Quantity) int64 { return q.Value() })
|
|
}
|
|
|
|
func podUsageKey(namespace, name string) string {
|
|
namespace = strings.TrimSpace(namespace)
|
|
name = strings.TrimSpace(name)
|
|
if namespace == "" || name == "" {
|
|
return ""
|
|
}
|
|
return namespace + "/" + name
|
|
}
|
|
|
|
func applyNodeUsage(nodes []agentsk8s.Node, usage map[string]agentsk8s.NodeUsage) {
|
|
if len(nodes) == 0 || len(usage) == 0 {
|
|
return
|
|
}
|
|
for i := range nodes {
|
|
if nodeUsage, ok := usage[strings.TrimSpace(nodes[i].Name)]; ok {
|
|
nodeUsageCopy := nodeUsage
|
|
nodes[i].Usage = &nodeUsageCopy
|
|
}
|
|
}
|
|
}
|
|
|
|
func applyPodUsage(pods []agentsk8s.Pod, usage map[string]agentsk8s.PodUsage) {
|
|
if len(pods) == 0 || len(usage) == 0 {
|
|
return
|
|
}
|
|
for i := range pods {
|
|
key := podUsageKey(pods[i].Namespace, pods[i].Name)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
if podUsage, ok := usage[key]; ok {
|
|
podUsageCopy := podUsage
|
|
pods[i].Usage = &podUsageCopy
|
|
}
|
|
}
|
|
}
|
|
|
|
func (a *Agent) collectNodes(ctx context.Context) ([]agentsk8s.Node, error) {
|
|
opts := metav1.ListOptions{Limit: listPageSize}
|
|
nodes := make([]agentsk8s.Node, 0, int(listPageSize))
|
|
|
|
for {
|
|
list, err := a.listNodesPage(ctx, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, node := range list.Items {
|
|
ready := isNodeReady(node)
|
|
nodes = append(nodes, agentsk8s.Node{
|
|
UID: string(node.UID),
|
|
Name: node.Name,
|
|
Ready: ready,
|
|
Unschedulable: node.Spec.Unschedulable,
|
|
KubeletVersion: node.Status.NodeInfo.KubeletVersion,
|
|
ContainerRuntimeVersion: node.Status.NodeInfo.ContainerRuntimeVersion,
|
|
OSImage: node.Status.NodeInfo.OSImage,
|
|
KernelVersion: node.Status.NodeInfo.KernelVersion,
|
|
Architecture: node.Status.NodeInfo.Architecture,
|
|
Capacity: toNodeResources(node.Status.Capacity),
|
|
Allocatable: toNodeResources(node.Status.Allocatable),
|
|
Roles: rolesFromNodeLabels(node.Labels),
|
|
})
|
|
}
|
|
|
|
if list.Continue == "" {
|
|
break
|
|
}
|
|
opts.Continue = list.Continue
|
|
}
|
|
|
|
sort.Slice(nodes, func(i, j int) bool { return nodes[i].Name < nodes[j].Name })
|
|
return nodes, nil
|
|
}
|
|
|
|
func isNodeReady(node corev1.Node) bool {
|
|
for _, cond := range node.Status.Conditions {
|
|
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func rolesFromNodeLabels(nodeLabels map[string]string) []string {
|
|
roles := make([]string, 0, 4)
|
|
for k := range nodeLabels {
|
|
if strings.HasPrefix(k, "node-role.kubernetes.io/") {
|
|
role := strings.TrimPrefix(k, "node-role.kubernetes.io/")
|
|
role = strings.TrimSpace(role)
|
|
if role == "" {
|
|
role = "master"
|
|
}
|
|
roles = append(roles, role)
|
|
}
|
|
}
|
|
if v := strings.TrimSpace(nodeLabels["kubernetes.io/role"]); v != "" {
|
|
roles = append(roles, v)
|
|
}
|
|
roles = dedupeStrings(roles)
|
|
sort.Strings(roles)
|
|
return roles
|
|
}
|
|
|
|
func dedupeStrings(values []string) []string {
|
|
out := make([]string, 0, len(values))
|
|
seen := make(map[string]struct{}, len(values))
|
|
for _, v := range values {
|
|
v = strings.TrimSpace(v)
|
|
if v == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[v]; ok {
|
|
continue
|
|
}
|
|
seen[v] = struct{}{}
|
|
out = append(out, v)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (a *Agent) effectiveMaxPods() int {
|
|
if a.cfg.MaxPods > 0 {
|
|
return a.cfg.MaxPods
|
|
}
|
|
return defaultMaxPods
|
|
}
|
|
|
|
func (a *Agent) effectiveMaxDeployments() int {
|
|
return defaultMaxDeployments
|
|
}
|
|
|
|
func explicitNamespaces(patterns []string) ([]string, bool) {
|
|
if len(patterns) == 0 {
|
|
return nil, false
|
|
}
|
|
|
|
namespaces := make([]string, 0, len(patterns))
|
|
for _, pattern := range patterns {
|
|
pattern = strings.TrimSpace(pattern)
|
|
if pattern == "" {
|
|
continue
|
|
}
|
|
if strings.ContainsAny(pattern, "*?[]") {
|
|
return nil, false
|
|
}
|
|
namespaces = append(namespaces, pattern)
|
|
}
|
|
|
|
namespaces = dedupeStrings(namespaces)
|
|
if len(namespaces) == 0 {
|
|
return nil, false
|
|
}
|
|
sort.Strings(namespaces)
|
|
return namespaces, true
|
|
}
|
|
|
|
func comparePodKey(left, right agentsk8s.Pod) int {
|
|
if left.Namespace < right.Namespace {
|
|
return -1
|
|
}
|
|
if left.Namespace > right.Namespace {
|
|
return 1
|
|
}
|
|
if left.Name < right.Name {
|
|
return -1
|
|
}
|
|
if left.Name > right.Name {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func insertPodSortedBounded(items []agentsk8s.Pod, pod agentsk8s.Pod, max int) []agentsk8s.Pod {
|
|
if max <= 0 {
|
|
return items
|
|
}
|
|
|
|
idx := sort.Search(len(items), func(i int) bool {
|
|
return comparePodKey(items[i], pod) >= 0
|
|
})
|
|
|
|
if len(items) >= max && idx >= max {
|
|
return items
|
|
}
|
|
|
|
if len(items) < max {
|
|
items = append(items, agentsk8s.Pod{})
|
|
copy(items[idx+1:], items[idx:])
|
|
items[idx] = pod
|
|
return items
|
|
}
|
|
|
|
copy(items[idx+1:], items[idx:max-1])
|
|
items[idx] = pod
|
|
return items
|
|
}
|
|
|
|
func compareDeploymentKey(left, right agentsk8s.Deployment) int {
|
|
if left.Namespace < right.Namespace {
|
|
return -1
|
|
}
|
|
if left.Namespace > right.Namespace {
|
|
return 1
|
|
}
|
|
if left.Name < right.Name {
|
|
return -1
|
|
}
|
|
if left.Name > right.Name {
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func insertDeploymentSortedBounded(items []agentsk8s.Deployment, deployment agentsk8s.Deployment, max int) []agentsk8s.Deployment {
|
|
if max <= 0 {
|
|
return items
|
|
}
|
|
|
|
idx := sort.Search(len(items), func(i int) bool {
|
|
return compareDeploymentKey(items[i], deployment) >= 0
|
|
})
|
|
|
|
if len(items) >= max && idx >= max {
|
|
return items
|
|
}
|
|
|
|
if len(items) < max {
|
|
items = append(items, agentsk8s.Deployment{})
|
|
copy(items[idx+1:], items[idx:])
|
|
items[idx] = deployment
|
|
return items
|
|
}
|
|
|
|
copy(items[idx+1:], items[idx:max-1])
|
|
items[idx] = deployment
|
|
return items
|
|
}
|
|
|
|
func isRetryableKubernetesError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
if errors.Is(err, context.Canceled) {
|
|
return false
|
|
}
|
|
if errors.Is(err, context.DeadlineExceeded) {
|
|
return true
|
|
}
|
|
if apierrors.IsTimeout(err) || apierrors.IsServerTimeout(err) || apierrors.IsTooManyRequests(err) || apierrors.IsServiceUnavailable(err) || apierrors.IsInternalError(err) {
|
|
return true
|
|
}
|
|
|
|
var netErr net.Error
|
|
if errors.As(err, &netErr) && netErr.Timeout() {
|
|
return true
|
|
}
|
|
|
|
msg := strings.ToLower(err.Error())
|
|
return strings.Contains(msg, "connection refused") ||
|
|
strings.Contains(msg, "connection reset") ||
|
|
strings.Contains(msg, "tls handshake timeout") ||
|
|
strings.Contains(msg, "i/o timeout") ||
|
|
strings.Contains(msg, "unexpected eof") ||
|
|
strings.Contains(msg, "server closed idle connection")
|
|
}
|
|
|
|
func retryAfterForError(err error) (time.Duration, bool) {
|
|
var statusErr *apierrors.StatusError
|
|
if !errors.As(err, &statusErr) {
|
|
return 0, false
|
|
}
|
|
|
|
details := statusErr.Status().Details
|
|
if details == nil || details.RetryAfterSeconds <= 0 {
|
|
return 0, false
|
|
}
|
|
delay := time.Duration(details.RetryAfterSeconds) * time.Second
|
|
if delay > maxRetryBackoff {
|
|
delay = maxRetryBackoff
|
|
}
|
|
return delay, true
|
|
}
|
|
|
|
func waitForRetry(ctx context.Context, delay time.Duration) error {
|
|
timer := time.NewTimer(delay)
|
|
defer timer.Stop()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-timer.C:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func wrapKubernetesError(action string, err error) error {
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || apierrors.IsTimeout(err) || apierrors.IsServerTimeout(err) {
|
|
return fmt.Errorf("%s: kubernetes API timeout or unreachable control plane: %w", action, err)
|
|
}
|
|
if apierrors.IsUnauthorized(err) {
|
|
return fmt.Errorf("%s: kubernetes authentication failed (unauthorized); verify kubeconfig credentials or service account token: %w", action, err)
|
|
}
|
|
if apierrors.IsForbidden(err) {
|
|
return fmt.Errorf("%s: kubernetes access forbidden (RBAC); verify Role/ClusterRole permissions for this agent: %w", action, err)
|
|
}
|
|
if apierrors.IsTooManyRequests(err) {
|
|
return fmt.Errorf("%s: kubernetes API rate limited (429); reduce scope or increase collection interval: %w", action, err)
|
|
}
|
|
if apierrors.IsServiceUnavailable(err) {
|
|
return fmt.Errorf("%s: kubernetes API server unavailable: %w", action, err)
|
|
}
|
|
return fmt.Errorf("%s: %w", action, err)
|
|
}
|
|
|
|
func (a *Agent) runKubernetesCallWithRetry(ctx context.Context, action string, fn func(context.Context) error) error {
|
|
backoff := initialRetryBackoff
|
|
var lastErr error
|
|
|
|
for attempt := 1; attempt <= maxKubeAPIRetries; attempt++ {
|
|
if ctx.Err() != nil {
|
|
return wrapKubernetesError(action, ctx.Err())
|
|
}
|
|
|
|
callCtx, cancel := context.WithTimeout(ctx, requestTimeout)
|
|
err := fn(callCtx)
|
|
cancel()
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
lastErr = err
|
|
|
|
if attempt == maxKubeAPIRetries || !isRetryableKubernetesError(err) {
|
|
return wrapKubernetesError(action, err)
|
|
}
|
|
|
|
delay, ok := retryAfterForError(err)
|
|
if !ok {
|
|
delay = backoff
|
|
}
|
|
if delay > maxRetryBackoff {
|
|
delay = maxRetryBackoff
|
|
}
|
|
|
|
a.logger.Debug().
|
|
Int("attempt", attempt).
|
|
Dur("backoff", delay).
|
|
Err(err).
|
|
Str("action", action).
|
|
Msg("Kubernetes API call failed; retrying")
|
|
|
|
if waitErr := waitForRetry(ctx, delay); waitErr != nil {
|
|
return wrapKubernetesError(action, waitErr)
|
|
}
|
|
|
|
backoff *= 2
|
|
if backoff > maxRetryBackoff {
|
|
backoff = maxRetryBackoff
|
|
}
|
|
}
|
|
|
|
return wrapKubernetesError(action, lastErr)
|
|
}
|
|
|
|
func (a *Agent) listNodesPage(ctx context.Context, listOpts metav1.ListOptions) (*corev1.NodeList, error) {
|
|
if listOpts.Limit <= 0 {
|
|
listOpts.Limit = listPageSize
|
|
}
|
|
|
|
var list *corev1.NodeList
|
|
err := a.runKubernetesCallWithRetry(ctx, "list nodes", func(callCtx context.Context) error {
|
|
var err error
|
|
list, err = a.kubeClient.CoreV1().Nodes().List(callCtx, listOpts)
|
|
return err
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return list, nil
|
|
}
|
|
|
|
func (a *Agent) listPodsPage(ctx context.Context, namespace string, listOpts metav1.ListOptions) (*corev1.PodList, error) {
|
|
if listOpts.Limit <= 0 {
|
|
listOpts.Limit = listPageSize
|
|
}
|
|
|
|
action := "list pods"
|
|
if namespace != metav1.NamespaceAll {
|
|
action = fmt.Sprintf("list pods in namespace %q", namespace)
|
|
}
|
|
|
|
var list *corev1.PodList
|
|
err := a.runKubernetesCallWithRetry(ctx, action, func(callCtx context.Context) error {
|
|
var err error
|
|
list, err = a.kubeClient.CoreV1().Pods(namespace).List(callCtx, listOpts)
|
|
return err
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return list, nil
|
|
}
|
|
|
|
func (a *Agent) listDeploymentsPage(ctx context.Context, namespace string, listOpts metav1.ListOptions) (*appsv1.DeploymentList, error) {
|
|
if listOpts.Limit <= 0 {
|
|
listOpts.Limit = listPageSize
|
|
}
|
|
|
|
action := "list deployments"
|
|
if namespace != metav1.NamespaceAll {
|
|
action = fmt.Sprintf("list deployments in namespace %q", namespace)
|
|
}
|
|
|
|
var list *appsv1.DeploymentList
|
|
err := a.runKubernetesCallWithRetry(ctx, action, func(callCtx context.Context) error {
|
|
var err error
|
|
list, err = a.kubeClient.AppsV1().Deployments(namespace).List(callCtx, listOpts)
|
|
return err
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return list, nil
|
|
}
|
|
|
|
func (a *Agent) doRawPathWithRetry(ctx context.Context, restClient rest.Interface, action, path string) ([]byte, error) {
|
|
var payload []byte
|
|
err := a.runKubernetesCallWithRetry(ctx, action, func(callCtx context.Context) error {
|
|
var err error
|
|
payload, err = restClient.Get().AbsPath(path).DoRaw(callCtx)
|
|
return err
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return payload, nil
|
|
}
|
|
|
|
func toNodeResources(list corev1.ResourceList) agentsk8s.NodeResources {
|
|
cpu := list[corev1.ResourceCPU]
|
|
mem := list[corev1.ResourceMemory]
|
|
pods := list[corev1.ResourcePods]
|
|
|
|
return agentsk8s.NodeResources{
|
|
CPUCores: cpu.MilliValue() / 1000,
|
|
MemoryBytes: mem.Value(),
|
|
Pods: pods.Value(),
|
|
}
|
|
}
|
|
|
|
func (a *Agent) collectPods(ctx context.Context) ([]agentsk8s.Pod, error) {
|
|
// Default: focus on non-succeeded pods to reduce payload size.
|
|
selector := fields.OneTermNotEqualSelector("status.phase", string(corev1.PodSucceeded))
|
|
maxPods := a.effectiveMaxPods()
|
|
listOpts := metav1.ListOptions{
|
|
FieldSelector: selector.String(),
|
|
Limit: listPageSize,
|
|
}
|
|
|
|
namespaces, explicit := explicitNamespaces(a.includeNamespaces)
|
|
if !explicit {
|
|
namespaces = []string{metav1.NamespaceAll}
|
|
}
|
|
|
|
items := make([]agentsk8s.Pod, 0, maxPods)
|
|
for _, namespace := range namespaces {
|
|
opts := listOpts
|
|
opts.Continue = ""
|
|
|
|
for {
|
|
podList, err := a.listPodsPage(ctx, namespace, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, pod := range podList.Items {
|
|
if !a.namespaceAllowed(pod.Namespace) {
|
|
continue
|
|
}
|
|
if !a.cfg.IncludeAllPods && !isProblemPod(pod) {
|
|
continue
|
|
}
|
|
|
|
labelsCopy := make(map[string]string, len(pod.Labels))
|
|
for k, v := range pod.Labels {
|
|
labelsCopy[k] = v
|
|
}
|
|
|
|
containers := make([]agentsk8s.PodContainer, 0, len(pod.Status.ContainerStatuses))
|
|
restarts := 0
|
|
for _, cs := range pod.Status.ContainerStatuses {
|
|
restarts += int(cs.RestartCount)
|
|
state, reason, message := summarizeContainerState(cs)
|
|
containers = append(containers, agentsk8s.PodContainer{
|
|
Name: cs.Name,
|
|
Image: cs.Image,
|
|
Ready: cs.Ready,
|
|
RestartCount: cs.RestartCount,
|
|
State: state,
|
|
Reason: reason,
|
|
Message: message,
|
|
})
|
|
}
|
|
|
|
ownerKind, ownerName := ownerRef(pod.OwnerReferences)
|
|
createdAt := pod.CreationTimestamp.Time
|
|
var startTime *time.Time
|
|
if pod.Status.StartTime != nil {
|
|
t := pod.Status.StartTime.Time
|
|
startTime = &t
|
|
}
|
|
|
|
items = insertPodSortedBounded(items, agentsk8s.Pod{
|
|
UID: string(pod.UID),
|
|
Name: pod.Name,
|
|
Namespace: pod.Namespace,
|
|
NodeName: pod.Spec.NodeName,
|
|
Phase: string(pod.Status.Phase),
|
|
Reason: pod.Status.Reason,
|
|
Message: pod.Status.Message,
|
|
QoSClass: string(pod.Status.QOSClass),
|
|
CreatedAt: createdAt,
|
|
StartTime: startTime,
|
|
Restarts: restarts,
|
|
Labels: labelsCopy,
|
|
OwnerKind: ownerKind,
|
|
OwnerName: ownerName,
|
|
Containers: containers,
|
|
}, maxPods)
|
|
}
|
|
|
|
if podList.Continue == "" {
|
|
break
|
|
}
|
|
opts.Continue = podList.Continue
|
|
}
|
|
}
|
|
|
|
return items, nil
|
|
}
|
|
|
|
func isProblemPod(pod corev1.Pod) bool {
|
|
switch pod.Status.Phase {
|
|
case corev1.PodPending, corev1.PodFailed, corev1.PodUnknown:
|
|
return true
|
|
}
|
|
|
|
for _, cs := range pod.Status.InitContainerStatuses {
|
|
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
|
|
return true
|
|
}
|
|
if cs.State.Waiting != nil {
|
|
return true
|
|
}
|
|
if !cs.Ready && (cs.State.Running == nil) {
|
|
return true
|
|
}
|
|
}
|
|
|
|
for _, cs := range pod.Status.ContainerStatuses {
|
|
if cs.State.Waiting != nil || cs.State.Terminated != nil {
|
|
return true
|
|
}
|
|
if !cs.Ready {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func summarizeContainerState(cs corev1.ContainerStatus) (string, string, string) {
|
|
switch {
|
|
case cs.State.Running != nil:
|
|
return "running", "", ""
|
|
case cs.State.Waiting != nil:
|
|
return "waiting", strings.TrimSpace(cs.State.Waiting.Reason), strings.TrimSpace(cs.State.Waiting.Message)
|
|
case cs.State.Terminated != nil:
|
|
return "terminated", strings.TrimSpace(cs.State.Terminated.Reason), strings.TrimSpace(cs.State.Terminated.Message)
|
|
default:
|
|
return "unknown", "", ""
|
|
}
|
|
}
|
|
|
|
func ownerRef(refs []metav1.OwnerReference) (string, string) {
|
|
for _, r := range refs {
|
|
if r.Controller != nil && *r.Controller {
|
|
return r.Kind, r.Name
|
|
}
|
|
}
|
|
if len(refs) > 0 {
|
|
return refs[0].Kind, refs[0].Name
|
|
}
|
|
return "", ""
|
|
}
|
|
|
|
func (a *Agent) collectDeployments(ctx context.Context) ([]agentsk8s.Deployment, error) {
|
|
maxDeployments := a.effectiveMaxDeployments()
|
|
listOpts := metav1.ListOptions{Limit: listPageSize}
|
|
|
|
namespaces, explicit := explicitNamespaces(a.includeNamespaces)
|
|
if !explicit {
|
|
namespaces = []string{metav1.NamespaceAll}
|
|
}
|
|
|
|
items := make([]agentsk8s.Deployment, 0, maxDeployments)
|
|
for _, namespace := range namespaces {
|
|
opts := listOpts
|
|
opts.Continue = ""
|
|
|
|
for {
|
|
depList, err := a.listDeploymentsPage(ctx, namespace, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, dep := range depList.Items {
|
|
if !a.namespaceAllowed(dep.Namespace) {
|
|
continue
|
|
}
|
|
if !a.cfg.IncludeAllDeployments && !isProblemDeployment(dep) {
|
|
continue
|
|
}
|
|
|
|
labelsCopy := make(map[string]string, len(dep.Labels))
|
|
for k, v := range dep.Labels {
|
|
labelsCopy[k] = v
|
|
}
|
|
|
|
items = insertDeploymentSortedBounded(items, agentsk8s.Deployment{
|
|
UID: string(dep.UID),
|
|
Name: dep.Name,
|
|
Namespace: dep.Namespace,
|
|
DesiredReplicas: desiredReplicas(dep),
|
|
UpdatedReplicas: dep.Status.UpdatedReplicas,
|
|
ReadyReplicas: dep.Status.ReadyReplicas,
|
|
AvailableReplicas: dep.Status.AvailableReplicas,
|
|
Labels: labelsCopy,
|
|
}, maxDeployments)
|
|
}
|
|
|
|
if depList.Continue == "" {
|
|
break
|
|
}
|
|
opts.Continue = depList.Continue
|
|
}
|
|
}
|
|
|
|
return items, nil
|
|
}
|
|
|
|
func desiredReplicas(dep appsv1.Deployment) int32 {
|
|
if dep.Spec.Replicas == nil {
|
|
return 0
|
|
}
|
|
return *dep.Spec.Replicas
|
|
}
|
|
|
|
func isProblemDeployment(dep appsv1.Deployment) bool {
|
|
desired := desiredReplicas(dep)
|
|
if desired <= 0 {
|
|
return false
|
|
}
|
|
return dep.Status.AvailableReplicas < desired || dep.Status.ReadyReplicas < desired || dep.Status.UpdatedReplicas < desired
|
|
}
|
|
|
|
func (a *Agent) sendReport(ctx context.Context, report agentsk8s.Report) (retErr error) {
|
|
payload, err := json.Marshal(report)
|
|
if err != nil {
|
|
return fmt.Errorf("marshal report: %w", err)
|
|
}
|
|
|
|
compressed, err := utils.CompressJSON(payload)
|
|
if err != nil {
|
|
return fmt.Errorf("compress report: %w", err)
|
|
}
|
|
|
|
reportURL := fmt.Sprintf("%s/api/agents/kubernetes/report", a.pulseURL)
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, reportURL, bytes.NewReader(compressed))
|
|
if err != nil {
|
|
return fmt.Errorf("create request for %s: %w", reportURL, err)
|
|
}
|
|
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Content-Encoding", "gzip")
|
|
req.Header.Set("Authorization", "Bearer "+a.cfg.APIToken)
|
|
req.Header.Set("X-API-Token", a.cfg.APIToken)
|
|
req.Header.Set("User-Agent", reportUserAgent+a.agentVersion)
|
|
|
|
resp, err := a.httpClient.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("send request to %s: %w", reportURL, err)
|
|
}
|
|
defer func() {
|
|
if closeErr := resp.Body.Close(); closeErr != nil {
|
|
wrappedCloseErr := fmt.Errorf("close response body: %w", closeErr)
|
|
if retErr != nil {
|
|
retErr = errors.Join(retErr, wrappedCloseErr)
|
|
return
|
|
}
|
|
retErr = wrappedCloseErr
|
|
}
|
|
}()
|
|
|
|
if resp.StatusCode >= 300 {
|
|
body, readErr := io.ReadAll(io.LimitReader(resp.Body, 4*1024))
|
|
if readErr != nil {
|
|
return fmt.Errorf("read error response body for status %s: %w", resp.Status, readErr)
|
|
}
|
|
msg := strings.TrimSpace(string(body))
|
|
if msg != "" {
|
|
return fmt.Errorf("pulse responded with status %s for %s: %s", resp.Status, reportURL, msg)
|
|
}
|
|
return fmt.Errorf("pulse responded with status %s for %s", resp.Status, reportURL)
|
|
}
|
|
|
|
return nil
|
|
}
|