From 08c04b78aea421c2eff2f11f97b977e77786ba9d Mon Sep 17 00:00:00 2001 From: rcourtman Date: Thu, 25 Dec 2025 21:14:12 +0000 Subject: [PATCH] feat: add power consumption monitoring (Intel RAPL + AMD Energy) - Add power.go with Intel RAPL and AMD energy driver support - Read CPU package, core, and DRAM power consumption in watts - Sample energy counters over 100ms interval to calculate power - Add PowerWatts field to Sensors struct for API reporting - Integrate power collection into host agent sensor gathering - Add comprehensive tests for power collection module Supports Intel CPUs (Sandy Bridge+) via RAPL and AMD Ryzen/EPYC via the amd_energy kernel module. Closes community-scripts/ProxmoxVE#9575 --- internal/hostagent/agent.go | 20 ++ internal/sensors/power.go | 348 +++++++++++++++++++++++++++++++++ internal/sensors/power_test.go | 280 ++++++++++++++++++++++++++ pkg/agents/host/report.go | 1 + 4 files changed, 649 insertions(+) create mode 100644 internal/sensors/power.go create mode 100644 internal/sensors/power_test.go diff --git a/internal/hostagent/agent.go b/internal/hostagent/agent.go index 55575c0ca..d5bb6ce44 100644 --- a/internal/hostagent/agent.go +++ b/internal/hostagent/agent.go @@ -88,6 +88,7 @@ var ( hostmetricsCollect = hostmetrics.Collect sensorsCollectLocal = sensors.CollectLocal sensorsParse = sensors.Parse + sensorsCollectPower = sensors.CollectPower mdadmCollectArrays = mdadm.CollectArrays cephCollect = ceph.Collect smartctlCollectLocal = smartctl.CollectLocal @@ -561,9 +562,28 @@ func (a *Agent) collectTemperatures(ctx context.Context) agentshost.Sensors { } } + // Collect power consumption data (Intel RAPL, etc.) + if powerData, err := sensorsCollectPower(ctx); err == nil && powerData.Available { + result.PowerWatts = make(map[string]float64) + if powerData.PackageWatts > 0 { + result.PowerWatts["cpu_package"] = powerData.PackageWatts + } + if powerData.CoreWatts > 0 { + result.PowerWatts["cpu_core"] = powerData.CoreWatts + } + if powerData.DRAMWatts > 0 { + result.PowerWatts["dram"] = powerData.DRAMWatts + } + a.logger.Debug(). + Float64("packageWatts", powerData.PackageWatts). + Str("source", powerData.Source). + Msg("Collected power data") + } + a.logger.Debug(). Int("temperatureCount", len(result.TemperatureCelsius)). Int("fanCount", len(result.FanRPM)). + Int("powerCount", len(result.PowerWatts)). Int("additionalCount", len(result.Additional)). Msg("Collected sensor data") diff --git a/internal/sensors/power.go b/internal/sensors/power.go new file mode 100644 index 000000000..876b2c5b3 --- /dev/null +++ b/internal/sensors/power.go @@ -0,0 +1,348 @@ +package sensors + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/rs/zerolog/log" +) + +// PowerData contains power consumption readings from the system. +type PowerData struct { + // PackageWatts is the CPU package power consumption in watts. + // This is the total power for the CPU socket (all cores + uncore). + PackageWatts float64 + + // CoreWatts is the CPU cores-only power consumption in watts (if available). + CoreWatts float64 + + // DRAMWatts is the DRAM power consumption in watts (if available). + // Note: Not all platforms support DRAM power measurement. + DRAMWatts float64 + + // Available indicates whether any power data was successfully collected. + Available bool + + // Source indicates the method used: "rapl", "amd_energy", or empty. + Source string +} + +// raplBasePath is the base path for Intel RAPL (Running Average Power Limit) readings. +// RAPL provides energy counters that we sample to calculate power. +const raplBasePath = "/sys/class/powercap/intel-rapl" + +// sampleInterval is the time between energy counter readings. +// Shorter intervals are less accurate; longer intervals add latency. +const sampleInterval = 100 * time.Millisecond + +// CollectPower reads power consumption data from the system. +// Supports Intel RAPL and AMD energy driver. +// Returns nil if no power data is available. +func CollectPower(ctx context.Context) (*PowerData, error) { + // Try Intel RAPL first (most common on Intel) + if data, err := collectRALP(ctx); err == nil && data.Available { + return data, nil + } + + // Try AMD energy driver (for AMD Ryzen/EPYC) + if data, err := collectAMDEnergy(ctx); err == nil && data.Available { + return data, nil + } + + // TODO: Add IPMI support for server BMCs + + return nil, fmt.Errorf("no power monitoring available") +} + +// collectRALP reads power data from Intel RAPL sysfs interface. +// RAPL provides energy counters in microjoules that we sample twice +// to calculate instantaneous power in watts. +func collectRALP(ctx context.Context) (*PowerData, error) { + // Check if RAPL is available + if _, err := os.Stat(raplBasePath); os.IsNotExist(err) { + return nil, fmt.Errorf("RAPL not available: %w", err) + } + + data := &PowerData{Source: "rapl"} + + // Find all RAPL domains (packages) + // Typically: intel-rapl:0 (package), intel-rapl:0:0 (core), intel-rapl:0:1 (uncore), etc. + packages, err := filepath.Glob(filepath.Join(raplBasePath, "intel-rapl:*")) + if err != nil || len(packages) == 0 { + return nil, fmt.Errorf("no RAPL packages found") + } + + // Sample energy counters + sample1, err := readRAPLEnergy(packages) + if err != nil { + return nil, fmt.Errorf("first RAPL sample failed: %w", err) + } + + // Wait for sample interval + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(sampleInterval): + } + + // Second sample + sample2, err := readRAPLEnergy(packages) + if err != nil { + return nil, fmt.Errorf("second RAPL sample failed: %w", err) + } + + // Calculate power from energy delta + // Power (W) = Energy delta (J) / Time delta (s) + duration := sampleInterval.Seconds() + + for domain, energy1 := range sample1 { + energy2, ok := sample2[domain] + if !ok { + continue + } + + // Handle counter wraparound (energy counters are typically 32-bit) + var deltaUJ uint64 + if energy2 >= energy1 { + deltaUJ = energy2 - energy1 + } else { + // Counter wrapped around + deltaUJ = (^uint64(0) - energy1) + energy2 + 1 + } + + // Convert microjoules to watts + watts := float64(deltaUJ) / 1e6 / duration + + // Categorize by domain name + domainLower := strings.ToLower(domain) + switch { + case strings.Contains(domainLower, "package") || strings.HasSuffix(domain, ":0"): + // Package-level (total CPU socket power) + data.PackageWatts += watts + case strings.Contains(domainLower, "core"): + data.CoreWatts += watts + case strings.Contains(domainLower, "dram"): + data.DRAMWatts += watts + } + + data.Available = true + } + + if data.Available { + log.Debug(). + Float64("packageWatts", data.PackageWatts). + Float64("coreWatts", data.CoreWatts). + Float64("dramWatts", data.DRAMWatts). + Msg("Collected RAPL power data") + } + + return data, nil +} + +// readRAPLEnergy reads energy counters from all RAPL domains. +// Returns a map of domain name -> energy in microjoules. +func readRAPLEnergy(packages []string) (map[string]uint64, error) { + result := make(map[string]uint64) + + for _, pkgPath := range packages { + // Read the package energy + energyPath := filepath.Join(pkgPath, "energy_uj") + if energy, err := readUint64File(energyPath); err == nil { + name := filepath.Base(pkgPath) + // Also read the domain name if available + namePath := filepath.Join(pkgPath, "name") + if domainName, err := readStringFile(namePath); err == nil { + name = domainName + } + result[name] = energy + } + + // Also read subdomain energy (core, uncore, dram) + subdomains, _ := filepath.Glob(filepath.Join(pkgPath, "intel-rapl:*")) + for _, subPath := range subdomains { + energyPath := filepath.Join(subPath, "energy_uj") + if energy, err := readUint64File(energyPath); err == nil { + name := filepath.Base(subPath) + // Read subdomain name + namePath := filepath.Join(subPath, "name") + if domainName, err := readStringFile(namePath); err == nil { + name = domainName + } + result[name] = energy + } + } + } + + if len(result) == 0 { + return nil, fmt.Errorf("no RAPL energy readings available") + } + + return result, nil +} + +// readUint64File reads a file containing a single uint64 value. +func readUint64File(path string) (uint64, error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) +} + +// readStringFile reads a file containing a single string value. +func readStringFile(path string) (string, error) { + data, err := os.ReadFile(path) + if err != nil { + return "", err + } + return strings.TrimSpace(string(data)), nil +} + +// hwmonBasePath is the base path for hwmon devices (used by AMD energy driver). +const hwmonBasePath = "/sys/class/hwmon" + +// collectAMDEnergy reads power data from AMD energy driver via hwmon. +// The amd_energy module exposes energy counters similar to Intel RAPL. +func collectAMDEnergy(ctx context.Context) (*PowerData, error) { + // Find hwmon device with amd_energy driver + hwmonPath, err := findAMDEnergyHwmon() + if err != nil { + return nil, err + } + + data := &PowerData{Source: "amd_energy"} + + // Sample energy counters + sample1, err := readAMDEnergy(hwmonPath) + if err != nil { + return nil, fmt.Errorf("first AMD energy sample failed: %w", err) + } + + // Wait for sample interval + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(sampleInterval): + } + + // Second sample + sample2, err := readAMDEnergy(hwmonPath) + if err != nil { + return nil, fmt.Errorf("second AMD energy sample failed: %w", err) + } + + // Calculate power from energy delta + duration := sampleInterval.Seconds() + + for label, energy1 := range sample1 { + energy2, ok := sample2[label] + if !ok { + continue + } + + // Handle counter wraparound + var deltaUJ uint64 + if energy2 >= energy1 { + deltaUJ = energy2 - energy1 + } else { + deltaUJ = (^uint64(0) - energy1) + energy2 + 1 + } + + // Convert microjoules to watts + watts := float64(deltaUJ) / 1e6 / duration + + // Categorize by label + labelLower := strings.ToLower(label) + switch { + case strings.Contains(labelLower, "socket") || strings.Contains(labelLower, "package"): + data.PackageWatts += watts + case strings.Contains(labelLower, "core"): + data.CoreWatts += watts + default: + // Default to package power for unlabeled readings + if data.PackageWatts == 0 { + data.PackageWatts = watts + } + } + + data.Available = true + } + + if data.Available { + log.Debug(). + Float64("packageWatts", data.PackageWatts). + Float64("coreWatts", data.CoreWatts). + Msg("Collected AMD energy power data") + } + + return data, nil +} + +// findAMDEnergyHwmon finds the hwmon device path for amd_energy driver. +func findAMDEnergyHwmon() (string, error) { + entries, err := os.ReadDir(hwmonBasePath) + if err != nil { + return "", fmt.Errorf("cannot read hwmon: %w", err) + } + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + hwmonDir := filepath.Join(hwmonBasePath, entry.Name()) + namePath := filepath.Join(hwmonDir, "name") + + name, err := readStringFile(namePath) + if err != nil { + continue + } + + if name == "amd_energy" { + return hwmonDir, nil + } + } + + return "", fmt.Errorf("amd_energy hwmon device not found") +} + +// readAMDEnergy reads energy counters from AMD energy hwmon device. +// Returns a map of label -> energy in microjoules. +func readAMDEnergy(hwmonPath string) (map[string]uint64, error) { + result := make(map[string]uint64) + + // AMD energy exposes energy*_input files (in microjoules) + energyFiles, err := filepath.Glob(filepath.Join(hwmonPath, "energy*_input")) + if err != nil || len(energyFiles) == 0 { + return nil, fmt.Errorf("no AMD energy files found") + } + + for _, energyPath := range energyFiles { + energy, err := readUint64File(energyPath) + if err != nil { + continue + } + + // Try to get the label for this energy reading + // energy1_input -> energy1_label + labelPath := strings.Replace(energyPath, "_input", "_label", 1) + label, err := readStringFile(labelPath) + if err != nil { + // Use filename as fallback + label = filepath.Base(energyPath) + } + + result[label] = energy + } + + if len(result) == 0 { + return nil, fmt.Errorf("no AMD energy readings available") + } + + return result, nil +} diff --git a/internal/sensors/power_test.go b/internal/sensors/power_test.go new file mode 100644 index 000000000..f2c85c88a --- /dev/null +++ b/internal/sensors/power_test.go @@ -0,0 +1,280 @@ +package sensors + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" +) + +func TestCollectPower_NoRAPL(t *testing.T) { + // On most CI systems, RAPL won't be available + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + + data, err := CollectPower(ctx) + + // Either should fail (no RAPL) or succeed (has RAPL) + if err != nil { + // Expected on systems without RAPL + t.Logf("Power collection unavailable (expected in CI): %v", err) + return + } + + // If we got here, RAPL is available + if data == nil { + t.Fatal("Expected non-nil data when no error returned") + } + + if !data.Available { + t.Error("Expected data.Available to be true") + } + + if data.Source != "rapl" && data.Source != "amd_energy" { + t.Errorf("Expected source 'rapl' or 'amd_energy', got '%s'", data.Source) + } + + t.Logf("Power data: Package=%.2fW, Core=%.2fW, DRAM=%.2fW (source: %s)", + data.PackageWatts, data.CoreWatts, data.DRAMWatts, data.Source) +} + +func TestPowerData_StructInitialization(t *testing.T) { + data := &PowerData{} + + if data.Available { + t.Error("Expected Available to be false by default") + } + + if data.PackageWatts != 0 { + t.Error("Expected PackageWatts to be 0 by default") + } + + if data.Source != "" { + t.Error("Expected Source to be empty by default") + } +} + +func TestReadUint64File(t *testing.T) { + // Create temp file with a value + tmpDir := t.TempDir() + testFile := filepath.Join(tmpDir, "energy_uj") + + // Test valid value + if err := os.WriteFile(testFile, []byte("123456789\n"), 0644); err != nil { + t.Fatalf("Failed to write test file: %v", err) + } + + val, err := readUint64File(testFile) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if val != 123456789 { + t.Errorf("Expected 123456789, got %d", val) + } + + // Test with whitespace + if err := os.WriteFile(testFile, []byte(" 987654321 \n"), 0644); err != nil { + t.Fatalf("Failed to write test file: %v", err) + } + + val, err = readUint64File(testFile) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if val != 987654321 { + t.Errorf("Expected 987654321, got %d", val) + } + + // Test non-existent file + _, err = readUint64File(filepath.Join(tmpDir, "nonexistent")) + if err == nil { + t.Error("Expected error for non-existent file") + } + + // Test invalid content + if err := os.WriteFile(testFile, []byte("not a number"), 0644); err != nil { + t.Fatalf("Failed to write test file: %v", err) + } + + _, err = readUint64File(testFile) + if err == nil { + t.Error("Expected error for invalid content") + } +} + +func TestReadStringFile(t *testing.T) { + tmpDir := t.TempDir() + testFile := filepath.Join(tmpDir, "name") + + // Test valid value + if err := os.WriteFile(testFile, []byte("package-0\n"), 0644); err != nil { + t.Fatalf("Failed to write test file: %v", err) + } + + val, err := readStringFile(testFile) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if val != "package-0" { + t.Errorf("Expected 'package-0', got '%s'", val) + } + + // Test with whitespace + if err := os.WriteFile(testFile, []byte(" core \n"), 0644); err != nil { + t.Fatalf("Failed to write test file: %v", err) + } + + val, err = readStringFile(testFile) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if val != "core" { + t.Errorf("Expected 'core', got '%s'", val) + } + + // Test non-existent file + _, err = readStringFile(filepath.Join(tmpDir, "nonexistent")) + if err == nil { + t.Error("Expected error for non-existent file") + } +} + +func TestReadRAPLEnergy(t *testing.T) { + // Create mock RAPL structure + tmpDir := t.TempDir() + pkg0 := filepath.Join(tmpDir, "intel-rapl:0") + if err := os.MkdirAll(pkg0, 0755); err != nil { + t.Fatalf("Failed to create mock RAPL dir: %v", err) + } + + // Write energy and name files + if err := os.WriteFile(filepath.Join(pkg0, "energy_uj"), []byte("1000000"), 0644); err != nil { + t.Fatalf("Failed to write energy file: %v", err) + } + if err := os.WriteFile(filepath.Join(pkg0, "name"), []byte("package-0"), 0644); err != nil { + t.Fatalf("Failed to write name file: %v", err) + } + + // Create subdomain (core) + core := filepath.Join(pkg0, "intel-rapl:0:0") + if err := os.MkdirAll(core, 0755); err != nil { + t.Fatalf("Failed to create mock core dir: %v", err) + } + if err := os.WriteFile(filepath.Join(core, "energy_uj"), []byte("500000"), 0644); err != nil { + t.Fatalf("Failed to write core energy file: %v", err) + } + if err := os.WriteFile(filepath.Join(core, "name"), []byte("core"), 0644); err != nil { + t.Fatalf("Failed to write core name file: %v", err) + } + + // Test reading + result, err := readRAPLEnergy([]string{pkg0}) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + if len(result) != 2 { + t.Errorf("Expected 2 readings, got %d", len(result)) + } + + if val, ok := result["package-0"]; !ok || val != 1000000 { + t.Errorf("Expected package-0=1000000, got %v", result) + } + + if val, ok := result["core"]; !ok || val != 500000 { + t.Errorf("Expected core=500000, got %v", result) + } +} + +func TestReadRAPLEnergy_NoFiles(t *testing.T) { + tmpDir := t.TempDir() + pkg0 := filepath.Join(tmpDir, "intel-rapl:0") + if err := os.MkdirAll(pkg0, 0755); err != nil { + t.Fatalf("Failed to create mock RAPL dir: %v", err) + } + + // No energy files + _, err := readRAPLEnergy([]string{pkg0}) + if err == nil { + t.Error("Expected error when no energy files exist") + } +} + +func TestFindAMDEnergyHwmon_NotFound(t *testing.T) { + // This should fail on systems without amd_energy + _, err := findAMDEnergyHwmon() + if err == nil { + // If it succeeds, that's fine - means we're on an AMD system + t.Log("AMD energy hwmon found (running on AMD system)") + } +} + +func TestCollectPower_ContextCancellation(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately + + _, err := CollectPower(ctx) + // Should fail quickly due to cancelled context or no power available + if err == nil { + t.Log("CollectPower succeeded despite cancelled context (power data was cached or instant)") + } +} + +func TestPowerCalculation(t *testing.T) { + // Test that power calculation is correct + // Power (W) = Energy delta (µJ) / 1e6 / time (s) + + // 1,000,000 µJ over 100ms = 10W + deltaUJ := uint64(1000000) + duration := 0.1 // 100ms + expectedWatts := 10.0 + + watts := float64(deltaUJ) / 1e6 / duration + if watts != expectedWatts { + t.Errorf("Expected %.2f W, got %.2f W", expectedWatts, watts) + } + + // 5,000,000 µJ over 100ms = 50W + deltaUJ = 5000000 + expectedWatts = 50.0 + watts = float64(deltaUJ) / 1e6 / duration + if watts != expectedWatts { + t.Errorf("Expected %.2f W, got %.2f W", expectedWatts, watts) + } +} + +func TestCounterWraparound(t *testing.T) { + // Test normal (no wraparound) case + energy1 := uint64(1000000) + energy2 := uint64(2000000) + + var deltaUJ uint64 + if energy2 >= energy1 { + deltaUJ = energy2 - energy1 + } else { + deltaUJ = (^uint64(0) - energy1) + energy2 + 1 + } + + expectedDelta := uint64(1000000) + if deltaUJ != expectedDelta { + t.Errorf("Normal case: Expected delta %d, got %d", expectedDelta, deltaUJ) + } + + // Test wraparound case (energy2 < energy1 means counter wrapped) + energy1 = uint64(18446744073709551610) // Close to max uint64 + energy2 = uint64(100) // After wrap + + if energy2 >= energy1 { + deltaUJ = energy2 - energy1 + } else { + deltaUJ = (^uint64(0) - energy1) + energy2 + 1 + } + + // Max uint64 is 18446744073709551615 + // So delta should be: (max - 18446744073709551610) + 100 + 1 = 5 + 100 + 1 = 106 + expectedDelta = uint64(106) + if deltaUJ != expectedDelta { + t.Errorf("Wraparound case: Expected delta %d, got %d", expectedDelta, deltaUJ) + } +} diff --git a/pkg/agents/host/report.go b/pkg/agents/host/report.go index c1c8d77da..6aa1e29e0 100644 --- a/pkg/agents/host/report.go +++ b/pkg/agents/host/report.go @@ -101,6 +101,7 @@ type NetworkInterface struct { type Sensors struct { TemperatureCelsius map[string]float64 `json:"temperatureCelsius,omitempty"` FanRPM map[string]float64 `json:"fanRpm,omitempty"` + PowerWatts map[string]float64 `json:"powerWatts,omitempty"` // Power consumption (e.g., cpu_package, dram) Additional map[string]float64 `json:"additional,omitempty"` SMART []DiskSMART `json:"smart,omitempty"` // S.M.A.R.T. disk data }