feat: add power consumption monitoring (Intel RAPL + AMD Energy)

- Add power.go with Intel RAPL and AMD energy driver support
- Read CPU package, core, and DRAM power consumption in watts
- Sample energy counters over 100ms interval to calculate power
- Add PowerWatts field to Sensors struct for API reporting
- Integrate power collection into host agent sensor gathering
- Add comprehensive tests for power collection module

Supports Intel CPUs (Sandy Bridge+) via RAPL and AMD Ryzen/EPYC
via the amd_energy kernel module.

Closes community-scripts/ProxmoxVE#9575
This commit is contained in:
rcourtman 2025-12-25 21:14:12 +00:00
parent 1329c08305
commit 08c04b78ae
4 changed files with 649 additions and 0 deletions

View file

@ -88,6 +88,7 @@ var (
hostmetricsCollect = hostmetrics.Collect
sensorsCollectLocal = sensors.CollectLocal
sensorsParse = sensors.Parse
sensorsCollectPower = sensors.CollectPower
mdadmCollectArrays = mdadm.CollectArrays
cephCollect = ceph.Collect
smartctlCollectLocal = smartctl.CollectLocal
@ -561,9 +562,28 @@ func (a *Agent) collectTemperatures(ctx context.Context) agentshost.Sensors {
}
}
// Collect power consumption data (Intel RAPL, etc.)
if powerData, err := sensorsCollectPower(ctx); err == nil && powerData.Available {
result.PowerWatts = make(map[string]float64)
if powerData.PackageWatts > 0 {
result.PowerWatts["cpu_package"] = powerData.PackageWatts
}
if powerData.CoreWatts > 0 {
result.PowerWatts["cpu_core"] = powerData.CoreWatts
}
if powerData.DRAMWatts > 0 {
result.PowerWatts["dram"] = powerData.DRAMWatts
}
a.logger.Debug().
Float64("packageWatts", powerData.PackageWatts).
Str("source", powerData.Source).
Msg("Collected power data")
}
a.logger.Debug().
Int("temperatureCount", len(result.TemperatureCelsius)).
Int("fanCount", len(result.FanRPM)).
Int("powerCount", len(result.PowerWatts)).
Int("additionalCount", len(result.Additional)).
Msg("Collected sensor data")

348
internal/sensors/power.go Normal file
View file

@ -0,0 +1,348 @@
package sensors
import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/rs/zerolog/log"
)
// PowerData contains power consumption readings from the system.
type PowerData struct {
// PackageWatts is the CPU package power consumption in watts.
// This is the total power for the CPU socket (all cores + uncore).
PackageWatts float64
// CoreWatts is the CPU cores-only power consumption in watts (if available).
CoreWatts float64
// DRAMWatts is the DRAM power consumption in watts (if available).
// Note: Not all platforms support DRAM power measurement.
DRAMWatts float64
// Available indicates whether any power data was successfully collected.
Available bool
// Source indicates the method used: "rapl", "amd_energy", or empty.
Source string
}
// raplBasePath is the base path for Intel RAPL (Running Average Power Limit) readings.
// RAPL provides energy counters that we sample to calculate power.
const raplBasePath = "/sys/class/powercap/intel-rapl"
// sampleInterval is the time between energy counter readings.
// Shorter intervals are less accurate; longer intervals add latency.
const sampleInterval = 100 * time.Millisecond
// CollectPower reads power consumption data from the system.
// Supports Intel RAPL and AMD energy driver.
// Returns nil if no power data is available.
func CollectPower(ctx context.Context) (*PowerData, error) {
// Try Intel RAPL first (most common on Intel)
if data, err := collectRALP(ctx); err == nil && data.Available {
return data, nil
}
// Try AMD energy driver (for AMD Ryzen/EPYC)
if data, err := collectAMDEnergy(ctx); err == nil && data.Available {
return data, nil
}
// TODO: Add IPMI support for server BMCs
return nil, fmt.Errorf("no power monitoring available")
}
// collectRALP reads power data from Intel RAPL sysfs interface.
// RAPL provides energy counters in microjoules that we sample twice
// to calculate instantaneous power in watts.
func collectRALP(ctx context.Context) (*PowerData, error) {
// Check if RAPL is available
if _, err := os.Stat(raplBasePath); os.IsNotExist(err) {
return nil, fmt.Errorf("RAPL not available: %w", err)
}
data := &PowerData{Source: "rapl"}
// Find all RAPL domains (packages)
// Typically: intel-rapl:0 (package), intel-rapl:0:0 (core), intel-rapl:0:1 (uncore), etc.
packages, err := filepath.Glob(filepath.Join(raplBasePath, "intel-rapl:*"))
if err != nil || len(packages) == 0 {
return nil, fmt.Errorf("no RAPL packages found")
}
// Sample energy counters
sample1, err := readRAPLEnergy(packages)
if err != nil {
return nil, fmt.Errorf("first RAPL sample failed: %w", err)
}
// Wait for sample interval
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(sampleInterval):
}
// Second sample
sample2, err := readRAPLEnergy(packages)
if err != nil {
return nil, fmt.Errorf("second RAPL sample failed: %w", err)
}
// Calculate power from energy delta
// Power (W) = Energy delta (J) / Time delta (s)
duration := sampleInterval.Seconds()
for domain, energy1 := range sample1 {
energy2, ok := sample2[domain]
if !ok {
continue
}
// Handle counter wraparound (energy counters are typically 32-bit)
var deltaUJ uint64
if energy2 >= energy1 {
deltaUJ = energy2 - energy1
} else {
// Counter wrapped around
deltaUJ = (^uint64(0) - energy1) + energy2 + 1
}
// Convert microjoules to watts
watts := float64(deltaUJ) / 1e6 / duration
// Categorize by domain name
domainLower := strings.ToLower(domain)
switch {
case strings.Contains(domainLower, "package") || strings.HasSuffix(domain, ":0"):
// Package-level (total CPU socket power)
data.PackageWatts += watts
case strings.Contains(domainLower, "core"):
data.CoreWatts += watts
case strings.Contains(domainLower, "dram"):
data.DRAMWatts += watts
}
data.Available = true
}
if data.Available {
log.Debug().
Float64("packageWatts", data.PackageWatts).
Float64("coreWatts", data.CoreWatts).
Float64("dramWatts", data.DRAMWatts).
Msg("Collected RAPL power data")
}
return data, nil
}
// readRAPLEnergy reads energy counters from all RAPL domains.
// Returns a map of domain name -> energy in microjoules.
func readRAPLEnergy(packages []string) (map[string]uint64, error) {
result := make(map[string]uint64)
for _, pkgPath := range packages {
// Read the package energy
energyPath := filepath.Join(pkgPath, "energy_uj")
if energy, err := readUint64File(energyPath); err == nil {
name := filepath.Base(pkgPath)
// Also read the domain name if available
namePath := filepath.Join(pkgPath, "name")
if domainName, err := readStringFile(namePath); err == nil {
name = domainName
}
result[name] = energy
}
// Also read subdomain energy (core, uncore, dram)
subdomains, _ := filepath.Glob(filepath.Join(pkgPath, "intel-rapl:*"))
for _, subPath := range subdomains {
energyPath := filepath.Join(subPath, "energy_uj")
if energy, err := readUint64File(energyPath); err == nil {
name := filepath.Base(subPath)
// Read subdomain name
namePath := filepath.Join(subPath, "name")
if domainName, err := readStringFile(namePath); err == nil {
name = domainName
}
result[name] = energy
}
}
}
if len(result) == 0 {
return nil, fmt.Errorf("no RAPL energy readings available")
}
return result, nil
}
// readUint64File reads a file containing a single uint64 value.
func readUint64File(path string) (uint64, error) {
data, err := os.ReadFile(path)
if err != nil {
return 0, err
}
return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
}
// readStringFile reads a file containing a single string value.
func readStringFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", err
}
return strings.TrimSpace(string(data)), nil
}
// hwmonBasePath is the base path for hwmon devices (used by AMD energy driver).
const hwmonBasePath = "/sys/class/hwmon"
// collectAMDEnergy reads power data from AMD energy driver via hwmon.
// The amd_energy module exposes energy counters similar to Intel RAPL.
func collectAMDEnergy(ctx context.Context) (*PowerData, error) {
// Find hwmon device with amd_energy driver
hwmonPath, err := findAMDEnergyHwmon()
if err != nil {
return nil, err
}
data := &PowerData{Source: "amd_energy"}
// Sample energy counters
sample1, err := readAMDEnergy(hwmonPath)
if err != nil {
return nil, fmt.Errorf("first AMD energy sample failed: %w", err)
}
// Wait for sample interval
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(sampleInterval):
}
// Second sample
sample2, err := readAMDEnergy(hwmonPath)
if err != nil {
return nil, fmt.Errorf("second AMD energy sample failed: %w", err)
}
// Calculate power from energy delta
duration := sampleInterval.Seconds()
for label, energy1 := range sample1 {
energy2, ok := sample2[label]
if !ok {
continue
}
// Handle counter wraparound
var deltaUJ uint64
if energy2 >= energy1 {
deltaUJ = energy2 - energy1
} else {
deltaUJ = (^uint64(0) - energy1) + energy2 + 1
}
// Convert microjoules to watts
watts := float64(deltaUJ) / 1e6 / duration
// Categorize by label
labelLower := strings.ToLower(label)
switch {
case strings.Contains(labelLower, "socket") || strings.Contains(labelLower, "package"):
data.PackageWatts += watts
case strings.Contains(labelLower, "core"):
data.CoreWatts += watts
default:
// Default to package power for unlabeled readings
if data.PackageWatts == 0 {
data.PackageWatts = watts
}
}
data.Available = true
}
if data.Available {
log.Debug().
Float64("packageWatts", data.PackageWatts).
Float64("coreWatts", data.CoreWatts).
Msg("Collected AMD energy power data")
}
return data, nil
}
// findAMDEnergyHwmon finds the hwmon device path for amd_energy driver.
func findAMDEnergyHwmon() (string, error) {
entries, err := os.ReadDir(hwmonBasePath)
if err != nil {
return "", fmt.Errorf("cannot read hwmon: %w", err)
}
for _, entry := range entries {
if !entry.IsDir() {
continue
}
hwmonDir := filepath.Join(hwmonBasePath, entry.Name())
namePath := filepath.Join(hwmonDir, "name")
name, err := readStringFile(namePath)
if err != nil {
continue
}
if name == "amd_energy" {
return hwmonDir, nil
}
}
return "", fmt.Errorf("amd_energy hwmon device not found")
}
// readAMDEnergy reads energy counters from AMD energy hwmon device.
// Returns a map of label -> energy in microjoules.
func readAMDEnergy(hwmonPath string) (map[string]uint64, error) {
result := make(map[string]uint64)
// AMD energy exposes energy*_input files (in microjoules)
energyFiles, err := filepath.Glob(filepath.Join(hwmonPath, "energy*_input"))
if err != nil || len(energyFiles) == 0 {
return nil, fmt.Errorf("no AMD energy files found")
}
for _, energyPath := range energyFiles {
energy, err := readUint64File(energyPath)
if err != nil {
continue
}
// Try to get the label for this energy reading
// energy1_input -> energy1_label
labelPath := strings.Replace(energyPath, "_input", "_label", 1)
label, err := readStringFile(labelPath)
if err != nil {
// Use filename as fallback
label = filepath.Base(energyPath)
}
result[label] = energy
}
if len(result) == 0 {
return nil, fmt.Errorf("no AMD energy readings available")
}
return result, nil
}

View file

@ -0,0 +1,280 @@
package sensors
import (
"context"
"os"
"path/filepath"
"testing"
"time"
)
func TestCollectPower_NoRAPL(t *testing.T) {
// On most CI systems, RAPL won't be available
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
data, err := CollectPower(ctx)
// Either should fail (no RAPL) or succeed (has RAPL)
if err != nil {
// Expected on systems without RAPL
t.Logf("Power collection unavailable (expected in CI): %v", err)
return
}
// If we got here, RAPL is available
if data == nil {
t.Fatal("Expected non-nil data when no error returned")
}
if !data.Available {
t.Error("Expected data.Available to be true")
}
if data.Source != "rapl" && data.Source != "amd_energy" {
t.Errorf("Expected source 'rapl' or 'amd_energy', got '%s'", data.Source)
}
t.Logf("Power data: Package=%.2fW, Core=%.2fW, DRAM=%.2fW (source: %s)",
data.PackageWatts, data.CoreWatts, data.DRAMWatts, data.Source)
}
func TestPowerData_StructInitialization(t *testing.T) {
data := &PowerData{}
if data.Available {
t.Error("Expected Available to be false by default")
}
if data.PackageWatts != 0 {
t.Error("Expected PackageWatts to be 0 by default")
}
if data.Source != "" {
t.Error("Expected Source to be empty by default")
}
}
func TestReadUint64File(t *testing.T) {
// Create temp file with a value
tmpDir := t.TempDir()
testFile := filepath.Join(tmpDir, "energy_uj")
// Test valid value
if err := os.WriteFile(testFile, []byte("123456789\n"), 0644); err != nil {
t.Fatalf("Failed to write test file: %v", err)
}
val, err := readUint64File(testFile)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if val != 123456789 {
t.Errorf("Expected 123456789, got %d", val)
}
// Test with whitespace
if err := os.WriteFile(testFile, []byte(" 987654321 \n"), 0644); err != nil {
t.Fatalf("Failed to write test file: %v", err)
}
val, err = readUint64File(testFile)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if val != 987654321 {
t.Errorf("Expected 987654321, got %d", val)
}
// Test non-existent file
_, err = readUint64File(filepath.Join(tmpDir, "nonexistent"))
if err == nil {
t.Error("Expected error for non-existent file")
}
// Test invalid content
if err := os.WriteFile(testFile, []byte("not a number"), 0644); err != nil {
t.Fatalf("Failed to write test file: %v", err)
}
_, err = readUint64File(testFile)
if err == nil {
t.Error("Expected error for invalid content")
}
}
func TestReadStringFile(t *testing.T) {
tmpDir := t.TempDir()
testFile := filepath.Join(tmpDir, "name")
// Test valid value
if err := os.WriteFile(testFile, []byte("package-0\n"), 0644); err != nil {
t.Fatalf("Failed to write test file: %v", err)
}
val, err := readStringFile(testFile)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if val != "package-0" {
t.Errorf("Expected 'package-0', got '%s'", val)
}
// Test with whitespace
if err := os.WriteFile(testFile, []byte(" core \n"), 0644); err != nil {
t.Fatalf("Failed to write test file: %v", err)
}
val, err = readStringFile(testFile)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if val != "core" {
t.Errorf("Expected 'core', got '%s'", val)
}
// Test non-existent file
_, err = readStringFile(filepath.Join(tmpDir, "nonexistent"))
if err == nil {
t.Error("Expected error for non-existent file")
}
}
func TestReadRAPLEnergy(t *testing.T) {
// Create mock RAPL structure
tmpDir := t.TempDir()
pkg0 := filepath.Join(tmpDir, "intel-rapl:0")
if err := os.MkdirAll(pkg0, 0755); err != nil {
t.Fatalf("Failed to create mock RAPL dir: %v", err)
}
// Write energy and name files
if err := os.WriteFile(filepath.Join(pkg0, "energy_uj"), []byte("1000000"), 0644); err != nil {
t.Fatalf("Failed to write energy file: %v", err)
}
if err := os.WriteFile(filepath.Join(pkg0, "name"), []byte("package-0"), 0644); err != nil {
t.Fatalf("Failed to write name file: %v", err)
}
// Create subdomain (core)
core := filepath.Join(pkg0, "intel-rapl:0:0")
if err := os.MkdirAll(core, 0755); err != nil {
t.Fatalf("Failed to create mock core dir: %v", err)
}
if err := os.WriteFile(filepath.Join(core, "energy_uj"), []byte("500000"), 0644); err != nil {
t.Fatalf("Failed to write core energy file: %v", err)
}
if err := os.WriteFile(filepath.Join(core, "name"), []byte("core"), 0644); err != nil {
t.Fatalf("Failed to write core name file: %v", err)
}
// Test reading
result, err := readRAPLEnergy([]string{pkg0})
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
if len(result) != 2 {
t.Errorf("Expected 2 readings, got %d", len(result))
}
if val, ok := result["package-0"]; !ok || val != 1000000 {
t.Errorf("Expected package-0=1000000, got %v", result)
}
if val, ok := result["core"]; !ok || val != 500000 {
t.Errorf("Expected core=500000, got %v", result)
}
}
func TestReadRAPLEnergy_NoFiles(t *testing.T) {
tmpDir := t.TempDir()
pkg0 := filepath.Join(tmpDir, "intel-rapl:0")
if err := os.MkdirAll(pkg0, 0755); err != nil {
t.Fatalf("Failed to create mock RAPL dir: %v", err)
}
// No energy files
_, err := readRAPLEnergy([]string{pkg0})
if err == nil {
t.Error("Expected error when no energy files exist")
}
}
func TestFindAMDEnergyHwmon_NotFound(t *testing.T) {
// This should fail on systems without amd_energy
_, err := findAMDEnergyHwmon()
if err == nil {
// If it succeeds, that's fine - means we're on an AMD system
t.Log("AMD energy hwmon found (running on AMD system)")
}
}
func TestCollectPower_ContextCancellation(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel() // Cancel immediately
_, err := CollectPower(ctx)
// Should fail quickly due to cancelled context or no power available
if err == nil {
t.Log("CollectPower succeeded despite cancelled context (power data was cached or instant)")
}
}
func TestPowerCalculation(t *testing.T) {
// Test that power calculation is correct
// Power (W) = Energy delta (µJ) / 1e6 / time (s)
// 1,000,000 µJ over 100ms = 10W
deltaUJ := uint64(1000000)
duration := 0.1 // 100ms
expectedWatts := 10.0
watts := float64(deltaUJ) / 1e6 / duration
if watts != expectedWatts {
t.Errorf("Expected %.2f W, got %.2f W", expectedWatts, watts)
}
// 5,000,000 µJ over 100ms = 50W
deltaUJ = 5000000
expectedWatts = 50.0
watts = float64(deltaUJ) / 1e6 / duration
if watts != expectedWatts {
t.Errorf("Expected %.2f W, got %.2f W", expectedWatts, watts)
}
}
func TestCounterWraparound(t *testing.T) {
// Test normal (no wraparound) case
energy1 := uint64(1000000)
energy2 := uint64(2000000)
var deltaUJ uint64
if energy2 >= energy1 {
deltaUJ = energy2 - energy1
} else {
deltaUJ = (^uint64(0) - energy1) + energy2 + 1
}
expectedDelta := uint64(1000000)
if deltaUJ != expectedDelta {
t.Errorf("Normal case: Expected delta %d, got %d", expectedDelta, deltaUJ)
}
// Test wraparound case (energy2 < energy1 means counter wrapped)
energy1 = uint64(18446744073709551610) // Close to max uint64
energy2 = uint64(100) // After wrap
if energy2 >= energy1 {
deltaUJ = energy2 - energy1
} else {
deltaUJ = (^uint64(0) - energy1) + energy2 + 1
}
// Max uint64 is 18446744073709551615
// So delta should be: (max - 18446744073709551610) + 100 + 1 = 5 + 100 + 1 = 106
expectedDelta = uint64(106)
if deltaUJ != expectedDelta {
t.Errorf("Wraparound case: Expected delta %d, got %d", expectedDelta, deltaUJ)
}
}

View file

@ -101,6 +101,7 @@ type NetworkInterface struct {
type Sensors struct {
TemperatureCelsius map[string]float64 `json:"temperatureCelsius,omitempty"`
FanRPM map[string]float64 `json:"fanRpm,omitempty"`
PowerWatts map[string]float64 `json:"powerWatts,omitempty"` // Power consumption (e.g., cpu_package, dram)
Additional map[string]float64 `json:"additional,omitempty"`
SMART []DiskSMART `json:"smart,omitempty"` // S.M.A.R.T. disk data
}