Pulse/internal/sensors/power.go
2026-03-18 16:06:30 +00:00

443 lines
12 KiB
Go

package sensors
import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/rs/zerolog/log"
)
// PowerData contains power consumption readings from the system.
type PowerData struct {
// PackageWatts is the CPU package power consumption in watts.
// This is the total power for the CPU socket (all cores + uncore).
PackageWatts float64
// CoreWatts is the CPU cores-only power consumption in watts (if available).
CoreWatts float64
// DRAMWatts is the DRAM power consumption in watts (if available).
// Note: Not all platforms support DRAM power measurement.
DRAMWatts float64
// Available indicates whether any power data was successfully collected.
Available bool
// Source indicates the method used: "rapl", "amd_energy", or empty.
Source string
}
// raplBasePath is the base path for Intel RAPL (Running Average Power Limit) readings.
// RAPL provides energy counters that we sample to calculate power.
var raplBasePath = "/sys/class/powercap/intel-rapl"
// sampleInterval is the time between energy counter readings.
// Shorter intervals are less accurate; longer intervals add latency.
const sampleInterval = 100 * time.Millisecond
func waitForSample(ctx context.Context) error {
timer := time.NewTimer(sampleInterval)
defer func() {
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
}()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}
// CollectPower reads power consumption data from the system.
// Supports Intel RAPL and AMD energy driver.
// Returns nil if no power data is available.
func CollectPower(ctx context.Context) (*PowerData, error) {
ctx = normalizeCollectionContext(ctx)
// Try Intel RAPL first (most common on Intel)
raplData, raplErr := collectRAPL(ctx)
if raplErr == nil && raplData != nil && raplData.Available {
return raplData, nil
}
if raplErr != nil {
log.Debug().
Str("component", "sensors_power").
Str("action", "collect_rapl_failed").
Err(raplErr).
Msg("Failed to collect Intel RAPL power data")
}
// Try AMD energy driver (for AMD Ryzen/EPYC)
amdData, amdErr := collectAMDEnergy(ctx)
if amdErr == nil && amdData != nil && amdData.Available {
return amdData, nil
}
if amdErr != nil {
log.Debug().
Str("component", "sensors_power").
Str("action", "collect_amd_energy_failed").
Err(amdErr).
Msg("Failed to collect AMD energy power data")
}
// TODO: Add IPMI support for server BMCs
return nil, fmt.Errorf("no power monitoring available (rapl: %v, amd_energy: %v)", raplErr, amdErr)
}
// collectRAPL reads power data from Intel RAPL sysfs interface.
// RAPL provides energy counters in microjoules that we sample twice
// to calculate instantaneous power in watts.
func collectRAPL(ctx context.Context) (*PowerData, error) {
ctx = normalizeCollectionContext(ctx)
// Check if RAPL is available
if _, err := os.Stat(raplBasePath); err != nil {
if os.IsNotExist(err) {
return nil, fmt.Errorf("RAPL not available: %w", err)
}
return nil, fmt.Errorf("check RAPL availability: %w", err)
}
data := &PowerData{Source: "rapl"}
// Find all RAPL domains (packages)
// Typically: intel-rapl:0 (package), intel-rapl:0:0 (core), intel-rapl:0:1 (uncore), etc.
packages, err := filepath.Glob(filepath.Join(raplBasePath, "intel-rapl:*"))
if err != nil {
return nil, fmt.Errorf("find RAPL packages: %w", err)
}
if len(packages) == 0 {
return nil, fmt.Errorf("no RAPL packages found")
}
// Sample energy counters
sample1, err := readRAPLEnergy(packages)
if err != nil {
return nil, fmt.Errorf("first RAPL sample failed: %w", err)
}
// Wait for sample interval
select {
case <-ctx.Done():
return nil, fmt.Errorf("RAPL sampling canceled: %w", ctx.Err())
case <-time.After(sampleInterval):
}
// Second sample
sample2, err := readRAPLEnergy(packages)
if err != nil {
return nil, fmt.Errorf("second RAPL sample failed: %w", err)
}
// Calculate power from energy delta
// Power (W) = Energy delta (J) / Time delta (s)
duration := sampleInterval.Seconds()
for domain, energy1 := range sample1 {
energy2, ok := sample2[domain]
if !ok {
continue
}
deltaUJ := energyDelta(energy1, energy2)
// Convert microjoules to watts
watts := float64(deltaUJ) / 1e6 / duration
// Categorize by domain name
domainLower := strings.ToLower(domain)
switch {
case strings.Contains(domainLower, "package") || strings.HasSuffix(domain, ":0"):
// Package-level (total CPU socket power)
data.PackageWatts += watts
case strings.Contains(domainLower, "core"):
data.CoreWatts += watts
case strings.Contains(domainLower, "dram"):
data.DRAMWatts += watts
}
data.Available = true
}
if data.Available {
log.Debug().
Float64("packageWatts", data.PackageWatts).
Float64("coreWatts", data.CoreWatts).
Float64("dramWatts", data.DRAMWatts).
Msg("Collected RAPL power data")
}
return data, nil
}
// readRAPLEnergy reads energy counters from all RAPL domains.
// Returns a map of domain name -> energy in microjoules.
func readRAPLEnergy(packages []string) (map[string]uint64, error) {
result := make(map[string]uint64)
for _, pkgPath := range packages {
// Read the package energy
energyPath := filepath.Join(pkgPath, "energy_uj")
energy, err := readUint64File(energyPath)
if err != nil {
log.Debug().
Str("path", energyPath).
Err(err).
Msg("Skipping RAPL package energy reading")
} else {
name := filepath.Base(pkgPath)
// Also read the domain name if available
namePath := filepath.Join(pkgPath, "name")
if domainName, nameErr := readStringFile(namePath); nameErr == nil {
name = domainName
} else {
log.Debug().
Str("component", "sensors_power").
Str("action", "read_rapl_domain_name_failed").
Str("name_path", namePath).
Err(err).
Msg("Failed to read RAPL domain name, using path fallback")
}
result[name] = energy
}
// Also read subdomain energy (core, uncore, dram)
subdomains, err := filepath.Glob(filepath.Join(pkgPath, "intel-rapl:*"))
if err != nil {
return nil, fmt.Errorf("find RAPL subdomains for %s: %w", pkgPath, err)
}
for _, subPath := range subdomains {
energyPath := filepath.Join(subPath, "energy_uj")
energy, err := readUint64File(energyPath)
if err != nil {
log.Debug().
Str("path", energyPath).
Err(err).
Msg("Skipping RAPL subdomain energy reading")
continue
}
name := filepath.Base(subPath)
// Read subdomain name
namePath := filepath.Join(subPath, "name")
if domainName, nameErr := readStringFile(namePath); nameErr == nil {
name = domainName
} else {
log.Debug().
Str("path", namePath).
Err(nameErr).
Msg("Using fallback RAPL subdomain name")
}
result[name] = energy
}
}
if len(result) == 0 {
return nil, fmt.Errorf("no RAPL energy readings available")
}
return result, nil
}
// energyDelta calculates the delta between two energy counter readings,
// handling counter wraparound (energy counters are typically 32-bit).
func energyDelta(before, after uint64) uint64 {
if after >= before {
return after - before
}
// Counter wrapped around
return (^uint64(0) - before) + after + 1
}
// readUint64File reads a file containing a single uint64 value.
func readUint64File(path string) (uint64, error) {
data, err := os.ReadFile(path)
if err != nil {
return 0, fmt.Errorf("read %s: %w", path, err)
}
value, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
if err != nil {
return 0, fmt.Errorf("parse uint64 from %s: %w", path, err)
}
return value, nil
}
// readStringFile reads a file containing a single string value.
func readStringFile(path string) (string, error) {
data, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("read %s: %w", path, err)
}
return strings.TrimSpace(string(data)), nil
}
// hwmonBasePath is the base path for hwmon devices (used by AMD energy driver).
var hwmonBasePath = "/sys/class/hwmon"
// collectAMDEnergy reads power data from AMD energy driver via hwmon.
// The amd_energy module exposes energy counters similar to Intel RAPL.
func collectAMDEnergy(ctx context.Context) (*PowerData, error) {
ctx = normalizeCollectionContext(ctx)
// Find hwmon device with amd_energy driver
hwmonPath, err := findAMDEnergyHwmon()
if err != nil {
return nil, fmt.Errorf("find AMD energy hwmon: %w", err)
}
data := &PowerData{Source: "amd_energy"}
// Sample energy counters
sample1, err := readAMDEnergy(hwmonPath)
if err != nil {
return nil, fmt.Errorf("first AMD energy sample failed: %w", err)
}
// Wait for sample interval
select {
case <-ctx.Done():
return nil, fmt.Errorf("AMD energy sampling canceled: %w", ctx.Err())
case <-time.After(sampleInterval):
}
// Second sample
sample2, err := readAMDEnergy(hwmonPath)
if err != nil {
return nil, fmt.Errorf("second AMD energy sample failed: %w", err)
}
// Calculate power from energy delta
duration := sampleInterval.Seconds()
for label, energy1 := range sample1 {
energy2, ok := sample2[label]
if !ok {
continue
}
deltaUJ := energyDelta(energy1, energy2)
// Convert microjoules to watts
watts := float64(deltaUJ) / 1e6 / duration
// Categorize by label
labelLower := strings.ToLower(label)
switch {
case strings.Contains(labelLower, "socket") || strings.Contains(labelLower, "package"):
data.PackageWatts += watts
case strings.Contains(labelLower, "core"):
data.CoreWatts += watts
default:
// Default to package power for unlabeled readings
if data.PackageWatts == 0 {
data.PackageWatts = watts
}
}
data.Available = true
}
if data.Available {
log.Debug().
Float64("packageWatts", data.PackageWatts).
Float64("coreWatts", data.CoreWatts).
Msg("Collected AMD energy power data")
}
return data, nil
}
// findAMDEnergyHwmon finds the hwmon device path for amd_energy driver.
func findAMDEnergyHwmon() (string, error) {
entries, err := os.ReadDir(hwmonBasePath)
if err != nil {
return "", fmt.Errorf("cannot read hwmon: %w", err)
}
for _, entry := range entries {
if !entry.IsDir() {
continue
}
hwmonDir := filepath.Join(hwmonBasePath, entry.Name())
namePath := filepath.Join(hwmonDir, "name")
name, err := readStringFile(namePath)
if err != nil {
log.Debug().
Str("path", namePath).
Err(err).
Msg("Skipping hwmon device without readable name")
continue
}
if name == "amd_energy" {
return hwmonDir, nil
}
}
return "", fmt.Errorf("amd_energy hwmon device not found")
}
// readAMDEnergy reads energy counters from AMD energy hwmon device.
// Returns a map of label -> energy in microjoules.
func readAMDEnergy(hwmonPath string) (map[string]uint64, error) {
result := make(map[string]uint64)
// AMD energy exposes energy*_input files (in microjoules)
energyFiles, err := filepath.Glob(filepath.Join(hwmonPath, "energy*_input"))
if err != nil {
return nil, fmt.Errorf("find AMD energy files in %s: %w", hwmonPath, err)
}
if len(energyFiles) == 0 {
return nil, fmt.Errorf("no AMD energy files found")
}
for _, energyPath := range energyFiles {
energy, err := readUint64File(energyPath)
if err != nil {
log.Debug().
Str("component", "sensors_power").
Str("action", "read_amd_energy_failed").
Str("energy_path", energyPath).
Err(err).
Msg("Failed to read AMD energy counter")
continue
}
// Try to get the label for this energy reading
// energy1_input -> energy1_label
labelPath := strings.Replace(energyPath, "_input", "_label", 1)
label, err := readStringFile(labelPath)
if err != nil {
log.Debug().
Str("component", "sensors_power").
Str("action", "read_amd_label_failed").
Str("label_path", labelPath).
Err(err).
Msg("Failed to read AMD energy label, using filename fallback")
// Use filename as fallback
label = filepath.Base(energyPath)
}
result[label] = energy
}
if len(result) == 0 {
return nil, fmt.Errorf("no AMD energy readings available")
}
return result, nil
}