mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 08:57:12 +00:00
443 lines
12 KiB
Go
443 lines
12 KiB
Go
package sensors
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// PowerData contains power consumption readings from the system.
|
|
type PowerData struct {
|
|
// PackageWatts is the CPU package power consumption in watts.
|
|
// This is the total power for the CPU socket (all cores + uncore).
|
|
PackageWatts float64
|
|
|
|
// CoreWatts is the CPU cores-only power consumption in watts (if available).
|
|
CoreWatts float64
|
|
|
|
// DRAMWatts is the DRAM power consumption in watts (if available).
|
|
// Note: Not all platforms support DRAM power measurement.
|
|
DRAMWatts float64
|
|
|
|
// Available indicates whether any power data was successfully collected.
|
|
Available bool
|
|
|
|
// Source indicates the method used: "rapl", "amd_energy", or empty.
|
|
Source string
|
|
}
|
|
|
|
// raplBasePath is the base path for Intel RAPL (Running Average Power Limit) readings.
|
|
// RAPL provides energy counters that we sample to calculate power.
|
|
var raplBasePath = "/sys/class/powercap/intel-rapl"
|
|
|
|
// sampleInterval is the time between energy counter readings.
|
|
// Shorter intervals are less accurate; longer intervals add latency.
|
|
const sampleInterval = 100 * time.Millisecond
|
|
|
|
func waitForSample(ctx context.Context) error {
|
|
timer := time.NewTimer(sampleInterval)
|
|
defer func() {
|
|
if !timer.Stop() {
|
|
select {
|
|
case <-timer.C:
|
|
default:
|
|
}
|
|
}
|
|
}()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case <-timer.C:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// CollectPower reads power consumption data from the system.
|
|
// Supports Intel RAPL and AMD energy driver.
|
|
// Returns nil if no power data is available.
|
|
func CollectPower(ctx context.Context) (*PowerData, error) {
|
|
ctx = normalizeCollectionContext(ctx)
|
|
|
|
// Try Intel RAPL first (most common on Intel)
|
|
raplData, raplErr := collectRAPL(ctx)
|
|
if raplErr == nil && raplData != nil && raplData.Available {
|
|
return raplData, nil
|
|
}
|
|
if raplErr != nil {
|
|
log.Debug().
|
|
Str("component", "sensors_power").
|
|
Str("action", "collect_rapl_failed").
|
|
Err(raplErr).
|
|
Msg("Failed to collect Intel RAPL power data")
|
|
}
|
|
|
|
// Try AMD energy driver (for AMD Ryzen/EPYC)
|
|
amdData, amdErr := collectAMDEnergy(ctx)
|
|
if amdErr == nil && amdData != nil && amdData.Available {
|
|
return amdData, nil
|
|
}
|
|
if amdErr != nil {
|
|
log.Debug().
|
|
Str("component", "sensors_power").
|
|
Str("action", "collect_amd_energy_failed").
|
|
Err(amdErr).
|
|
Msg("Failed to collect AMD energy power data")
|
|
}
|
|
|
|
// TODO: Add IPMI support for server BMCs
|
|
|
|
return nil, fmt.Errorf("no power monitoring available (rapl: %v, amd_energy: %v)", raplErr, amdErr)
|
|
}
|
|
|
|
// collectRAPL reads power data from Intel RAPL sysfs interface.
|
|
// RAPL provides energy counters in microjoules that we sample twice
|
|
// to calculate instantaneous power in watts.
|
|
func collectRAPL(ctx context.Context) (*PowerData, error) {
|
|
ctx = normalizeCollectionContext(ctx)
|
|
|
|
// Check if RAPL is available
|
|
if _, err := os.Stat(raplBasePath); err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, fmt.Errorf("RAPL not available: %w", err)
|
|
}
|
|
return nil, fmt.Errorf("check RAPL availability: %w", err)
|
|
}
|
|
|
|
data := &PowerData{Source: "rapl"}
|
|
|
|
// Find all RAPL domains (packages)
|
|
// Typically: intel-rapl:0 (package), intel-rapl:0:0 (core), intel-rapl:0:1 (uncore), etc.
|
|
packages, err := filepath.Glob(filepath.Join(raplBasePath, "intel-rapl:*"))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("find RAPL packages: %w", err)
|
|
}
|
|
if len(packages) == 0 {
|
|
return nil, fmt.Errorf("no RAPL packages found")
|
|
}
|
|
|
|
// Sample energy counters
|
|
sample1, err := readRAPLEnergy(packages)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("first RAPL sample failed: %w", err)
|
|
}
|
|
|
|
// Wait for sample interval
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, fmt.Errorf("RAPL sampling canceled: %w", ctx.Err())
|
|
case <-time.After(sampleInterval):
|
|
}
|
|
|
|
// Second sample
|
|
sample2, err := readRAPLEnergy(packages)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("second RAPL sample failed: %w", err)
|
|
}
|
|
|
|
// Calculate power from energy delta
|
|
// Power (W) = Energy delta (J) / Time delta (s)
|
|
duration := sampleInterval.Seconds()
|
|
|
|
for domain, energy1 := range sample1 {
|
|
energy2, ok := sample2[domain]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
deltaUJ := energyDelta(energy1, energy2)
|
|
|
|
// Convert microjoules to watts
|
|
watts := float64(deltaUJ) / 1e6 / duration
|
|
|
|
// Categorize by domain name
|
|
domainLower := strings.ToLower(domain)
|
|
switch {
|
|
case strings.Contains(domainLower, "package") || strings.HasSuffix(domain, ":0"):
|
|
// Package-level (total CPU socket power)
|
|
data.PackageWatts += watts
|
|
case strings.Contains(domainLower, "core"):
|
|
data.CoreWatts += watts
|
|
case strings.Contains(domainLower, "dram"):
|
|
data.DRAMWatts += watts
|
|
}
|
|
|
|
data.Available = true
|
|
}
|
|
|
|
if data.Available {
|
|
log.Debug().
|
|
Float64("packageWatts", data.PackageWatts).
|
|
Float64("coreWatts", data.CoreWatts).
|
|
Float64("dramWatts", data.DRAMWatts).
|
|
Msg("Collected RAPL power data")
|
|
}
|
|
|
|
return data, nil
|
|
}
|
|
|
|
// readRAPLEnergy reads energy counters from all RAPL domains.
|
|
// Returns a map of domain name -> energy in microjoules.
|
|
func readRAPLEnergy(packages []string) (map[string]uint64, error) {
|
|
result := make(map[string]uint64)
|
|
|
|
for _, pkgPath := range packages {
|
|
// Read the package energy
|
|
energyPath := filepath.Join(pkgPath, "energy_uj")
|
|
energy, err := readUint64File(energyPath)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("path", energyPath).
|
|
Err(err).
|
|
Msg("Skipping RAPL package energy reading")
|
|
} else {
|
|
name := filepath.Base(pkgPath)
|
|
// Also read the domain name if available
|
|
namePath := filepath.Join(pkgPath, "name")
|
|
if domainName, nameErr := readStringFile(namePath); nameErr == nil {
|
|
name = domainName
|
|
} else {
|
|
log.Debug().
|
|
Str("component", "sensors_power").
|
|
Str("action", "read_rapl_domain_name_failed").
|
|
Str("name_path", namePath).
|
|
Err(err).
|
|
Msg("Failed to read RAPL domain name, using path fallback")
|
|
}
|
|
result[name] = energy
|
|
}
|
|
// Also read subdomain energy (core, uncore, dram)
|
|
subdomains, err := filepath.Glob(filepath.Join(pkgPath, "intel-rapl:*"))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("find RAPL subdomains for %s: %w", pkgPath, err)
|
|
}
|
|
for _, subPath := range subdomains {
|
|
energyPath := filepath.Join(subPath, "energy_uj")
|
|
energy, err := readUint64File(energyPath)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("path", energyPath).
|
|
Err(err).
|
|
Msg("Skipping RAPL subdomain energy reading")
|
|
continue
|
|
}
|
|
|
|
name := filepath.Base(subPath)
|
|
// Read subdomain name
|
|
namePath := filepath.Join(subPath, "name")
|
|
if domainName, nameErr := readStringFile(namePath); nameErr == nil {
|
|
name = domainName
|
|
} else {
|
|
log.Debug().
|
|
Str("path", namePath).
|
|
Err(nameErr).
|
|
Msg("Using fallback RAPL subdomain name")
|
|
}
|
|
result[name] = energy
|
|
}
|
|
}
|
|
|
|
if len(result) == 0 {
|
|
return nil, fmt.Errorf("no RAPL energy readings available")
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// energyDelta calculates the delta between two energy counter readings,
|
|
// handling counter wraparound (energy counters are typically 32-bit).
|
|
func energyDelta(before, after uint64) uint64 {
|
|
if after >= before {
|
|
return after - before
|
|
}
|
|
// Counter wrapped around
|
|
return (^uint64(0) - before) + after + 1
|
|
}
|
|
|
|
// readUint64File reads a file containing a single uint64 value.
|
|
func readUint64File(path string) (uint64, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("read %s: %w", path, err)
|
|
}
|
|
value, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("parse uint64 from %s: %w", path, err)
|
|
}
|
|
return value, nil
|
|
}
|
|
|
|
// readStringFile reads a file containing a single string value.
|
|
func readStringFile(path string) (string, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("read %s: %w", path, err)
|
|
}
|
|
return strings.TrimSpace(string(data)), nil
|
|
}
|
|
|
|
// hwmonBasePath is the base path for hwmon devices (used by AMD energy driver).
|
|
var hwmonBasePath = "/sys/class/hwmon"
|
|
|
|
// collectAMDEnergy reads power data from AMD energy driver via hwmon.
|
|
// The amd_energy module exposes energy counters similar to Intel RAPL.
|
|
func collectAMDEnergy(ctx context.Context) (*PowerData, error) {
|
|
ctx = normalizeCollectionContext(ctx)
|
|
|
|
// Find hwmon device with amd_energy driver
|
|
hwmonPath, err := findAMDEnergyHwmon()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("find AMD energy hwmon: %w", err)
|
|
}
|
|
|
|
data := &PowerData{Source: "amd_energy"}
|
|
|
|
// Sample energy counters
|
|
sample1, err := readAMDEnergy(hwmonPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("first AMD energy sample failed: %w", err)
|
|
}
|
|
|
|
// Wait for sample interval
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, fmt.Errorf("AMD energy sampling canceled: %w", ctx.Err())
|
|
case <-time.After(sampleInterval):
|
|
}
|
|
|
|
// Second sample
|
|
sample2, err := readAMDEnergy(hwmonPath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("second AMD energy sample failed: %w", err)
|
|
}
|
|
|
|
// Calculate power from energy delta
|
|
duration := sampleInterval.Seconds()
|
|
|
|
for label, energy1 := range sample1 {
|
|
energy2, ok := sample2[label]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
deltaUJ := energyDelta(energy1, energy2)
|
|
|
|
// Convert microjoules to watts
|
|
watts := float64(deltaUJ) / 1e6 / duration
|
|
|
|
// Categorize by label
|
|
labelLower := strings.ToLower(label)
|
|
switch {
|
|
case strings.Contains(labelLower, "socket") || strings.Contains(labelLower, "package"):
|
|
data.PackageWatts += watts
|
|
case strings.Contains(labelLower, "core"):
|
|
data.CoreWatts += watts
|
|
default:
|
|
// Default to package power for unlabeled readings
|
|
if data.PackageWatts == 0 {
|
|
data.PackageWatts = watts
|
|
}
|
|
}
|
|
|
|
data.Available = true
|
|
}
|
|
|
|
if data.Available {
|
|
log.Debug().
|
|
Float64("packageWatts", data.PackageWatts).
|
|
Float64("coreWatts", data.CoreWatts).
|
|
Msg("Collected AMD energy power data")
|
|
}
|
|
|
|
return data, nil
|
|
}
|
|
|
|
// findAMDEnergyHwmon finds the hwmon device path for amd_energy driver.
|
|
func findAMDEnergyHwmon() (string, error) {
|
|
entries, err := os.ReadDir(hwmonBasePath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("cannot read hwmon: %w", err)
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if !entry.IsDir() {
|
|
continue
|
|
}
|
|
|
|
hwmonDir := filepath.Join(hwmonBasePath, entry.Name())
|
|
namePath := filepath.Join(hwmonDir, "name")
|
|
|
|
name, err := readStringFile(namePath)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("path", namePath).
|
|
Err(err).
|
|
Msg("Skipping hwmon device without readable name")
|
|
continue
|
|
}
|
|
|
|
if name == "amd_energy" {
|
|
return hwmonDir, nil
|
|
}
|
|
}
|
|
|
|
return "", fmt.Errorf("amd_energy hwmon device not found")
|
|
}
|
|
|
|
// readAMDEnergy reads energy counters from AMD energy hwmon device.
|
|
// Returns a map of label -> energy in microjoules.
|
|
func readAMDEnergy(hwmonPath string) (map[string]uint64, error) {
|
|
result := make(map[string]uint64)
|
|
|
|
// AMD energy exposes energy*_input files (in microjoules)
|
|
energyFiles, err := filepath.Glob(filepath.Join(hwmonPath, "energy*_input"))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("find AMD energy files in %s: %w", hwmonPath, err)
|
|
}
|
|
if len(energyFiles) == 0 {
|
|
return nil, fmt.Errorf("no AMD energy files found")
|
|
}
|
|
|
|
for _, energyPath := range energyFiles {
|
|
energy, err := readUint64File(energyPath)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("component", "sensors_power").
|
|
Str("action", "read_amd_energy_failed").
|
|
Str("energy_path", energyPath).
|
|
Err(err).
|
|
Msg("Failed to read AMD energy counter")
|
|
continue
|
|
}
|
|
|
|
// Try to get the label for this energy reading
|
|
// energy1_input -> energy1_label
|
|
labelPath := strings.Replace(energyPath, "_input", "_label", 1)
|
|
label, err := readStringFile(labelPath)
|
|
if err != nil {
|
|
log.Debug().
|
|
Str("component", "sensors_power").
|
|
Str("action", "read_amd_label_failed").
|
|
Str("label_path", labelPath).
|
|
Err(err).
|
|
Msg("Failed to read AMD energy label, using filename fallback")
|
|
|
|
// Use filename as fallback
|
|
label = filepath.Base(energyPath)
|
|
}
|
|
|
|
result[label] = energy
|
|
}
|
|
|
|
if len(result) == 0 {
|
|
return nil, fmt.Errorf("no AMD energy readings available")
|
|
}
|
|
|
|
return result, nil
|
|
}
|