diff --git a/internal/hostagent/agent.go b/internal/hostagent/agent.go index 603ed165f..8bd509145 100644 --- a/internal/hostagent/agent.go +++ b/internal/hostagent/agent.go @@ -21,6 +21,7 @@ import ( "github.com/rcourtman/pulse-go-rewrite/internal/hostmetrics" "github.com/rcourtman/pulse-go-rewrite/internal/mdadm" "github.com/rcourtman/pulse-go-rewrite/internal/sensors" + "github.com/rcourtman/pulse-go-rewrite/internal/smartctl" agentshost "github.com/rcourtman/pulse-go-rewrite/pkg/agents/host" "github.com/rs/zerolog" gohost "github.com/shirou/gopsutil/v4/host" @@ -79,14 +80,15 @@ var readFile = os.ReadFile var netInterfaces = net.Interfaces var ( - hostInfoWithContext = gohost.InfoWithContext - hostUptimeWithContext = gohost.UptimeWithContext - hostmetricsCollect = hostmetrics.Collect - sensorsCollectLocal = sensors.CollectLocal - sensorsParse = sensors.Parse - mdadmCollectArrays = mdadm.CollectArrays - cephCollect = ceph.Collect - nowUTC = func() time.Time { return time.Now().UTC() } + hostInfoWithContext = gohost.InfoWithContext + hostUptimeWithContext = gohost.UptimeWithContext + hostmetricsCollect = hostmetrics.Collect + sensorsCollectLocal = sensors.CollectLocal + sensorsParse = sensors.Parse + mdadmCollectArrays = mdadm.CollectArrays + cephCollect = ceph.Collect + smartctlCollectLocal = smartctl.CollectLocal + nowUTC = func() time.Time { return time.Now().UTC() } ) // New constructs a fully initialised host Agent. @@ -346,6 +348,12 @@ func (a *Agent) buildReport(ctx context.Context) (agentshost.Report, error) { // Collect temperature data (best effort - don't fail if unavailable) sensorData := a.collectTemperatures(collectCtx) + // Collect S.M.A.R.T. disk data (best effort - don't fail if unavailable) + smartData := a.collectSMARTData(collectCtx) + if len(smartData) > 0 { + sensorData.SMART = smartData + } + // Collect RAID array data (best effort - don't fail if unavailable) raidData := a.collectRAIDArrays(collectCtx) @@ -682,6 +690,46 @@ func (a *Agent) collectCephStatus(ctx context.Context) *agentshost.CephCluster { return result } +// collectSMARTData collects S.M.A.R.T. data from local disks. +// Returns nil if smartctl is not available or no disks are found. +func (a *Agent) collectSMARTData(ctx context.Context) []agentshost.DiskSMART { + // Only collect on Linux (smartctl works on other platforms but disk paths differ) + if runtime.GOOS != "linux" { + return nil + } + + smartData, err := smartctlCollectLocal(ctx) + if err != nil { + a.logger.Debug().Err(err).Msg("Failed to collect S.M.A.R.T. data (smartctl may not be installed)") + return nil + } + + if len(smartData) == 0 { + return nil + } + + // Convert internal smartctl types to agent report types + result := make([]agentshost.DiskSMART, 0, len(smartData)) + for _, disk := range smartData { + result = append(result, agentshost.DiskSMART{ + Device: disk.Device, + Model: disk.Model, + Serial: disk.Serial, + WWN: disk.WWN, + Type: disk.Type, + Temperature: disk.Temperature, + Health: disk.Health, + Standby: disk.Standby, + }) + } + + a.logger.Debug(). + Int("diskCount", len(result)). + Msg("Collected S.M.A.R.T. disk data") + + return result +} + // runProxmoxSetup performs one-time Proxmox API token setup and node registration. func (a *Agent) runProxmoxSetup(ctx context.Context) { a.logger.Info().Msg("Proxmox mode enabled, checking setup...") diff --git a/internal/models/models.go b/internal/models/models.go index 3e4211bb5..110f9fb2f 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -210,6 +210,19 @@ type HostSensorSummary struct { TemperatureCelsius map[string]float64 `json:"temperatureCelsius,omitempty"` FanRPM map[string]float64 `json:"fanRpm,omitempty"` Additional map[string]float64 `json:"additional,omitempty"` + SMART []HostDiskSMART `json:"smart,omitempty"` // S.M.A.R.T. disk data +} + +// HostDiskSMART represents S.M.A.R.T. data for a disk from a host agent. +type HostDiskSMART struct { + Device string `json:"device"` // Device name (e.g., sda) + Model string `json:"model,omitempty"` // Disk model + Serial string `json:"serial,omitempty"` // Serial number + WWN string `json:"wwn,omitempty"` // World Wide Name + Type string `json:"type,omitempty"` // Transport type: sata, sas, nvme + Temperature int `json:"temperature"` // Temperature in Celsius + Health string `json:"health,omitempty"` // PASSED, FAILED, UNKNOWN + Standby bool `json:"standby,omitempty"` // True if disk was in standby } // HostRAIDArray represents an mdadm RAID array on a host. diff --git a/internal/monitoring/host_agent_temps.go b/internal/monitoring/host_agent_temps.go index b91461724..19bfb1e81 100644 --- a/internal/monitoring/host_agent_temps.go +++ b/internal/monitoring/host_agent_temps.go @@ -208,8 +208,29 @@ func convertHostSensorsToTemperature(sensors models.HostSensorSummary, lastSeen } } + // Convert S.M.A.R.T. data from host agent + if len(sensors.SMART) > 0 { + temp.SMART = make([]models.DiskTemp, 0, len(sensors.SMART)) + for _, disk := range sensors.SMART { + // Skip disks in standby (no temperature data) + if disk.Standby { + continue + } + temp.SMART = append(temp.SMART, models.DiskTemp{ + Device: "/dev/" + disk.Device, + Serial: disk.Serial, + WWN: disk.WWN, + Model: disk.Model, + Type: disk.Type, + Temperature: disk.Temperature, + LastUpdated: lastSeen, + }) + } + temp.HasSMART = len(temp.SMART) > 0 + } + // Validate we have at least some data - if !temp.HasCPU && !temp.HasGPU && !temp.HasNVMe { + if !temp.HasCPU && !temp.HasGPU && !temp.HasNVMe && !temp.HasSMART { return nil } @@ -220,6 +241,7 @@ func convertHostSensorsToTemperature(sensors models.HostSensorSummary, lastSeen Int("coreCount", len(temp.Cores)). Int("nvmeCount", len(temp.NVMe)). Int("gpuCount", len(temp.GPU)). + Int("smartCount", len(temp.SMART)). Msg("Converted host agent sensors to temperature data") return temp @@ -258,7 +280,7 @@ func mergeTemperatureData(hostAgentTemp, proxyTemp *models.Temperature) *models. HasCPU: hostAgentTemp.HasCPU, HasGPU: hostAgentTemp.HasGPU, HasNVMe: hostAgentTemp.HasNVMe, - HasSMART: proxyTemp.HasSMART, // SMART data only comes from proxy + HasSMART: hostAgentTemp.HasSMART || proxyTemp.HasSMART, LastUpdate: hostAgentTemp.LastUpdate, } @@ -270,8 +292,10 @@ func mergeTemperatureData(hostAgentTemp, proxyTemp *models.Temperature) *models. result.HasCPU = true } - // Keep proxy SMART data (host agent doesn't have smartctl access currently) - if proxyTemp.HasSMART { + // Merge SMART data - prefer host agent if available, fall back to proxy + if hostAgentTemp.HasSMART { + result.SMART = hostAgentTemp.SMART + } else if proxyTemp.HasSMART { result.SMART = proxyTemp.SMART } diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index 1af148635..f440f8272 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -2226,6 +2226,7 @@ func (m *Monitor) ApplyHostReport(report agentshost.Report, tokenRecord *config. TemperatureCelsius: cloneStringFloatMap(report.Sensors.TemperatureCelsius), FanRPM: cloneStringFloatMap(report.Sensors.FanRPM), Additional: cloneStringFloatMap(report.Sensors.Additional), + SMART: convertAgentSMARTToModels(report.Sensors.SMART), }, RAID: raid, Ceph: cephData, @@ -9605,6 +9606,27 @@ func isLegacyDockerAgent(agentType string) bool { return agentType != "unified" } +// convertAgentSMARTToModels converts agent report S.M.A.R.T. data to the models.HostDiskSMART format. +func convertAgentSMARTToModels(smart []agentshost.DiskSMART) []models.HostDiskSMART { + if len(smart) == 0 { + return nil + } + result := make([]models.HostDiskSMART, 0, len(smart)) + for _, disk := range smart { + result = append(result, models.HostDiskSMART{ + Device: disk.Device, + Model: disk.Model, + Serial: disk.Serial, + WWN: disk.WWN, + Type: disk.Type, + Temperature: disk.Temperature, + Health: disk.Health, + Standby: disk.Standby, + }) + } + return result +} + // convertAgentCephToModels converts agent report Ceph data to the models.HostCephCluster format. func convertAgentCephToModels(ceph *agentshost.CephCluster) *models.HostCephCluster { if ceph == nil { diff --git a/internal/smartctl/collector.go b/internal/smartctl/collector.go new file mode 100644 index 000000000..3828c566b --- /dev/null +++ b/internal/smartctl/collector.go @@ -0,0 +1,222 @@ +// Package smartctl provides S.M.A.R.T. data collection from local disks. +package smartctl + +import ( + "context" + "encoding/json" + "os/exec" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/rs/zerolog/log" +) + +// DiskSMART represents S.M.A.R.T. data for a single disk. +type DiskSMART struct { + Device string `json:"device"` // Device path (e.g., /dev/sda) + Model string `json:"model,omitempty"` // Disk model + Serial string `json:"serial,omitempty"` // Serial number + WWN string `json:"wwn,omitempty"` // World Wide Name + Type string `json:"type,omitempty"` // Transport type: sata, sas, nvme + Temperature int `json:"temperature"` // Temperature in Celsius + Health string `json:"health,omitempty"` // PASSED, FAILED, UNKNOWN + Standby bool `json:"standby,omitempty"` // True if disk was in standby + LastUpdated time.Time `json:"lastUpdated"` // When this reading was taken +} + +// smartctlJSON represents the JSON output from smartctl --json. +type smartctlJSON struct { + Device struct { + Name string `json:"name"` + Type string `json:"type"` + Protocol string `json:"protocol"` + } `json:"device"` + ModelFamily string `json:"model_family"` + ModelName string `json:"model_name"` + SerialNumber string `json:"serial_number"` + WWN struct { + NAA uint64 `json:"naa"` + OUI uint64 `json:"oui"` + ID uint64 `json:"id"` + } `json:"wwn"` + SmartStatus struct { + Passed bool `json:"passed"` + } `json:"smart_status"` + Temperature struct { + Current int `json:"current"` + } `json:"temperature"` + // NVMe-specific temperature + NVMeSmartHealthInformationLog struct { + Temperature int `json:"temperature"` + } `json:"nvme_smart_health_information_log"` + PowerMode string `json:"power_mode"` +} + +// CollectLocal collects S.M.A.R.T. data from all local block devices. +func CollectLocal(ctx context.Context) ([]DiskSMART, error) { + // List block devices + devices, err := listBlockDevices(ctx) + if err != nil { + log.Debug().Err(err).Msg("Failed to list block devices for SMART collection") + return nil, err + } + + if len(devices) == 0 { + return nil, nil + } + + var results []DiskSMART + for _, dev := range devices { + smart, err := collectDeviceSMART(ctx, dev) + if err != nil { + log.Debug().Err(err).Str("device", dev).Msg("Failed to collect SMART data for device") + continue + } + if smart != nil { + results = append(results, *smart) + } + } + + return results, nil +} + +// listBlockDevices returns a list of block devices suitable for SMART queries. +func listBlockDevices(ctx context.Context) ([]string, error) { + // Use lsblk to find disks (not partitions) + cmd := exec.CommandContext(ctx, "lsblk", "-d", "-n", "-o", "NAME,TYPE") + output, err := cmd.Output() + if err != nil { + return nil, err + } + + var devices []string + for _, line := range strings.Split(string(output), "\n") { + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + name, devType := fields[0], fields[1] + // Only include disk types (not loop, rom, partition) + if devType == "disk" { + devices = append(devices, "/dev/"+name) + } + } + + return devices, nil +} + +// collectDeviceSMART runs smartctl on a single device and parses the result. +func collectDeviceSMART(ctx context.Context, device string) (*DiskSMART, error) { + // Use timeout to avoid hanging on slow/unresponsive disks + cmdCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + + // Check if smartctl is available + smartctlPath, err := exec.LookPath("smartctl") + if err != nil { + return nil, err + } + + // Run smartctl with standby check to avoid waking sleeping drives + // -n standby: don't check if drive is in standby (return exit code 2) + // -i: device info + // -A: attributes (for temperature) + // --json=o: output original smartctl JSON format + cmd := exec.CommandContext(cmdCtx, smartctlPath, "-n", "standby", "-i", "-A", "-H", "--json=o", device) + output, err := cmd.Output() + + // smartctl returns non-zero exit codes for various conditions + // Exit code 2 means drive is in standby - that's okay + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + exitCode := exitErr.ExitCode() + // Check for standby (bit 1 set in exit status) + if exitCode&2 != 0 { + return &DiskSMART{ + Device: filepath.Base(device), + Standby: true, + LastUpdated: time.Now(), + }, nil + } + // Other exit codes might still have valid JSON output + // Continue parsing if we got output + if len(output) == 0 { + return nil, err + } + } else { + return nil, err + } + } + + // Parse JSON output + var smartData smartctlJSON + if err := json.Unmarshal(output, &smartData); err != nil { + return nil, err + } + + result := &DiskSMART{ + Device: filepath.Base(device), + Model: smartData.ModelName, + Serial: smartData.SerialNumber, + Type: detectDiskType(smartData), + LastUpdated: time.Now(), + } + + // Build WWN string if available + if smartData.WWN.NAA != 0 { + result.WWN = formatWWN(smartData.WWN.NAA, smartData.WWN.OUI, smartData.WWN.ID) + } + + // Get temperature (different location for NVMe vs SATA) + if smartData.Temperature.Current > 0 { + result.Temperature = smartData.Temperature.Current + } else if smartData.NVMeSmartHealthInformationLog.Temperature > 0 { + result.Temperature = smartData.NVMeSmartHealthInformationLog.Temperature + } + + // Get health status + if smartData.SmartStatus.Passed { + result.Health = "PASSED" + } else { + result.Health = "FAILED" + } + + log.Debug(). + Str("device", result.Device). + Str("model", result.Model). + Int("temperature", result.Temperature). + Str("health", result.Health). + Msg("Collected SMART data") + + return result, nil +} + +// detectDiskType determines the disk transport type from smartctl output. +func detectDiskType(data smartctlJSON) string { + protocol := strings.ToLower(data.Device.Protocol) + switch { + case strings.Contains(protocol, "nvme"): + return "nvme" + case strings.Contains(protocol, "sas"): + return "sas" + case strings.Contains(protocol, "ata"), strings.Contains(protocol, "sata"): + return "sata" + default: + // Try to infer from device type + devType := strings.ToLower(data.Device.Type) + if strings.Contains(devType, "nvme") { + return "nvme" + } + return "sata" // default + } +} + +// formatWWN formats WWN components into a standard string. +func formatWWN(naa, oui, id uint64) string { + // Format as hex string: naa-oui-id + return strconv.FormatUint(naa, 16) + "-" + + strconv.FormatUint(oui, 16) + "-" + + strconv.FormatUint(id, 16) +} diff --git a/pkg/agents/host/report.go b/pkg/agents/host/report.go index d6111f449..c1c8d77da 100644 --- a/pkg/agents/host/report.go +++ b/pkg/agents/host/report.go @@ -102,6 +102,19 @@ type Sensors struct { TemperatureCelsius map[string]float64 `json:"temperatureCelsius,omitempty"` FanRPM map[string]float64 `json:"fanRpm,omitempty"` Additional map[string]float64 `json:"additional,omitempty"` + SMART []DiskSMART `json:"smart,omitempty"` // S.M.A.R.T. disk data +} + +// DiskSMART represents S.M.A.R.T. data for a single disk. +type DiskSMART struct { + Device string `json:"device"` // Device path (e.g., sda) + Model string `json:"model,omitempty"` // Disk model + Serial string `json:"serial,omitempty"` // Serial number + WWN string `json:"wwn,omitempty"` // World Wide Name + Type string `json:"type,omitempty"` // Transport type: sata, sas, nvme + Temperature int `json:"temperature"` // Temperature in Celsius + Health string `json:"health,omitempty"` // PASSED, FAILED, UNKNOWN + Standby bool `json:"standby,omitempty"` // True if disk was in standby } // RAIDArray represents an mdadm RAID array.