Pulse/internal/mdadm/mdadm.go
rcourtman bb7ca93c18 feat: Add mdadm RAID monitoring support for host agents
Implements comprehensive mdadm RAID array monitoring for Linux hosts
via pulse-host-agent. Arrays are automatically detected and monitored
with real-time status updates, rebuild progress tracking, and automatic
alerting for degraded or failed arrays.

Key changes:

**Backend:**
- Add mdadm package for parsing mdadm --detail output
- Extend host agent report structure with RAID array data
- Integrate mdadm collection into host agent (Linux-only, best-effort)
- Add RAID array processing in monitoring system
- Implement automatic alerting:
  - Critical alerts for degraded arrays or arrays with failed devices
  - Warning alerts for rebuilding/resyncing arrays with progress tracking
  - Auto-clear alerts when arrays return to healthy state

**Frontend:**
- Add TypeScript types for RAID arrays and devices
- Display RAID arrays in host details drawer with:
  - Array status (clean/degraded/recovering) with color-coded indicators
  - Device counts (active/total/failed/spare)
  - Rebuild progress percentage and speed when applicable
  - Green for healthy, amber for rebuilding, red for degraded

**Documentation:**
- Document mdadm monitoring feature in HOST_AGENT.md
- Explain requirements (Linux, mdadm installed, root access)
- Clarify scope (software RAID only, hardware RAID not supported)

**Testing:**
- Add comprehensive tests for mdadm output parsing
- Test parsing of healthy, degraded, and rebuilding arrays
- Verify proper extraction of device states and rebuild progress

All builds pass successfully. RAID monitoring is automatic and best-effort
- if mdadm is not installed or no arrays exist, host agent continues
reporting other metrics normally.

Related to #676
2025-11-09 16:36:33 +00:00

259 lines
6.9 KiB
Go

package mdadm
import (
"context"
"fmt"
"os/exec"
"regexp"
"strconv"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/pkg/agents/host"
)
// CollectArrays discovers and collects status for all mdadm RAID arrays on the system.
// Returns an empty slice if mdadm is not available or no arrays are found.
func CollectArrays(ctx context.Context) ([]host.RAIDArray, error) {
// Check if mdadm is available
if !isMdadmAvailable(ctx) {
return nil, nil
}
// Get list of arrays from /proc/mdstat
devices, err := listArrayDevices(ctx)
if err != nil {
return nil, fmt.Errorf("list array devices: %w", err)
}
if len(devices) == 0 {
return nil, nil
}
// Collect detailed info for each array
var arrays []host.RAIDArray
for _, device := range devices {
array, err := collectArrayDetail(ctx, device)
if err != nil {
// Log but don't fail - continue with other arrays
continue
}
arrays = append(arrays, array)
}
return arrays, nil
}
// isMdadmAvailable checks if mdadm binary is accessible
func isMdadmAvailable(ctx context.Context) bool {
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "mdadm", "--version")
return cmd.Run() == nil
}
// listArrayDevices scans /proc/mdstat to find all md devices
func listArrayDevices(ctx context.Context) ([]string, error) {
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "cat", "/proc/mdstat")
output, err := cmd.Output()
if err != nil {
return nil, fmt.Errorf("read /proc/mdstat: %w", err)
}
// Parse /proc/mdstat to find device names
// Lines like: md0 : active raid1 sdb1[1] sda1[0]
re := regexp.MustCompile(`^(md\d+)\s*:`)
var devices []string
for _, line := range strings.Split(string(output), "\n") {
matches := re.FindStringSubmatch(line)
if len(matches) > 1 {
devices = append(devices, "/dev/"+matches[1])
}
}
return devices, nil
}
// collectArrayDetail runs mdadm --detail on a specific device and parses the output
func collectArrayDetail(ctx context.Context, device string) (host.RAIDArray, error) {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "mdadm", "--detail", device)
output, err := cmd.Output()
if err != nil {
return host.RAIDArray{}, fmt.Errorf("mdadm --detail %s: %w", device, err)
}
return parseDetail(device, string(output))
}
// parseDetail parses the output of mdadm --detail
func parseDetail(device, output string) (host.RAIDArray, error) {
array := host.RAIDArray{
Device: device,
Devices: []host.RAIDDevice{},
}
lines := strings.Split(output, "\n")
inDeviceSection := false
// Regex to match device lines: Number Major Minor RaidDevice State Device
// Example: " 0 8 1 0 active sync /dev/sda1"
slotRe := regexp.MustCompile(`^\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+?)\s+(/dev/.+)$`)
for _, line := range lines {
line = strings.TrimSpace(line)
// Skip empty lines
if line == "" {
continue
}
// Check if we're entering the device list section
if strings.Contains(line, "Number") && strings.Contains(line, "Major") && strings.Contains(line, "Minor") {
inDeviceSection = true
continue
}
// Parse device entries
if inDeviceSection {
matches := slotRe.FindStringSubmatch(line)
if len(matches) >= 7 {
slot, _ := strconv.Atoi(matches[1])
state := strings.TrimSpace(matches[5])
devicePath := strings.TrimSpace(matches[6])
array.Devices = append(array.Devices, host.RAIDDevice{
Device: devicePath,
State: state,
Slot: slot,
})
continue
}
// Handle spare/faulty devices (different format)
if strings.Contains(line, "spare") || strings.Contains(line, "faulty") {
parts := strings.Fields(line)
if len(parts) >= 2 {
state := "spare"
if strings.Contains(line, "faulty") {
state = "faulty"
}
devicePath := parts[len(parts)-1]
array.Devices = append(array.Devices, host.RAIDDevice{
Device: devicePath,
State: state,
Slot: -1,
})
}
}
continue
}
// Parse key-value pairs
if strings.Contains(line, ":") {
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
switch key {
case "Name":
array.Name = value
case "Raid Level":
array.Level = strings.ToLower(value)
case "State":
array.State = strings.ToLower(value)
case "Total Devices":
array.TotalDevices, _ = strconv.Atoi(value)
case "Active Devices":
array.ActiveDevices, _ = strconv.Atoi(value)
case "Working Devices":
array.WorkingDevices, _ = strconv.Atoi(value)
case "Failed Devices":
array.FailedDevices, _ = strconv.Atoi(value)
case "Spare Devices":
array.SpareDevices, _ = strconv.Atoi(value)
case "UUID":
array.UUID = value
case "Rebuild Status":
// Parse rebuild percentage
// Format: "50% complete"
if strings.Contains(value, "%") {
percentStr := strings.TrimSpace(strings.Split(value, "%")[0])
array.RebuildPercent, _ = strconv.ParseFloat(percentStr, 64)
}
case "Reshape Status":
// Handle reshape similarly to rebuild
if strings.Contains(value, "%") {
percentStr := strings.TrimSpace(strings.Split(value, "%")[0])
array.RebuildPercent, _ = strconv.ParseFloat(percentStr, 64)
}
}
}
}
// Check for rebuild/resync info in /proc/mdstat for speed information
if array.RebuildPercent > 0 {
speed := getRebuildSpeed(device)
if speed != "" {
array.RebuildSpeed = speed
}
}
return array, nil
}
// getRebuildSpeed extracts rebuild speed from /proc/mdstat
func getRebuildSpeed(device string) string {
// Remove /dev/ prefix for /proc/mdstat lookup
deviceName := strings.TrimPrefix(device, "/dev/")
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "cat", "/proc/mdstat")
output, err := cmd.Output()
if err != nil {
return ""
}
// Look for lines containing rebuild/resync speed
// Example: [==>..................] recovery = 12.6% (37043392/293039104) finish=127.5min speed=33440K/sec
lines := strings.Split(string(output), "\n")
inSection := false
for _, line := range lines {
// Check if this is our device
if strings.HasPrefix(strings.TrimSpace(line), deviceName) {
inSection = true
continue
}
// If we're in the right section, look for speed info
if inSection {
if strings.Contains(line, "speed=") {
// Extract speed value
speedRe := regexp.MustCompile(`speed=(\S+)`)
matches := speedRe.FindStringSubmatch(line)
if len(matches) > 1 {
return matches[1]
}
}
// Exit section when we hit a new device or blank line
if strings.TrimSpace(line) == "" || (strings.HasPrefix(strings.TrimSpace(line), "md") && strings.Contains(line, ":")) {
break
}
}
}
return ""
}