NAS 磁盘健康监控：从 smartctl 到自定义 Agent 实战

2022-08-15 约 2141 字预计阅读 5 分钟

NAS 最重要的职责是保护用户的数据。磁盘故障前往往有预警信号，SMART 技术能提前发现问题。这篇文章记录了如何开发一个磁盘健康监控 Agent。

一、SMART 技术简介

1.1 什么是 SMART？

S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) 是硬盘内置的自检技术。硬盘会持续收集自身的健康指标，供操作系统读取。

1.2 关键指标

ID	属性名	含义	危险阈值
5	Reallocated Sectors Count	重映射扇区数	>0 需要关注
187	Reported Uncorrectable Errors	无法纠正的错误	>0 危险
188	Command Timeout	命令超时次数	快速增长 = 危险
197	Current Pending Sector Count	待映射扇区	>0 = 潜在故障
198	Offline Uncorrectable	离线无法纠正	>0 = 即将故障
194	Temperature	温度	>55°C 告警

1.3 硬盘寿命的常见规律

根据 Backblaze 的统计数据：

第 1 年：年故障率约 5%（新硬盘的"婴儿期死亡"）
第 2-3 年：年故障率约 1.5%（最稳定）
第 4 年起：故障率快速上升
Reallocated Sectors > 0 后，1 年内故障概率飙升

二、smartctl 使用

2.1 基本命令

1
2
3
4
5
6
7
8
# 查看所有 SMART 信息
smartctl -a /dev/sda

# 只看健康状态
smartctl -H /dev/sda

# JSON 格式输出（方便程序解析）
smartctl -a /dev/sda -j

2.2 输出示例

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
{
  "device": {
    "name": "/dev/sda",
    "type": "sat"
  },
  "model_name": "WDC WD40EFRX-68N32N0",
  "serial_number": "WD-WCC7K1234567",
  "ata_smart_attributes": {
    "table": [
      {
        "id": 5,
        "name": "Reallocated_Sector_Ct",
        "value": 200,
        "worst": 200,
        "thresh": 140,
        "raw": {"value": 0}
      },
      {
        "id": 194,
        "name": "Temperature_Celsius",
        "value": 117,
        "raw": {"value": 33}
      }
    ]
  }
}

三、Go 实现

3.1 数据结构定义

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package diskmon

// SmartctlOutput 对应 smartctl -j 的输出
type SmartctlOutput struct {
    Device struct {
        Name string `json:"name"`
        Type string `json:"type"`
    } `json:"device"`
    ModelName    string `json:"model_name"`
    SerialNumber string `json:"serial_number"`
    SmartStatus  struct {
        Passed bool `json:"passed"`
    } `json:"smart_status"`
    ATASmartAttributes struct {
        Table []SmartAttribute `json:"table"`
    } `json:"ata_smart_attributes"`
    Temperature struct {
        Current int `json:"current"`
    } `json:"temperature"`
    PowerOnTime struct {
        Hours int `json:"hours"`
    } `json:"power_on_time"`
}

type SmartAttribute struct {
    ID     int    `json:"id"`
    Name   string `json:"name"`
    Value  int    `json:"value"`
    Worst  int    `json:"worst"`
    Thresh int    `json:"thresh"`
    Raw    struct {
        Value int `json:"value"`
    } `json:"raw"`
}

// DiskHealth 精简后的健康状态
type DiskHealth struct {
    Device             string
    Model              string
    Serial             string
    Passed             bool
    Temperature        int
    PowerOnHours       int
    ReallocatedSectors int
    PendingSectors     int
    UncorrectableErrors int
    Alerts             []string
}

3.2 执行 smartctl 并解析

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
func GetDiskHealth(device string) (*DiskHealth, error) {
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()
    
    cmd := exec.CommandContext(ctx, "smartctl", "-a", "-j", device)
    output, err := cmd.Output()
    if err != nil {
        // smartctl 返回非 0 可能只是警告，需要检查
        if exitErr, ok := err.(*exec.ExitError); ok {
            // 返回码 32 = 过去有错误，64 = 当前有错误
            if exitErr.ExitCode() >= 4 {
                return nil, fmt.Errorf("smartctl failed: %w", err)
            }
            // 继续解析输出
        } else {
            return nil, err
        }
    }
    
    var raw SmartctlOutput
    if err := json.Unmarshal(output, &raw); err != nil {
        return nil, fmt.Errorf("parse smartctl output: %w", err)
    }
    
    return analyzeHealth(&raw), nil
}

func analyzeHealth(raw *SmartctlOutput) *DiskHealth {
    health := &DiskHealth{
        Device:       raw.Device.Name,
        Model:        raw.ModelName,
        Serial:       raw.SerialNumber,
        Passed:       raw.SmartStatus.Passed,
        Temperature:  raw.Temperature.Current,
        PowerOnHours: raw.PowerOnTime.Hours,
    }
    
    // 提取关键属性
    for _, attr := range raw.ATASmartAttributes.Table {
        switch attr.ID {
        case 5: // Reallocated Sectors
            health.ReallocatedSectors = attr.Raw.Value
        case 197: // Current Pending Sector
            health.PendingSectors = attr.Raw.Value
        case 187, 198: // Uncorrectable Errors
            health.UncorrectableErrors += attr.Raw.Value
        }
    }
    
    // 生成告警
    health.Alerts = generateAlerts(health)
    
    return health
}

3.3 告警策略

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
type AlertLevel int

const (
    AlertNone AlertLevel = iota
    AlertWarning
    AlertCritical
)

func generateAlerts(h *DiskHealth) []string {
    var alerts []string
    
    // SMART 自检失败 = 立即更换
    if !h.Passed {
        alerts = append(alerts, "[CRITICAL] SMART 自检失败！请立即备份数据并更换硬盘")
    }
    
    // 重映射扇区
    if h.ReallocatedSectors > 0 {
        if h.ReallocatedSectors < 10 {
            alerts = append(alerts, fmt.Sprintf("[WARNING] 发现 %d 个重映射扇区，建议密切监控", h.ReallocatedSectors))
        } else {
            alerts = append(alerts, fmt.Sprintf("[CRITICAL] 重映射扇区数 %d 过多，请尽快更换", h.ReallocatedSectors))
        }
    }
    
    // 待重映射扇区（更危险）
    if h.PendingSectors > 0 {
        alerts = append(alerts, fmt.Sprintf("[CRITICAL] 存在 %d 个待重映射扇区，数据可能丢失", h.PendingSectors))
    }
    
    // 无法纠正的错误
    if h.UncorrectableErrors > 0 {
        alerts = append(alerts, fmt.Sprintf("[CRITICAL] 检测到 %d 个无法纠正的错误", h.UncorrectableErrors))
    }
    
    // 温度
    if h.Temperature > 55 {
        alerts = append(alerts, fmt.Sprintf("[WARNING] 磁盘温度过高: %d°C，请检查散热", h.Temperature))
    }
    
    // 使用时长
    if h.PowerOnHours > 35000 { // 约 4 年
        alerts = append(alerts, fmt.Sprintf("[INFO] 硬盘已运行 %d 小时（约 %.1f 年），建议考虑更换", 
            h.PowerOnHours, float64(h.PowerOnHours)/8760))
    }
    
    return alerts
}

3.4 定时检查

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
type DiskMonitor struct {
    devices   []string
    interval  time.Duration
    alertChan chan Alert
    ctx       context.Context
    cancel    context.CancelFunc
}

func NewDiskMonitor(devices []string, interval time.Duration) *DiskMonitor {
    ctx, cancel := context.WithCancel(context.Background())
    return &DiskMonitor{
        devices:   devices,
        interval:  interval,
        alertChan: make(chan Alert, 100),
        ctx:       ctx,
        cancel:    cancel,
    }
}

func (m *DiskMonitor) Run() {
    ticker := time.NewTicker(m.interval)
    defer ticker.Stop()
    
    // 启动时立即检查一次
    m.checkAllDisks()
    
    for {
        select {
        case <-m.ctx.Done():
            return
        case <-ticker.C:
            m.checkAllDisks()
        }
    }
}

func (m *DiskMonitor) checkAllDisks() {
    for _, device := range m.devices {
        health, err := GetDiskHealth(device)
        if err != nil {
            log.Printf("检查 %s 失败: %v", device, err)
            continue
        }
        
        // 记录到 Prometheus
        diskTemperature.WithLabelValues(device).Set(float64(health.Temperature))
        diskReallocatedSectors.WithLabelValues(device).Set(float64(health.ReallocatedSectors))
        
        // 发送告警
        for _, msg := range health.Alerts {
            m.alertChan <- Alert{
                Device:  device,
                Message: msg,
                Time:    time.Now(),
            }
        }
    }
}

四、告警通知

4.1 集成邮件通知

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
func (m *DiskMonitor) consumeAlerts() {
    for alert := range m.alertChan {
        // 防止告警风暴：同一磁盘同一条消息，1 小时内只发一次
        if m.isRecentlySent(alert) {
            continue
        }
        
        if err := m.sendEmail(alert); err != nil {
            log.Printf("发送邮件失败: %v", err)
        }
        
        m.recordSent(alert)
    }
}

func (m *DiskMonitor) sendEmail(alert Alert) error {
    subject := fmt.Sprintf("[NAS告警] %s 磁盘异常", alert.Device)
    body := fmt.Sprintf(`
设备: %s
时间: %s
详情: %s

请尽快登录 NAS 管理界面查看详情。
`, alert.Device, alert.Time.Format("2006-01-02 15:04:05"), alert.Message)
    
    return smtp.SendMail(/* ... */)
}

4.2 Prometheus 指标

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
var (
    diskTemperature = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "disk_temperature_celsius",
            Help: "Disk temperature in Celsius",
        },
        []string{"device"},
    )
    
    diskReallocatedSectors = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "disk_reallocated_sectors_total",
            Help: "Number of reallocated sectors",
        },
        []string{"device"},
    )
    
    diskSmartPassed = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "disk_smart_passed",
            Help: "SMART self-test passed (1) or failed (0)",
        },
        []string{"device"},
    )
)

五、生产环境注意事项

5.1 避免频繁查询

smartctl 执行时会让磁盘做 I/O，过于频繁会影响性能：

1
2
3
4
5
// 建议间隔
const (
    CheckInterval = 15 * time.Minute // 常规检查
    QuickInterval = 5 * time.Minute  // 发现问题后加密监控
)

5.2 处理 SSD

SSD 的 SMART 属性和 HDD 不同：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
func analyzeSSDHealth(raw *SmartctlOutput) *DiskHealth {
    // SSD 特有属性
    for _, attr := range raw.ATASmartAttributes.Table {
        switch attr.Name {
        case "Wear_Leveling_Count":
            // 100 = 新盘，0 = 寿命耗尽
            if attr.Value < 20 {
                // 告警
            }
        case "Available_Reservd_Space":
            // 预留空间
        }
    }
}

5.3 RAID 场景

如果使用软 RAID，需要检查底层磁盘：

1
2
3
4
5
6
# 列出 RAID 成员
cat /proc/mdstat

# 对每个成员执行 smartctl
smartctl -a /dev/sda
smartctl -a /dev/sdb

六、总结

组件	技术选型
数据采集	smartctl -j
解析	Go json.Unmarshal
定时调度	time.Ticker + context
告警去重	内存缓存 + TTL
可观测性	Prometheus metrics
通知	Email / Webhook

核心原则：磁盘故障不可避免，但可以提前发现。一个好的监控系统能给用户足够的时间备份数据和更换硬盘。

相关文章
Go Daemon 开发实战：优雅处理 SIGTERM 与热重载配置
实战 OOM Killer：一次容器内存泄漏的排查全过程

目录