kiss server monitoring tool with email alerts
go monitoring

feat: add disk usage

+57 -17
+4 -2
README.md
··· 1 - # Servmond 1 + # Servmon 2 2 3 3 KISS server monitoring tool with email alerts. 4 + For those who want to keep it simple instead of using Prometheus, Grafana, and Alertmanager. 4 5 5 6 Monitors: 6 7 7 8 - [x] CPU 8 9 - [x] Memory 9 10 - [x] HTTP Health check 10 - - [ ] Disk 11 + - [x] Disk Usage 12 + - [ ] Disk Write/Read 11 13 - [ ] Docker 12 14 13 15 ## Installation
+12 -12
config.go
··· 15 15 } 16 16 17 17 type Thresholds struct { 18 - CPU CPU `yaml:"cpu"` 19 - Memory Memory `yaml:"memory"` 20 - HTTP HTTP `yaml:"http"` 21 - } 22 - 23 - type CPU struct { 24 - Threshold float64 `yaml:"threshold"` 25 - Duration time.Duration `yaml:"duration"` 26 - Cooldown time.Duration `yaml:"cooldown"` 18 + CPU ThresholdConfig `yaml:"cpu"` 19 + Memory ThresholdConfig `yaml:"memory"` 20 + Disk ThresholdConfig `yaml:"disk"` 21 + HTTP HTTP `yaml:"http"` 27 22 } 28 23 29 - type Memory struct { 24 + type ThresholdConfig struct { 30 25 Threshold float64 `yaml:"threshold"` 26 + Duration time.Duration `yaml:"duration,omitempty"` 31 27 Cooldown time.Duration `yaml:"cooldown"` 32 28 } 33 29 ··· 65 61 func defaultConfig() *Config { 66 62 return &Config{ 67 63 AlertThresholds: Thresholds{ 68 - CPU: CPU{ 64 + CPU: ThresholdConfig{ 69 65 Threshold: 90, 70 66 Duration: 5 * time.Minute, 71 67 Cooldown: 30 * time.Minute, 72 68 }, 73 - Memory: Memory{ 69 + Memory: ThresholdConfig{ 74 70 Threshold: 80, 75 71 Cooldown: 30 * time.Minute, 72 + }, 73 + Disk: ThresholdConfig{ 74 + Threshold: 90, 75 + Cooldown: 4 * time.Hour, 76 76 }, 77 77 HTTP: HTTP{ 78 78 URL: "http://localhost:8080/health",
+1
main.go
··· 50 50 51 51 go monitorCPU(cfg) 52 52 go monitorMemory(cfg) 53 + go monitorDisk(cfg) 53 54 go monitorHTTP(cfg) 54 55 55 56 select {} // keep alive
+40 -3
monitor.go
··· 8 8 "time" 9 9 10 10 "github.com/shirou/gopsutil/v4/cpu" 11 + "github.com/shirou/gopsutil/v4/disk" 11 12 "github.com/shirou/gopsutil/v4/mem" 12 13 ) 13 14 ··· 16 17 17 18 alertCooldown := time.NewTimer(cfg.AlertThresholds.CPU.Cooldown) 18 19 for { 19 - percent, err := cpu.Percent(time.Duration(1)*time.Second, false) 20 + percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false) 20 21 if err != nil { 21 22 log.Printf("Error getting CPU usage: %v", err) 22 23 time.Sleep(1 * time.Second) ··· 28 29 for _, p := range percent { 29 30 total += p 30 31 } 31 - 32 32 avg := total / float64(len(percent)) 33 33 34 34 if avg > cfg.AlertThresholds.CPU.Threshold { ··· 91 91 } 92 92 } 93 93 94 + func monitorDisk(cfg *Config) { 95 + log.Printf("Monitoring disk usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Disk.Threshold, cfg.AlertThresholds.Disk.Cooldown) 96 + 97 + alertCooldown := time.NewTimer(cfg.AlertThresholds.Disk.Cooldown) 98 + for { 99 + usage, err := disk.Usage("/") 100 + if err != nil { 101 + log.Printf("Error getting disk usage: %v", err) 102 + time.Sleep(1 * time.Second) 103 + continue 104 + } 105 + 106 + usedPercent := usage.UsedPercent 107 + if usedPercent > cfg.AlertThresholds.Disk.Threshold { 108 + // Check if we're within the cooldown period 109 + select { 110 + case <-alertCooldown.C: 111 + // Cooldown expired, check again 112 + alertCooldown.Reset(cfg.AlertThresholds.Disk.Cooldown) 113 + default: 114 + // Within cooldown, skip alert 115 + time.Sleep(1 * time.Second) 116 + continue 117 + } 118 + 119 + err := sendEmail(fmt.Sprintf("Disk Usage Alert: %.2f%%", usedPercent), 120 + fmt.Sprintf("Disk usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Disk.Threshold), cfg) 121 + if err != nil { 122 + log.Printf("Error sending email: %v", err) 123 + } 124 + } 125 + 126 + time.Sleep(time.Duration(1) * time.Second) 127 + } 128 + } 129 + 94 130 func monitorHTTP(cfg *Config) { 95 131 log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown) 96 132 ··· 113 149 } 114 150 115 151 ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout) 116 - defer cancel() 117 152 118 153 resp, err := client.Do(req.WithContext(ctx)) 119 154 if err != nil || resp.StatusCode >= 400 { 120 155 failureCount++ 121 156 } 157 + 158 + cancel() 122 159 } 123 160 124 161 // Calculate failure rate