kiss server monitoring tool with email alerts
go monitoring

Add graceful shutdown and multi-disk support

Introduce context- and signal-based graceful shutdown and start/stop
monitors with context cancellation Support multiple disks with per-disk
intervals and cooldowns Add check_interval to CPU/Memory and smtp_port
to Email config Improve email logs/errors, update example config and
README Bump module dependencies

+304 -119
+27 -19
.servmon.example.yaml
··· 1 1 alert_thresholds: 2 - cpu: 3 - threshold: 90 4 - duration: 5m0s 5 - cooldown: 30m0s 6 - memory: 7 - threshold: 80 8 - cooldown: 30m0s 9 - http: 10 - url: http://localhost:8080/health 11 - timeout: 5s 12 - sample_rate: 10 13 - failure_threshold: 20 14 - check_interval: 1m0s 15 - cooldown: 15m0s 2 + cpu: 3 + threshold: 90 4 + duration: 5m0s 5 + cooldown: 30m0s 6 + check_interval: 10s 7 + memory: 8 + threshold: 80 9 + cooldown: 30m0s 10 + check_interval: 10s 11 + disks: 12 + - path: / 13 + threshold: 90 14 + cooldown: 4h0m0s 15 + check_interval: 1m0s 16 + http: 17 + url: http://localhost:8080/health 18 + timeout: 5s 19 + sample_rate: 10 20 + failure_threshold: 20 21 + check_interval: 1m0s 22 + cooldown: 15m0s 16 23 email: 17 - smtp_server: smtp.example.com 18 - from: alerts@example.com 19 - to: admin@example.com 20 - username: alertuser 21 - password: alertpassword 24 + smtp_server: smtp.example.com 25 + smtp_port: 587 26 + from: alerts@example.com 27 + to: admin@example.com 28 + username: alertuser 29 + password: alertpassword
+115 -12
config.go
··· 3 3 import ( 4 4 "fmt" 5 5 "os" 6 + "strings" 6 7 "time" 7 8 8 9 "gopkg.in/yaml.v3" ··· 17 18 type Thresholds struct { 18 19 CPU ThresholdConfig `yaml:"cpu"` 19 20 Memory ThresholdConfig `yaml:"memory"` 20 - Disk ThresholdConfig `yaml:"disk"` 21 + Disks []DiskConfig `yaml:"disks"` 21 22 HTTP HTTP `yaml:"http"` 22 23 } 23 24 24 25 type ThresholdConfig struct { 25 - Threshold float64 `yaml:"threshold"` 26 - Duration time.Duration `yaml:"duration,omitempty"` 27 - Cooldown time.Duration `yaml:"cooldown"` 26 + Threshold float64 `yaml:"threshold"` 27 + Duration time.Duration `yaml:"duration,omitempty"` 28 + Cooldown time.Duration `yaml:"cooldown"` 29 + CheckInterval time.Duration `yaml:"check_interval"` 30 + } 31 + 32 + type DiskConfig struct { 33 + Path string `yaml:"path"` 34 + Threshold float64 `yaml:"threshold"` 35 + Cooldown time.Duration `yaml:"cooldown"` 36 + CheckInterval time.Duration `yaml:"check_interval"` 28 37 } 29 38 30 39 type HTTP struct { ··· 38 47 39 48 type Email struct { 40 49 SMTPServer string `yaml:"smtp_server"` 50 + SMTPPort int `yaml:"smtp_port"` 41 51 From string `yaml:"from"` 42 52 To string `yaml:"to"` 43 53 Username string `yaml:"username"` ··· 62 72 return &Config{ 63 73 AlertThresholds: Thresholds{ 64 74 CPU: ThresholdConfig{ 65 - Threshold: 90, 66 - Duration: 5 * time.Minute, 67 - Cooldown: 30 * time.Minute, 75 + Threshold: 90, 76 + Duration: 5 * time.Minute, 77 + Cooldown: 30 * time.Minute, 78 + CheckInterval: 10 * time.Second, 68 79 }, 69 80 Memory: ThresholdConfig{ 70 - Threshold: 80, 71 - Cooldown: 30 * time.Minute, 81 + Threshold: 80, 82 + Cooldown: 30 * time.Minute, 83 + CheckInterval: 10 * time.Second, 72 84 }, 73 - Disk: ThresholdConfig{ 74 - Threshold: 90, 75 - Cooldown: 4 * time.Hour, 85 + Disks: []DiskConfig{ 86 + { 87 + Path: "/", 88 + Threshold: 90, 89 + Cooldown: 4 * time.Hour, 90 + CheckInterval: 1 * time.Minute, 91 + }, 76 92 }, 77 93 HTTP: HTTP{ 78 94 URL: "http://localhost:8080/health", ··· 85 101 }, 86 102 Email: Email{ 87 103 SMTPServer: "smtp.example.com", 104 + SMTPPort: 587, 88 105 From: "alerts@example.com", 89 106 To: "admin@example.com", 90 107 Username: "alertuser", ··· 105 122 return nil, fmt.Errorf("error unmarshaling config: %w", err) 106 123 } 107 124 125 + if err := cfg.Validate(); err != nil { 126 + return nil, fmt.Errorf("invalid configuration: %w", err) 127 + } 128 + 108 129 return &cfg, nil 109 130 } 131 + 132 + // Validate checks if the configuration is valid 133 + func (c *Config) Validate() error { 134 + // Validate CPU thresholds 135 + if c.AlertThresholds.CPU.Threshold <= 0 || c.AlertThresholds.CPU.Threshold > 100 { 136 + return fmt.Errorf("CPU threshold must be between 0 and 100") 137 + } 138 + if c.AlertThresholds.CPU.Cooldown <= 0 { 139 + return fmt.Errorf("CPU cooldown must be positive") 140 + } 141 + if c.AlertThresholds.CPU.CheckInterval <= 0 { 142 + return fmt.Errorf("CPU check interval must be positive") 143 + } 144 + 145 + // Validate Memory thresholds 146 + if c.AlertThresholds.Memory.Threshold <= 0 || c.AlertThresholds.Memory.Threshold > 100 { 147 + return fmt.Errorf("memory threshold must be between 0 and 100") 148 + } 149 + if c.AlertThresholds.Memory.Cooldown <= 0 { 150 + return fmt.Errorf("memory cooldown must be positive") 151 + } 152 + if c.AlertThresholds.Memory.CheckInterval <= 0 { 153 + return fmt.Errorf("memory check interval must be positive") 154 + } 155 + 156 + // Validate Disk thresholds 157 + if len(c.AlertThresholds.Disks) == 0 { 158 + return fmt.Errorf("at least one disk must be configured") 159 + } 160 + for i, disk := range c.AlertThresholds.Disks { 161 + if disk.Path == "" { 162 + return fmt.Errorf("disk[%d] path cannot be empty", i) 163 + } 164 + if disk.Threshold <= 0 || disk.Threshold > 100 { 165 + return fmt.Errorf("disk[%d] threshold must be between 0 and 100", i) 166 + } 167 + if disk.Cooldown <= 0 { 168 + return fmt.Errorf("disk[%d] cooldown must be positive", i) 169 + } 170 + if disk.CheckInterval <= 0 { 171 + return fmt.Errorf("disk[%d] check interval must be positive", i) 172 + } 173 + } 174 + 175 + // Validate HTTP thresholds 176 + if c.AlertThresholds.HTTP.URL != "" { 177 + if !strings.HasPrefix(c.AlertThresholds.HTTP.URL, "http://") && !strings.HasPrefix(c.AlertThresholds.HTTP.URL, "https://") { 178 + return fmt.Errorf("HTTP URL must start with http:// or https://") 179 + } 180 + if c.AlertThresholds.HTTP.Timeout <= 0 { 181 + return fmt.Errorf("HTTP timeout must be positive") 182 + } 183 + if c.AlertThresholds.HTTP.SampleRate <= 0 { 184 + return fmt.Errorf("HTTP sample rate must be positive") 185 + } 186 + if c.AlertThresholds.HTTP.FailureThreshold < 0 || c.AlertThresholds.HTTP.FailureThreshold > 100 { 187 + return fmt.Errorf("HTTP failure threshold must be between 0 and 100") 188 + } 189 + if c.AlertThresholds.HTTP.CheckInterval <= 0 { 190 + return fmt.Errorf("HTTP check interval must be positive") 191 + } 192 + if c.AlertThresholds.HTTP.Cooldown <= 0 { 193 + return fmt.Errorf("HTTP cooldown must be positive") 194 + } 195 + } 196 + 197 + // Validate Email configuration 198 + if c.Email.SMTPServer == "" { 199 + return fmt.Errorf("SMTP server cannot be empty") 200 + } 201 + if c.Email.SMTPPort <= 0 || c.Email.SMTPPort > 65535 { 202 + return fmt.Errorf("SMTP port must be between 1 and 65535") 203 + } 204 + if c.Email.From == "" { 205 + return fmt.Errorf("from email address cannot be empty") 206 + } 207 + if c.Email.To == "" { 208 + return fmt.Errorf("to email address cannot be empty") 209 + } 210 + 211 + return nil 212 + }
+8 -5
email.go
··· 9 9 10 10 // sendEmail sends an alert email using the configuration 11 11 func sendEmail(subject, body string, cfg *Config) error { 12 + log.Printf("Attempting to send email alert: %s", subject) 13 + 12 14 msg := mail.NewMsg() 13 15 if err := msg.From(cfg.Email.From); err != nil { 14 - return fmt.Errorf("failed to set FROM address: %w", err) 16 + return fmt.Errorf("failed to set FROM address '%s': %w", cfg.Email.From, err) 15 17 } 16 18 if err := msg.To(cfg.Email.To); err != nil { 17 - return fmt.Errorf("failed to set TO address: %w", err) 19 + return fmt.Errorf("failed to set TO address '%s': %w", cfg.Email.To, err) 18 20 } 19 21 20 22 msg.Subject(fmt.Sprintf("[ServMon Alert] %s", subject)) ··· 23 25 // Create SMTP client with configuration 24 26 client, err := mail.NewClient( 25 27 cfg.Email.SMTPServer, 28 + mail.WithPort(cfg.Email.SMTPPort), 26 29 mail.WithSMTPAuth(mail.SMTPAuthPlain), 27 30 mail.WithTLSPortPolicy(mail.TLSMandatory), 28 31 mail.WithUsername(cfg.Email.Username), 29 32 mail.WithPassword(cfg.Email.Password), 30 33 ) 31 34 if err != nil { 32 - return fmt.Errorf("failed to create SMTP client: %w", err) 35 + return fmt.Errorf("failed to create SMTP client for %s:%d: %w", cfg.Email.SMTPServer, cfg.Email.SMTPPort, err) 33 36 } 34 37 35 38 // Send the email 36 39 if err := client.DialAndSend(msg); err != nil { 37 - return fmt.Errorf("failed to send email: %w", err) 40 + return fmt.Errorf("failed to send email to %s via %s:%d: %w", cfg.Email.To, cfg.Email.SMTPServer, cfg.Email.SMTPPort, err) 38 41 } 39 42 40 - log.Printf("Email alert sent successfully: %s", subject) 43 + log.Printf("✓ Email alert sent successfully to %s: %s", cfg.Email.To, subject) 41 44 return nil 42 45 }
+4 -4
go.mod
··· 3 3 go 1.25.1 4 4 5 5 require ( 6 - github.com/shirou/gopsutil/v4 v4.25.10 7 - github.com/spf13/cobra v1.10.1 6 + github.com/shirou/gopsutil/v4 v4.25.11 7 + github.com/spf13/cobra v1.10.2 8 8 github.com/wneessen/go-mail v0.7.2 9 9 gopkg.in/yaml.v3 v3.0.1 10 10 ) ··· 19 19 github.com/tklauser/go-sysconf v0.3.16 // indirect 20 20 github.com/tklauser/numcpus v0.11.0 // indirect 21 21 github.com/yusufpapurcu/wmi v1.2.4 // indirect 22 - golang.org/x/sys v0.38.0 // indirect 23 - golang.org/x/text v0.31.0 // indirect 22 + golang.org/x/sys v0.39.0 // indirect 23 + golang.org/x/text v0.32.0 // indirect 24 24 )
+9 -8
go.sum
··· 17 17 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= 18 18 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= 19 19 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 20 - github.com/shirou/gopsutil/v4 v4.25.10 h1:at8lk/5T1OgtuCp+AwrDofFRjnvosn0nkN2OLQ6g8tA= 21 - github.com/shirou/gopsutil/v4 v4.25.10/go.mod h1:+kSwyC8DRUD9XXEHCAFjK+0nuArFJM0lva+StQAcskM= 22 - github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= 23 - github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= 20 + github.com/shirou/gopsutil/v4 v4.25.11 h1:X53gB7muL9Gnwwo2evPSE+SfOrltMoR6V3xJAXZILTY= 21 + github.com/shirou/gopsutil/v4 v4.25.11/go.mod h1:EivAfP5x2EhLp2ovdpKSozecVXn1TmuG7SMzs/Wh4PU= 22 + github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= 23 + github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= 24 24 github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 25 25 github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= 26 26 github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= ··· 34 34 github.com/wneessen/go-mail v0.7.2/go.mod h1:+TkW6QP3EVkgTEqHtVmnAE/1MRhmzb8Y9/W3pweuS+k= 35 35 github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= 36 36 github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= 37 + go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= 37 38 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 38 39 golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 39 40 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 40 - golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= 41 - golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= 42 - golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= 43 - golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= 41 + golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= 42 + golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= 43 + golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= 44 + golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= 44 45 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 45 46 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 46 47 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+31 -6
main.go
··· 1 1 package main 2 2 3 3 import ( 4 + "context" 4 5 "errors" 5 6 "fmt" 6 7 "os" 7 8 "os/exec" 9 + "os/signal" 8 10 "path" 9 11 "runtime" 10 12 "runtime/debug" 11 13 "strings" 14 + "syscall" 12 15 13 16 "github.com/spf13/cobra" 14 17 ) ··· 75 78 return err 76 79 } 77 80 78 - go monitorCPU(cfg) 79 - go monitorMemory(cfg) 80 - go monitorDisk(cfg) 81 - go monitorHTTP(cfg) 81 + // Set up signal handling for graceful shutdown 82 + sigChan := make(chan os.Signal, 1) 83 + signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) 84 + 85 + // Create context for graceful shutdown 86 + ctx, cancel := context.WithCancel(context.Background()) 87 + defer cancel() 88 + 89 + // Start monitoring goroutines 90 + go monitorCPU(ctx, cfg) 91 + go monitorMemory(ctx, cfg) 92 + 93 + for _, diskCfg := range cfg.AlertThresholds.Disks { 94 + go monitorDisk(ctx, cfg, diskCfg) 95 + } 96 + 97 + if cfg.AlertThresholds.HTTP.URL != "" { 98 + go monitorHTTP(ctx, cfg) 99 + } 82 100 83 - select {} // keep alive 101 + cmd.Println("Servmon started successfully. Monitoring active.") 102 + cmd.Println("Press Ctrl+C to stop.") 103 + 104 + // Wait for shutdown signal 105 + sig := <-sigChan 106 + cmd.Printf("\nReceived signal %v, shutting down gracefully...\n", sig) 107 + cancel() 108 + return nil 84 109 }, 85 110 } 86 111 ··· 123 148 return pid, nil 124 149 } 125 150 126 - return 0, fmt.Errorf("daemon mode is not supported on %s", runtime.GOOS) 151 + return 0, fmt.Errorf("daemon mode is only supported on Linux and FreeBSD, not on %s", runtime.GOOS) 127 152 } 128 153 129 154 func getVersion() (string, error) {
+98 -56
monitor.go
··· 12 12 "github.com/shirou/gopsutil/v4/mem" 13 13 ) 14 14 15 - func monitorCPU(cfg *Config) { 16 - log.Printf("Monitoring CPU usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.CPU.Threshold, cfg.AlertThresholds.CPU.Cooldown) 15 + func monitorCPU(ctx context.Context, cfg *Config) { 16 + log.Printf("Monitoring CPU usage with threshold %.2f%%, check interval %v, and cooldown %v", 17 + cfg.AlertThresholds.CPU.Threshold, cfg.AlertThresholds.CPU.CheckInterval, cfg.AlertThresholds.CPU.Cooldown) 17 18 18 - alertCooldown := time.NewTimer(cfg.AlertThresholds.CPU.Cooldown) 19 + // Initialize cooldown timer in expired state so the first alert can fire immediately. 20 + // We create a timer with 0 duration and drain it right away, so the select case 21 + // <-alertCooldown.C will succeed on the first threshold breach. 22 + alertCooldown := time.NewTimer(0) 23 + <-alertCooldown.C 24 + 25 + ticker := time.NewTicker(cfg.AlertThresholds.CPU.CheckInterval) 26 + defer ticker.Stop() 27 + 19 28 for { 29 + select { 30 + case <-ctx.Done(): 31 + log.Println("CPU monitor shutting down") 32 + return 33 + case <-ticker.C: 34 + } 20 35 percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false) 21 36 if err != nil { 22 37 log.Printf("Error getting CPU usage: %v", err) 23 - time.Sleep(1 * time.Second) 24 38 continue 25 39 } 26 40 ··· 29 43 for _, p := range percent { 30 44 total += p 31 45 } 46 + 47 + // Safety check: prevent division by zero 48 + if len(percent) == 0 { 49 + log.Printf("Warning: CPU percentage returned empty array, skipping check") 50 + continue 51 + } 32 52 avg := total / float64(len(percent)) 33 53 34 54 if avg > cfg.AlertThresholds.CPU.Threshold { 35 - // Check if we're within the cooldown period 55 + // Check if we're within the cooldown period using non-blocking select. 56 + // If the timer has expired, we can send an alert and reset the timer. 57 + // If not, we skip the alert to prevent spam. 36 58 select { 37 59 case <-alertCooldown.C: 38 - // Cooldown expired, check again 60 + // Cooldown expired, send alert 61 + err := sendEmail(fmt.Sprintf("CPU Usage Alert: %.2f%%", avg), 62 + fmt.Sprintf("CPU usage of %.2f%% has exceeded the threshold of %.2f%%", avg, cfg.AlertThresholds.CPU.Threshold), cfg) 63 + if err != nil { 64 + log.Printf("Error sending email: %v", err) 65 + } 66 + // Reset timer to start a new cooldown period 39 67 alertCooldown.Reset(cfg.AlertThresholds.CPU.Cooldown) 40 68 default: 41 69 // Within cooldown, skip alert 42 - time.Sleep(1 * time.Second) 43 - continue 44 - } 45 - 46 - err := sendEmail(fmt.Sprintf("CPU Usage Alert: %.2f%%", avg), 47 - fmt.Sprintf("CPU usage of %.2f%% has exceeded the threshold of %.2f%%", avg, cfg.AlertThresholds.CPU.Threshold), cfg) 48 - if err != nil { 49 - log.Printf("Error sending email: %v", err) 50 70 } 51 71 } 52 - 53 - time.Sleep(time.Duration(1) * time.Second) 54 72 } 55 73 } 56 74 57 - func monitorMemory(cfg *Config) { 58 - log.Printf("Monitoring memory usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Memory.Threshold, cfg.AlertThresholds.Memory.Cooldown) 75 + func monitorMemory(ctx context.Context, cfg *Config) { 76 + log.Printf("Monitoring memory usage with threshold %.2f%%, check interval %v, and cooldown %v", 77 + cfg.AlertThresholds.Memory.Threshold, cfg.AlertThresholds.Memory.CheckInterval, cfg.AlertThresholds.Memory.Cooldown) 59 78 60 - alertCooldown := time.NewTimer(cfg.AlertThresholds.Memory.Cooldown) 79 + alertCooldown := time.NewTimer(0) 80 + <-alertCooldown.C // Drain the initial timer immediately so first alert can fire 81 + 82 + ticker := time.NewTicker(cfg.AlertThresholds.Memory.CheckInterval) 83 + defer ticker.Stop() 84 + 61 85 for { 86 + select { 87 + case <-ctx.Done(): 88 + log.Println("Memory monitor shutting down") 89 + return 90 + case <-ticker.C: 91 + } 62 92 vm, err := mem.VirtualMemory() 63 93 if err != nil { 64 94 log.Printf("Error getting memory usage: %v", err) 65 - time.Sleep(1 * time.Second) 66 95 continue 67 96 } 68 97 ··· 72 101 // Check if we're within the cooldown period 73 102 select { 74 103 case <-alertCooldown.C: 75 - // Cooldown expired, check again 104 + // Cooldown expired, send alert 105 + err := sendEmail(fmt.Sprintf("Memory Usage Alert: %.2f%%", usedPercent), 106 + fmt.Sprintf("Memory usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Memory.Threshold), cfg) 107 + if err != nil { 108 + log.Printf("Error sending email: %v", err) 109 + } 76 110 alertCooldown.Reset(cfg.AlertThresholds.Memory.Cooldown) 77 111 default: 78 112 // Within cooldown, skip alert 79 - time.Sleep(1 * time.Second) 80 - continue 81 - } 82 - 83 - err := sendEmail(fmt.Sprintf("Memory Usage Alert: %.2f%%", usedPercent), 84 - fmt.Sprintf("Memory usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Memory.Threshold), cfg) 85 - if err != nil { 86 - log.Printf("Error sending email: %v", err) 87 113 } 88 114 } 89 - 90 - time.Sleep(time.Duration(1) * time.Second) 91 115 } 92 116 } 93 117 94 - func monitorDisk(cfg *Config) { 95 - log.Printf("Monitoring disk usage with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.Disk.Threshold, cfg.AlertThresholds.Disk.Cooldown) 118 + func monitorDisk(ctx context.Context, cfg *Config, diskCfg DiskConfig) { 119 + log.Printf("Monitoring disk %s usage with threshold %.2f%%, check interval %v, and cooldown %v", 120 + diskCfg.Path, diskCfg.Threshold, diskCfg.CheckInterval, diskCfg.Cooldown) 121 + 122 + alertCooldown := time.NewTimer(0) 123 + <-alertCooldown.C // Drain the initial timer immediately so first alert can fire 124 + 125 + ticker := time.NewTicker(diskCfg.CheckInterval) 126 + defer ticker.Stop() 96 127 97 - alertCooldown := time.NewTimer(cfg.AlertThresholds.Disk.Cooldown) 98 128 for { 99 - usage, err := disk.Usage("/") 129 + select { 130 + case <-ctx.Done(): 131 + log.Printf("Disk monitor for %s shutting down\n", diskCfg.Path) 132 + return 133 + case <-ticker.C: 134 + } 135 + usage, err := disk.Usage(diskCfg.Path) 100 136 if err != nil { 101 - log.Printf("Error getting disk usage: %v", err) 102 - time.Sleep(1 * time.Second) 137 + log.Printf("Error getting disk usage for %s: %v", diskCfg.Path, err) 103 138 continue 104 139 } 105 140 106 141 usedPercent := usage.UsedPercent 107 - if usedPercent > cfg.AlertThresholds.Disk.Threshold { 142 + if usedPercent > diskCfg.Threshold { 108 143 // Check if we're within the cooldown period 109 144 select { 110 145 case <-alertCooldown.C: 111 - // Cooldown expired, check again 112 - alertCooldown.Reset(cfg.AlertThresholds.Disk.Cooldown) 146 + // Cooldown expired, send alert 147 + err := sendEmail(fmt.Sprintf("Disk Usage Alert: %s %.2f%%", diskCfg.Path, usedPercent), 148 + fmt.Sprintf("Disk usage for %s of %.2f%% has exceeded the threshold of %.2f%%", diskCfg.Path, usedPercent, diskCfg.Threshold), cfg) 149 + if err != nil { 150 + log.Printf("Error sending email: %v", err) 151 + } 152 + alertCooldown.Reset(diskCfg.Cooldown) 113 153 default: 114 154 // Within cooldown, skip alert 115 - time.Sleep(1 * time.Second) 116 - continue 117 - } 118 - 119 - err := sendEmail(fmt.Sprintf("Disk Usage Alert: %.2f%%", usedPercent), 120 - fmt.Sprintf("Disk usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Disk.Threshold), cfg) 121 - if err != nil { 122 - log.Printf("Error sending email: %v", err) 123 155 } 124 156 } 125 - 126 - time.Sleep(time.Duration(1) * time.Second) 127 157 } 128 158 } 129 159 130 - func monitorHTTP(cfg *Config) { 160 + func monitorHTTP(ctx context.Context, cfg *Config) { 131 161 log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown) 132 162 133 - alertCooldown := time.NewTimer(cfg.AlertThresholds.HTTP.Cooldown) 163 + alertCooldown := time.NewTimer(0) 164 + <-alertCooldown.C // Drain the initial timer immediately so first alert can fire 134 165 client := &http.Client{ 135 166 Timeout: cfg.AlertThresholds.HTTP.Timeout, 136 167 } 137 168 138 169 for { 139 - // Wait for check interval 140 - time.Sleep(cfg.AlertThresholds.HTTP.CheckInterval) 170 + // Wait for check interval or context cancellation 171 + select { 172 + case <-ctx.Done(): 173 + log.Println("HTTP monitor shutting down") 174 + return 175 + case <-time.After(cfg.AlertThresholds.HTTP.CheckInterval): 176 + } 141 177 142 - // Perform HTTP checks 178 + // Perform HTTP checks in batch to calculate failure rate. 179 + // We make sample_rate number of requests and track how many fail. 143 180 failureCount := 0 144 181 for i := 0; i < cfg.AlertThresholds.HTTP.SampleRate; i++ { 145 182 req, err := http.NewRequest("GET", cfg.AlertThresholds.HTTP.URL, nil) ··· 151 188 ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout) 152 189 153 190 resp, err := client.Do(req.WithContext(ctx)) 154 - if err != nil || resp.StatusCode >= 400 { 191 + if err != nil { 155 192 failureCount++ 193 + } else { 194 + if resp.StatusCode >= 400 { 195 + failureCount++ 196 + } 197 + resp.Body.Close() 156 198 } 157 199 158 200 cancel()
+12 -9
readme.md
··· 1 1 # Servmon 2 2 3 - KISS server monitoring tool with email alerts. 4 - For those who want to keep it simple instead of using Prometheus, Grafana, and Alertmanager. 3 + KISS (Keep It Simple, Stupid) server monitoring tool with email alerts. 4 + 5 + For those who want to keep it simple instead of using complex setups like Prometheus, Grafana, and Alertmanager. 5 6 It uses the awesome [gopsutil](https://github.com/shirou/gopsutil) library to get system metrics. 6 7 7 - Monitors: 8 + ## Features 8 9 9 - - [x] CPU 10 - - [x] Memory 11 - - [x] HTTP Health check 12 - - [x] Disk Usage 13 - - [ ] Disk Write/Read 14 - - [ ] Docker 10 + - [x] **CPU Monitoring** - Monitor CPU usage with configurable thresholds and duration 11 + - [x] **Memory Monitoring** - Track memory usage with percentage-based alerts 12 + - [x] **Disk Monitoring** - Monitor multiple disk partitions independently 13 + - [x] **HTTP Health Checks** - Periodic health checks with failure rate monitoring 14 + - [x] **Email Alerts** - SMTP-based email notifications with configurable cooldowns 15 + - [x] **Graceful Shutdown** - Clean shutdown on SIGTERM/SIGINT 16 + - [x] **Config Validation** - Automatic validation of configuration parameters 17 + - [ ] Disk Write/Read performance monitoring 15 18 16 19 ## Installation 17 20