diff --git a/README.md b/README.md index 2d7e35e..3c37f81 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,47 @@ - **Multi-Channel Alerting System:** Get notified via **SMTP (Email)**, **SMS (IPPanel)**, and **Webhooks**. The architecture is extensible for adding new channels. - **Intelligent Periodic Checks:** Set custom intervals (`check_period`) for monitoring each service. - **Spam Prevention:** Define a cooldown period (`sleep_on_fail`) after a failure is detected to avoid repetitive alerts. -- **Customizable Health Conditions:** Specify the expected HTTP status code (`expected_status_code`) to define a "healthy" state for each service. +- **Customizable Health Conditions:** Define complex health rules using **AND**, **OR**, **NOT**, **Regex**, **Header**, and **Response Time** checks. +- **Recovery Notifications:** Get notified when a service comes back online after a failure. +- **Smart Templating:** Use custom message templates for different types of failures (Network, HTTP, Latency, etc.). - **Concurrent by Design:** Utilizes Goroutines to monitor all services concurrently without blocking. - **Easy Configuration:** All settings are managed through a single, human-readable `YAML` file. --- +## 🛠️ Advanced Features + +### Smart Metadata & Templating +You can use variables in your notification templates using Go's `text/template` syntax. + +| Variable | Description | +| :--- | :--- | +| `{{.Metadata.ServiceName}}` | The name of the service | +| `{{.Metadata.ServiceURL}}` | The URL being checked | +| `{{.Metadata.Status}}` | Current status (UP or DOWN) | +| `{{.Metadata.Reason}}` | Detailed reason for failure (supports recursive reporting) | +| `{{.Metadata.StatusCode}}` | HTTP status code received | +| `{{.Metadata.ResponseTime}}`| Duration of the request | +| `{{.Metadata.Timestamp}}` | When the check occurred | +| `{{.Metadata.FailureCount}}`| Consecutive failures detected | + +### Template Groups +Notifiers support `TemplateGroups`, allowing different messages for different failure types. This allows for intelligent selection: +- `network_error`: Network/Connection issues. +- `http_error`: Unexpected status codes. +- `slow_response`: Latency threshold exceeded. +- `condition_failed`: Logic/Regex/Header mismatch. +- `recovery`: Triggered when a service returns to healthy state. +- `default`: Fallback template. + +### Recursive Error Reporting +When using nested `AND`/`OR` conditions, Healthy-API provides a detailed failure tree in the `Reason` field. This helps identify exactly which part of a complex condition caused the failure. + +### Recovery Notifications +Set `notify_on_recovery: true` in your service configuration to receive alerts when services come back online. + +--- + ## 🚀 Getting Started ### Prerequisites @@ -149,17 +184,19 @@ The project is designed with a modular architecture to easily accommodate new fe ## 🗺️ Roadmap -- [ ] Implement **Graceful Shutdown** using `context` for better Goroutine management. +- [x] Implement **Graceful Shutdown** using `context` for better Goroutine management. - [x] Add **Unit Tests** for the `healthcheck` and `notifier` modules. - [x] Support **Response Body Validation** using regular expressions (Regex). -- [ ] Add more notifiers (e.g., **Slack**, **Telegram**). -- [X] Persist logs to a file or database for historical analysis. +- [x] Add more notifiers (e.g., **Slack**, **Discord** via Webhooks). +- [x] Persist logs to a file or database for historical analysis. - [ ] Develop a simple **Web UI** to display the real-time status of services. -- [ ] Add cronjob insted of check_period. -- [X] enhance logging. -- [x] Add response time condition -- [ ] Add json path condition -- [x] Add retry policy +- [ ] Add cronjob instead of check_period. +- [x] Enhance logging with `slog`. +- [x] Add response time condition. +- [ ] Add json path condition. +- [x] Add retry policy (Threshold). +- [x] Add Recovery notifications. +- [x] Add Smart Template Groups. --- diff --git a/healthcheck/healthcheck.go b/healthcheck/healthcheck.go index 6991453..547f71b 100644 --- a/healthcheck/healthcheck.go +++ b/healthcheck/healthcheck.go @@ -18,16 +18,17 @@ type HealthChecker struct { ConditionRegistry *registry.Registry[model.Condition] Client *http.Client Logger *slog.Logger + isDown bool + failureCount int } func (h *HealthChecker) Start(ctx context.Context) { h.Logger.Info("checker_started", "service", h.Service.Name) - failureCount := 0 for { waitDuration := time.Duration(h.Service.CheckPeriod) * time.Second - h.performCheck(&failureCount, &waitDuration) + h.performCheck(&waitDuration) select { case <-ctx.Done(): @@ -38,7 +39,7 @@ func (h *HealthChecker) Start(ctx context.Context) { } } -func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration) { +func (h *HealthChecker) performCheck(nextWait *time.Duration) { start := time.Now() request, err := http.NewRequest("GET", h.Service.URL, nil) @@ -65,6 +66,7 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration) if err != nil { evaluationRes.Reason = fmt.Sprintf("Network/Connection Error: %v", err) + evaluationRes.Type = model.NotificationNetworkError } else if resp != nil { sCode = resp.StatusCode @@ -80,18 +82,19 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration) } if !evaluationRes.IsHealthy { - *failureCount++ + h.failureCount++ h.Logger.Warn("health_check_failed", "service", h.Service.Name, - "attempt", *failureCount, + "attempt", h.failureCount, "threshold", h.Service.Threshold, "status", sCode, "duration", requestDuration, "reason", evaluationRes.Reason) - if *failureCount >= h.Service.Threshold { + if h.failureCount >= h.Service.Threshold { h.Logger.Error("threshold_reached", "service", h.Service.Name, "action", "sending_notifications") + h.isDown = true metadata := model.NotificationMetadata{ ServiceName: h.Service.Name, @@ -100,8 +103,9 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration) StatusCode: sCode, ResponseTime: requestDuration.Round(time.Millisecond).String(), Timestamp: time.Now().Format(time.RFC3339), - FailureCount: *failureCount, + FailureCount: h.failureCount, Threshold: h.Service.Threshold, + Status: "DOWN", } for _, target := range h.Service.Targets { @@ -109,18 +113,44 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration) _ = n.Notify(model.Notification{ Metadata: metadata, Recipients: target.Recipients, + Type: evaluationRes.Type, }) } } *nextWait = time.Duration(h.Service.SleepOnFail) * time.Second - *failureCount = 0 // Reset after notification as per original logic + h.failureCount = 0 // Reset after notification as per original logic } } else { - if *failureCount > 0 { - h.Logger.Info("service_recovery", "service", h.Service.Name, "after_failures", *failureCount) + if h.isDown { + h.Logger.Info("service_recovery", "service", h.Service.Name) + if h.Service.NotifyOnRecovery { + metadata := model.NotificationMetadata{ + ServiceName: h.Service.Name, + ServiceURL: h.Service.URL, + Reason: "Service recovered", + StatusCode: sCode, + ResponseTime: requestDuration.Round(time.Millisecond).String(), + Timestamp: time.Now().Format(time.RFC3339), + FailureCount: 0, + Threshold: h.Service.Threshold, + Status: "UP", + } + for _, target := range h.Service.Targets { + if n, ok := h.NotifierRegistry.Get(target.NotifierID); ok { + _ = n.Notify(model.Notification{ + Metadata: metadata, + Recipients: target.Recipients, + Type: model.NotificationRecovery, + }) + } + } + } + h.isDown = false + } else if h.failureCount > 0 { + h.Logger.Info("service_recovered_before_threshold", "service", h.Service.Name, "after_failures", h.failureCount) } - *failureCount = 0 + h.failureCount = 0 h.Logger.Info("health_check_success", "service", h.Service.Name, "duration", requestDuration, "status_code", sCode) } } diff --git a/loader/notifier.go b/loader/notifier.go index 388d6a2..5a9bed9 100644 --- a/loader/notifier.go +++ b/loader/notifier.go @@ -48,11 +48,12 @@ func loadPayamakPanels(cfg *model.Config, reg *registry.Registry[notifier.Notifi continue } notifierInst := ¬ifier.PayamakNotifier{ - Username: pp.Username, - Password: pp.Password, - Sender: pp.Sender, - Template: pp.Template, - Logger: logger, + Username: pp.Username, + Password: pp.Password, + Sender: pp.Sender, + Template: pp.Template, + Templates: pp.Templates, + Logger: logger, } reg.Register(pp.ID, notifierInst) logger.Info("notifier_registered", "type", "meli_payamak", "id", pp.ID) @@ -69,11 +70,12 @@ func loadSMTPNotifiers(cfg *model.Config, reg *registry.Registry[notifier.Notifi continue } notifierInst := ¬ifier.MailNotifier{ - Sender: smtp.Sender, - Server: smtp.Server, - Port: smtp.Port, - Password: smtp.Password, - Logger: logger, + Sender: smtp.Sender, + Server: smtp.Server, + Port: smtp.Port, + Password: smtp.Password, + Templates: smtp.Templates, + Logger: logger, } reg.Register(smtp.ID, notifierInst) logger.Info("notifier_registered", "type", "smtp", "id", smtp.ID) diff --git a/model/condition.go b/model/condition.go index 3da6c77..96243a1 100644 --- a/model/condition.go +++ b/model/condition.go @@ -50,9 +50,21 @@ type HeaderCondition struct { type ResponseTimeCondition struct { MaxDuration string `yaml:"max_duration"` } +type NotificationType string + +const ( + NotificationNetworkError NotificationType = "network_error" + NotificationHttpError NotificationType = "http_error" + NotificationSlowResponse NotificationType = "slow_response" + NotificationConditionFailed NotificationType = "condition_failed" + NotificationRecovery NotificationType = "recovery" + NotificationDefault NotificationType = "default" +) + type EvaluationResult struct { IsHealthy bool Reason string + Type NotificationType } func (c *Condition) Validate(path string) error { @@ -113,6 +125,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur return EvaluationResult{ IsHealthy: false, Reason: fmt.Sprintf("AND condition failed (index %d): %s", i, res.Reason), + Type: res.Type, } } } @@ -127,11 +140,12 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur if res.IsHealthy { return EvaluationResult{IsHealthy: true} } - reasons = append(reasons, fmt.Sprintf("[%d: %s]", i, res.Reason)) + reasons = append(reasons, fmt.Sprintf("Sub-condition #%d failed: %s", i, res.Reason)) } return EvaluationResult{ IsHealthy: false, - Reason: fmt.Sprintf("All OR conditions failed: %s", strings.Join(reasons, ", ")), + Reason: fmt.Sprintf("All OR conditions failed:\n - %s", strings.Join(reasons, "\n - ")), + Type: NotificationConditionFailed, // Common case for OR } } @@ -142,6 +156,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur return EvaluationResult{ IsHealthy: false, Reason: "NOT condition failed: the forbidden condition matched successfully", + Type: NotificationConditionFailed, } } return EvaluationResult{IsHealthy: true} @@ -154,6 +169,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur return EvaluationResult{ IsHealthy: false, Reason: fmt.Sprintf("Regex pattern '%s' not found in body", c.Regex.Regex), + Type: NotificationConditionFailed, } } return EvaluationResult{IsHealthy: true} @@ -162,12 +178,13 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur // 5. بررسی StatusCode if c.StatusCode != nil { if resp == nil { - return EvaluationResult{IsHealthy: false, Reason: "No response received"} + return EvaluationResult{IsHealthy: false, Reason: "No response received", Type: NotificationHttpError} } if resp.StatusCode != c.StatusCode.Code { return EvaluationResult{ IsHealthy: false, Reason: fmt.Sprintf("Expected status %d, but got %d", c.StatusCode.Code, resp.StatusCode), + Type: NotificationHttpError, } } return EvaluationResult{IsHealthy: true} @@ -176,7 +193,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur // 6. بررسی Headers if c.Header != nil { if resp == nil { - return EvaluationResult{IsHealthy: false, Reason: "No response headers available"} + return EvaluationResult{IsHealthy: false, Reason: "No response headers available", Type: NotificationConditionFailed} } for _, h := range *c.Header { actual := resp.Header.Get(h.Key) @@ -184,6 +201,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur return EvaluationResult{ IsHealthy: false, Reason: fmt.Sprintf("Header '%s' expected '%s', got '%s'", h.Key, h.Value, actual), + Type: NotificationConditionFailed, } } } @@ -197,12 +215,13 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur return EvaluationResult{ IsHealthy: false, Reason: fmt.Sprintf("Response time %v exceeded limit %v", duration, max), + Type: NotificationSlowResponse, } } return EvaluationResult{IsHealthy: true} } - return EvaluationResult{IsHealthy: false, Reason: "No valid condition defined"} + return EvaluationResult{IsHealthy: false, Reason: "No valid condition defined", Type: NotificationDefault} } func (r *RegexCondition) Evaluate(body []byte) bool { diff --git a/model/config.go b/model/config.go index 3cb6870..eca8419 100644 --- a/model/config.go +++ b/model/config.go @@ -6,9 +6,10 @@ type Service struct { Targets []Target `yaml:"targets"` CheckPeriod int `yaml:"check_period"` SleepOnFail int `yaml:"sleep_on_fail"` - ConditionName string `yaml:"condition_id"` - Threshold int `yaml:"threshold"` - UserAgent string `yaml:"user_agent"` + ConditionName string `yaml:"condition_id"` + Threshold int `yaml:"threshold"` + UserAgent string `yaml:"user_agent"` + NotifyOnRecovery bool `yaml:"notify_on_recovery"` } type Target struct { @@ -24,11 +25,12 @@ type Notifiers struct { } type SMTP struct { - ID string `yaml:"id"` - Sender string `yaml:"sender"` - Password string `yaml:"password"` - Server string `yaml:"server"` - Port string `yaml:"port"` + ID string `yaml:"id"` + Sender string `yaml:"sender"` + Password string `yaml:"password"` + Server string `yaml:"server"` + Port string `yaml:"port"` + Templates TemplateGroup `yaml:"templates"` } type Config struct { Services []Service `yaml:"services"` diff --git a/model/detailed_reason_test.go b/model/detailed_reason_test.go index 61a4b65..f7b8062 100644 --- a/model/detailed_reason_test.go +++ b/model/detailed_reason_test.go @@ -47,10 +47,10 @@ func TestEvaluate_DetailedReasons(t *testing.T) { if !strings.Contains(result.Reason, "All OR conditions failed") { t.Errorf("expected OR failure message, got: %s", result.Reason) } - if !strings.Contains(result.Reason, "[0: Expected status 200, but got 500]") { + if !strings.Contains(result.Reason, "Sub-condition #0 failed: Expected status 200, but got 500") { t.Errorf("expected reason 0, got: %s", result.Reason) } - if !strings.Contains(result.Reason, "[1: Regex pattern 'UP' not found in body]") { + if !strings.Contains(result.Reason, "Sub-condition #1 failed: Regex pattern 'UP' not found in body") { t.Errorf("expected reason 1, got: %s", result.Reason) } }) diff --git a/model/meli_payamak.go b/model/meli_payamak.go index c89989c..36b595f 100644 --- a/model/meli_payamak.go +++ b/model/meli_payamak.go @@ -3,9 +3,10 @@ package model type MeliPayamakPanel struct { - ID string `yaml:"id"` - Username string `yaml:"username"` - Password string `yaml:"password"` - Sender string `yaml:"sender"` - Template string `yaml:"template"` + ID string `yaml:"id"` + Username string `yaml:"username"` + Password string `yaml:"password"` + Sender string `yaml:"sender"` + Template string `yaml:"template"` // Fallback/Legacy + Templates TemplateGroup `yaml:"templates"` } \ No newline at end of file diff --git a/model/notification.go b/model/notification.go index 9e14b4f..982b7ee 100644 --- a/model/notification.go +++ b/model/notification.go @@ -9,9 +9,46 @@ type NotificationMetadata struct { Timestamp string FailureCount int Threshold int + Status string } type Notification struct { Metadata NotificationMetadata Recipients []string + Type NotificationType +} + +type TemplateGroup struct { + NetworkError string `yaml:"network_error"` + HttpError string `yaml:"http_error"` + SlowResponse string `yaml:"slow_response"` + ConditionFailed string `yaml:"condition_failed"` + Recovery string `yaml:"recovery"` + Default string `yaml:"default"` +} + +const ( + DefaultNetworkErrorTemplate = "[🔌 Network Alert] {{.Metadata.ServiceName}} - Connection failed at {{.Metadata.Timestamp}}. Error: {{.Metadata.Reason}}" + DefaultHttpErrorTemplate = "[❌ HTTP Alert] {{.Metadata.ServiceName}} returned {{.Metadata.StatusCode}} at {{.Metadata.Timestamp}}. URL: {{.Metadata.ServiceURL}}" + DefaultSlowResponseTemplate = "[⏱️ Latency Alert] {{.Metadata.ServiceName}} is slow! Response time: {{.Metadata.ResponseTime}} (Threshold exceeded) at {{.Metadata.Timestamp}}." + DefaultConditionFailedTemplate = "[🔍 Validation Alert] {{.Metadata.ServiceName}} failed health criteria at {{.Metadata.Timestamp}}. Detail: {{.Metadata.Reason}}" + DefaultRecoveryTemplate = "[✅ Recovery] {{.Metadata.ServiceName}} is back online! Status: Healthy. Restored at: {{.Metadata.Timestamp}}." + DefaultNotificationTemplate = "[🔔 Alert] {{.Metadata.ServiceName}} status is {{.Metadata.Status}} at {{.Metadata.Timestamp}}." +) + +func GetDefaultTemplate(t NotificationType) string { + switch t { + case NotificationNetworkError: + return DefaultNetworkErrorTemplate + case NotificationHttpError: + return DefaultHttpErrorTemplate + case NotificationSlowResponse: + return DefaultSlowResponseTemplate + case NotificationConditionFailed: + return DefaultConditionFailedTemplate + case NotificationRecovery: + return DefaultRecoveryTemplate + default: + return DefaultNotificationTemplate + } } diff --git a/model/webhook.go b/model/webhook.go index 7885be1..7a66640 100644 --- a/model/webhook.go +++ b/model/webhook.go @@ -1,10 +1,11 @@ package model type Webhook struct { - ID string `yaml:"id"` - Method string `yaml:"method"` - Headers map[string]interface{} `yaml:"headers"` - JSON map[string]interface{} `yaml:"json"` + ID string `yaml:"id"` + Method string `yaml:"method"` + Headers map[string]interface{} `yaml:"headers"` + JSON map[string]interface{} `yaml:"json"` + Templates TemplateGroup `yaml:"templates"` } type WebhookTemplate struct { diff --git a/notifier/mail.go b/notifier/mail.go index 960b4c1..fe66910 100644 --- a/notifier/mail.go +++ b/notifier/mail.go @@ -6,19 +6,60 @@ import ( "healthy-api/model" "log/slog" "net/smtp" + "text/template" ) type MailNotifier struct { - Sender string - Server string - Port string - Password string - Logger *slog.Logger + Sender string + Server string + Port string + Password string + Templates model.TemplateGroup + Logger *slog.Logger } -func (m *MailNotifier) CreateMessage(metadata model.NotificationMetadata, to string, subject string) string { - return fmt.Sprintf("From: %s\nTo: %s\nSubject: %s\n\nService **%s** (%s) is not working good.\nReason: %s\nStatus Code: %d\nResponse Time: %s\nTimestamp: %s\nFailure Count: %d\nThreshold: %d\nCheck it fast please.", - m.Sender, to, subject, metadata.ServiceName, metadata.ServiceURL, metadata.Reason, metadata.StatusCode, metadata.ResponseTime, metadata.Timestamp, metadata.FailureCount, metadata.Threshold) +func (m *MailNotifier) selectTemplate(n model.Notification) string { + t := m.Templates + var tmplStr string + + switch n.Type { + case model.NotificationNetworkError: + tmplStr = t.NetworkError + case model.NotificationHttpError: + tmplStr = t.HttpError + case model.NotificationSlowResponse: + tmplStr = t.SlowResponse + case model.NotificationConditionFailed: + tmplStr = t.ConditionFailed + case model.NotificationRecovery: + tmplStr = t.Recovery + default: + tmplStr = t.Default + } + + if tmplStr == "" { + return model.GetDefaultTemplate(n.Type) + } + + return tmplStr +} + +func (m *MailNotifier) CreateMessage(n model.Notification, to string, subject string) string { + tmplStr := m.selectTemplate(n) + if tmplStr != "" { + tmpl, err := template.New("mail").Parse(tmplStr) + if err == nil { + var tpl bytes.Buffer + if err := tmpl.Execute(&tpl, n); err == nil { + return fmt.Sprintf("From: %s\nTo: %s\nSubject: %s\n\n%s", m.Sender, to, subject, tpl.String()) + } + } + } + + // Fallback to legacy format + metadata := n.Metadata + return fmt.Sprintf("From: %s\nTo: %s\nSubject: %s\n\nService **%s** (%s) is now %s.\nReason: %s\nStatus Code: %d\nResponse Time: %s\nTimestamp: %s\nFailure Count: %d\nThreshold: %d\nCheck it fast please.", + m.Sender, to, subject, metadata.ServiceName, metadata.ServiceURL, metadata.Status, metadata.Reason, metadata.StatusCode, metadata.ResponseTime, metadata.Timestamp, metadata.FailureCount, metadata.Threshold) } func (m *MailNotifier) GetName() string { @@ -30,7 +71,7 @@ func (m *MailNotifier) Notify(n model.Notification) error { addr := fmt.Sprintf("%s:%s", m.Server, m.Port) for _, mail := range n.Recipients { go func(target string) { - msg := m.CreateMessage(n.Metadata, target, "Alert") + msg := m.CreateMessage(n, target, "Alert") err := smtp.SendMail(addr, auth, m.Sender, []string{target}, bytes.NewBufferString(msg).Bytes()) if err != nil { m.Logger.Error("email_send_failed", "target", target, "addr", addr, "error", err) diff --git a/notifier/meli_payamak.go b/notifier/meli_payamak.go index ef12ef3..4501240 100644 --- a/notifier/meli_payamak.go +++ b/notifier/meli_payamak.go @@ -13,22 +13,52 @@ import ( ) type PayamakNotifier struct { - Username string - Password string - Sender string - Template string - Logger *slog.Logger + Username string + Password string + Sender string + Template string + Templates model.TemplateGroup + Logger *slog.Logger +} + +func (p *PayamakNotifier) selectTemplate(n model.Notification) string { + t := p.Templates + var tmplStr string + + switch n.Type { + case model.NotificationNetworkError: + tmplStr = t.NetworkError + case model.NotificationHttpError: + tmplStr = t.HttpError + case model.NotificationSlowResponse: + tmplStr = t.SlowResponse + case model.NotificationConditionFailed: + tmplStr = t.ConditionFailed + case model.NotificationRecovery: + tmplStr = t.Recovery + default: + tmplStr = t.Default + } + + if tmplStr == "" { + if p.Template != "" { + return p.Template + } + return model.GetDefaultTemplate(n.Type) + } + return tmplStr } func (p *PayamakNotifier) Notify(notification model.Notification) error { baseURL := "https://rest.payamak-panel.com/api/SendSMS/SendSMS" // رندر کردن تمپلیت - tmpl, err := template.New("sms").Parse(p.Template) + tmplStr := p.selectTemplate(notification) + tmpl, err := template.New("sms").Parse(tmplStr) if err != nil { // اگر تمپلیت مشکل داشت، یک متن پیش‌فرض استفاده کن - p.Template = "Service {{.Metadata.ServiceName}} is DOWN!" - tmpl, _ = template.New("sms").Parse(p.Template) + tmplStr = "Service {{.Metadata.ServiceName}} is {{.Metadata.Status}}!" + tmpl, _ = template.New("sms").Parse(tmplStr) } var tpl bytes.Buffer diff --git a/notifier/sms.go b/notifier/sms.go index beae50c..72f184a 100644 --- a/notifier/sms.go +++ b/notifier/sms.go @@ -61,6 +61,10 @@ func (s *SMSNotifier) Notify(n model.Notification) error { PatternCode: s.GetCodePattern(), InputData: []map[string]string{ {s.GetDataKey(): n.Metadata.ServiceName}, + {"status": n.Metadata.Status}, + {"url": n.Metadata.ServiceURL}, + {"reason": n.Metadata.Reason}, + {"response_time": n.Metadata.ResponseTime}, }, }) if err != nil { diff --git a/notifier/webhook.go b/notifier/webhook.go index 500efaf..cd4021c 100644 --- a/notifier/webhook.go +++ b/notifier/webhook.go @@ -7,6 +7,7 @@ import ( "healthy-api/model" "log/slog" "net/http" + "strings" "text/template" ) @@ -100,6 +101,53 @@ func (w *WebhookNotifier) sendRequest(url string, headers map[string]interface{} return nil } +func (w *WebhookNotifier) selectTemplate(n model.Notification) map[string]interface{} { + // If no templates are defined in the group, use the legacy/direct JSON. + // But the user wants intelligent selection. + t := w.HookData.Templates + var tmplStr string + + switch n.Type { + case model.NotificationNetworkError: + tmplStr = t.NetworkError + case model.NotificationHttpError: + tmplStr = t.HttpError + case model.NotificationSlowResponse: + tmplStr = t.SlowResponse + case model.NotificationConditionFailed: + tmplStr = t.ConditionFailed + case model.NotificationRecovery: + tmplStr = t.Recovery + default: + tmplStr = t.Default + } + + if tmplStr == "" { + // If no specific template is defined, check if legacy JSON is provided and not empty + if len(w.HookData.JSON) > 0 { + // Check if it's just the default "text" field with generic message + if text, ok := w.HookData.JSON["text"].(string); ok && (text == "" || strings.Contains(text, "Alert for")) { + // It's probably a default/generic JSON, we can do better with our built-in templates + tmplStr = model.GetDefaultTemplate(n.Type) + return map[string]interface{}{"text": tmplStr} + } + return w.HookData.JSON + } + // Use built-in default + tmplStr = model.GetDefaultTemplate(n.Type) + return map[string]interface{}{"text": tmplStr} + } + + // If we have a template string, we assume it's a JSON string. + // We'll try to unmarshal it. + var result map[string]interface{} + if err := json.Unmarshal([]byte(tmplStr), &result); err != nil { + // If it's not valid JSON, treat it as a simple text message for standard webhooks + return map[string]interface{}{"text": tmplStr} + } + return result +} + func (w *WebhookNotifier) Notify(n model.Notification) error { for _, recipient := range n.Recipients { @@ -107,11 +155,14 @@ func (w *WebhookNotifier) Notify(n model.Notification) error { Metadata: n.Metadata, URL: recipient, } + + jsonToUse := w.selectTemplate(n) + filledHeaders, err := FillTemplate(w.HookData.Headers, ctx) if err != nil { return fmt.Errorf("failed to fill headers template: %w", err) } - filledJSON, err := FillTemplate(w.HookData.JSON, ctx) + filledJSON, err := FillTemplate(jsonToUse, ctx) if err != nil { return fmt.Errorf("failed to fill JSON template: %w", err) } diff --git a/sample.yaml b/sample.yaml index 8816c68..451074d 100644 --- a/sample.yaml +++ b/sample.yaml @@ -1,158 +1,95 @@ -#=========================================== -# Services to Monitor -#=========================================== +#================================================================# +# Healthy-API God-Mode Configuration Example # +#================================================================# services: - # Service 1: Critical API that must be fully operational and fast. - - name: "Production API" - url: "https://api.my-company.com/v1/health" - # Complex AND condition - condition_id: critical-api-health + # Service 1: Critical API with complex nested conditions + - name: "Production Gateway" + url: "https://api.my-company.com/health" + condition_id: "complex-nested-check" check_period: 30 - sleep_on_fail: 120 - threshold: 3 # یعنی فقط بعد از ۳ بار خطای متوالی خبر بده + sleep_on_fail: 60 + threshold: 3 + notify_on_recovery: true targets: + - notifier_id: "slack-alerts" + recipients: ["https://hooks.slack.com/services/T000/B000/XXXX"] - notifier_id: "on-call-sms" - # Urgent SMS for the on-call engineer - recipients: - - "+15551234567" - - notifier_id: "slack-critical-alerts" - recipients: - # Detailed alert for the team - - "https://hooks.slack.com/services/CRITICAL_CHANNEL" + recipients: ["+1234567890"] - # Service 2: A public website that shouldn't show server errors to users. + # Service 2: A simple website check with custom user agent - name: "Main Website" url: "https://www.my-company.com" - # A NOT condition - condition_id: "no-server-error-text" - check_period: 300 - sleep_on_fail: 600 - targets: - - notifier_id: "dev-team-email" - # Non-urgent email to the whole team - recipients: - - "lead.dev@my-company.com" - - "backend.team@my-company.com" - - # Service 3: A service that can be either ready or in maintenance mode. - - name: "User Authentication Service" - url: "https://auth.my-company.com/status" - #A complex OR condition - condition_id: "ready-or-maintenance" + condition_id: "fast-200-ok" check_period: 60 - sleep_on_fail: 300 + user_agent: "HealthyAPI-Monitor/2.0" targets: - - notifier_id: "slack-info-alerts" - recipients: - # Informational-only alert - - "https://hooks.slack.com/services/INFO_CHANNEL" + - notifier_id: "team-email" + recipients: ["dev-team@my-company.com"] -#=========================================== -# Notification Channel Configuration -#=========================================== +#================================================================# +# Notification Channel Configuration # +#================================================================# notifiers: + # ------ Webhooks (Slack/Discord/Custom) ------ + webhook: + - id: "slack-alerts" + method: POST + headers: + Content-Type: "application/json" + json: + # Generic fallback + text: "⚠️ Alert for {{.Metadata.ServiceName}}" + templates: + network_error: '{"text": "🔌 *Network Error*: Cannot reach {{.Metadata.ServiceName}}! Reason: {{.Metadata.Reason}}"}' + http_error: '{"text": "❌ *HTTP Error*: {{.Metadata.ServiceName}} returned {{.Metadata.StatusCode}}."}' + slow_response: '{"text": "⏱️ *Latency Alert*: {{.Metadata.ServiceName}} took {{.Metadata.ResponseTime}}."}' + condition_failed: '{"text": "🔍 *Logic Failure*: {{.Metadata.ServiceName}} failed health criteria.\n```\n{{.Metadata.Reason}}\n```"}' + recovery: '{"text": "✅ *Service Restored*: {{.Metadata.ServiceName}} is back online!"}' + default: '{"text": "🔔 *Status Update*: {{.Metadata.ServiceName}} is {{.Metadata.Status}}"}' + # ------ Email (SMTP) ------ smtp: - - id: "dev-team-email" + - id: "team-email" sender: "monitoring@my-company.com" - password: "your-smtp-password" - server: "smtp.my-company.com" + password: "secret-password" + server: "smtp.gmail.com" port: "587" + templates: + recovery: "Subject: ✅ Service {{.Metadata.ServiceName}} Recovered\n\nGood news! Service is back up." + default: "Subject: 🚨 Alert: {{.Metadata.ServiceName}} is {{.Metadata.Status}}\n\nReason: {{.Metadata.Reason}}" - # ------ SMS (IPPanel) ------ - ippanel: - - id: "on-call-sms" - url: - user: - pass: - + # ------ SMS (MeliPayamak) ------ meli_payamak_panel: - id: "sms-admin" username: "myuser" password: "mypassword" sender: "50001234" - # استفاده از تمپلیت سفارشی - template: "هشدار! سرویس {{.ServiceName}} از دسترس خارج شد. لطفا بررسی کنید." - # ------ Webhooks ------ - webhook: - # A detailed, richly-formatted webhook for critical alerts using Slack's Block Kit - - id: "slack-critical-alerts" - method: POST - headers: - Content-Type: "application/json" - json: - # Fallback text for notifications - text: "🚨 CRITICAL ALERT: Service `{{ .ServiceName }}` is DOWN! 🚨" - blocks: - - type: "header" - text: - type: "plain_text" - text: "🔴 Service `{{ .ServiceName }}` is Unhealthy" - - type: "section" - fields: - - type: "mrkdwn" - text: "*Timestamp:*\n{{ .TimeStamp }}" - - type: "mrkdwn" - text: "*Endpoint URL:*\n{{ .URL }}" - - type: "context" - elements: - - type: "plain_text" - text: "This alert was triggered by Healthy-API Monitoring." + templates: + recovery: "✅ سرویس {{.Metadata.ServiceName}} به وضعیت عادی بازگشت." + default: "⚠️ هشدار: {{.Metadata.ServiceName}} دچار اختلال شده است. وضعیت: {{.Metadata.Status}}" - # A simpler webhook for informational alerts - - id: "slack-info-alerts" - method: POST - headers: - Content-Type: "application/json" - json: - text: "ℹ️ INFO: Service `{{ .ServiceName }}` failed its health check. URL: {{ .URL }}" - -#=========================================== -# Health Check Conditions -#=========================================== +#================================================================# +# Health Check Conditions # +#================================================================# conditions: - # Condition for Service 1: Must be 200 OK, have the right header, AND contain "UP" in the body. - - id: "critical-api-health" + # Simple 200 OK + Fast response + - id: "fast-200-ok" condition: and: - - status_code: - code: 200 - - header: - - key: "Content-Type" - value: "application/health+json" - - regex: - # Checks for 'status': "UP" or 'status':"UP" - pattern: '"status": ?"UP"' - - # Condition for Service 2: Healthy if the body does NOT contain "Server Error" or "Database Connection Failed". - - id: "no-server-error-text" - condition: - not: - regex: - pattern: "Server Error|Database Connection Failed" + - status_code: { code: 200 } + - response_time: { max_duration: "1s" } - # Condition for Service 3: Handles maintenance mode gracefully. - - id: "ready-or-maintenance" + # Complex AND inside OR (Highly Extensible) + - id: "complex-nested-check" condition: or: - # Healthy if ready - - and: - - status_code: - code: 200 - - regex: - pattern: "READY" - # Also healthy if in planned maintenance - - and: - - status_code: - code: 503 - - regex: - pattern: "MAINTENANCE" - - id: "fast-response-only" - condition: - and: - - status_code: - code: 200 - - response_time: - max_duration: "500ms" \ No newline at end of file + # Path A: Normal Operational State + - and: + - status_code: { code: 200 } + - regex: { pattern: '"status": ?"UP"' } + # Path B: Maintenance Mode (Also considered Healthy if configured so) + - and: + - status_code: { code: 503 } + - regex: { pattern: "MAINTENANCE_ACTIVE" }