Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 46 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,47 @@
- **Multi-Channel Alerting System:** Get notified via **SMTP (Email)**, **SMS (IPPanel)**, and **Webhooks**. The architecture is extensible for adding new channels.
- **Intelligent Periodic Checks:** Set custom intervals (`check_period`) for monitoring each service.
- **Spam Prevention:** Define a cooldown period (`sleep_on_fail`) after a failure is detected to avoid repetitive alerts.
- **Customizable Health Conditions:** Specify the expected HTTP status code (`expected_status_code`) to define a "healthy" state for each service.
- **Customizable Health Conditions:** Define complex health rules using **AND**, **OR**, **NOT**, **Regex**, **Header**, and **Response Time** checks.
- **Recovery Notifications:** Get notified when a service comes back online after a failure.
- **Smart Templating:** Use custom message templates for different types of failures (Network, HTTP, Latency, etc.).
- **Concurrent by Design:** Utilizes Goroutines to monitor all services concurrently without blocking.
- **Easy Configuration:** All settings are managed through a single, human-readable `YAML` file.

---

## 🛠️ Advanced Features

### Smart Metadata & Templating
You can use variables in your notification templates using Go's `text/template` syntax.

| Variable | Description |
| :--- | :--- |
| `{{.Metadata.ServiceName}}` | The name of the service |
| `{{.Metadata.ServiceURL}}` | The URL being checked |
| `{{.Metadata.Status}}` | Current status (UP or DOWN) |
| `{{.Metadata.Reason}}` | Detailed reason for failure (supports recursive reporting) |
| `{{.Metadata.StatusCode}}` | HTTP status code received |
| `{{.Metadata.ResponseTime}}`| Duration of the request |
| `{{.Metadata.Timestamp}}` | When the check occurred |
| `{{.Metadata.FailureCount}}`| Consecutive failures detected |

### Template Groups
Notifiers support `TemplateGroups`, allowing different messages for different failure types. This allows for intelligent selection:
- `network_error`: Network/Connection issues.
- `http_error`: Unexpected status codes.
- `slow_response`: Latency threshold exceeded.
- `condition_failed`: Logic/Regex/Header mismatch.
- `recovery`: Triggered when a service returns to healthy state.
- `default`: Fallback template.

### Recursive Error Reporting
When using nested `AND`/`OR` conditions, Healthy-API provides a detailed failure tree in the `Reason` field. This helps identify exactly which part of a complex condition caused the failure.

### Recovery Notifications
Set `notify_on_recovery: true` in your service configuration to receive alerts when services come back online.

---

## 🚀 Getting Started

### Prerequisites
Expand Down Expand Up @@ -149,17 +184,19 @@ The project is designed with a modular architecture to easily accommodate new fe

## 🗺️ Roadmap

- [ ] Implement **Graceful Shutdown** using `context` for better Goroutine management.
- [x] Implement **Graceful Shutdown** using `context` for better Goroutine management.
- [x] Add **Unit Tests** for the `healthcheck` and `notifier` modules.
- [x] Support **Response Body Validation** using regular expressions (Regex).
- [ ] Add more notifiers (e.g., **Slack**, **Telegram**).
- [X] Persist logs to a file or database for historical analysis.
- [x] Add more notifiers (e.g., **Slack**, **Discord** via Webhooks).
- [x] Persist logs to a file or database for historical analysis.
- [ ] Develop a simple **Web UI** to display the real-time status of services.
- [ ] Add cronjob insted of check_period.
- [X] enhance logging.
- [x] Add response time condition
- [ ] Add json path condition
- [x] Add retry policy
- [ ] Add cronjob instead of check_period.
- [x] Enhance logging with `slog`.
- [x] Add response time condition.
- [ ] Add json path condition.
- [x] Add retry policy (Threshold).
- [x] Add Recovery notifications.
- [x] Add Smart Template Groups.

---

Expand Down
52 changes: 41 additions & 11 deletions healthcheck/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,17 @@ type HealthChecker struct {
ConditionRegistry *registry.Registry[model.Condition]
Client *http.Client
Logger *slog.Logger
isDown bool
failureCount int
}

func (h *HealthChecker) Start(ctx context.Context) {
h.Logger.Info("checker_started", "service", h.Service.Name)
failureCount := 0

for {
waitDuration := time.Duration(h.Service.CheckPeriod) * time.Second

h.performCheck(&failureCount, &waitDuration)
h.performCheck(&waitDuration)

select {
case <-ctx.Done():
Expand All @@ -38,7 +39,7 @@ func (h *HealthChecker) Start(ctx context.Context) {
}
}

func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration) {
func (h *HealthChecker) performCheck(nextWait *time.Duration) {
start := time.Now()
request, err := http.NewRequest("GET", h.Service.URL, nil)

Expand All @@ -65,6 +66,7 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration)

if err != nil {
evaluationRes.Reason = fmt.Sprintf("Network/Connection Error: %v", err)
evaluationRes.Type = model.NotificationNetworkError
} else if resp != nil {
sCode = resp.StatusCode

Expand All @@ -80,18 +82,19 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration)
}

if !evaluationRes.IsHealthy {
*failureCount++
h.failureCount++

h.Logger.Warn("health_check_failed",
"service", h.Service.Name,
"attempt", *failureCount,
"attempt", h.failureCount,
"threshold", h.Service.Threshold,
"status", sCode,
"duration", requestDuration,
"reason", evaluationRes.Reason)

if *failureCount >= h.Service.Threshold {
if h.failureCount >= h.Service.Threshold {
h.Logger.Error("threshold_reached", "service", h.Service.Name, "action", "sending_notifications")
h.isDown = true

metadata := model.NotificationMetadata{
ServiceName: h.Service.Name,
Expand All @@ -100,27 +103,54 @@ func (h *HealthChecker) performCheck(failureCount *int, nextWait *time.Duration)
StatusCode: sCode,
ResponseTime: requestDuration.Round(time.Millisecond).String(),
Timestamp: time.Now().Format(time.RFC3339),
FailureCount: *failureCount,
FailureCount: h.failureCount,
Threshold: h.Service.Threshold,
Status: "DOWN",
}

for _, target := range h.Service.Targets {
if n, ok := h.NotifierRegistry.Get(target.NotifierID); ok {
_ = n.Notify(model.Notification{
Metadata: metadata,
Recipients: target.Recipients,
Type: evaluationRes.Type,
})
}
}

*nextWait = time.Duration(h.Service.SleepOnFail) * time.Second
*failureCount = 0 // Reset after notification as per original logic
h.failureCount = 0 // Reset after notification as per original logic
}
} else {
if *failureCount > 0 {
h.Logger.Info("service_recovery", "service", h.Service.Name, "after_failures", *failureCount)
if h.isDown {
h.Logger.Info("service_recovery", "service", h.Service.Name)
if h.Service.NotifyOnRecovery {
metadata := model.NotificationMetadata{
ServiceName: h.Service.Name,
ServiceURL: h.Service.URL,
Reason: "Service recovered",
StatusCode: sCode,
ResponseTime: requestDuration.Round(time.Millisecond).String(),
Timestamp: time.Now().Format(time.RFC3339),
FailureCount: 0,
Threshold: h.Service.Threshold,
Status: "UP",
}
for _, target := range h.Service.Targets {
if n, ok := h.NotifierRegistry.Get(target.NotifierID); ok {
_ = n.Notify(model.Notification{
Metadata: metadata,
Recipients: target.Recipients,
Type: model.NotificationRecovery,
})
}
}
}
h.isDown = false
} else if h.failureCount > 0 {
h.Logger.Info("service_recovered_before_threshold", "service", h.Service.Name, "after_failures", h.failureCount)
}
*failureCount = 0
h.failureCount = 0
h.Logger.Info("health_check_success", "service", h.Service.Name, "duration", requestDuration, "status_code", sCode)
}
}
Expand Down
22 changes: 12 additions & 10 deletions loader/notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,12 @@ func loadPayamakPanels(cfg *model.Config, reg *registry.Registry[notifier.Notifi
continue
}
notifierInst := &notifier.PayamakNotifier{
Username: pp.Username,
Password: pp.Password,
Sender: pp.Sender,
Template: pp.Template,
Logger: logger,
Username: pp.Username,
Password: pp.Password,
Sender: pp.Sender,
Template: pp.Template,
Templates: pp.Templates,
Logger: logger,
}
reg.Register(pp.ID, notifierInst)
logger.Info("notifier_registered", "type", "meli_payamak", "id", pp.ID)
Expand All @@ -69,11 +70,12 @@ func loadSMTPNotifiers(cfg *model.Config, reg *registry.Registry[notifier.Notifi
continue
}
notifierInst := &notifier.MailNotifier{
Sender: smtp.Sender,
Server: smtp.Server,
Port: smtp.Port,
Password: smtp.Password,
Logger: logger,
Sender: smtp.Sender,
Server: smtp.Server,
Port: smtp.Port,
Password: smtp.Password,
Templates: smtp.Templates,
Logger: logger,
}
reg.Register(smtp.ID, notifierInst)
logger.Info("notifier_registered", "type", "smtp", "id", smtp.ID)
Expand Down
29 changes: 24 additions & 5 deletions model/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,21 @@ type HeaderCondition struct {
type ResponseTimeCondition struct {
MaxDuration string `yaml:"max_duration"`
}
type NotificationType string

const (
NotificationNetworkError NotificationType = "network_error"
NotificationHttpError NotificationType = "http_error"
NotificationSlowResponse NotificationType = "slow_response"
NotificationConditionFailed NotificationType = "condition_failed"
NotificationRecovery NotificationType = "recovery"
NotificationDefault NotificationType = "default"
)

type EvaluationResult struct {
IsHealthy bool
Reason string
Type NotificationType
}

func (c *Condition) Validate(path string) error {
Expand Down Expand Up @@ -113,6 +125,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
return EvaluationResult{
IsHealthy: false,
Reason: fmt.Sprintf("AND condition failed (index %d): %s", i, res.Reason),
Type: res.Type,
}
}
}
Expand All @@ -127,11 +140,12 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
if res.IsHealthy {
return EvaluationResult{IsHealthy: true}
}
reasons = append(reasons, fmt.Sprintf("[%d: %s]", i, res.Reason))
reasons = append(reasons, fmt.Sprintf("Sub-condition #%d failed: %s", i, res.Reason))
}
return EvaluationResult{
IsHealthy: false,
Reason: fmt.Sprintf("All OR conditions failed: %s", strings.Join(reasons, ", ")),
Reason: fmt.Sprintf("All OR conditions failed:\n - %s", strings.Join(reasons, "\n - ")),
Type: NotificationConditionFailed, // Common case for OR
}
}

Expand All @@ -142,6 +156,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
return EvaluationResult{
IsHealthy: false,
Reason: "NOT condition failed: the forbidden condition matched successfully",
Type: NotificationConditionFailed,
}
}
return EvaluationResult{IsHealthy: true}
Expand All @@ -154,6 +169,7 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
return EvaluationResult{
IsHealthy: false,
Reason: fmt.Sprintf("Regex pattern '%s' not found in body", c.Regex.Regex),
Type: NotificationConditionFailed,
}
}
return EvaluationResult{IsHealthy: true}
Expand All @@ -162,12 +178,13 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
// 5. بررسی StatusCode
if c.StatusCode != nil {
if resp == nil {
return EvaluationResult{IsHealthy: false, Reason: "No response received"}
return EvaluationResult{IsHealthy: false, Reason: "No response received", Type: NotificationHttpError}
}
if resp.StatusCode != c.StatusCode.Code {
return EvaluationResult{
IsHealthy: false,
Reason: fmt.Sprintf("Expected status %d, but got %d", c.StatusCode.Code, resp.StatusCode),
Type: NotificationHttpError,
}
}
return EvaluationResult{IsHealthy: true}
Expand All @@ -176,14 +193,15 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
// 6. بررسی Headers
if c.Header != nil {
if resp == nil {
return EvaluationResult{IsHealthy: false, Reason: "No response headers available"}
return EvaluationResult{IsHealthy: false, Reason: "No response headers available", Type: NotificationConditionFailed}
}
for _, h := range *c.Header {
actual := resp.Header.Get(h.Key)
if actual != h.Value {
return EvaluationResult{
IsHealthy: false,
Reason: fmt.Sprintf("Header '%s' expected '%s', got '%s'", h.Key, h.Value, actual),
Type: NotificationConditionFailed,
}
}
}
Expand All @@ -197,12 +215,13 @@ func (c *Condition) Evaluate(resp *http.Response, body []byte, duration time.Dur
return EvaluationResult{
IsHealthy: false,
Reason: fmt.Sprintf("Response time %v exceeded limit %v", duration, max),
Type: NotificationSlowResponse,
}
}
return EvaluationResult{IsHealthy: true}
}

return EvaluationResult{IsHealthy: false, Reason: "No valid condition defined"}
return EvaluationResult{IsHealthy: false, Reason: "No valid condition defined", Type: NotificationDefault}
}

func (r *RegexCondition) Evaluate(body []byte) bool {
Expand Down
18 changes: 10 additions & 8 deletions model/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ type Service struct {
Targets []Target `yaml:"targets"`
CheckPeriod int `yaml:"check_period"`
SleepOnFail int `yaml:"sleep_on_fail"`
ConditionName string `yaml:"condition_id"`
Threshold int `yaml:"threshold"`
UserAgent string `yaml:"user_agent"`
ConditionName string `yaml:"condition_id"`
Threshold int `yaml:"threshold"`
UserAgent string `yaml:"user_agent"`
NotifyOnRecovery bool `yaml:"notify_on_recovery"`
}

type Target struct {
Expand All @@ -24,11 +25,12 @@ type Notifiers struct {
}

type SMTP struct {
ID string `yaml:"id"`
Sender string `yaml:"sender"`
Password string `yaml:"password"`
Server string `yaml:"server"`
Port string `yaml:"port"`
ID string `yaml:"id"`
Sender string `yaml:"sender"`
Password string `yaml:"password"`
Server string `yaml:"server"`
Port string `yaml:"port"`
Templates TemplateGroup `yaml:"templates"`
}
type Config struct {
Services []Service `yaml:"services"`
Expand Down
4 changes: 2 additions & 2 deletions model/detailed_reason_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ func TestEvaluate_DetailedReasons(t *testing.T) {
if !strings.Contains(result.Reason, "All OR conditions failed") {
t.Errorf("expected OR failure message, got: %s", result.Reason)
}
if !strings.Contains(result.Reason, "[0: Expected status 200, but got 500]") {
if !strings.Contains(result.Reason, "Sub-condition #0 failed: Expected status 200, but got 500") {
t.Errorf("expected reason 0, got: %s", result.Reason)
}
if !strings.Contains(result.Reason, "[1: Regex pattern 'UP' not found in body]") {
if !strings.Contains(result.Reason, "Sub-condition #1 failed: Regex pattern 'UP' not found in body") {
t.Errorf("expected reason 1, got: %s", result.Reason)
}
})
Expand Down
Loading