From bd69818b7d2991f92392c86d55cf17b42e817ee3 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Thu, 30 Apr 2026 14:28:01 +0000
Subject: [PATCH] feat: add AI Monitoring Agent with Azure Application Insights
integration
- New AIMonitoringAgent service (port 5006) under Services/Monitoring/
- Azure Application Insights telemetry: events, metrics, availability, exceptions
- AI model performance tracking: latency, token usage, success/failure rates
- Service health monitoring with background polling of all microservice endpoints
- Statistical anomaly detection using rolling-window standard deviation analysis
- Configurable alert rules with threshold conditions and severity levels
- REST API with dashboard, monitoring, and alerts controllers
- Swagger UI for development
- Dockerfile and docker-compose integration
- Default alert rules for response time, failures, and AI model metrics
---
README.md | 11 +-
.../AIMonitoringAgent.csproj | 17 ++
.../Configuration/MonitoringConfiguration.cs | 32 +++
.../Controllers/AlertsController.cs | 71 +++++++
.../Controllers/MonitoringController.cs | 113 ++++++++++
.../Monitoring/AIMonitoringAgent/Dockerfile | 22 ++
.../Models/AIModelMetrics.cs | 15 ++
.../AIMonitoringAgent/Models/AlertRule.cs | 45 ++++
.../Models/AnomalyDetectionResult.cs | 22 ++
.../Models/ServiceHealthStatus.cs | 21 ++
.../Monitoring/AIMonitoringAgent/Program.cs | 82 ++++++++
.../Monitoring/AIMonitoringAgent/README.md | 112 ++++++++++
.../Services/AIModelMonitoringService.cs | 133 ++++++++++++
.../Services/AlertingService.cs | 117 +++++++++++
.../Services/AnomalyDetectionService.cs | 141 +++++++++++++
.../Services/AppInsightsTelemetryService.cs | 197 ++++++++++++++++++
.../Services/HealthCheckBackgroundService.cs | 50 +++++
.../Services/ServiceHealthMonitor.cs | 105 ++++++++++
.../appsettings.Development.json | 9 +
.../AIMonitoringAgent/appsettings.json | 53 +++++
src/docker-compose.yml | 16 ++
21 files changed, 1380 insertions(+), 4 deletions(-)
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Dockerfile
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Program.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/README.md
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json
create mode 100644 src/Services/Monitoring/AIMonitoringAgent/appsettings.json
diff --git a/README.md b/README.md
index d6e9a9a..a25cee9 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,7 @@ The monolith's bounded contexts are decomposed into the following independently
| `product-service` | 5004 | Product catalog management | `ProductsController`, product models |
| `notification-service` | 5005 | Email and in-app notifications | `NotificationService`, notification models |
| `api-gateway` | 5000 | YARP reverse proxy, request routing, rate limiting | New — replaces monolith's single entry point |
+| `ai-monitoring-agent` | 5006 | Azure App Insights AI monitoring, anomaly detection, alerting | New — cross-cutting observability service |
## Project Structure
@@ -68,10 +69,12 @@ src/
│ │ ├── Product.API/
│ │ ├── Product.Domain/
│ │ └── Product.Infrastructure/
-│ └── Notification/
-│ ├── Notification.API/
-│ ├── Notification.Domain/
-│ └── Notification.Infrastructure/
+│ ├── Notification/
+│ │ ├── Notification.API/
+│ │ ├── Notification.Domain/
+│ │ └── Notification.Infrastructure/
+│ └── Monitoring/
+│ └── AIMonitoringAgent/ # App Insights AI monitoring agent
├── Shared/
│ ├── Shared.Contracts/ # Shared DTOs, events, interfaces
│ └── Shared.Infrastructure/ # Common middleware, logging, health checks
diff --git a/src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj b/src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj
new file mode 100644
index 0000000..f33ecaa
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj
@@ -0,0 +1,17 @@
+
+
+ net10.0
+ enable
+ enable
+ AIMonitoringAgent
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs b/src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs
new file mode 100644
index 0000000..58d5c2c
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs
@@ -0,0 +1,32 @@
+namespace AIMonitoringAgent.Configuration;
+
+public class MonitoringConfiguration
+{
+ public const string SectionName = "Monitoring";
+
+ public string ApplicationInsightsConnectionString { get; set; } = string.Empty;
+ public int HealthCheckIntervalSeconds { get; set; } = 30;
+ public double AnomalySensitivityMultiplier { get; set; } = 2.0;
+
+ public Dictionary ServiceEndpoints { get; set; } = new()
+ {
+ ["identity-service"] = "http://localhost:5001/healthz",
+ ["customer-service"] = "http://localhost:5002/healthz",
+ ["order-service"] = "http://localhost:5003/healthz",
+ ["product-service"] = "http://localhost:5004/healthz",
+ ["notification-service"] = "http://localhost:5005/healthz",
+ ["api-gateway"] = "http://localhost:5000/healthz"
+ };
+
+ public List DefaultAlertRules { get; set; } = new();
+}
+
+public class DefaultAlertRule
+{
+ public string Name { get; set; } = string.Empty;
+ public string MetricName { get; set; } = string.Empty;
+ public string? ServiceName { get; set; }
+ public string Condition { get; set; } = "GreaterThan";
+ public double Threshold { get; set; }
+ public string Severity { get; set; } = "Warning";
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs b/src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs
new file mode 100644
index 0000000..7b59972
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs
@@ -0,0 +1,71 @@
+using AIMonitoringAgent.Models;
+using AIMonitoringAgent.Services;
+using Microsoft.AspNetCore.Mvc;
+
+namespace AIMonitoringAgent.Controllers;
+
+[ApiController]
+[Route("api/[controller]")]
+public class AlertsController : ControllerBase
+{
+ private readonly IAlertingService _alertingService;
+
+ public AlertsController(IAlertingService alertingService)
+ {
+ _alertingService = alertingService;
+ }
+
+ [HttpGet("rules")]
+ public IActionResult GetAllRules()
+ {
+ var rules = _alertingService.GetAllRules();
+ return Ok(rules);
+ }
+
+ [HttpGet("rules/{ruleId}")]
+ public IActionResult GetRule(string ruleId)
+ {
+ var rule = _alertingService.GetRule(ruleId);
+ if (rule == null) return NotFound();
+ return Ok(rule);
+ }
+
+ [HttpPost("rules")]
+ public IActionResult AddRule([FromBody] AlertRule rule)
+ {
+ var created = _alertingService.AddRule(rule);
+ return CreatedAtAction(nameof(GetRule), new { ruleId = created.Id }, created);
+ }
+
+ [HttpDelete("rules/{ruleId}")]
+ public IActionResult DeleteRule(string ruleId)
+ {
+ var removed = _alertingService.RemoveRule(ruleId);
+ if (!removed) return NotFound();
+ return NoContent();
+ }
+
+ [HttpPost("evaluate")]
+ public IActionResult EvaluateMetric([FromBody] MetricEvaluationRequest request)
+ {
+ var notification = _alertingService.EvaluateMetric(request.MetricName, request.ServiceName, request.Value);
+ if (notification == null)
+ return Ok(new { triggered = false, message = "No alert rules triggered" });
+
+ return Ok(new { triggered = true, alert = notification });
+ }
+
+ [HttpGet("recent")]
+ public IActionResult GetRecentAlerts([FromQuery] int count = 50)
+ {
+ var alerts = _alertingService.GetRecentAlerts(count);
+ return Ok(alerts);
+ }
+}
+
+public class MetricEvaluationRequest
+{
+ public string MetricName { get; set; } = string.Empty;
+ public string? ServiceName { get; set; }
+ public double Value { get; set; }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs b/src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs
new file mode 100644
index 0000000..db96626
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs
@@ -0,0 +1,113 @@
+using AIMonitoringAgent.Models;
+using AIMonitoringAgent.Services;
+using Microsoft.AspNetCore.Mvc;
+
+namespace AIMonitoringAgent.Controllers;
+
+[ApiController]
+[Route("api/[controller]")]
+public class MonitoringController : ControllerBase
+{
+ private readonly IAIModelMonitoringService _modelMonitoring;
+ private readonly IServiceHealthMonitor _healthMonitor;
+ private readonly IAnomalyDetectionService _anomalyDetection;
+ private readonly IAppInsightsTelemetryService _telemetryService;
+
+ public MonitoringController(
+ IAIModelMonitoringService modelMonitoring,
+ IServiceHealthMonitor healthMonitor,
+ IAnomalyDetectionService anomalyDetection,
+ IAppInsightsTelemetryService telemetryService)
+ {
+ _modelMonitoring = modelMonitoring;
+ _healthMonitor = healthMonitor;
+ _anomalyDetection = anomalyDetection;
+ _telemetryService = telemetryService;
+ }
+
+ [HttpPost("ai-model/track")]
+ public IActionResult TrackAIModelInvocation([FromBody] AIModelMetrics metrics)
+ {
+ _modelMonitoring.RecordInvocation(metrics);
+ return Accepted(new { message = "AI model invocation tracked successfully" });
+ }
+
+ [HttpGet("ai-model/summary")]
+ public IActionResult GetAllModelSummaries()
+ {
+ var summaries = _modelMonitoring.GetAllModelSummaries();
+ return Ok(summaries);
+ }
+
+ [HttpGet("ai-model/summary/{modelName}")]
+ public IActionResult GetModelSummary(string modelName)
+ {
+ var summary = _modelMonitoring.GetModelSummary(modelName);
+ return Ok(summary);
+ }
+
+ [HttpGet("ai-model/invocations")]
+ public IActionResult GetRecentInvocations([FromQuery] string? modelName = null, [FromQuery] int count = 50)
+ {
+ var invocations = _modelMonitoring.GetRecentInvocations(modelName, count);
+ return Ok(invocations);
+ }
+
+ [HttpGet("health/services")]
+ public IActionResult GetServiceHealthStatuses()
+ {
+ var statuses = _healthMonitor.GetLatestStatuses();
+ return Ok(statuses);
+ }
+
+ [HttpPost("health/check")]
+ public async Task TriggerHealthCheck()
+ {
+ var results = await _healthMonitor.CheckAllServicesAsync();
+ return Ok(results);
+ }
+
+ [HttpPost("health/check/{serviceName}")]
+ public async Task CheckServiceHealth(string serviceName, [FromQuery] string endpoint)
+ {
+ var result = await _healthMonitor.CheckServiceHealthAsync(serviceName, endpoint);
+ return Ok(result);
+ }
+
+ [HttpGet("anomalies")]
+ public IActionResult GetRecentAnomalies([FromQuery] int count = 50)
+ {
+ var anomalies = _anomalyDetection.GetRecentAnomalies(count);
+ return Ok(anomalies);
+ }
+
+ [HttpPost("metrics/custom")]
+ public IActionResult TrackCustomMetric([FromBody] CustomMetricRequest request)
+ {
+ _telemetryService.TrackCustomMetric(request.Name, request.Value, request.Properties);
+ return Accepted(new { message = "Custom metric tracked" });
+ }
+
+ [HttpGet("dashboard")]
+ public async Task GetDashboard()
+ {
+ var modelSummaries = _modelMonitoring.GetAllModelSummaries();
+ var serviceStatuses = _healthMonitor.GetLatestStatuses();
+ var recentAnomalies = _anomalyDetection.GetRecentAnomalies(10);
+
+ return Ok(new
+ {
+ timestamp = DateTime.UtcNow,
+ aiModels = modelSummaries,
+ services = serviceStatuses,
+ recentAnomalies = recentAnomalies
+ });
+ }
+}
+
+public class CustomMetricRequest
+{
+ public string Name { get; set; } = string.Empty;
+ public double Value { get; set; }
+ public Dictionary? Properties { get; set; }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Dockerfile b/src/Services/Monitoring/AIMonitoringAgent/Dockerfile
new file mode 100644
index 0000000..50eccc7
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Dockerfile
@@ -0,0 +1,22 @@
+FROM mcr.microsoft.com/dotnet/aspnet:10.0-preview AS base
+WORKDIR /app
+EXPOSE 5006
+
+FROM mcr.microsoft.com/dotnet/sdk:10.0-preview AS build
+WORKDIR /src
+COPY ["Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj", "Services/Monitoring/AIMonitoringAgent/"]
+COPY ["Shared/Shared.Contracts/Shared.Contracts.csproj", "Shared/Shared.Contracts/"]
+COPY ["Shared/Shared.Infrastructure/Shared.Infrastructure.csproj", "Shared/Shared.Infrastructure/"]
+RUN dotnet restore "Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj"
+COPY . .
+WORKDIR "/src/Services/Monitoring/AIMonitoringAgent"
+RUN dotnet build "AIMonitoringAgent.csproj" -c Release -o /app/build
+
+FROM build AS publish
+RUN dotnet publish "AIMonitoringAgent.csproj" -c Release -o /app/publish /p:UseAppHost=false
+
+FROM base AS final
+WORKDIR /app
+COPY --from=publish /app/publish .
+ENV ASPNETCORE_URLS=http://+:5006
+ENTRYPOINT ["dotnet", "AIMonitoringAgent.dll"]
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs
new file mode 100644
index 0000000..4dacf46
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs
@@ -0,0 +1,15 @@
+namespace AIMonitoringAgent.Models;
+
+public class AIModelMetrics
+{
+ public string ModelName { get; set; } = string.Empty;
+ public string ModelVersion { get; set; } = string.Empty;
+ public double Latency { get; set; }
+ public double TokensUsed { get; set; }
+ public double PromptTokens { get; set; }
+ public double CompletionTokens { get; set; }
+ public bool IsSuccessful { get; set; }
+ public string? ErrorMessage { get; set; }
+ public DateTime Timestamp { get; set; } = DateTime.UtcNow;
+ public Dictionary CustomProperties { get; set; } = new();
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs
new file mode 100644
index 0000000..0443ebd
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs
@@ -0,0 +1,45 @@
+namespace AIMonitoringAgent.Models;
+
+public class AlertRule
+{
+ public string Id { get; set; } = Guid.NewGuid().ToString();
+ public string Name { get; set; } = string.Empty;
+ public string MetricName { get; set; } = string.Empty;
+ public string? ServiceName { get; set; }
+ public AlertCondition Condition { get; set; } = AlertCondition.GreaterThan;
+ public double Threshold { get; set; }
+ public int EvaluationWindowMinutes { get; set; } = 5;
+ public AlertSeverityLevel Severity { get; set; } = AlertSeverityLevel.Warning;
+ public bool IsEnabled { get; set; } = true;
+ public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
+}
+
+public enum AlertCondition
+{
+ GreaterThan,
+ LessThan,
+ EqualTo,
+ GreaterThanOrEqual,
+ LessThanOrEqual
+}
+
+public enum AlertSeverityLevel
+{
+ Information,
+ Warning,
+ Error,
+ Critical
+}
+
+public class AlertNotification
+{
+ public string AlertRuleId { get; set; } = string.Empty;
+ public string AlertRuleName { get; set; } = string.Empty;
+ public string MetricName { get; set; } = string.Empty;
+ public string? ServiceName { get; set; }
+ public double CurrentValue { get; set; }
+ public double Threshold { get; set; }
+ public AlertSeverityLevel Severity { get; set; }
+ public DateTime FiredAt { get; set; } = DateTime.UtcNow;
+ public string Message { get; set; } = string.Empty;
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs
new file mode 100644
index 0000000..9912322
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs
@@ -0,0 +1,22 @@
+namespace AIMonitoringAgent.Models;
+
+public class AnomalyDetectionResult
+{
+ public string MetricName { get; set; } = string.Empty;
+ public string ServiceName { get; set; } = string.Empty;
+ public double CurrentValue { get; set; }
+ public double ExpectedValue { get; set; }
+ public double DeviationPercentage { get; set; }
+ public AnomalySeverity Severity { get; set; }
+ public bool IsAnomaly { get; set; }
+ public DateTime DetectedAt { get; set; } = DateTime.UtcNow;
+ public string Description { get; set; } = string.Empty;
+}
+
+public enum AnomalySeverity
+{
+ Low,
+ Medium,
+ High,
+ Critical
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs
new file mode 100644
index 0000000..56483b9
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs
@@ -0,0 +1,21 @@
+namespace AIMonitoringAgent.Models;
+
+public class ServiceHealthStatus
+{
+ public string ServiceName { get; set; } = string.Empty;
+ public string Endpoint { get; set; } = string.Empty;
+ public HealthState State { get; set; } = HealthState.Unknown;
+ public double ResponseTimeMs { get; set; }
+ public int HttpStatusCode { get; set; }
+ public string? ErrorDetails { get; set; }
+ public DateTime LastCheckedAt { get; set; } = DateTime.UtcNow;
+ public int ConsecutiveFailures { get; set; }
+}
+
+public enum HealthState
+{
+ Healthy,
+ Degraded,
+ Unhealthy,
+ Unknown
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Program.cs b/src/Services/Monitoring/AIMonitoringAgent/Program.cs
new file mode 100644
index 0000000..3ae17da
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Program.cs
@@ -0,0 +1,82 @@
+using AIMonitoringAgent.Configuration;
+using AIMonitoringAgent.Models;
+using AIMonitoringAgent.Services;
+
+var builder = WebApplication.CreateBuilder(args);
+
+// Configuration
+builder.Services.Configure(
+ builder.Configuration.GetSection(MonitoringConfiguration.SectionName));
+
+// Application Insights
+var appInsightsConnectionString = builder.Configuration
+ .GetValue("Monitoring:ApplicationInsightsConnectionString");
+
+builder.Services.AddApplicationInsightsTelemetry(options =>
+{
+ if (!string.IsNullOrEmpty(appInsightsConnectionString))
+ options.ConnectionString = appInsightsConnectionString;
+});
+
+// Core services
+builder.Services.AddSingleton();
+builder.Services.AddSingleton();
+builder.Services.AddSingleton();
+builder.Services.AddSingleton();
+
+// Health monitor with HttpClient
+builder.Services.AddHttpClient();
+
+// Background health check service
+builder.Services.AddHostedService();
+
+// ASP.NET Core
+builder.Services.AddControllers();
+builder.Services.AddEndpointsApiExplorer();
+builder.Services.AddSwaggerGen(options =>
+{
+ options.SwaggerDoc("v1", new Microsoft.OpenApi.Models.OpenApiInfo
+ {
+ Title = "AI Monitoring Agent API",
+ Version = "v1",
+ Description = "Azure Application Insights AI Monitoring Agent for microservices observability"
+ });
+});
+builder.Services.AddHealthChecks();
+
+// Seed default alert rules
+var monitoringConfig = builder.Configuration
+ .GetSection(MonitoringConfiguration.SectionName)
+ .Get();
+
+var app = builder.Build();
+
+// Seed default alert rules from configuration
+if (monitoringConfig?.DefaultAlertRules.Count > 0)
+{
+ var alertingService = app.Services.GetRequiredService();
+ foreach (var ruleConfig in monitoringConfig.DefaultAlertRules)
+ {
+ var rule = new AlertRule
+ {
+ Name = ruleConfig.Name,
+ MetricName = ruleConfig.MetricName,
+ ServiceName = ruleConfig.ServiceName,
+ Condition = Enum.Parse(ruleConfig.Condition),
+ Threshold = ruleConfig.Threshold,
+ Severity = Enum.Parse(ruleConfig.Severity)
+ };
+ alertingService.AddRule(rule);
+ }
+}
+
+if (app.Environment.IsDevelopment())
+{
+ app.UseSwagger();
+ app.UseSwaggerUI();
+}
+
+app.MapControllers();
+app.MapHealthChecks("/healthz");
+
+app.Run();
diff --git a/src/Services/Monitoring/AIMonitoringAgent/README.md b/src/Services/Monitoring/AIMonitoringAgent/README.md
new file mode 100644
index 0000000..ce7076f
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/README.md
@@ -0,0 +1,112 @@
+# AI Monitoring Agent
+
+Azure Application Insights-powered monitoring agent for the microservices platform. Provides AI model performance tracking, service health monitoring, anomaly detection, and configurable alerting.
+
+## Features
+
+- **AI Model Performance Tracking** — Track latency, token usage, success/failure rates for AI/ML model invocations
+- **Service Health Monitoring** — Periodic health checks against all microservice endpoints with availability telemetry
+- **Anomaly Detection** — Statistical anomaly detection using rolling-window standard deviation analysis
+- **Configurable Alerting** — Define alert rules with thresholds, conditions, and severity levels
+- **Application Insights Integration** — All telemetry (events, metrics, availability, exceptions) sent to Azure App Insights
+- **Dashboard API** — Aggregated view of AI models, service health, and recent anomalies
+
+## API Endpoints
+
+### Monitoring
+
+| Method | Route | Description |
+|--------|-------|-------------|
+| `POST` | `/api/monitoring/ai-model/track` | Track an AI model invocation |
+| `GET` | `/api/monitoring/ai-model/summary` | Get summaries for all tracked models |
+| `GET` | `/api/monitoring/ai-model/summary/{modelName}` | Get summary for a specific model |
+| `GET` | `/api/monitoring/ai-model/invocations` | Get recent invocations |
+| `GET` | `/api/monitoring/health/services` | Get latest health statuses |
+| `POST` | `/api/monitoring/health/check` | Trigger health check for all services |
+| `POST` | `/api/monitoring/health/check/{serviceName}` | Check a specific service |
+| `GET` | `/api/monitoring/anomalies` | Get recent anomalies |
+| `POST` | `/api/monitoring/metrics/custom` | Track a custom metric |
+| `GET` | `/api/monitoring/dashboard` | Get aggregated dashboard data |
+
+### Alerts
+
+| Method | Route | Description |
+|--------|-------|-------------|
+| `GET` | `/api/alerts/rules` | List all alert rules |
+| `GET` | `/api/alerts/rules/{ruleId}` | Get a specific alert rule |
+| `POST` | `/api/alerts/rules` | Create an alert rule |
+| `DELETE` | `/api/alerts/rules/{ruleId}` | Delete an alert rule |
+| `POST` | `/api/alerts/evaluate` | Evaluate a metric against rules |
+| `GET` | `/api/alerts/recent` | Get recent alert notifications |
+
+## Configuration
+
+Set the App Insights connection string in `appsettings.json` or via environment variable:
+
+```json
+{
+ "Monitoring": {
+ "ApplicationInsightsConnectionString": "InstrumentationKey=...;IngestionEndpoint=...",
+ "HealthCheckIntervalSeconds": 30,
+ "AnomalySensitivityMultiplier": 2.0,
+ "ServiceEndpoints": {
+ "identity-service": "http://localhost:5001/healthz"
+ },
+ "DefaultAlertRules": [
+ {
+ "Name": "High Response Time",
+ "MetricName": "ResponseTime",
+ "Condition": "GreaterThan",
+ "Threshold": 5000,
+ "Severity": "Warning"
+ }
+ ]
+ }
+}
+```
+
+## Running
+
+```bash
+# Standalone
+cd src/Services/Monitoring/AIMonitoringAgent
+dotnet run
+
+# With Docker Compose (all services)
+cd src
+docker compose up --build
+```
+
+The agent runs on port **5006** and exposes Swagger UI at `/swagger` in development mode.
+
+## Usage Example
+
+Track an AI model invocation:
+
+```bash
+curl -X POST http://localhost:5006/api/monitoring/ai-model/track \
+ -H "Content-Type: application/json" \
+ -d '{
+ "modelName": "gpt-4",
+ "modelVersion": "0613",
+ "latency": 1250.5,
+ "tokensUsed": 450,
+ "promptTokens": 200,
+ "completionTokens": 250,
+ "isSuccessful": true
+ }'
+```
+
+Create an alert rule:
+
+```bash
+curl -X POST http://localhost:5006/api/alerts/rules \
+ -H "Content-Type: application/json" \
+ -d '{
+ "name": "High Latency Alert",
+ "metricName": "ResponseTime",
+ "condition": "GreaterThan",
+ "threshold": 3000,
+ "severity": "Warning"
+ }'
+```
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs
new file mode 100644
index 0000000..6c0e2c6
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs
@@ -0,0 +1,133 @@
+using System.Collections.Concurrent;
+using AIMonitoringAgent.Models;
+
+namespace AIMonitoringAgent.Services;
+
+public interface IAIModelMonitoringService
+{
+ void RecordInvocation(AIModelMetrics metrics);
+ AIModelSummary GetModelSummary(string modelName);
+ IReadOnlyList GetAllModelSummaries();
+ IReadOnlyList GetRecentInvocations(string? modelName = null, int count = 50);
+}
+
+public class AIModelMonitoringService : IAIModelMonitoringService
+{
+ private readonly IAppInsightsTelemetryService _telemetryService;
+ private readonly ILogger _logger;
+ private readonly ConcurrentDictionary _modelTrackers = new();
+ private readonly ConcurrentQueue _recentInvocations = new();
+ private const int MaxRecentInvocations = 1000;
+
+ public AIModelMonitoringService(
+ IAppInsightsTelemetryService telemetryService,
+ ILogger logger)
+ {
+ _telemetryService = telemetryService;
+ _logger = logger;
+ }
+
+ public void RecordInvocation(AIModelMetrics metrics)
+ {
+ _telemetryService.TrackAIModelInvocation(metrics);
+
+ var tracker = _modelTrackers.GetOrAdd(metrics.ModelName, _ => new ModelTracker());
+ tracker.Record(metrics);
+
+ _recentInvocations.Enqueue(metrics);
+ while (_recentInvocations.Count > MaxRecentInvocations)
+ _recentInvocations.TryDequeue(out _);
+
+ _logger.LogInformation(
+ "AI model invocation recorded: {Model} v{Version}, Latency={Latency}ms, Success={Success}",
+ metrics.ModelName, metrics.ModelVersion, metrics.Latency, metrics.IsSuccessful);
+ }
+
+ public AIModelSummary GetModelSummary(string modelName)
+ {
+ if (_modelTrackers.TryGetValue(modelName, out var tracker))
+ return tracker.GetSummary(modelName);
+
+ return new AIModelSummary { ModelName = modelName };
+ }
+
+ public IReadOnlyList GetAllModelSummaries()
+ {
+ return _modelTrackers
+ .Select(kvp => kvp.Value.GetSummary(kvp.Key))
+ .OrderBy(s => s.ModelName)
+ .ToList();
+ }
+
+ public IReadOnlyList GetRecentInvocations(string? modelName = null, int count = 50)
+ {
+ var query = _recentInvocations.AsEnumerable();
+
+ if (!string.IsNullOrEmpty(modelName))
+ query = query.Where(m => m.ModelName.Equals(modelName, StringComparison.OrdinalIgnoreCase));
+
+ return query.OrderByDescending(m => m.Timestamp).Take(count).ToList();
+ }
+
+ private class ModelTracker
+ {
+ private long _totalInvocations;
+ private long _successCount;
+ private long _failureCount;
+ private double _totalLatency;
+ private double _totalTokens;
+ private double _minLatency = double.MaxValue;
+ private double _maxLatency = double.MinValue;
+ private readonly object _lock = new();
+
+ public void Record(AIModelMetrics metrics)
+ {
+ lock (_lock)
+ {
+ _totalInvocations++;
+ _totalLatency += metrics.Latency;
+ _totalTokens += metrics.TokensUsed;
+
+ if (metrics.Latency < _minLatency) _minLatency = metrics.Latency;
+ if (metrics.Latency > _maxLatency) _maxLatency = metrics.Latency;
+
+ if (metrics.IsSuccessful)
+ _successCount++;
+ else
+ _failureCount++;
+ }
+ }
+
+ public AIModelSummary GetSummary(string modelName)
+ {
+ lock (_lock)
+ {
+ return new AIModelSummary
+ {
+ ModelName = modelName,
+ TotalInvocations = _totalInvocations,
+ SuccessCount = _successCount,
+ FailureCount = _failureCount,
+ SuccessRate = _totalInvocations > 0 ? (double)_successCount / _totalInvocations * 100 : 0,
+ AverageLatencyMs = _totalInvocations > 0 ? _totalLatency / _totalInvocations : 0,
+ MinLatencyMs = _minLatency == double.MaxValue ? 0 : _minLatency,
+ MaxLatencyMs = _maxLatency == double.MinValue ? 0 : _maxLatency,
+ TotalTokensUsed = _totalTokens
+ };
+ }
+ }
+ }
+}
+
+public class AIModelSummary
+{
+ public string ModelName { get; set; } = string.Empty;
+ public long TotalInvocations { get; set; }
+ public long SuccessCount { get; set; }
+ public long FailureCount { get; set; }
+ public double SuccessRate { get; set; }
+ public double AverageLatencyMs { get; set; }
+ public double MinLatencyMs { get; set; }
+ public double MaxLatencyMs { get; set; }
+ public double TotalTokensUsed { get; set; }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs
new file mode 100644
index 0000000..c32a003
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs
@@ -0,0 +1,117 @@
+using System.Collections.Concurrent;
+using AIMonitoringAgent.Models;
+
+namespace AIMonitoringAgent.Services;
+
+public interface IAlertingService
+{
+ AlertRule AddRule(AlertRule rule);
+ bool RemoveRule(string ruleId);
+ AlertRule? GetRule(string ruleId);
+ IReadOnlyList GetAllRules();
+ AlertNotification? EvaluateMetric(string metricName, string? serviceName, double currentValue);
+ IReadOnlyList GetRecentAlerts(int count = 50);
+}
+
+public class AlertingService : IAlertingService
+{
+ private readonly IAppInsightsTelemetryService _telemetryService;
+ private readonly ILogger _logger;
+ private readonly ConcurrentDictionary _rules = new();
+ private readonly ConcurrentQueue _recentAlerts = new();
+ private const int MaxRecentAlerts = 500;
+
+ public AlertingService(
+ IAppInsightsTelemetryService telemetryService,
+ ILogger logger)
+ {
+ _telemetryService = telemetryService;
+ _logger = logger;
+ }
+
+ public AlertRule AddRule(AlertRule rule)
+ {
+ _rules[rule.Id] = rule;
+ _logger.LogInformation("Alert rule added: {RuleName} ({RuleId})", rule.Name, rule.Id);
+ return rule;
+ }
+
+ public bool RemoveRule(string ruleId)
+ {
+ var removed = _rules.TryRemove(ruleId, out _);
+ if (removed)
+ _logger.LogInformation("Alert rule removed: {RuleId}", ruleId);
+ return removed;
+ }
+
+ public AlertRule? GetRule(string ruleId)
+ {
+ _rules.TryGetValue(ruleId, out var rule);
+ return rule;
+ }
+
+ public IReadOnlyList GetAllRules()
+ {
+ return _rules.Values.OrderBy(r => r.Name).ToList();
+ }
+
+ public AlertNotification? EvaluateMetric(string metricName, string? serviceName, double currentValue)
+ {
+ var matchingRules = _rules.Values
+ .Where(r => r.IsEnabled
+ && r.MetricName.Equals(metricName, StringComparison.OrdinalIgnoreCase)
+ && (string.IsNullOrEmpty(r.ServiceName) || r.ServiceName.Equals(serviceName, StringComparison.OrdinalIgnoreCase)))
+ .ToList();
+
+ foreach (var rule in matchingRules)
+ {
+ if (!IsConditionMet(rule.Condition, currentValue, rule.Threshold))
+ continue;
+
+ var notification = new AlertNotification
+ {
+ AlertRuleId = rule.Id,
+ AlertRuleName = rule.Name,
+ MetricName = metricName,
+ ServiceName = serviceName,
+ CurrentValue = currentValue,
+ Threshold = rule.Threshold,
+ Severity = rule.Severity,
+ Message = $"Alert '{rule.Name}': {metricName} is {currentValue:F2} " +
+ $"({rule.Condition} threshold {rule.Threshold:F2})"
+ };
+
+ _telemetryService.TrackAlert(notification);
+
+ _recentAlerts.Enqueue(notification);
+ while (_recentAlerts.Count > MaxRecentAlerts)
+ _recentAlerts.TryDequeue(out _);
+
+ _logger.LogWarning("Alert fired: {Message}", notification.Message);
+ return notification;
+ }
+
+ return null;
+ }
+
+ public IReadOnlyList GetRecentAlerts(int count = 50)
+ {
+ return _recentAlerts
+ .OrderByDescending(a => a.FiredAt)
+ .Take(count)
+ .ToList();
+ }
+
+ private static bool IsConditionMet(AlertCondition condition, double current, double threshold)
+ {
+ return condition switch
+ {
+ AlertCondition.GreaterThan => current > threshold,
+ AlertCondition.LessThan => current < threshold,
+ AlertCondition.EqualTo => Math.Abs(current - threshold) < 0.001,
+ AlertCondition.GreaterThanOrEqual => current >= threshold,
+ AlertCondition.LessThanOrEqual => current <= threshold,
+ _ => false
+ };
+ }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs
new file mode 100644
index 0000000..971585d
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs
@@ -0,0 +1,141 @@
+using System.Collections.Concurrent;
+using AIMonitoringAgent.Models;
+
+namespace AIMonitoringAgent.Services;
+
+public interface IAnomalyDetectionService
+{
+ AnomalyDetectionResult Evaluate(string metricName, string serviceName, double currentValue);
+ IReadOnlyList GetRecentAnomalies(int count = 50);
+ void Configure(string metricName, double sensitivityMultiplier);
+}
+
+public class AnomalyDetectionService : IAnomalyDetectionService
+{
+ private readonly IAppInsightsTelemetryService _telemetryService;
+ private readonly ILogger _logger;
+ private readonly ConcurrentDictionary _metricWindows = new();
+ private readonly ConcurrentDictionary _sensitivityConfig = new();
+ private readonly ConcurrentQueue _recentAnomalies = new();
+ private const int MaxRecentAnomalies = 200;
+ private const double DefaultSensitivityMultiplier = 2.0;
+
+ public AnomalyDetectionService(
+ IAppInsightsTelemetryService telemetryService,
+ ILogger logger)
+ {
+ _telemetryService = telemetryService;
+ _logger = logger;
+ }
+
+ public AnomalyDetectionResult Evaluate(string metricName, string serviceName, double currentValue)
+ {
+ var key = $"{serviceName}:{metricName}";
+ var window = _metricWindows.GetOrAdd(key, _ => new MetricWindow());
+ window.Add(currentValue);
+
+ var stats = window.GetStatistics();
+ var sensitivity = _sensitivityConfig.GetValueOrDefault(metricName, DefaultSensitivityMultiplier);
+
+ var deviation = stats.StdDev > 0
+ ? Math.Abs(currentValue - stats.Mean) / stats.StdDev
+ : 0;
+
+ var isAnomaly = stats.Count >= 10 && deviation > sensitivity;
+
+ var deviationPercentage = stats.Mean != 0
+ ? (currentValue - stats.Mean) / stats.Mean * 100
+ : 0;
+
+ var severity = DetermineSeverity(deviation, sensitivity);
+
+ var result = new AnomalyDetectionResult
+ {
+ MetricName = metricName,
+ ServiceName = serviceName,
+ CurrentValue = currentValue,
+ ExpectedValue = Math.Round(stats.Mean, 2),
+ DeviationPercentage = Math.Round(deviationPercentage, 2),
+ Severity = severity,
+ IsAnomaly = isAnomaly,
+ Description = isAnomaly
+ ? $"{metricName} for {serviceName} is {Math.Abs(deviationPercentage):F1}% " +
+ $"{(currentValue > stats.Mean ? "above" : "below")} the expected value " +
+ $"(current: {currentValue:F2}, expected: {stats.Mean:F2})"
+ : $"{metricName} for {serviceName} is within normal range"
+ };
+
+ if (isAnomaly)
+ {
+ _telemetryService.TrackAnomaly(result);
+ _recentAnomalies.Enqueue(result);
+ while (_recentAnomalies.Count > MaxRecentAnomalies)
+ _recentAnomalies.TryDequeue(out _);
+
+ _logger.LogWarning("Anomaly detected: {Description}", result.Description);
+ }
+
+ return result;
+ }
+
+ public IReadOnlyList GetRecentAnomalies(int count = 50)
+ {
+ return _recentAnomalies
+ .OrderByDescending(a => a.DetectedAt)
+ .Take(count)
+ .ToList();
+ }
+
+ public void Configure(string metricName, double sensitivityMultiplier)
+ {
+ _sensitivityConfig[metricName] = sensitivityMultiplier;
+ _logger.LogInformation("Anomaly detection sensitivity for {Metric} set to {Sensitivity}",
+ metricName, sensitivityMultiplier);
+ }
+
+ private static AnomalySeverity DetermineSeverity(double deviation, double sensitivity)
+ {
+ if (deviation > sensitivity * 3) return AnomalySeverity.Critical;
+ if (deviation > sensitivity * 2) return AnomalySeverity.High;
+ if (deviation > sensitivity * 1.5) return AnomalySeverity.Medium;
+ return AnomalySeverity.Low;
+ }
+
+ private class MetricWindow
+ {
+ private readonly Queue _values = new();
+ private readonly object _lock = new();
+ private const int WindowSize = 100;
+ private static readonly TimeSpan WindowDuration = TimeSpan.FromMinutes(30);
+
+ public void Add(double value)
+ {
+ lock (_lock)
+ {
+ var cutoff = DateTime.UtcNow - WindowDuration;
+ while (_values.Count > 0 && (_values.Peek().Timestamp < cutoff || _values.Count >= WindowSize))
+ _values.Dequeue();
+
+ _values.Enqueue(new TimestampedValue(value, DateTime.UtcNow));
+ }
+ }
+
+ public (double Mean, double StdDev, int Count) GetStatistics()
+ {
+ lock (_lock)
+ {
+ if (_values.Count == 0)
+ return (0, 0, 0);
+
+ var values = _values.Select(v => v.Value).ToArray();
+ var mean = values.Average();
+ var variance = values.Select(v => Math.Pow(v - mean, 2)).Average();
+ var stdDev = Math.Sqrt(variance);
+
+ return (mean, stdDev, values.Length);
+ }
+ }
+
+ private record TimestampedValue(double Value, DateTime Timestamp);
+ }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs
new file mode 100644
index 0000000..997fbcc
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs
@@ -0,0 +1,197 @@
+using Microsoft.ApplicationInsights;
+using Microsoft.ApplicationInsights.DataContracts;
+using Microsoft.ApplicationInsights.Extensibility;
+using AIMonitoringAgent.Models;
+
+namespace AIMonitoringAgent.Services;
+
+public interface IAppInsightsTelemetryService
+{
+ void TrackAIModelInvocation(AIModelMetrics metrics);
+ void TrackServiceHealth(ServiceHealthStatus status);
+ void TrackAnomaly(AnomalyDetectionResult anomaly);
+ void TrackAlert(AlertNotification alert);
+ void TrackCustomMetric(string name, double value, Dictionary? properties = null);
+ void TrackDependency(string dependencyType, string target, string name, double durationMs, bool success);
+ void TrackException(Exception exception, Dictionary? properties = null);
+ void TrackEvent(string eventName, Dictionary? properties = null, Dictionary? metrics = null);
+ void Flush();
+}
+
+public class AppInsightsTelemetryService : IAppInsightsTelemetryService
+{
+ private readonly TelemetryClient _telemetryClient;
+ private readonly ILogger _logger;
+
+ public AppInsightsTelemetryService(
+ TelemetryClient telemetryClient,
+ ILogger logger)
+ {
+ _telemetryClient = telemetryClient;
+ _logger = logger;
+ }
+
+ public void TrackAIModelInvocation(AIModelMetrics metrics)
+ {
+ var properties = new Dictionary
+ {
+ ["ModelName"] = metrics.ModelName,
+ ["ModelVersion"] = metrics.ModelVersion,
+ ["IsSuccessful"] = metrics.IsSuccessful.ToString()
+ };
+
+ if (!string.IsNullOrEmpty(metrics.ErrorMessage))
+ properties["ErrorMessage"] = metrics.ErrorMessage;
+
+ foreach (var kvp in metrics.CustomProperties)
+ properties[kvp.Key] = kvp.Value;
+
+ var telemetryMetrics = new Dictionary
+ {
+ ["LatencyMs"] = metrics.Latency,
+ ["TokensUsed"] = metrics.TokensUsed,
+ ["PromptTokens"] = metrics.PromptTokens,
+ ["CompletionTokens"] = metrics.CompletionTokens
+ };
+
+ _telemetryClient.TrackEvent("AIModelInvocation", properties, telemetryMetrics);
+
+ _telemetryClient.GetMetric("AIModel.Latency", "ModelName")
+ .TrackValue(metrics.Latency, metrics.ModelName);
+ _telemetryClient.GetMetric("AIModel.TokensUsed", "ModelName")
+ .TrackValue(metrics.TokensUsed, metrics.ModelName);
+
+ if (!metrics.IsSuccessful)
+ {
+ _telemetryClient.GetMetric("AIModel.Failures", "ModelName")
+ .TrackValue(1, metrics.ModelName);
+ }
+
+ _logger.LogDebug("Tracked AI model invocation: {ModelName} v{Version}, Latency={Latency}ms",
+ metrics.ModelName, metrics.ModelVersion, metrics.Latency);
+ }
+
+ public void TrackServiceHealth(ServiceHealthStatus status)
+ {
+ var properties = new Dictionary
+ {
+ ["ServiceName"] = status.ServiceName,
+ ["Endpoint"] = status.Endpoint,
+ ["State"] = status.State.ToString(),
+ ["HttpStatusCode"] = status.HttpStatusCode.ToString()
+ };
+
+ if (!string.IsNullOrEmpty(status.ErrorDetails))
+ properties["ErrorDetails"] = status.ErrorDetails;
+
+ var metrics = new Dictionary
+ {
+ ["ResponseTimeMs"] = status.ResponseTimeMs,
+ ["ConsecutiveFailures"] = status.ConsecutiveFailures
+ };
+
+ _telemetryClient.TrackEvent("ServiceHealthCheck", properties, metrics);
+
+ _telemetryClient.GetMetric("Service.ResponseTime", "ServiceName")
+ .TrackValue(status.ResponseTimeMs, status.ServiceName);
+
+ var availability = new AvailabilityTelemetry
+ {
+ Name = $"{status.ServiceName} Health Check",
+ Duration = TimeSpan.FromMilliseconds(status.ResponseTimeMs),
+ Success = status.State == HealthState.Healthy,
+ RunLocation = "AIMonitoringAgent",
+ Message = status.State.ToString(),
+ Timestamp = status.LastCheckedAt
+ };
+
+ _telemetryClient.TrackAvailability(availability);
+ }
+
+ public void TrackAnomaly(AnomalyDetectionResult anomaly)
+ {
+ var properties = new Dictionary
+ {
+ ["MetricName"] = anomaly.MetricName,
+ ["ServiceName"] = anomaly.ServiceName,
+ ["Severity"] = anomaly.Severity.ToString(),
+ ["IsAnomaly"] = anomaly.IsAnomaly.ToString(),
+ ["Description"] = anomaly.Description
+ };
+
+ var metrics = new Dictionary
+ {
+ ["CurrentValue"] = anomaly.CurrentValue,
+ ["ExpectedValue"] = anomaly.ExpectedValue,
+ ["DeviationPercentage"] = anomaly.DeviationPercentage
+ };
+
+ _telemetryClient.TrackEvent("AnomalyDetected", properties, metrics);
+
+ if (anomaly.Severity >= AnomalySeverity.High)
+ {
+ _logger.LogWarning("High severity anomaly detected: {Description}", anomaly.Description);
+ }
+ }
+
+ public void TrackAlert(AlertNotification alert)
+ {
+ var properties = new Dictionary
+ {
+ ["AlertRuleId"] = alert.AlertRuleId,
+ ["AlertRuleName"] = alert.AlertRuleName,
+ ["MetricName"] = alert.MetricName,
+ ["Severity"] = alert.Severity.ToString(),
+ ["Message"] = alert.Message
+ };
+
+ if (!string.IsNullOrEmpty(alert.ServiceName))
+ properties["ServiceName"] = alert.ServiceName;
+
+ var metrics = new Dictionary
+ {
+ ["CurrentValue"] = alert.CurrentValue,
+ ["Threshold"] = alert.Threshold
+ };
+
+ _telemetryClient.TrackEvent("AlertFired", properties, metrics);
+
+ _logger.LogWarning("Alert fired: {AlertName} - {Message}", alert.AlertRuleName, alert.Message);
+ }
+
+ public void TrackCustomMetric(string name, double value, Dictionary? properties = null)
+ {
+ var metricTelemetry = new MetricTelemetry(name, value);
+
+ if (properties != null)
+ {
+ foreach (var kvp in properties)
+ metricTelemetry.Properties[kvp.Key] = kvp.Value;
+ }
+
+ _telemetryClient.TrackMetric(metricTelemetry);
+ }
+
+ public void TrackDependency(string dependencyType, string target, string name, double durationMs, bool success)
+ {
+ _telemetryClient.TrackDependency(dependencyType, target, name,
+ string.Empty, DateTimeOffset.UtcNow,
+ TimeSpan.FromMilliseconds(durationMs), "200", success);
+ }
+
+ public void TrackException(Exception exception, Dictionary? properties = null)
+ {
+ _telemetryClient.TrackException(exception, properties);
+ _logger.LogError(exception, "Exception tracked in Application Insights");
+ }
+
+ public void TrackEvent(string eventName, Dictionary? properties = null, Dictionary? metrics = null)
+ {
+ _telemetryClient.TrackEvent(eventName, properties, metrics);
+ }
+
+ public void Flush()
+ {
+ _telemetryClient.Flush();
+ }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs
new file mode 100644
index 0000000..e484775
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs
@@ -0,0 +1,50 @@
+using AIMonitoringAgent.Configuration;
+using Microsoft.Extensions.Options;
+
+namespace AIMonitoringAgent.Services;
+
+public class HealthCheckBackgroundService : BackgroundService
+{
+ private readonly IServiceProvider _serviceProvider;
+ private readonly ILogger _logger;
+ private readonly MonitoringConfiguration _config;
+
+ public HealthCheckBackgroundService(
+ IServiceProvider serviceProvider,
+ IOptions config,
+ ILogger logger)
+ {
+ _serviceProvider = serviceProvider;
+ _config = config.Value;
+ _logger = logger;
+ }
+
+ protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+ {
+ _logger.LogInformation("Health check background service started. Interval: {Interval}s",
+ _config.HealthCheckIntervalSeconds);
+
+ while (!stoppingToken.IsCancellationRequested)
+ {
+ try
+ {
+ using var scope = _serviceProvider.CreateScope();
+ var healthMonitor = scope.ServiceProvider.GetRequiredService();
+ var results = await healthMonitor.CheckAllServicesAsync();
+
+ var unhealthy = results.Where(r => r.State == Models.HealthState.Unhealthy).ToList();
+ if (unhealthy.Count > 0)
+ {
+ _logger.LogWarning("{Count} service(s) unhealthy: {Services}",
+ unhealthy.Count, string.Join(", ", unhealthy.Select(s => s.ServiceName)));
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.LogError(ex, "Error during health check cycle");
+ }
+
+ await Task.Delay(TimeSpan.FromSeconds(_config.HealthCheckIntervalSeconds), stoppingToken);
+ }
+ }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs
new file mode 100644
index 0000000..ad7a49e
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs
@@ -0,0 +1,105 @@
+using System.Collections.Concurrent;
+using System.Diagnostics;
+using AIMonitoringAgent.Configuration;
+using AIMonitoringAgent.Models;
+using Microsoft.Extensions.Options;
+
+namespace AIMonitoringAgent.Services;
+
+public interface IServiceHealthMonitor
+{
+ Task CheckServiceHealthAsync(string serviceName, string endpoint);
+ Task> CheckAllServicesAsync();
+ IReadOnlyList GetLatestStatuses();
+}
+
+public class ServiceHealthMonitor : IServiceHealthMonitor
+{
+ private readonly HttpClient _httpClient;
+ private readonly IAppInsightsTelemetryService _telemetryService;
+ private readonly IAnomalyDetectionService _anomalyDetectionService;
+ private readonly IAlertingService _alertingService;
+ private readonly ILogger _logger;
+ private readonly MonitoringConfiguration _config;
+ private readonly ConcurrentDictionary _latestStatuses = new();
+
+ public ServiceHealthMonitor(
+ HttpClient httpClient,
+ IAppInsightsTelemetryService telemetryService,
+ IAnomalyDetectionService anomalyDetectionService,
+ IAlertingService alertingService,
+ IOptions config,
+ ILogger logger)
+ {
+ _httpClient = httpClient;
+ _telemetryService = telemetryService;
+ _anomalyDetectionService = anomalyDetectionService;
+ _alertingService = alertingService;
+ _config = config.Value;
+ _logger = logger;
+ }
+
+ public async Task CheckServiceHealthAsync(string serviceName, string endpoint)
+ {
+ var status = new ServiceHealthStatus
+ {
+ ServiceName = serviceName,
+ Endpoint = endpoint
+ };
+
+ var sw = Stopwatch.StartNew();
+ try
+ {
+ using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10));
+ var response = await _httpClient.GetAsync(endpoint, cts.Token);
+ sw.Stop();
+
+ status.ResponseTimeMs = sw.Elapsed.TotalMilliseconds;
+ status.HttpStatusCode = (int)response.StatusCode;
+ status.State = response.IsSuccessStatusCode ? HealthState.Healthy : HealthState.Degraded;
+ status.ConsecutiveFailures = 0;
+ }
+ catch (Exception ex)
+ {
+ sw.Stop();
+ status.ResponseTimeMs = sw.Elapsed.TotalMilliseconds;
+ status.State = HealthState.Unhealthy;
+ status.ErrorDetails = ex.Message;
+
+ if (_latestStatuses.TryGetValue(serviceName, out var previous))
+ status.ConsecutiveFailures = previous.ConsecutiveFailures + 1;
+ else
+ status.ConsecutiveFailures = 1;
+
+ _telemetryService.TrackException(ex, new Dictionary
+ {
+ ["ServiceName"] = serviceName,
+ ["Endpoint"] = endpoint
+ });
+ }
+
+ _latestStatuses[serviceName] = status;
+ _telemetryService.TrackServiceHealth(status);
+ _anomalyDetectionService.Evaluate("ResponseTime", serviceName, status.ResponseTimeMs);
+ _alertingService.EvaluateMetric("ConsecutiveFailures", serviceName, status.ConsecutiveFailures);
+
+ return status;
+ }
+
+ public async Task> CheckAllServicesAsync()
+ {
+ var tasks = _config.ServiceEndpoints
+ .Select(kvp => CheckServiceHealthAsync(kvp.Key, kvp.Value))
+ .ToList();
+
+ var results = await Task.WhenAll(tasks);
+ return results.ToList();
+ }
+
+ public IReadOnlyList GetLatestStatuses()
+ {
+ return _latestStatuses.Values
+ .OrderBy(s => s.ServiceName)
+ .ToList();
+ }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json b/src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json
new file mode 100644
index 0000000..0114e9c
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json
@@ -0,0 +1,9 @@
+{
+ "Logging": {
+ "LogLevel": {
+ "Default": "Debug",
+ "Microsoft.AspNetCore": "Information",
+ "AIMonitoringAgent": "Debug"
+ }
+ }
+}
diff --git a/src/Services/Monitoring/AIMonitoringAgent/appsettings.json b/src/Services/Monitoring/AIMonitoringAgent/appsettings.json
new file mode 100644
index 0000000..1c63f21
--- /dev/null
+++ b/src/Services/Monitoring/AIMonitoringAgent/appsettings.json
@@ -0,0 +1,53 @@
+{
+ "Logging": {
+ "LogLevel": {
+ "Default": "Information",
+ "Microsoft.AspNetCore": "Warning",
+ "AIMonitoringAgent": "Information"
+ }
+ },
+ "AllowedHosts": "*",
+ "Monitoring": {
+ "ApplicationInsightsConnectionString": "",
+ "HealthCheckIntervalSeconds": 30,
+ "AnomalySensitivityMultiplier": 2.0,
+ "ServiceEndpoints": {
+ "identity-service": "http://localhost:5001/healthz",
+ "customer-service": "http://localhost:5002/healthz",
+ "order-service": "http://localhost:5003/healthz",
+ "product-service": "http://localhost:5004/healthz",
+ "notification-service": "http://localhost:5005/healthz",
+ "api-gateway": "http://localhost:5000/healthz"
+ },
+ "DefaultAlertRules": [
+ {
+ "Name": "High Response Time",
+ "MetricName": "ResponseTime",
+ "Condition": "GreaterThan",
+ "Threshold": 5000,
+ "Severity": "Warning"
+ },
+ {
+ "Name": "Consecutive Failures",
+ "MetricName": "ConsecutiveFailures",
+ "Condition": "GreaterThanOrEqual",
+ "Threshold": 3,
+ "Severity": "Critical"
+ },
+ {
+ "Name": "High AI Model Latency",
+ "MetricName": "AIModel.Latency",
+ "Condition": "GreaterThan",
+ "Threshold": 10000,
+ "Severity": "Warning"
+ },
+ {
+ "Name": "AI Model Failure Rate",
+ "MetricName": "AIModel.Failures",
+ "Condition": "GreaterThan",
+ "Threshold": 5,
+ "Severity": "Error"
+ }
+ ]
+ }
+}
diff --git a/src/docker-compose.yml b/src/docker-compose.yml
index bf66a7b..8dfdc9b 100644
--- a/src/docker-compose.yml
+++ b/src/docker-compose.yml
@@ -81,6 +81,22 @@ services:
- ASPNETCORE_ENVIRONMENT=Development
- ConnectionStrings__DefaultConnection=Host=postgres;Database=notificationdb;Username=postgres;Password=postgres
+ ai-monitoring-agent:
+ build:
+ context: .
+ dockerfile: Services/Monitoring/AIMonitoringAgent/Dockerfile
+ ports:
+ - "5006:5006"
+ depends_on:
+ - identity-service
+ - customer-service
+ - order-service
+ - product-service
+ - notification-service
+ environment:
+ - ASPNETCORE_ENVIRONMENT=Development
+ - Monitoring__ApplicationInsightsConnectionString=
+
postgres:
image: postgres:16-alpine
ports: