From bd69818b7d2991f92392c86d55cf17b42e817ee3 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:28:01 +0000 Subject: [PATCH] feat: add AI Monitoring Agent with Azure Application Insights integration - New AIMonitoringAgent service (port 5006) under Services/Monitoring/ - Azure Application Insights telemetry: events, metrics, availability, exceptions - AI model performance tracking: latency, token usage, success/failure rates - Service health monitoring with background polling of all microservice endpoints - Statistical anomaly detection using rolling-window standard deviation analysis - Configurable alert rules with threshold conditions and severity levels - REST API with dashboard, monitoring, and alerts controllers - Swagger UI for development - Dockerfile and docker-compose integration - Default alert rules for response time, failures, and AI model metrics --- README.md | 11 +- .../AIMonitoringAgent.csproj | 17 ++ .../Configuration/MonitoringConfiguration.cs | 32 +++ .../Controllers/AlertsController.cs | 71 +++++++ .../Controllers/MonitoringController.cs | 113 ++++++++++ .../Monitoring/AIMonitoringAgent/Dockerfile | 22 ++ .../Models/AIModelMetrics.cs | 15 ++ .../AIMonitoringAgent/Models/AlertRule.cs | 45 ++++ .../Models/AnomalyDetectionResult.cs | 22 ++ .../Models/ServiceHealthStatus.cs | 21 ++ .../Monitoring/AIMonitoringAgent/Program.cs | 82 ++++++++ .../Monitoring/AIMonitoringAgent/README.md | 112 ++++++++++ .../Services/AIModelMonitoringService.cs | 133 ++++++++++++ .../Services/AlertingService.cs | 117 +++++++++++ .../Services/AnomalyDetectionService.cs | 141 +++++++++++++ .../Services/AppInsightsTelemetryService.cs | 197 ++++++++++++++++++ .../Services/HealthCheckBackgroundService.cs | 50 +++++ .../Services/ServiceHealthMonitor.cs | 105 ++++++++++ .../appsettings.Development.json | 9 + .../AIMonitoringAgent/appsettings.json | 53 +++++ src/docker-compose.yml | 16 ++ 21 files changed, 1380 insertions(+), 4 deletions(-) create mode 100644 src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Dockerfile create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Program.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/README.md create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs create mode 100644 src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json create mode 100644 src/Services/Monitoring/AIMonitoringAgent/appsettings.json diff --git a/README.md b/README.md index d6e9a9a..a25cee9 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ The monolith's bounded contexts are decomposed into the following independently | `product-service` | 5004 | Product catalog management | `ProductsController`, product models | | `notification-service` | 5005 | Email and in-app notifications | `NotificationService`, notification models | | `api-gateway` | 5000 | YARP reverse proxy, request routing, rate limiting | New — replaces monolith's single entry point | +| `ai-monitoring-agent` | 5006 | Azure App Insights AI monitoring, anomaly detection, alerting | New — cross-cutting observability service | ## Project Structure @@ -68,10 +69,12 @@ src/ │ │ ├── Product.API/ │ │ ├── Product.Domain/ │ │ └── Product.Infrastructure/ -│ └── Notification/ -│ ├── Notification.API/ -│ ├── Notification.Domain/ -│ └── Notification.Infrastructure/ +│ ├── Notification/ +│ │ ├── Notification.API/ +│ │ ├── Notification.Domain/ +│ │ └── Notification.Infrastructure/ +│ └── Monitoring/ +│ └── AIMonitoringAgent/ # App Insights AI monitoring agent ├── Shared/ │ ├── Shared.Contracts/ # Shared DTOs, events, interfaces │ └── Shared.Infrastructure/ # Common middleware, logging, health checks diff --git a/src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj b/src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj new file mode 100644 index 0000000..f33ecaa --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj @@ -0,0 +1,17 @@ + + + net10.0 + enable + enable + AIMonitoringAgent + + + + + + + + + + + diff --git a/src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs b/src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs new file mode 100644 index 0000000..58d5c2c --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Configuration/MonitoringConfiguration.cs @@ -0,0 +1,32 @@ +namespace AIMonitoringAgent.Configuration; + +public class MonitoringConfiguration +{ + public const string SectionName = "Monitoring"; + + public string ApplicationInsightsConnectionString { get; set; } = string.Empty; + public int HealthCheckIntervalSeconds { get; set; } = 30; + public double AnomalySensitivityMultiplier { get; set; } = 2.0; + + public Dictionary ServiceEndpoints { get; set; } = new() + { + ["identity-service"] = "http://localhost:5001/healthz", + ["customer-service"] = "http://localhost:5002/healthz", + ["order-service"] = "http://localhost:5003/healthz", + ["product-service"] = "http://localhost:5004/healthz", + ["notification-service"] = "http://localhost:5005/healthz", + ["api-gateway"] = "http://localhost:5000/healthz" + }; + + public List DefaultAlertRules { get; set; } = new(); +} + +public class DefaultAlertRule +{ + public string Name { get; set; } = string.Empty; + public string MetricName { get; set; } = string.Empty; + public string? ServiceName { get; set; } + public string Condition { get; set; } = "GreaterThan"; + public double Threshold { get; set; } + public string Severity { get; set; } = "Warning"; +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs b/src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs new file mode 100644 index 0000000..7b59972 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Controllers/AlertsController.cs @@ -0,0 +1,71 @@ +using AIMonitoringAgent.Models; +using AIMonitoringAgent.Services; +using Microsoft.AspNetCore.Mvc; + +namespace AIMonitoringAgent.Controllers; + +[ApiController] +[Route("api/[controller]")] +public class AlertsController : ControllerBase +{ + private readonly IAlertingService _alertingService; + + public AlertsController(IAlertingService alertingService) + { + _alertingService = alertingService; + } + + [HttpGet("rules")] + public IActionResult GetAllRules() + { + var rules = _alertingService.GetAllRules(); + return Ok(rules); + } + + [HttpGet("rules/{ruleId}")] + public IActionResult GetRule(string ruleId) + { + var rule = _alertingService.GetRule(ruleId); + if (rule == null) return NotFound(); + return Ok(rule); + } + + [HttpPost("rules")] + public IActionResult AddRule([FromBody] AlertRule rule) + { + var created = _alertingService.AddRule(rule); + return CreatedAtAction(nameof(GetRule), new { ruleId = created.Id }, created); + } + + [HttpDelete("rules/{ruleId}")] + public IActionResult DeleteRule(string ruleId) + { + var removed = _alertingService.RemoveRule(ruleId); + if (!removed) return NotFound(); + return NoContent(); + } + + [HttpPost("evaluate")] + public IActionResult EvaluateMetric([FromBody] MetricEvaluationRequest request) + { + var notification = _alertingService.EvaluateMetric(request.MetricName, request.ServiceName, request.Value); + if (notification == null) + return Ok(new { triggered = false, message = "No alert rules triggered" }); + + return Ok(new { triggered = true, alert = notification }); + } + + [HttpGet("recent")] + public IActionResult GetRecentAlerts([FromQuery] int count = 50) + { + var alerts = _alertingService.GetRecentAlerts(count); + return Ok(alerts); + } +} + +public class MetricEvaluationRequest +{ + public string MetricName { get; set; } = string.Empty; + public string? ServiceName { get; set; } + public double Value { get; set; } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs b/src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs new file mode 100644 index 0000000..db96626 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Controllers/MonitoringController.cs @@ -0,0 +1,113 @@ +using AIMonitoringAgent.Models; +using AIMonitoringAgent.Services; +using Microsoft.AspNetCore.Mvc; + +namespace AIMonitoringAgent.Controllers; + +[ApiController] +[Route("api/[controller]")] +public class MonitoringController : ControllerBase +{ + private readonly IAIModelMonitoringService _modelMonitoring; + private readonly IServiceHealthMonitor _healthMonitor; + private readonly IAnomalyDetectionService _anomalyDetection; + private readonly IAppInsightsTelemetryService _telemetryService; + + public MonitoringController( + IAIModelMonitoringService modelMonitoring, + IServiceHealthMonitor healthMonitor, + IAnomalyDetectionService anomalyDetection, + IAppInsightsTelemetryService telemetryService) + { + _modelMonitoring = modelMonitoring; + _healthMonitor = healthMonitor; + _anomalyDetection = anomalyDetection; + _telemetryService = telemetryService; + } + + [HttpPost("ai-model/track")] + public IActionResult TrackAIModelInvocation([FromBody] AIModelMetrics metrics) + { + _modelMonitoring.RecordInvocation(metrics); + return Accepted(new { message = "AI model invocation tracked successfully" }); + } + + [HttpGet("ai-model/summary")] + public IActionResult GetAllModelSummaries() + { + var summaries = _modelMonitoring.GetAllModelSummaries(); + return Ok(summaries); + } + + [HttpGet("ai-model/summary/{modelName}")] + public IActionResult GetModelSummary(string modelName) + { + var summary = _modelMonitoring.GetModelSummary(modelName); + return Ok(summary); + } + + [HttpGet("ai-model/invocations")] + public IActionResult GetRecentInvocations([FromQuery] string? modelName = null, [FromQuery] int count = 50) + { + var invocations = _modelMonitoring.GetRecentInvocations(modelName, count); + return Ok(invocations); + } + + [HttpGet("health/services")] + public IActionResult GetServiceHealthStatuses() + { + var statuses = _healthMonitor.GetLatestStatuses(); + return Ok(statuses); + } + + [HttpPost("health/check")] + public async Task TriggerHealthCheck() + { + var results = await _healthMonitor.CheckAllServicesAsync(); + return Ok(results); + } + + [HttpPost("health/check/{serviceName}")] + public async Task CheckServiceHealth(string serviceName, [FromQuery] string endpoint) + { + var result = await _healthMonitor.CheckServiceHealthAsync(serviceName, endpoint); + return Ok(result); + } + + [HttpGet("anomalies")] + public IActionResult GetRecentAnomalies([FromQuery] int count = 50) + { + var anomalies = _anomalyDetection.GetRecentAnomalies(count); + return Ok(anomalies); + } + + [HttpPost("metrics/custom")] + public IActionResult TrackCustomMetric([FromBody] CustomMetricRequest request) + { + _telemetryService.TrackCustomMetric(request.Name, request.Value, request.Properties); + return Accepted(new { message = "Custom metric tracked" }); + } + + [HttpGet("dashboard")] + public async Task GetDashboard() + { + var modelSummaries = _modelMonitoring.GetAllModelSummaries(); + var serviceStatuses = _healthMonitor.GetLatestStatuses(); + var recentAnomalies = _anomalyDetection.GetRecentAnomalies(10); + + return Ok(new + { + timestamp = DateTime.UtcNow, + aiModels = modelSummaries, + services = serviceStatuses, + recentAnomalies = recentAnomalies + }); + } +} + +public class CustomMetricRequest +{ + public string Name { get; set; } = string.Empty; + public double Value { get; set; } + public Dictionary? Properties { get; set; } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Dockerfile b/src/Services/Monitoring/AIMonitoringAgent/Dockerfile new file mode 100644 index 0000000..50eccc7 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Dockerfile @@ -0,0 +1,22 @@ +FROM mcr.microsoft.com/dotnet/aspnet:10.0-preview AS base +WORKDIR /app +EXPOSE 5006 + +FROM mcr.microsoft.com/dotnet/sdk:10.0-preview AS build +WORKDIR /src +COPY ["Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj", "Services/Monitoring/AIMonitoringAgent/"] +COPY ["Shared/Shared.Contracts/Shared.Contracts.csproj", "Shared/Shared.Contracts/"] +COPY ["Shared/Shared.Infrastructure/Shared.Infrastructure.csproj", "Shared/Shared.Infrastructure/"] +RUN dotnet restore "Services/Monitoring/AIMonitoringAgent/AIMonitoringAgent.csproj" +COPY . . +WORKDIR "/src/Services/Monitoring/AIMonitoringAgent" +RUN dotnet build "AIMonitoringAgent.csproj" -c Release -o /app/build + +FROM build AS publish +RUN dotnet publish "AIMonitoringAgent.csproj" -c Release -o /app/publish /p:UseAppHost=false + +FROM base AS final +WORKDIR /app +COPY --from=publish /app/publish . +ENV ASPNETCORE_URLS=http://+:5006 +ENTRYPOINT ["dotnet", "AIMonitoringAgent.dll"] diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs new file mode 100644 index 0000000..4dacf46 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Models/AIModelMetrics.cs @@ -0,0 +1,15 @@ +namespace AIMonitoringAgent.Models; + +public class AIModelMetrics +{ + public string ModelName { get; set; } = string.Empty; + public string ModelVersion { get; set; } = string.Empty; + public double Latency { get; set; } + public double TokensUsed { get; set; } + public double PromptTokens { get; set; } + public double CompletionTokens { get; set; } + public bool IsSuccessful { get; set; } + public string? ErrorMessage { get; set; } + public DateTime Timestamp { get; set; } = DateTime.UtcNow; + public Dictionary CustomProperties { get; set; } = new(); +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs new file mode 100644 index 0000000..0443ebd --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Models/AlertRule.cs @@ -0,0 +1,45 @@ +namespace AIMonitoringAgent.Models; + +public class AlertRule +{ + public string Id { get; set; } = Guid.NewGuid().ToString(); + public string Name { get; set; } = string.Empty; + public string MetricName { get; set; } = string.Empty; + public string? ServiceName { get; set; } + public AlertCondition Condition { get; set; } = AlertCondition.GreaterThan; + public double Threshold { get; set; } + public int EvaluationWindowMinutes { get; set; } = 5; + public AlertSeverityLevel Severity { get; set; } = AlertSeverityLevel.Warning; + public bool IsEnabled { get; set; } = true; + public DateTime CreatedAt { get; set; } = DateTime.UtcNow; +} + +public enum AlertCondition +{ + GreaterThan, + LessThan, + EqualTo, + GreaterThanOrEqual, + LessThanOrEqual +} + +public enum AlertSeverityLevel +{ + Information, + Warning, + Error, + Critical +} + +public class AlertNotification +{ + public string AlertRuleId { get; set; } = string.Empty; + public string AlertRuleName { get; set; } = string.Empty; + public string MetricName { get; set; } = string.Empty; + public string? ServiceName { get; set; } + public double CurrentValue { get; set; } + public double Threshold { get; set; } + public AlertSeverityLevel Severity { get; set; } + public DateTime FiredAt { get; set; } = DateTime.UtcNow; + public string Message { get; set; } = string.Empty; +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs new file mode 100644 index 0000000..9912322 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Models/AnomalyDetectionResult.cs @@ -0,0 +1,22 @@ +namespace AIMonitoringAgent.Models; + +public class AnomalyDetectionResult +{ + public string MetricName { get; set; } = string.Empty; + public string ServiceName { get; set; } = string.Empty; + public double CurrentValue { get; set; } + public double ExpectedValue { get; set; } + public double DeviationPercentage { get; set; } + public AnomalySeverity Severity { get; set; } + public bool IsAnomaly { get; set; } + public DateTime DetectedAt { get; set; } = DateTime.UtcNow; + public string Description { get; set; } = string.Empty; +} + +public enum AnomalySeverity +{ + Low, + Medium, + High, + Critical +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs b/src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs new file mode 100644 index 0000000..56483b9 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Models/ServiceHealthStatus.cs @@ -0,0 +1,21 @@ +namespace AIMonitoringAgent.Models; + +public class ServiceHealthStatus +{ + public string ServiceName { get; set; } = string.Empty; + public string Endpoint { get; set; } = string.Empty; + public HealthState State { get; set; } = HealthState.Unknown; + public double ResponseTimeMs { get; set; } + public int HttpStatusCode { get; set; } + public string? ErrorDetails { get; set; } + public DateTime LastCheckedAt { get; set; } = DateTime.UtcNow; + public int ConsecutiveFailures { get; set; } +} + +public enum HealthState +{ + Healthy, + Degraded, + Unhealthy, + Unknown +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Program.cs b/src/Services/Monitoring/AIMonitoringAgent/Program.cs new file mode 100644 index 0000000..3ae17da --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Program.cs @@ -0,0 +1,82 @@ +using AIMonitoringAgent.Configuration; +using AIMonitoringAgent.Models; +using AIMonitoringAgent.Services; + +var builder = WebApplication.CreateBuilder(args); + +// Configuration +builder.Services.Configure( + builder.Configuration.GetSection(MonitoringConfiguration.SectionName)); + +// Application Insights +var appInsightsConnectionString = builder.Configuration + .GetValue("Monitoring:ApplicationInsightsConnectionString"); + +builder.Services.AddApplicationInsightsTelemetry(options => +{ + if (!string.IsNullOrEmpty(appInsightsConnectionString)) + options.ConnectionString = appInsightsConnectionString; +}); + +// Core services +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); + +// Health monitor with HttpClient +builder.Services.AddHttpClient(); + +// Background health check service +builder.Services.AddHostedService(); + +// ASP.NET Core +builder.Services.AddControllers(); +builder.Services.AddEndpointsApiExplorer(); +builder.Services.AddSwaggerGen(options => +{ + options.SwaggerDoc("v1", new Microsoft.OpenApi.Models.OpenApiInfo + { + Title = "AI Monitoring Agent API", + Version = "v1", + Description = "Azure Application Insights AI Monitoring Agent for microservices observability" + }); +}); +builder.Services.AddHealthChecks(); + +// Seed default alert rules +var monitoringConfig = builder.Configuration + .GetSection(MonitoringConfiguration.SectionName) + .Get(); + +var app = builder.Build(); + +// Seed default alert rules from configuration +if (monitoringConfig?.DefaultAlertRules.Count > 0) +{ + var alertingService = app.Services.GetRequiredService(); + foreach (var ruleConfig in monitoringConfig.DefaultAlertRules) + { + var rule = new AlertRule + { + Name = ruleConfig.Name, + MetricName = ruleConfig.MetricName, + ServiceName = ruleConfig.ServiceName, + Condition = Enum.Parse(ruleConfig.Condition), + Threshold = ruleConfig.Threshold, + Severity = Enum.Parse(ruleConfig.Severity) + }; + alertingService.AddRule(rule); + } +} + +if (app.Environment.IsDevelopment()) +{ + app.UseSwagger(); + app.UseSwaggerUI(); +} + +app.MapControllers(); +app.MapHealthChecks("/healthz"); + +app.Run(); diff --git a/src/Services/Monitoring/AIMonitoringAgent/README.md b/src/Services/Monitoring/AIMonitoringAgent/README.md new file mode 100644 index 0000000..ce7076f --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/README.md @@ -0,0 +1,112 @@ +# AI Monitoring Agent + +Azure Application Insights-powered monitoring agent for the microservices platform. Provides AI model performance tracking, service health monitoring, anomaly detection, and configurable alerting. + +## Features + +- **AI Model Performance Tracking** — Track latency, token usage, success/failure rates for AI/ML model invocations +- **Service Health Monitoring** — Periodic health checks against all microservice endpoints with availability telemetry +- **Anomaly Detection** — Statistical anomaly detection using rolling-window standard deviation analysis +- **Configurable Alerting** — Define alert rules with thresholds, conditions, and severity levels +- **Application Insights Integration** — All telemetry (events, metrics, availability, exceptions) sent to Azure App Insights +- **Dashboard API** — Aggregated view of AI models, service health, and recent anomalies + +## API Endpoints + +### Monitoring + +| Method | Route | Description | +|--------|-------|-------------| +| `POST` | `/api/monitoring/ai-model/track` | Track an AI model invocation | +| `GET` | `/api/monitoring/ai-model/summary` | Get summaries for all tracked models | +| `GET` | `/api/monitoring/ai-model/summary/{modelName}` | Get summary for a specific model | +| `GET` | `/api/monitoring/ai-model/invocations` | Get recent invocations | +| `GET` | `/api/monitoring/health/services` | Get latest health statuses | +| `POST` | `/api/monitoring/health/check` | Trigger health check for all services | +| `POST` | `/api/monitoring/health/check/{serviceName}` | Check a specific service | +| `GET` | `/api/monitoring/anomalies` | Get recent anomalies | +| `POST` | `/api/monitoring/metrics/custom` | Track a custom metric | +| `GET` | `/api/monitoring/dashboard` | Get aggregated dashboard data | + +### Alerts + +| Method | Route | Description | +|--------|-------|-------------| +| `GET` | `/api/alerts/rules` | List all alert rules | +| `GET` | `/api/alerts/rules/{ruleId}` | Get a specific alert rule | +| `POST` | `/api/alerts/rules` | Create an alert rule | +| `DELETE` | `/api/alerts/rules/{ruleId}` | Delete an alert rule | +| `POST` | `/api/alerts/evaluate` | Evaluate a metric against rules | +| `GET` | `/api/alerts/recent` | Get recent alert notifications | + +## Configuration + +Set the App Insights connection string in `appsettings.json` or via environment variable: + +```json +{ + "Monitoring": { + "ApplicationInsightsConnectionString": "InstrumentationKey=...;IngestionEndpoint=...", + "HealthCheckIntervalSeconds": 30, + "AnomalySensitivityMultiplier": 2.0, + "ServiceEndpoints": { + "identity-service": "http://localhost:5001/healthz" + }, + "DefaultAlertRules": [ + { + "Name": "High Response Time", + "MetricName": "ResponseTime", + "Condition": "GreaterThan", + "Threshold": 5000, + "Severity": "Warning" + } + ] + } +} +``` + +## Running + +```bash +# Standalone +cd src/Services/Monitoring/AIMonitoringAgent +dotnet run + +# With Docker Compose (all services) +cd src +docker compose up --build +``` + +The agent runs on port **5006** and exposes Swagger UI at `/swagger` in development mode. + +## Usage Example + +Track an AI model invocation: + +```bash +curl -X POST http://localhost:5006/api/monitoring/ai-model/track \ + -H "Content-Type: application/json" \ + -d '{ + "modelName": "gpt-4", + "modelVersion": "0613", + "latency": 1250.5, + "tokensUsed": 450, + "promptTokens": 200, + "completionTokens": 250, + "isSuccessful": true + }' +``` + +Create an alert rule: + +```bash +curl -X POST http://localhost:5006/api/alerts/rules \ + -H "Content-Type: application/json" \ + -d '{ + "name": "High Latency Alert", + "metricName": "ResponseTime", + "condition": "GreaterThan", + "threshold": 3000, + "severity": "Warning" + }' +``` diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs new file mode 100644 index 0000000..6c0e2c6 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AIModelMonitoringService.cs @@ -0,0 +1,133 @@ +using System.Collections.Concurrent; +using AIMonitoringAgent.Models; + +namespace AIMonitoringAgent.Services; + +public interface IAIModelMonitoringService +{ + void RecordInvocation(AIModelMetrics metrics); + AIModelSummary GetModelSummary(string modelName); + IReadOnlyList GetAllModelSummaries(); + IReadOnlyList GetRecentInvocations(string? modelName = null, int count = 50); +} + +public class AIModelMonitoringService : IAIModelMonitoringService +{ + private readonly IAppInsightsTelemetryService _telemetryService; + private readonly ILogger _logger; + private readonly ConcurrentDictionary _modelTrackers = new(); + private readonly ConcurrentQueue _recentInvocations = new(); + private const int MaxRecentInvocations = 1000; + + public AIModelMonitoringService( + IAppInsightsTelemetryService telemetryService, + ILogger logger) + { + _telemetryService = telemetryService; + _logger = logger; + } + + public void RecordInvocation(AIModelMetrics metrics) + { + _telemetryService.TrackAIModelInvocation(metrics); + + var tracker = _modelTrackers.GetOrAdd(metrics.ModelName, _ => new ModelTracker()); + tracker.Record(metrics); + + _recentInvocations.Enqueue(metrics); + while (_recentInvocations.Count > MaxRecentInvocations) + _recentInvocations.TryDequeue(out _); + + _logger.LogInformation( + "AI model invocation recorded: {Model} v{Version}, Latency={Latency}ms, Success={Success}", + metrics.ModelName, metrics.ModelVersion, metrics.Latency, metrics.IsSuccessful); + } + + public AIModelSummary GetModelSummary(string modelName) + { + if (_modelTrackers.TryGetValue(modelName, out var tracker)) + return tracker.GetSummary(modelName); + + return new AIModelSummary { ModelName = modelName }; + } + + public IReadOnlyList GetAllModelSummaries() + { + return _modelTrackers + .Select(kvp => kvp.Value.GetSummary(kvp.Key)) + .OrderBy(s => s.ModelName) + .ToList(); + } + + public IReadOnlyList GetRecentInvocations(string? modelName = null, int count = 50) + { + var query = _recentInvocations.AsEnumerable(); + + if (!string.IsNullOrEmpty(modelName)) + query = query.Where(m => m.ModelName.Equals(modelName, StringComparison.OrdinalIgnoreCase)); + + return query.OrderByDescending(m => m.Timestamp).Take(count).ToList(); + } + + private class ModelTracker + { + private long _totalInvocations; + private long _successCount; + private long _failureCount; + private double _totalLatency; + private double _totalTokens; + private double _minLatency = double.MaxValue; + private double _maxLatency = double.MinValue; + private readonly object _lock = new(); + + public void Record(AIModelMetrics metrics) + { + lock (_lock) + { + _totalInvocations++; + _totalLatency += metrics.Latency; + _totalTokens += metrics.TokensUsed; + + if (metrics.Latency < _minLatency) _minLatency = metrics.Latency; + if (metrics.Latency > _maxLatency) _maxLatency = metrics.Latency; + + if (metrics.IsSuccessful) + _successCount++; + else + _failureCount++; + } + } + + public AIModelSummary GetSummary(string modelName) + { + lock (_lock) + { + return new AIModelSummary + { + ModelName = modelName, + TotalInvocations = _totalInvocations, + SuccessCount = _successCount, + FailureCount = _failureCount, + SuccessRate = _totalInvocations > 0 ? (double)_successCount / _totalInvocations * 100 : 0, + AverageLatencyMs = _totalInvocations > 0 ? _totalLatency / _totalInvocations : 0, + MinLatencyMs = _minLatency == double.MaxValue ? 0 : _minLatency, + MaxLatencyMs = _maxLatency == double.MinValue ? 0 : _maxLatency, + TotalTokensUsed = _totalTokens + }; + } + } + } +} + +public class AIModelSummary +{ + public string ModelName { get; set; } = string.Empty; + public long TotalInvocations { get; set; } + public long SuccessCount { get; set; } + public long FailureCount { get; set; } + public double SuccessRate { get; set; } + public double AverageLatencyMs { get; set; } + public double MinLatencyMs { get; set; } + public double MaxLatencyMs { get; set; } + public double TotalTokensUsed { get; set; } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs new file mode 100644 index 0000000..c32a003 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AlertingService.cs @@ -0,0 +1,117 @@ +using System.Collections.Concurrent; +using AIMonitoringAgent.Models; + +namespace AIMonitoringAgent.Services; + +public interface IAlertingService +{ + AlertRule AddRule(AlertRule rule); + bool RemoveRule(string ruleId); + AlertRule? GetRule(string ruleId); + IReadOnlyList GetAllRules(); + AlertNotification? EvaluateMetric(string metricName, string? serviceName, double currentValue); + IReadOnlyList GetRecentAlerts(int count = 50); +} + +public class AlertingService : IAlertingService +{ + private readonly IAppInsightsTelemetryService _telemetryService; + private readonly ILogger _logger; + private readonly ConcurrentDictionary _rules = new(); + private readonly ConcurrentQueue _recentAlerts = new(); + private const int MaxRecentAlerts = 500; + + public AlertingService( + IAppInsightsTelemetryService telemetryService, + ILogger logger) + { + _telemetryService = telemetryService; + _logger = logger; + } + + public AlertRule AddRule(AlertRule rule) + { + _rules[rule.Id] = rule; + _logger.LogInformation("Alert rule added: {RuleName} ({RuleId})", rule.Name, rule.Id); + return rule; + } + + public bool RemoveRule(string ruleId) + { + var removed = _rules.TryRemove(ruleId, out _); + if (removed) + _logger.LogInformation("Alert rule removed: {RuleId}", ruleId); + return removed; + } + + public AlertRule? GetRule(string ruleId) + { + _rules.TryGetValue(ruleId, out var rule); + return rule; + } + + public IReadOnlyList GetAllRules() + { + return _rules.Values.OrderBy(r => r.Name).ToList(); + } + + public AlertNotification? EvaluateMetric(string metricName, string? serviceName, double currentValue) + { + var matchingRules = _rules.Values + .Where(r => r.IsEnabled + && r.MetricName.Equals(metricName, StringComparison.OrdinalIgnoreCase) + && (string.IsNullOrEmpty(r.ServiceName) || r.ServiceName.Equals(serviceName, StringComparison.OrdinalIgnoreCase))) + .ToList(); + + foreach (var rule in matchingRules) + { + if (!IsConditionMet(rule.Condition, currentValue, rule.Threshold)) + continue; + + var notification = new AlertNotification + { + AlertRuleId = rule.Id, + AlertRuleName = rule.Name, + MetricName = metricName, + ServiceName = serviceName, + CurrentValue = currentValue, + Threshold = rule.Threshold, + Severity = rule.Severity, + Message = $"Alert '{rule.Name}': {metricName} is {currentValue:F2} " + + $"({rule.Condition} threshold {rule.Threshold:F2})" + }; + + _telemetryService.TrackAlert(notification); + + _recentAlerts.Enqueue(notification); + while (_recentAlerts.Count > MaxRecentAlerts) + _recentAlerts.TryDequeue(out _); + + _logger.LogWarning("Alert fired: {Message}", notification.Message); + return notification; + } + + return null; + } + + public IReadOnlyList GetRecentAlerts(int count = 50) + { + return _recentAlerts + .OrderByDescending(a => a.FiredAt) + .Take(count) + .ToList(); + } + + private static bool IsConditionMet(AlertCondition condition, double current, double threshold) + { + return condition switch + { + AlertCondition.GreaterThan => current > threshold, + AlertCondition.LessThan => current < threshold, + AlertCondition.EqualTo => Math.Abs(current - threshold) < 0.001, + AlertCondition.GreaterThanOrEqual => current >= threshold, + AlertCondition.LessThanOrEqual => current <= threshold, + _ => false + }; + } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs new file mode 100644 index 0000000..971585d --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AnomalyDetectionService.cs @@ -0,0 +1,141 @@ +using System.Collections.Concurrent; +using AIMonitoringAgent.Models; + +namespace AIMonitoringAgent.Services; + +public interface IAnomalyDetectionService +{ + AnomalyDetectionResult Evaluate(string metricName, string serviceName, double currentValue); + IReadOnlyList GetRecentAnomalies(int count = 50); + void Configure(string metricName, double sensitivityMultiplier); +} + +public class AnomalyDetectionService : IAnomalyDetectionService +{ + private readonly IAppInsightsTelemetryService _telemetryService; + private readonly ILogger _logger; + private readonly ConcurrentDictionary _metricWindows = new(); + private readonly ConcurrentDictionary _sensitivityConfig = new(); + private readonly ConcurrentQueue _recentAnomalies = new(); + private const int MaxRecentAnomalies = 200; + private const double DefaultSensitivityMultiplier = 2.0; + + public AnomalyDetectionService( + IAppInsightsTelemetryService telemetryService, + ILogger logger) + { + _telemetryService = telemetryService; + _logger = logger; + } + + public AnomalyDetectionResult Evaluate(string metricName, string serviceName, double currentValue) + { + var key = $"{serviceName}:{metricName}"; + var window = _metricWindows.GetOrAdd(key, _ => new MetricWindow()); + window.Add(currentValue); + + var stats = window.GetStatistics(); + var sensitivity = _sensitivityConfig.GetValueOrDefault(metricName, DefaultSensitivityMultiplier); + + var deviation = stats.StdDev > 0 + ? Math.Abs(currentValue - stats.Mean) / stats.StdDev + : 0; + + var isAnomaly = stats.Count >= 10 && deviation > sensitivity; + + var deviationPercentage = stats.Mean != 0 + ? (currentValue - stats.Mean) / stats.Mean * 100 + : 0; + + var severity = DetermineSeverity(deviation, sensitivity); + + var result = new AnomalyDetectionResult + { + MetricName = metricName, + ServiceName = serviceName, + CurrentValue = currentValue, + ExpectedValue = Math.Round(stats.Mean, 2), + DeviationPercentage = Math.Round(deviationPercentage, 2), + Severity = severity, + IsAnomaly = isAnomaly, + Description = isAnomaly + ? $"{metricName} for {serviceName} is {Math.Abs(deviationPercentage):F1}% " + + $"{(currentValue > stats.Mean ? "above" : "below")} the expected value " + + $"(current: {currentValue:F2}, expected: {stats.Mean:F2})" + : $"{metricName} for {serviceName} is within normal range" + }; + + if (isAnomaly) + { + _telemetryService.TrackAnomaly(result); + _recentAnomalies.Enqueue(result); + while (_recentAnomalies.Count > MaxRecentAnomalies) + _recentAnomalies.TryDequeue(out _); + + _logger.LogWarning("Anomaly detected: {Description}", result.Description); + } + + return result; + } + + public IReadOnlyList GetRecentAnomalies(int count = 50) + { + return _recentAnomalies + .OrderByDescending(a => a.DetectedAt) + .Take(count) + .ToList(); + } + + public void Configure(string metricName, double sensitivityMultiplier) + { + _sensitivityConfig[metricName] = sensitivityMultiplier; + _logger.LogInformation("Anomaly detection sensitivity for {Metric} set to {Sensitivity}", + metricName, sensitivityMultiplier); + } + + private static AnomalySeverity DetermineSeverity(double deviation, double sensitivity) + { + if (deviation > sensitivity * 3) return AnomalySeverity.Critical; + if (deviation > sensitivity * 2) return AnomalySeverity.High; + if (deviation > sensitivity * 1.5) return AnomalySeverity.Medium; + return AnomalySeverity.Low; + } + + private class MetricWindow + { + private readonly Queue _values = new(); + private readonly object _lock = new(); + private const int WindowSize = 100; + private static readonly TimeSpan WindowDuration = TimeSpan.FromMinutes(30); + + public void Add(double value) + { + lock (_lock) + { + var cutoff = DateTime.UtcNow - WindowDuration; + while (_values.Count > 0 && (_values.Peek().Timestamp < cutoff || _values.Count >= WindowSize)) + _values.Dequeue(); + + _values.Enqueue(new TimestampedValue(value, DateTime.UtcNow)); + } + } + + public (double Mean, double StdDev, int Count) GetStatistics() + { + lock (_lock) + { + if (_values.Count == 0) + return (0, 0, 0); + + var values = _values.Select(v => v.Value).ToArray(); + var mean = values.Average(); + var variance = values.Select(v => Math.Pow(v - mean, 2)).Average(); + var stdDev = Math.Sqrt(variance); + + return (mean, stdDev, values.Length); + } + } + + private record TimestampedValue(double Value, DateTime Timestamp); + } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs new file mode 100644 index 0000000..997fbcc --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Services/AppInsightsTelemetryService.cs @@ -0,0 +1,197 @@ +using Microsoft.ApplicationInsights; +using Microsoft.ApplicationInsights.DataContracts; +using Microsoft.ApplicationInsights.Extensibility; +using AIMonitoringAgent.Models; + +namespace AIMonitoringAgent.Services; + +public interface IAppInsightsTelemetryService +{ + void TrackAIModelInvocation(AIModelMetrics metrics); + void TrackServiceHealth(ServiceHealthStatus status); + void TrackAnomaly(AnomalyDetectionResult anomaly); + void TrackAlert(AlertNotification alert); + void TrackCustomMetric(string name, double value, Dictionary? properties = null); + void TrackDependency(string dependencyType, string target, string name, double durationMs, bool success); + void TrackException(Exception exception, Dictionary? properties = null); + void TrackEvent(string eventName, Dictionary? properties = null, Dictionary? metrics = null); + void Flush(); +} + +public class AppInsightsTelemetryService : IAppInsightsTelemetryService +{ + private readonly TelemetryClient _telemetryClient; + private readonly ILogger _logger; + + public AppInsightsTelemetryService( + TelemetryClient telemetryClient, + ILogger logger) + { + _telemetryClient = telemetryClient; + _logger = logger; + } + + public void TrackAIModelInvocation(AIModelMetrics metrics) + { + var properties = new Dictionary + { + ["ModelName"] = metrics.ModelName, + ["ModelVersion"] = metrics.ModelVersion, + ["IsSuccessful"] = metrics.IsSuccessful.ToString() + }; + + if (!string.IsNullOrEmpty(metrics.ErrorMessage)) + properties["ErrorMessage"] = metrics.ErrorMessage; + + foreach (var kvp in metrics.CustomProperties) + properties[kvp.Key] = kvp.Value; + + var telemetryMetrics = new Dictionary + { + ["LatencyMs"] = metrics.Latency, + ["TokensUsed"] = metrics.TokensUsed, + ["PromptTokens"] = metrics.PromptTokens, + ["CompletionTokens"] = metrics.CompletionTokens + }; + + _telemetryClient.TrackEvent("AIModelInvocation", properties, telemetryMetrics); + + _telemetryClient.GetMetric("AIModel.Latency", "ModelName") + .TrackValue(metrics.Latency, metrics.ModelName); + _telemetryClient.GetMetric("AIModel.TokensUsed", "ModelName") + .TrackValue(metrics.TokensUsed, metrics.ModelName); + + if (!metrics.IsSuccessful) + { + _telemetryClient.GetMetric("AIModel.Failures", "ModelName") + .TrackValue(1, metrics.ModelName); + } + + _logger.LogDebug("Tracked AI model invocation: {ModelName} v{Version}, Latency={Latency}ms", + metrics.ModelName, metrics.ModelVersion, metrics.Latency); + } + + public void TrackServiceHealth(ServiceHealthStatus status) + { + var properties = new Dictionary + { + ["ServiceName"] = status.ServiceName, + ["Endpoint"] = status.Endpoint, + ["State"] = status.State.ToString(), + ["HttpStatusCode"] = status.HttpStatusCode.ToString() + }; + + if (!string.IsNullOrEmpty(status.ErrorDetails)) + properties["ErrorDetails"] = status.ErrorDetails; + + var metrics = new Dictionary + { + ["ResponseTimeMs"] = status.ResponseTimeMs, + ["ConsecutiveFailures"] = status.ConsecutiveFailures + }; + + _telemetryClient.TrackEvent("ServiceHealthCheck", properties, metrics); + + _telemetryClient.GetMetric("Service.ResponseTime", "ServiceName") + .TrackValue(status.ResponseTimeMs, status.ServiceName); + + var availability = new AvailabilityTelemetry + { + Name = $"{status.ServiceName} Health Check", + Duration = TimeSpan.FromMilliseconds(status.ResponseTimeMs), + Success = status.State == HealthState.Healthy, + RunLocation = "AIMonitoringAgent", + Message = status.State.ToString(), + Timestamp = status.LastCheckedAt + }; + + _telemetryClient.TrackAvailability(availability); + } + + public void TrackAnomaly(AnomalyDetectionResult anomaly) + { + var properties = new Dictionary + { + ["MetricName"] = anomaly.MetricName, + ["ServiceName"] = anomaly.ServiceName, + ["Severity"] = anomaly.Severity.ToString(), + ["IsAnomaly"] = anomaly.IsAnomaly.ToString(), + ["Description"] = anomaly.Description + }; + + var metrics = new Dictionary + { + ["CurrentValue"] = anomaly.CurrentValue, + ["ExpectedValue"] = anomaly.ExpectedValue, + ["DeviationPercentage"] = anomaly.DeviationPercentage + }; + + _telemetryClient.TrackEvent("AnomalyDetected", properties, metrics); + + if (anomaly.Severity >= AnomalySeverity.High) + { + _logger.LogWarning("High severity anomaly detected: {Description}", anomaly.Description); + } + } + + public void TrackAlert(AlertNotification alert) + { + var properties = new Dictionary + { + ["AlertRuleId"] = alert.AlertRuleId, + ["AlertRuleName"] = alert.AlertRuleName, + ["MetricName"] = alert.MetricName, + ["Severity"] = alert.Severity.ToString(), + ["Message"] = alert.Message + }; + + if (!string.IsNullOrEmpty(alert.ServiceName)) + properties["ServiceName"] = alert.ServiceName; + + var metrics = new Dictionary + { + ["CurrentValue"] = alert.CurrentValue, + ["Threshold"] = alert.Threshold + }; + + _telemetryClient.TrackEvent("AlertFired", properties, metrics); + + _logger.LogWarning("Alert fired: {AlertName} - {Message}", alert.AlertRuleName, alert.Message); + } + + public void TrackCustomMetric(string name, double value, Dictionary? properties = null) + { + var metricTelemetry = new MetricTelemetry(name, value); + + if (properties != null) + { + foreach (var kvp in properties) + metricTelemetry.Properties[kvp.Key] = kvp.Value; + } + + _telemetryClient.TrackMetric(metricTelemetry); + } + + public void TrackDependency(string dependencyType, string target, string name, double durationMs, bool success) + { + _telemetryClient.TrackDependency(dependencyType, target, name, + string.Empty, DateTimeOffset.UtcNow, + TimeSpan.FromMilliseconds(durationMs), "200", success); + } + + public void TrackException(Exception exception, Dictionary? properties = null) + { + _telemetryClient.TrackException(exception, properties); + _logger.LogError(exception, "Exception tracked in Application Insights"); + } + + public void TrackEvent(string eventName, Dictionary? properties = null, Dictionary? metrics = null) + { + _telemetryClient.TrackEvent(eventName, properties, metrics); + } + + public void Flush() + { + _telemetryClient.Flush(); + } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs new file mode 100644 index 0000000..e484775 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Services/HealthCheckBackgroundService.cs @@ -0,0 +1,50 @@ +using AIMonitoringAgent.Configuration; +using Microsoft.Extensions.Options; + +namespace AIMonitoringAgent.Services; + +public class HealthCheckBackgroundService : BackgroundService +{ + private readonly IServiceProvider _serviceProvider; + private readonly ILogger _logger; + private readonly MonitoringConfiguration _config; + + public HealthCheckBackgroundService( + IServiceProvider serviceProvider, + IOptions config, + ILogger logger) + { + _serviceProvider = serviceProvider; + _config = config.Value; + _logger = logger; + } + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + _logger.LogInformation("Health check background service started. Interval: {Interval}s", + _config.HealthCheckIntervalSeconds); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + using var scope = _serviceProvider.CreateScope(); + var healthMonitor = scope.ServiceProvider.GetRequiredService(); + var results = await healthMonitor.CheckAllServicesAsync(); + + var unhealthy = results.Where(r => r.State == Models.HealthState.Unhealthy).ToList(); + if (unhealthy.Count > 0) + { + _logger.LogWarning("{Count} service(s) unhealthy: {Services}", + unhealthy.Count, string.Join(", ", unhealthy.Select(s => s.ServiceName))); + } + } + catch (Exception ex) + { + _logger.LogError(ex, "Error during health check cycle"); + } + + await Task.Delay(TimeSpan.FromSeconds(_config.HealthCheckIntervalSeconds), stoppingToken); + } + } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs b/src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs new file mode 100644 index 0000000..ad7a49e --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/Services/ServiceHealthMonitor.cs @@ -0,0 +1,105 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using AIMonitoringAgent.Configuration; +using AIMonitoringAgent.Models; +using Microsoft.Extensions.Options; + +namespace AIMonitoringAgent.Services; + +public interface IServiceHealthMonitor +{ + Task CheckServiceHealthAsync(string serviceName, string endpoint); + Task> CheckAllServicesAsync(); + IReadOnlyList GetLatestStatuses(); +} + +public class ServiceHealthMonitor : IServiceHealthMonitor +{ + private readonly HttpClient _httpClient; + private readonly IAppInsightsTelemetryService _telemetryService; + private readonly IAnomalyDetectionService _anomalyDetectionService; + private readonly IAlertingService _alertingService; + private readonly ILogger _logger; + private readonly MonitoringConfiguration _config; + private readonly ConcurrentDictionary _latestStatuses = new(); + + public ServiceHealthMonitor( + HttpClient httpClient, + IAppInsightsTelemetryService telemetryService, + IAnomalyDetectionService anomalyDetectionService, + IAlertingService alertingService, + IOptions config, + ILogger logger) + { + _httpClient = httpClient; + _telemetryService = telemetryService; + _anomalyDetectionService = anomalyDetectionService; + _alertingService = alertingService; + _config = config.Value; + _logger = logger; + } + + public async Task CheckServiceHealthAsync(string serviceName, string endpoint) + { + var status = new ServiceHealthStatus + { + ServiceName = serviceName, + Endpoint = endpoint + }; + + var sw = Stopwatch.StartNew(); + try + { + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10)); + var response = await _httpClient.GetAsync(endpoint, cts.Token); + sw.Stop(); + + status.ResponseTimeMs = sw.Elapsed.TotalMilliseconds; + status.HttpStatusCode = (int)response.StatusCode; + status.State = response.IsSuccessStatusCode ? HealthState.Healthy : HealthState.Degraded; + status.ConsecutiveFailures = 0; + } + catch (Exception ex) + { + sw.Stop(); + status.ResponseTimeMs = sw.Elapsed.TotalMilliseconds; + status.State = HealthState.Unhealthy; + status.ErrorDetails = ex.Message; + + if (_latestStatuses.TryGetValue(serviceName, out var previous)) + status.ConsecutiveFailures = previous.ConsecutiveFailures + 1; + else + status.ConsecutiveFailures = 1; + + _telemetryService.TrackException(ex, new Dictionary + { + ["ServiceName"] = serviceName, + ["Endpoint"] = endpoint + }); + } + + _latestStatuses[serviceName] = status; + _telemetryService.TrackServiceHealth(status); + _anomalyDetectionService.Evaluate("ResponseTime", serviceName, status.ResponseTimeMs); + _alertingService.EvaluateMetric("ConsecutiveFailures", serviceName, status.ConsecutiveFailures); + + return status; + } + + public async Task> CheckAllServicesAsync() + { + var tasks = _config.ServiceEndpoints + .Select(kvp => CheckServiceHealthAsync(kvp.Key, kvp.Value)) + .ToList(); + + var results = await Task.WhenAll(tasks); + return results.ToList(); + } + + public IReadOnlyList GetLatestStatuses() + { + return _latestStatuses.Values + .OrderBy(s => s.ServiceName) + .ToList(); + } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json b/src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json new file mode 100644 index 0000000..0114e9c --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/appsettings.Development.json @@ -0,0 +1,9 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Debug", + "Microsoft.AspNetCore": "Information", + "AIMonitoringAgent": "Debug" + } + } +} diff --git a/src/Services/Monitoring/AIMonitoringAgent/appsettings.json b/src/Services/Monitoring/AIMonitoringAgent/appsettings.json new file mode 100644 index 0000000..1c63f21 --- /dev/null +++ b/src/Services/Monitoring/AIMonitoringAgent/appsettings.json @@ -0,0 +1,53 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "AIMonitoringAgent": "Information" + } + }, + "AllowedHosts": "*", + "Monitoring": { + "ApplicationInsightsConnectionString": "", + "HealthCheckIntervalSeconds": 30, + "AnomalySensitivityMultiplier": 2.0, + "ServiceEndpoints": { + "identity-service": "http://localhost:5001/healthz", + "customer-service": "http://localhost:5002/healthz", + "order-service": "http://localhost:5003/healthz", + "product-service": "http://localhost:5004/healthz", + "notification-service": "http://localhost:5005/healthz", + "api-gateway": "http://localhost:5000/healthz" + }, + "DefaultAlertRules": [ + { + "Name": "High Response Time", + "MetricName": "ResponseTime", + "Condition": "GreaterThan", + "Threshold": 5000, + "Severity": "Warning" + }, + { + "Name": "Consecutive Failures", + "MetricName": "ConsecutiveFailures", + "Condition": "GreaterThanOrEqual", + "Threshold": 3, + "Severity": "Critical" + }, + { + "Name": "High AI Model Latency", + "MetricName": "AIModel.Latency", + "Condition": "GreaterThan", + "Threshold": 10000, + "Severity": "Warning" + }, + { + "Name": "AI Model Failure Rate", + "MetricName": "AIModel.Failures", + "Condition": "GreaterThan", + "Threshold": 5, + "Severity": "Error" + } + ] + } +} diff --git a/src/docker-compose.yml b/src/docker-compose.yml index bf66a7b..8dfdc9b 100644 --- a/src/docker-compose.yml +++ b/src/docker-compose.yml @@ -81,6 +81,22 @@ services: - ASPNETCORE_ENVIRONMENT=Development - ConnectionStrings__DefaultConnection=Host=postgres;Database=notificationdb;Username=postgres;Password=postgres + ai-monitoring-agent: + build: + context: . + dockerfile: Services/Monitoring/AIMonitoringAgent/Dockerfile + ports: + - "5006:5006" + depends_on: + - identity-service + - customer-service + - order-service + - product-service + - notification-service + environment: + - ASPNETCORE_ENVIRONMENT=Development + - Monitoring__ApplicationInsightsConnectionString= + postgres: image: postgres:16-alpine ports: