From aaca5e712730926be5f7f93d4d548a9db10be65b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:19:35 +0000 Subject: [PATCH] feat: add App Insights AI Monitoring Agent Azure Function - Add Azure Function project (C#, .NET 10, isolated worker) for AI-powered monitoring - Implement timer-triggered functions: AnomalyDetector (5min), HealthMonitor (30min) - Implement HTTP-triggered functions: GetHealthReport, GetServiceTelemetry, GetAnomalies, AlertWebhook, TriggerManualAnalysis - Add AppInsightsQueryService for KQL-based telemetry queries (requests, exceptions, dependencies) - Add AiAnalysisService with rule-based + statistical z-score + Azure OpenAI anomaly detection - Add AlertService with Teams/Slack adaptive card webhook notifications - Add Dockerfile and docker-compose service definition - Update solution file and README with monitoring agent documentation --- README.md | 52 +++ .../Configuration/MonitoringOptions.cs | 30 ++ .../Monitoring.Functions/Dockerfile | 15 + .../Functions/AlertWebhookFunction.cs | 147 +++++++ .../Functions/AnomalyDetectorFunction.cs | 94 ++++ .../Functions/HealthMonitorFunction.cs | 140 ++++++ .../Functions/MonitoringDashboardFunction.cs | 156 +++++++ .../Models/AnomalyResult.cs | 32 ++ .../Models/HealthReport.cs | 45 ++ .../Models/MonitoringAlert.cs | 30 ++ .../Models/TelemetryData.cs | 51 +++ .../Monitoring.Functions.csproj | 27 ++ .../Monitoring.Functions/Program.cs | 25 ++ .../Services/AiAnalysisService.cs | 416 ++++++++++++++++++ .../Services/AlertService.cs | 286 ++++++++++++ .../Services/AppInsightsQueryService.cs | 277 ++++++++++++ .../Services/IAiAnalysisService.cs | 18 + .../Services/IAlertService.cs | 20 + .../Services/IAppInsightsQueryService.cs | 24 + .../Monitoring/Monitoring.Functions/host.json | 24 + .../Monitoring.Functions/local.settings.json | 18 + src/Microservices.sln | 9 + src/docker-compose.yml | 17 + 23 files changed, 1953 insertions(+) create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Configuration/MonitoringOptions.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Dockerfile create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Functions/AlertWebhookFunction.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Functions/AnomalyDetectorFunction.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Functions/HealthMonitorFunction.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Functions/MonitoringDashboardFunction.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Models/AnomalyResult.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Models/HealthReport.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Models/MonitoringAlert.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Models/TelemetryData.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Program.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Services/AiAnalysisService.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Services/AlertService.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Services/AppInsightsQueryService.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Services/IAiAnalysisService.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Services/IAlertService.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/Services/IAppInsightsQueryService.cs create mode 100644 src/Functions/Monitoring/Monitoring.Functions/host.json create mode 100644 src/Functions/Monitoring/Monitoring.Functions/local.settings.json diff --git a/README.md b/README.md index d6e9a9a..ef518dc 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ The monolith's bounded contexts are decomposed into the following independently | `product-service` | 5004 | Product catalog management | `ProductsController`, product models | | `notification-service` | 5005 | Email and in-app notifications | `NotificationService`, notification models | | `api-gateway` | 5000 | YARP reverse proxy, request routing, rate limiting | New — replaces monolith's single entry point | +| `monitoring-agent` | 7071 | AI-powered App Insights monitoring Azure Function | New — observability and anomaly detection | ## Project Structure @@ -72,6 +73,16 @@ src/ │ ├── Notification.API/ │ ├── Notification.Domain/ │ └── Notification.Infrastructure/ +├── Functions/ +│ └── Monitoring/ +│ └── Monitoring.Functions/ # Azure Function — AI monitoring agent +│ ├── Functions/ # Timer & HTTP-triggered functions +│ ├── Services/ # App Insights query, AI analysis, alerting +│ ├── Models/ # Telemetry, anomaly, alert models +│ ├── Configuration/ # Monitoring options +│ ├── Program.cs +│ ├── host.json +│ └── Dockerfile ├── Shared/ │ ├── Shared.Contracts/ # Shared DTOs, events, interfaces │ └── Shared.Infrastructure/ # Common middleware, logging, health checks @@ -86,6 +97,9 @@ src/ - **Entity Framework Core** — per-service database (database-per-service pattern) - **YARP** — API gateway / reverse proxy - **RabbitMQ** — async messaging between services +- **Azure Functions v4** — serverless monitoring agent (isolated worker) +- **Azure Application Insights** — telemetry collection and querying +- **Azure OpenAI** — AI-powered anomaly analysis and health insights - **Docker** — containerized services - **Kubernetes** — orchestration (see `app_dotnet_angular_containerized_decomposition_iac` for Helm charts) @@ -102,6 +116,44 @@ cd src/Services/Identity/Identity.API dotnet run ``` +## AI Monitoring Agent + +The `monitoring-agent` is an Azure Function (C#, .NET 10, isolated worker) that provides AI-powered observability for all microservices via Azure Application Insights. + +### Functions + +| Function | Trigger | Schedule | Description | +|----------|---------|----------|-------------| +| `AnomalyDetector` | Timer | Every 5 min | Queries App Insights telemetry and detects anomalies using rule-based + statistical (z-score) + AI analysis | +| `HealthMonitor` | Timer | Every 30 min | Generates comprehensive health reports with AI summaries and sends webhook notifications | +| `GetHealthReport` | HTTP GET | On-demand | Returns full platform health report with per-service AI insights (`/api/monitoring/health`) | +| `GetServiceTelemetry` | HTTP GET | On-demand | Returns detailed telemetry and AI analysis for a specific service (`/api/monitoring/services/{name}`) | +| `GetAnomalies` | HTTP GET | On-demand | Lists all detected anomalies across services (`/api/monitoring/anomalies`) | +| `AlertWebhook` | HTTP POST | On-demand | Receives App Insights alert webhooks and enriches with AI analysis (`/api/monitoring/alerts/webhook`) | +| `TriggerManualAnalysis` | HTTP POST | On-demand | Triggers on-demand AI analysis for a specific service (`/api/monitoring/analyze/{name}`) | + +### Capabilities + +- **Anomaly Detection**: Rule-based thresholds (failure rate, response time) + statistical z-score analysis on time-series data +- **AI-Powered Insights**: Azure OpenAI generates root cause analysis, health summaries, and exception pattern analysis +- **Webhook Alerts**: Sends adaptive card notifications to Microsoft Teams or Slack +- **KQL Queries**: Queries App Insights via Azure Monitor Query SDK (requests, exceptions, dependencies) + +### Configuration + +Set the following environment variables (or `local.settings.json` values): + +| Variable | Description | +|----------|-------------| +| `APPLICATIONINSIGHTS_CONNECTION_STRING` | App Insights connection string | +| `Monitoring__WorkspaceId` | Log Analytics workspace ID | +| `Monitoring__AzureOpenAIEndpoint` | Azure OpenAI endpoint URL | +| `Monitoring__AzureOpenAIDeployment` | Model deployment name (default: `gpt-4o`) | +| `Monitoring__AlertWebhookUrl` | Teams/Slack incoming webhook URL | +| `Monitoring__FailureRateThresholdPercent` | Failure rate alert threshold (default: `5.0`) | +| `Monitoring__ResponseTimeThresholdMs` | P95 response time threshold (default: `2000`) | +| `Monitoring__MonitoredServices` | Comma-separated service names to monitor | + ## Related Repositories | Repo | Purpose | diff --git a/src/Functions/Monitoring/Monitoring.Functions/Configuration/MonitoringOptions.cs b/src/Functions/Monitoring/Monitoring.Functions/Configuration/MonitoringOptions.cs new file mode 100644 index 0000000..cfd0eec --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Configuration/MonitoringOptions.cs @@ -0,0 +1,30 @@ +namespace Monitoring.Functions.Configuration; + +public sealed class MonitoringOptions +{ + public string ApplicationInsightsConnectionString { get; set; } = string.Empty; + public string WorkspaceId { get; set; } = string.Empty; + public string AzureOpenAIEndpoint { get; set; } = string.Empty; + public string AzureOpenAIDeployment { get; set; } = "gpt-4o"; + public string AlertWebhookUrl { get; set; } = string.Empty; + public int AnomalyDetectionIntervalMinutes { get; set; } = 5; + public int HealthCheckIntervalMinutes { get; set; } = 2; + public double FailureRateThresholdPercent { get; set; } = 5.0; + public double ResponseTimeThresholdMs { get; set; } = 2000; + public string MonitoredServices { get; set; } = string.Empty; + + public IReadOnlyList GetMonitoredServiceNames() => + MonitoredServices + .Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) + .ToList(); + + public IReadOnlyList GetServiceConfigs() => + GetMonitoredServiceNames() + .Select(name => new MonitoredServiceConfig(name)) + .ToList(); +} + +public record MonitoredServiceConfig(string ServiceName) +{ + public string CloudRoleName => ServiceName; +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Dockerfile b/src/Functions/Monitoring/Monitoring.Functions/Dockerfile new file mode 100644 index 0000000..f0b7f08 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Dockerfile @@ -0,0 +1,15 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build +WORKDIR /src + +COPY Shared/Shared.Contracts/Shared.Contracts.csproj Shared/Shared.Contracts/ +COPY Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj Functions/Monitoring/Monitoring.Functions/ +RUN dotnet restore Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj + +COPY Shared/Shared.Contracts/ Shared/Shared.Contracts/ +COPY Functions/Monitoring/Monitoring.Functions/ Functions/Monitoring/Monitoring.Functions/ +RUN dotnet publish Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj \ + -c Release -o /app/publish --no-restore + +FROM mcr.microsoft.com/azure-functions/dotnet-isolated:4-dotnet-isolated10.0 +ENV AzureWebJobsScriptRoot=/home/site/wwwroot +COPY --from=build /app/publish /home/site/wwwroot diff --git a/src/Functions/Monitoring/Monitoring.Functions/Functions/AlertWebhookFunction.cs b/src/Functions/Monitoring/Monitoring.Functions/Functions/AlertWebhookFunction.cs new file mode 100644 index 0000000..bb4c6f8 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Functions/AlertWebhookFunction.cs @@ -0,0 +1,147 @@ +using System.Text.Json; +using Microsoft.AspNetCore.Http; +using Microsoft.AspNetCore.Mvc; +using Microsoft.Azure.Functions.Worker; +using Microsoft.Extensions.Logging; +using Monitoring.Functions.Models; +using Monitoring.Functions.Services; + +namespace Monitoring.Functions.Functions; + +public sealed class AlertWebhookFunction +{ + private readonly IAppInsightsQueryService _queryService; + private readonly IAiAnalysisService _aiService; + private readonly IAlertService _alertService; + private readonly ILogger _logger; + + public AlertWebhookFunction( + IAppInsightsQueryService queryService, + IAiAnalysisService aiService, + IAlertService alertService, + ILogger logger) + { + _queryService = queryService; + _aiService = aiService; + _alertService = alertService; + _logger = logger; + } + + [Function("AlertWebhook")] + public async Task RunAsync( + [HttpTrigger(AuthorizationLevel.Function, "post", Route = "monitoring/alerts/webhook")] HttpRequest req, + CancellationToken cancellationToken) + { + _logger.LogInformation("Alert webhook triggered"); + + try + { + var body = await new StreamReader(req.Body).ReadToEndAsync(cancellationToken); + var alertPayload = JsonSerializer.Deserialize(body, + new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); + + if (alertPayload is null) + return new BadRequestObjectResult(new { error = "Invalid alert payload" }); + + var serviceName = alertPayload.Data?.Context?.ResourceName ?? "unknown-service"; + + var telemetry = await _queryService.GetServiceTelemetryAsync( + serviceName, TimeSpan.FromMinutes(30), cancellationToken); + + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + var exceptionAnalysis = await _aiService.AnalyzeExceptionPatternAsync( + telemetry.TopExceptions, serviceName, cancellationToken); + + var alert = await _alertService.CreateAlertAsync( + serviceName, + MapSeverity(alertPayload.Data?.Context?.Severity), + alertPayload.Data?.Context?.Name ?? "App Insights Alert", + $"Alert triggered: {alertPayload.Data?.Context?.Description}. " + + $"AI Analysis: {exceptionAnalysis}", + anomalies, + cancellationToken); + + await _alertService.SendAlertNotificationAsync(alert, cancellationToken); + + return new OkObjectResult(new + { + alertId = alert.AlertId, + processed = true, + anomaliesDetected = anomalies.Count, + aiEnriched = true + }); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to process alert webhook"); + return new StatusCodeResult(500); + } + } + + [Function("TriggerManualAnalysis")] + public async Task TriggerManualAnalysisAsync( + [HttpTrigger(AuthorizationLevel.Function, "post", Route = "monitoring/analyze/{serviceName}")] HttpRequest req, + string serviceName, + CancellationToken cancellationToken) + { + _logger.LogInformation("Manual analysis triggered for {Service}", serviceName); + + var telemetry = await _queryService.GetServiceTelemetryAsync( + serviceName, TimeSpan.FromHours(1), cancellationToken); + + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + var insight = await _aiService.GenerateHealthInsightAsync(telemetry, cancellationToken); + var exceptionAnalysis = await _aiService.AnalyzeExceptionPatternAsync( + telemetry.TopExceptions, serviceName, cancellationToken); + + if (anomalies.Count > 0) + { + var maxSeverity = anomalies.Max(a => a.Severity); + var alert = await _alertService.CreateAlertAsync( + serviceName, + maxSeverity == AnomalySeverity.Critical ? AlertLevel.Critical : AlertLevel.Warning, + $"Manual analysis: {serviceName}", + insight, + anomalies, + cancellationToken); + + await _alertService.SendAlertNotificationAsync(alert, cancellationToken); + } + + return new OkObjectResult(new + { + serviceName, + telemetry, + healthInsight = insight, + exceptionAnalysis, + anomalies, + analyzedAt = DateTime.UtcNow + }); + } + + private static AlertLevel MapSeverity(string? severity) => severity?.ToLowerInvariant() switch + { + "sev0" or "critical" => AlertLevel.Critical, + "sev1" or "error" => AlertLevel.Error, + "sev2" or "warning" => AlertLevel.Warning, + _ => AlertLevel.Information + }; +} + +public record AppInsightsAlertPayload( + string? SchemaId, + AppInsightsAlertData? Data +); + +public record AppInsightsAlertData( + AppInsightsAlertContext? Context +); + +public record AppInsightsAlertContext( + string? Name, + string? Description, + string? ResourceName, + string? ResourceGroupName, + string? Severity, + string? ConditionType +); diff --git a/src/Functions/Monitoring/Monitoring.Functions/Functions/AnomalyDetectorFunction.cs b/src/Functions/Monitoring/Monitoring.Functions/Functions/AnomalyDetectorFunction.cs new file mode 100644 index 0000000..2170bc5 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Functions/AnomalyDetectorFunction.cs @@ -0,0 +1,94 @@ +using Microsoft.Azure.Functions.Worker; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Monitoring.Functions.Configuration; +using Monitoring.Functions.Models; +using Monitoring.Functions.Services; + +namespace Monitoring.Functions.Functions; + +public sealed class AnomalyDetectorFunction +{ + private readonly IAppInsightsQueryService _queryService; + private readonly IAiAnalysisService _aiService; + private readonly IAlertService _alertService; + private readonly MonitoringOptions _options; + private readonly ILogger _logger; + + public AnomalyDetectorFunction( + IAppInsightsQueryService queryService, + IAiAnalysisService aiService, + IAlertService alertService, + IOptions options, + ILogger logger) + { + _queryService = queryService; + _aiService = aiService; + _alertService = alertService; + _options = options.Value; + _logger = logger; + } + + [Function("AnomalyDetector")] + public async Task RunAsync( + [TimerTrigger("0 */5 * * * *")] TimerInfo timer, + CancellationToken cancellationToken) + { + _logger.LogInformation("Anomaly detection cycle started at {Time}", DateTime.UtcNow); + + var monitoringPeriod = TimeSpan.FromMinutes(_options.AnomalyDetectionIntervalMinutes * 6); + + try + { + var allTelemetry = await _queryService.GetAllServicesTelemetryAsync( + monitoringPeriod, cancellationToken); + + var allAnomalies = new List(); + + foreach (var telemetry in allTelemetry) + { + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + + if (anomalies.Count > 0) + { + _logger.LogWarning( + "Detected {Count} anomalies for {Service}", + anomalies.Count, telemetry.ServiceName); + + allAnomalies.AddRange(anomalies); + + var maxSeverity = anomalies.Max(a => a.Severity); + var alertLevel = maxSeverity switch + { + AnomalySeverity.Critical => AlertLevel.Critical, + AnomalySeverity.Warning => AlertLevel.Warning, + _ => AlertLevel.Information + }; + + var alert = await _alertService.CreateAlertAsync( + telemetry.ServiceName, + alertLevel, + $"Anomalies detected in {telemetry.ServiceName}", + $"{anomalies.Count} anomalies detected: " + + string.Join(", ", anomalies.Select(a => a.Type.ToString()).Distinct()), + anomalies, + cancellationToken); + + if (alertLevel >= AlertLevel.Warning) + { + await _alertService.SendAlertNotificationAsync(alert, cancellationToken); + } + } + } + + _logger.LogInformation( + "Anomaly detection completed. Services: {ServiceCount}, Anomalies: {AnomalyCount}", + allTelemetry.Count, allAnomalies.Count); + } + catch (Exception ex) + { + _logger.LogError(ex, "Anomaly detection cycle failed"); + throw; + } + } +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Functions/HealthMonitorFunction.cs b/src/Functions/Monitoring/Monitoring.Functions/Functions/HealthMonitorFunction.cs new file mode 100644 index 0000000..bc83cba --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Functions/HealthMonitorFunction.cs @@ -0,0 +1,140 @@ +using Microsoft.Azure.Functions.Worker; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Monitoring.Functions.Configuration; +using Monitoring.Functions.Models; +using Monitoring.Functions.Services; + +namespace Monitoring.Functions.Functions; + +public sealed class HealthMonitorFunction +{ + private readonly IAppInsightsQueryService _queryService; + private readonly IAiAnalysisService _aiService; + private readonly IAlertService _alertService; + private readonly MonitoringOptions _options; + private readonly ILogger _logger; + + public HealthMonitorFunction( + IAppInsightsQueryService queryService, + IAiAnalysisService aiService, + IAlertService alertService, + IOptions options, + ILogger logger) + { + _queryService = queryService; + _aiService = aiService; + _alertService = alertService; + _options = options.Value; + _logger = logger; + } + + [Function("HealthMonitor")] + public async Task RunAsync( + [TimerTrigger("0 */30 * * * *")] TimerInfo timer, + CancellationToken cancellationToken) + { + _logger.LogInformation("Health monitoring cycle started at {Time}", DateTime.UtcNow); + + try + { + var monitoringPeriod = TimeSpan.FromHours(1); + var allTelemetry = await _queryService.GetAllServicesTelemetryAsync( + monitoringPeriod, cancellationToken); + + var serviceStatuses = new List(); + var activeAlerts = new List(); + + foreach (var telemetry in allTelemetry) + { + var state = DetermineHealthState(telemetry); + var insight = await _aiService.GenerateHealthInsightAsync(telemetry, cancellationToken); + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + + serviceStatuses.Add(new ServiceHealthStatus( + ServiceName: telemetry.ServiceName, + State: state, + UptimePercent: CalculateUptime(telemetry), + AverageResponseTimeMs: telemetry.AverageResponseTimeMs, + FailureRatePercent: telemetry.FailureRatePercent, + ActiveAnomalies: anomalies.Count, + LastChecked: DateTime.UtcNow, + AiInsight: insight + )); + + if (anomalies.Count > 0) + { + var alert = await _alertService.CreateAlertAsync( + telemetry.ServiceName, + state == HealthState.Unhealthy ? AlertLevel.Error : AlertLevel.Warning, + $"{telemetry.ServiceName} health: {state}", + insight, + anomalies, + cancellationToken); + activeAlerts.Add(alert); + } + } + + var overallStatus = DetermineOverallStatus(serviceStatuses); + var aiSummary = await _aiService.GeneratePlatformSummaryAsync(allTelemetry, cancellationToken); + + var report = new HealthReport( + GeneratedAt: DateTime.UtcNow, + OverallStatus: overallStatus, + AiSummary: aiSummary, + Services: serviceStatuses, + ActiveAlerts: activeAlerts, + Platform: new PlatformMetrics( + TotalServicesMonitored: allTelemetry.Count, + TotalRequestsLast24h: allTelemetry.Sum(t => t.TotalRequests), + OverallFailureRatePercent: allTelemetry.Count > 0 + ? allTelemetry.Average(t => t.FailureRatePercent) : 0, + AverageP95ResponseTimeMs: allTelemetry.Count > 0 + ? allTelemetry.Average(t => t.P95ResponseTimeMs) : 0, + ActiveAnomalies: serviceStatuses.Sum(s => s.ActiveAnomalies), + AlertsTriggeredLast24h: activeAlerts.Count + ) + ); + + await _alertService.SendHealthReportNotificationAsync(report, cancellationToken); + + _logger.LogInformation( + "Health report generated. Overall: {Status}, Services: {Count}, Alerts: {Alerts}", + overallStatus, serviceStatuses.Count, activeAlerts.Count); + } + catch (Exception ex) + { + _logger.LogError(ex, "Health monitoring cycle failed"); + throw; + } + } + + private HealthState DetermineHealthState(ServiceTelemetrySummary telemetry) + { + if (telemetry.FailureRatePercent > _options.FailureRateThresholdPercent * 4) + return HealthState.Unhealthy; + if (telemetry.FailureRatePercent > _options.FailureRateThresholdPercent) + return HealthState.Degraded; + if (telemetry.P95ResponseTimeMs > _options.ResponseTimeThresholdMs * 2) + return HealthState.Degraded; + if (telemetry.TotalRequests == 0) + return HealthState.Unknown; + return HealthState.Healthy; + } + + private static OverallHealthStatus DetermineOverallStatus(List services) + { + if (services.Any(s => s.State == HealthState.Unhealthy)) + return OverallHealthStatus.Unhealthy; + if (services.Any(s => s.State == HealthState.Degraded)) + return OverallHealthStatus.Degraded; + return OverallHealthStatus.Healthy; + } + + private static double CalculateUptime(ServiceTelemetrySummary telemetry) + { + if (telemetry.TotalRequests == 0) + return 100.0; + return 100.0 - telemetry.FailureRatePercent; + } +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Functions/MonitoringDashboardFunction.cs b/src/Functions/Monitoring/Monitoring.Functions/Functions/MonitoringDashboardFunction.cs new file mode 100644 index 0000000..9336a6a --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Functions/MonitoringDashboardFunction.cs @@ -0,0 +1,156 @@ +using System.Text.Json; +using Microsoft.AspNetCore.Http; +using Microsoft.AspNetCore.Mvc; +using Microsoft.Azure.Functions.Worker; +using Microsoft.Extensions.Logging; +using Monitoring.Functions.Models; +using Monitoring.Functions.Services; + +namespace Monitoring.Functions.Functions; + +public sealed class MonitoringDashboardFunction +{ + private readonly IAppInsightsQueryService _queryService; + private readonly IAiAnalysisService _aiService; + private readonly ILogger _logger; + + public MonitoringDashboardFunction( + IAppInsightsQueryService queryService, + IAiAnalysisService aiService, + ILogger logger) + { + _queryService = queryService; + _aiService = aiService; + _logger = logger; + } + + [Function("GetHealthReport")] + public async Task GetHealthReportAsync( + [HttpTrigger(AuthorizationLevel.Function, "get", Route = "monitoring/health")] HttpRequest req, + CancellationToken cancellationToken) + { + _logger.LogInformation("Health report requested"); + + var periodParam = req.Query["period"].FirstOrDefault() ?? "1h"; + var period = ParsePeriod(periodParam); + + var allTelemetry = await _queryService.GetAllServicesTelemetryAsync(period, cancellationToken); + + var serviceStatuses = new List(); + foreach (var telemetry in allTelemetry) + { + var insight = await _aiService.GenerateHealthInsightAsync(telemetry, cancellationToken); + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + + serviceStatuses.Add(new ServiceHealthStatus( + ServiceName: telemetry.ServiceName, + State: telemetry.FailureRatePercent > 20 ? HealthState.Unhealthy : + telemetry.FailureRatePercent > 5 ? HealthState.Degraded : HealthState.Healthy, + UptimePercent: 100.0 - telemetry.FailureRatePercent, + AverageResponseTimeMs: telemetry.AverageResponseTimeMs, + FailureRatePercent: telemetry.FailureRatePercent, + ActiveAnomalies: anomalies.Count, + LastChecked: DateTime.UtcNow, + AiInsight: insight + )); + } + + var aiSummary = await _aiService.GeneratePlatformSummaryAsync(allTelemetry, cancellationToken); + + var report = new HealthReport( + GeneratedAt: DateTime.UtcNow, + OverallStatus: serviceStatuses.Any(s => s.State == HealthState.Unhealthy) + ? OverallHealthStatus.Unhealthy + : serviceStatuses.Any(s => s.State == HealthState.Degraded) + ? OverallHealthStatus.Degraded + : OverallHealthStatus.Healthy, + AiSummary: aiSummary, + Services: serviceStatuses, + ActiveAlerts: [], + Platform: new PlatformMetrics( + TotalServicesMonitored: allTelemetry.Count, + TotalRequestsLast24h: allTelemetry.Sum(t => t.TotalRequests), + OverallFailureRatePercent: allTelemetry.Count > 0 + ? allTelemetry.Average(t => t.FailureRatePercent) : 0, + AverageP95ResponseTimeMs: allTelemetry.Count > 0 + ? allTelemetry.Average(t => t.P95ResponseTimeMs) : 0, + ActiveAnomalies: serviceStatuses.Sum(s => s.ActiveAnomalies), + AlertsTriggeredLast24h: 0 + ) + ); + + return new OkObjectResult(report); + } + + [Function("GetServiceTelemetry")] + public async Task GetServiceTelemetryAsync( + [HttpTrigger(AuthorizationLevel.Function, "get", Route = "monitoring/services/{serviceName}")] HttpRequest req, + string serviceName, + CancellationToken cancellationToken) + { + _logger.LogInformation("Telemetry requested for {Service}", serviceName); + + var periodParam = req.Query["period"].FirstOrDefault() ?? "1h"; + var period = ParsePeriod(periodParam); + + var telemetry = await _queryService.GetServiceTelemetryAsync(serviceName, period, cancellationToken); + var insight = await _aiService.GenerateHealthInsightAsync(telemetry, cancellationToken); + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + var exceptionAnalysis = await _aiService.AnalyzeExceptionPatternAsync( + telemetry.TopExceptions, serviceName, cancellationToken); + + return new OkObjectResult(new + { + telemetry, + aiInsight = insight, + anomalies, + exceptionAnalysis + }); + } + + [Function("GetAnomalies")] + public async Task GetAnomaliesAsync( + [HttpTrigger(AuthorizationLevel.Function, "get", Route = "monitoring/anomalies")] HttpRequest req, + CancellationToken cancellationToken) + { + _logger.LogInformation("Anomaly report requested"); + + var periodParam = req.Query["period"].FirstOrDefault() ?? "30m"; + var period = ParsePeriod(periodParam); + + var allTelemetry = await _queryService.GetAllServicesTelemetryAsync(period, cancellationToken); + var allAnomalies = new List(); + + foreach (var telemetry in allTelemetry) + { + var anomalies = await _aiService.DetectAnomaliesAsync(telemetry, cancellationToken); + allAnomalies.AddRange(anomalies); + } + + return new OkObjectResult(new + { + period = periodParam, + totalAnomalies = allAnomalies.Count, + bySeverity = new + { + critical = allAnomalies.Count(a => a.Severity == AnomalySeverity.Critical), + warning = allAnomalies.Count(a => a.Severity == AnomalySeverity.Warning), + info = allAnomalies.Count(a => a.Severity == AnomalySeverity.Info) + }, + anomalies = allAnomalies.OrderByDescending(a => a.Severity).ThenByDescending(a => a.DetectedAt) + }); + } + + private static TimeSpan ParsePeriod(string period) => period.ToLowerInvariant() switch + { + "5m" => TimeSpan.FromMinutes(5), + "15m" => TimeSpan.FromMinutes(15), + "30m" => TimeSpan.FromMinutes(30), + "1h" => TimeSpan.FromHours(1), + "6h" => TimeSpan.FromHours(6), + "12h" => TimeSpan.FromHours(12), + "24h" or "1d" => TimeSpan.FromHours(24), + "7d" => TimeSpan.FromDays(7), + _ => TimeSpan.FromHours(1) + }; +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Models/AnomalyResult.cs b/src/Functions/Monitoring/Monitoring.Functions/Models/AnomalyResult.cs new file mode 100644 index 0000000..f451480 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Models/AnomalyResult.cs @@ -0,0 +1,32 @@ +namespace Monitoring.Functions.Models; + +public record AnomalyResult( + string ServiceName, + AnomalyType Type, + AnomalySeverity Severity, + string Description, + string AiAnalysis, + string RecommendedAction, + double CurrentValue, + double BaselineValue, + double DeviationPercent, + DateTime DetectedAt +); + +public enum AnomalyType +{ + HighFailureRate, + ResponseTimeSpike, + ExceptionBurst, + DependencyDegradation, + TrafficAnomaly, + MemoryLeak, + CpuSpike +} + +public enum AnomalySeverity +{ + Info, + Warning, + Critical +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Models/HealthReport.cs b/src/Functions/Monitoring/Monitoring.Functions/Models/HealthReport.cs new file mode 100644 index 0000000..39b30a1 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Models/HealthReport.cs @@ -0,0 +1,45 @@ +namespace Monitoring.Functions.Models; + +public record HealthReport( + DateTime GeneratedAt, + OverallHealthStatus OverallStatus, + string AiSummary, + List Services, + List ActiveAlerts, + PlatformMetrics Platform +); + +public record ServiceHealthStatus( + string ServiceName, + HealthState State, + double UptimePercent, + double AverageResponseTimeMs, + double FailureRatePercent, + int ActiveAnomalies, + DateTime LastChecked, + string AiInsight +); + +public record PlatformMetrics( + int TotalServicesMonitored, + long TotalRequestsLast24h, + double OverallFailureRatePercent, + double AverageP95ResponseTimeMs, + int ActiveAnomalies, + int AlertsTriggeredLast24h +); + +public enum OverallHealthStatus +{ + Healthy, + Degraded, + Unhealthy +} + +public enum HealthState +{ + Healthy, + Degraded, + Unhealthy, + Unknown +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Models/MonitoringAlert.cs b/src/Functions/Monitoring/Monitoring.Functions/Models/MonitoringAlert.cs new file mode 100644 index 0000000..71f4f66 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Models/MonitoringAlert.cs @@ -0,0 +1,30 @@ +namespace Monitoring.Functions.Models; + +public record MonitoringAlert( + Guid AlertId, + string ServiceName, + AlertLevel Level, + string Title, + string Summary, + string DetailedAnalysis, + List Anomalies, + List RecommendedActions, + DateTime CreatedAt, + AlertStatus Status +); + +public enum AlertLevel +{ + Information, + Warning, + Error, + Critical +} + +public enum AlertStatus +{ + Active, + Acknowledged, + Resolved, + Suppressed +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Models/TelemetryData.cs b/src/Functions/Monitoring/Monitoring.Functions/Models/TelemetryData.cs new file mode 100644 index 0000000..0cfef98 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Models/TelemetryData.cs @@ -0,0 +1,51 @@ +namespace Monitoring.Functions.Models; + +public record TelemetryData( + string ServiceName, + DateTime Timestamp, + long TotalRequests, + long FailedRequests, + double AverageResponseTimeMs, + double P95ResponseTimeMs, + double P99ResponseTimeMs, + double FailureRatePercent, + List TopExceptions, + List DependencyMetrics +); + +public record ExceptionEntry( + string ExceptionType, + string Message, + long Count, + DateTime LastOccurrence +); + +public record DependencyMetric( + string DependencyType, + string DependencyName, + double AverageLatencyMs, + double FailureRatePercent, + long CallCount +); + +public record ServiceTelemetrySummary( + string ServiceName, + DateTime PeriodStart, + DateTime PeriodEnd, + long TotalRequests, + long FailedRequests, + double FailureRatePercent, + double AverageResponseTimeMs, + double P95ResponseTimeMs, + double P99ResponseTimeMs, + int UniqueExceptionTypes, + List TopExceptions, + List DependencyMetrics, + List RequestTimeSeries, + List ResponseTimeTimeSeries +); + +public record TimeSeriesDataPoint( + DateTime Timestamp, + double Value +); diff --git a/src/Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj b/src/Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj new file mode 100644 index 0000000..8035de6 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Monitoring.Functions.csproj @@ -0,0 +1,27 @@ + + + net10.0 + v4 + Exe + enable + enable + + + + + + + + + + + + + + + + + + + + diff --git a/src/Functions/Monitoring/Monitoring.Functions/Program.cs b/src/Functions/Monitoring/Monitoring.Functions/Program.cs new file mode 100644 index 0000000..39cd7be --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Program.cs @@ -0,0 +1,25 @@ +using Microsoft.Azure.Functions.Worker; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Monitoring.Functions.Configuration; +using Monitoring.Functions.Services; + +var host = new HostBuilder() + .ConfigureFunctionsWebApplication() + .ConfigureServices((context, services) => + { + services.AddApplicationInsightsTelemetryWorkerService(); + services.ConfigureFunctionsApplicationInsights(); + + services.Configure( + context.Configuration.GetSection("Monitoring")); + + services.AddHttpClient(); + services.AddSingleton(); + services.AddSingleton(); + services.AddSingleton(); + }) + .Build(); + +host.Run(); diff --git a/src/Functions/Monitoring/Monitoring.Functions/Services/AiAnalysisService.cs b/src/Functions/Monitoring/Monitoring.Functions/Services/AiAnalysisService.cs new file mode 100644 index 0000000..081ef47 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Services/AiAnalysisService.cs @@ -0,0 +1,416 @@ +using System.Text; +using System.Text.Json; +using Azure; +using Azure.AI.OpenAI; +using Azure.Identity; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Monitoring.Functions.Configuration; +using Monitoring.Functions.Models; +using OpenAI.Chat; + +namespace Monitoring.Functions.Services; + +public sealed class AiAnalysisService : IAiAnalysisService +{ + private readonly MonitoringOptions _options; + private readonly ILogger _logger; + private readonly ChatClient _chatClient; + + public AiAnalysisService( + IOptions options, + ILogger logger) + { + _options = options.Value; + _logger = logger; + + var azureClient = new AzureOpenAIClient( + new Uri(_options.AzureOpenAIEndpoint), + new DefaultAzureCredential()); + + _chatClient = azureClient.GetChatClient(_options.AzureOpenAIDeployment); + } + + public async Task> DetectAnomaliesAsync( + ServiceTelemetrySummary telemetry, CancellationToken cancellationToken = default) + { + var anomalies = new List(); + + // Rule-based detection first + anomalies.AddRange(DetectRuleBasedAnomalies(telemetry)); + + // Statistical anomaly detection on time series + anomalies.AddRange(DetectTimeSeriesAnomalies(telemetry)); + + // AI-enhanced analysis for detected anomalies + if (anomalies.Count > 0) + { + anomalies = await EnrichAnomaliesWithAiAsync(anomalies, telemetry, cancellationToken); + } + + return anomalies; + } + + public async Task GenerateHealthInsightAsync( + ServiceTelemetrySummary telemetry, CancellationToken cancellationToken = default) + { + var prompt = BuildHealthInsightPrompt(telemetry); + + try + { + var completion = await _chatClient.CompleteChatAsync( + [ + new SystemChatMessage( + """ + You are an AI monitoring agent for a microservices platform. + Analyze the telemetry data and provide a concise health insight. + Focus on actionable observations. Be specific about metrics. + Keep the response under 200 words. + """), + new UserChatMessage(prompt) + ], + new ChatCompletionOptions { Temperature = 0.3f }, + cancellationToken); + + return completion.Value.Content[0].Text; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "AI analysis unavailable for {Service}", telemetry.ServiceName); + return GenerateFallbackInsight(telemetry); + } + } + + public async Task GeneratePlatformSummaryAsync( + List allTelemetry, CancellationToken cancellationToken = default) + { + var prompt = BuildPlatformSummaryPrompt(allTelemetry); + + try + { + var completion = await _chatClient.CompleteChatAsync( + [ + new SystemChatMessage( + """ + You are an AI monitoring agent for a distributed microservices platform. + Provide a holistic platform health summary based on all service telemetry. + Identify cross-service patterns, correlations, and systemic issues. + Prioritize critical findings. Keep the response under 300 words. + """), + new UserChatMessage(prompt) + ], + new ChatCompletionOptions { Temperature = 0.3f }, + cancellationToken); + + return completion.Value.Content[0].Text; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "AI platform summary generation failed"); + return "AI analysis unavailable. Review individual service metrics for details."; + } + } + + public async Task AnalyzeExceptionPatternAsync( + List exceptions, string serviceName, CancellationToken cancellationToken = default) + { + if (exceptions.Count == 0) + return "No exceptions detected in the monitoring period."; + + var sb = new StringBuilder(); + sb.AppendLine($"Service: {serviceName}"); + sb.AppendLine("Exception patterns:"); + foreach (var ex in exceptions.Take(10)) + { + sb.AppendLine($" - {ex.ExceptionType}: {ex.Message} (Count: {ex.Count}, Last: {ex.LastOccurrence:u})"); + } + + try + { + var completion = await _chatClient.CompleteChatAsync( + [ + new SystemChatMessage( + """ + You are an AI monitoring agent. Analyze the exception patterns and provide: + 1. Root cause hypothesis for the most frequent exceptions + 2. Whether exceptions are correlated + 3. Recommended remediation steps + Keep the response under 200 words. + """), + new UserChatMessage(sb.ToString()) + ], + new ChatCompletionOptions { Temperature = 0.3f }, + cancellationToken); + + return completion.Value.Content[0].Text; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "AI exception analysis failed for {Service}", serviceName); + return $"Found {exceptions.Count} exception types. Top: {exceptions.First().ExceptionType} ({exceptions.First().Count} occurrences)."; + } + } + + private List DetectRuleBasedAnomalies(ServiceTelemetrySummary telemetry) + { + var anomalies = new List(); + + if (telemetry.FailureRatePercent > _options.FailureRateThresholdPercent) + { + anomalies.Add(new AnomalyResult( + ServiceName: telemetry.ServiceName, + Type: AnomalyType.HighFailureRate, + Severity: telemetry.FailureRatePercent > 20 ? AnomalySeverity.Critical : AnomalySeverity.Warning, + Description: $"Failure rate {telemetry.FailureRatePercent:F1}% exceeds threshold {_options.FailureRateThresholdPercent}%", + AiAnalysis: string.Empty, + RecommendedAction: "Investigate error logs and recent deployments", + CurrentValue: telemetry.FailureRatePercent, + BaselineValue: _options.FailureRateThresholdPercent, + DeviationPercent: ((telemetry.FailureRatePercent - _options.FailureRateThresholdPercent) + / _options.FailureRateThresholdPercent) * 100, + DetectedAt: DateTime.UtcNow + )); + } + + if (telemetry.P95ResponseTimeMs > _options.ResponseTimeThresholdMs) + { + anomalies.Add(new AnomalyResult( + ServiceName: telemetry.ServiceName, + Type: AnomalyType.ResponseTimeSpike, + Severity: telemetry.P95ResponseTimeMs > _options.ResponseTimeThresholdMs * 3 + ? AnomalySeverity.Critical : AnomalySeverity.Warning, + Description: $"P95 response time {telemetry.P95ResponseTimeMs:F0}ms exceeds threshold {_options.ResponseTimeThresholdMs}ms", + AiAnalysis: string.Empty, + RecommendedAction: "Check dependency latency, database queries, and resource utilization", + CurrentValue: telemetry.P95ResponseTimeMs, + BaselineValue: _options.ResponseTimeThresholdMs, + DeviationPercent: ((telemetry.P95ResponseTimeMs - _options.ResponseTimeThresholdMs) + / _options.ResponseTimeThresholdMs) * 100, + DetectedAt: DateTime.UtcNow + )); + } + + if (telemetry.UniqueExceptionTypes > 5) + { + anomalies.Add(new AnomalyResult( + ServiceName: telemetry.ServiceName, + Type: AnomalyType.ExceptionBurst, + Severity: telemetry.UniqueExceptionTypes > 15 ? AnomalySeverity.Critical : AnomalySeverity.Warning, + Description: $"High number of unique exception types: {telemetry.UniqueExceptionTypes}", + AiAnalysis: string.Empty, + RecommendedAction: "Review exception patterns for systemic issues", + CurrentValue: telemetry.UniqueExceptionTypes, + BaselineValue: 5, + DeviationPercent: ((telemetry.UniqueExceptionTypes - 5.0) / 5.0) * 100, + DetectedAt: DateTime.UtcNow + )); + } + + foreach (var dep in telemetry.DependencyMetrics.Where(d => d.FailureRatePercent > 10)) + { + anomalies.Add(new AnomalyResult( + ServiceName: telemetry.ServiceName, + Type: AnomalyType.DependencyDegradation, + Severity: dep.FailureRatePercent > 50 ? AnomalySeverity.Critical : AnomalySeverity.Warning, + Description: $"Dependency '{dep.DependencyName}' ({dep.DependencyType}) failure rate: {dep.FailureRatePercent:F1}%", + AiAnalysis: string.Empty, + RecommendedAction: $"Check health of dependency '{dep.DependencyName}'", + CurrentValue: dep.FailureRatePercent, + BaselineValue: 10, + DeviationPercent: ((dep.FailureRatePercent - 10.0) / 10.0) * 100, + DetectedAt: DateTime.UtcNow + )); + } + + return anomalies; + } + + private static List DetectTimeSeriesAnomalies(ServiceTelemetrySummary telemetry) + { + var anomalies = new List(); + + anomalies.AddRange(DetectZScoreAnomalies( + telemetry.RequestTimeSeries, telemetry.ServiceName, AnomalyType.TrafficAnomaly, "request rate")); + + anomalies.AddRange(DetectZScoreAnomalies( + telemetry.ResponseTimeTimeSeries, telemetry.ServiceName, AnomalyType.ResponseTimeSpike, "response time")); + + return anomalies; + } + + private static List DetectZScoreAnomalies( + List timeSeries, + string serviceName, + AnomalyType anomalyType, + string metricName) + { + var anomalies = new List(); + + if (timeSeries.Count < 5) + return anomalies; + + var values = timeSeries.Select(p => p.Value).ToList(); + var mean = values.Average(); + var stdDev = Math.Sqrt(values.Average(v => Math.Pow(v - mean, 2))); + + if (stdDev < 0.001) + return anomalies; + + const double zScoreThreshold = 2.5; + + foreach (var point in timeSeries.TakeLast(3)) + { + var zScore = (point.Value - mean) / stdDev; + if (Math.Abs(zScore) > zScoreThreshold) + { + anomalies.Add(new AnomalyResult( + ServiceName: serviceName, + Type: anomalyType, + Severity: Math.Abs(zScore) > 3.5 ? AnomalySeverity.Critical : AnomalySeverity.Warning, + Description: $"Statistical anomaly in {metricName}: value {point.Value:F1} at {point.Timestamp:u} " + + $"(z-score: {zScore:F2}, mean: {mean:F1}, stddev: {stdDev:F1})", + AiAnalysis: string.Empty, + RecommendedAction: $"Investigate {metricName} deviation from baseline", + CurrentValue: point.Value, + BaselineValue: mean, + DeviationPercent: mean > 0 ? ((point.Value - mean) / mean) * 100 : 0, + DetectedAt: DateTime.UtcNow + )); + } + } + + return anomalies; + } + + private async Task> EnrichAnomaliesWithAiAsync( + List anomalies, + ServiceTelemetrySummary telemetry, + CancellationToken cancellationToken) + { + var prompt = BuildAnomalyAnalysisPrompt(anomalies, telemetry); + + try + { + var completion = await _chatClient.CompleteChatAsync( + [ + new SystemChatMessage( + """ + You are an AI monitoring agent. Analyze the detected anomalies in context of the + service telemetry. For each anomaly, provide a brief root cause analysis and + specific remediation steps. Return a JSON array with objects containing: + {"index": 0, "analysis": "...", "action": "..."} + Return ONLY the JSON array, no markdown. + """), + new UserChatMessage(prompt) + ], + new ChatCompletionOptions { Temperature = 0.2f }, + cancellationToken); + + var aiResponse = completion.Value.Content[0].Text; + var enrichments = JsonSerializer.Deserialize>(aiResponse, + new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); + + if (enrichments is not null) + { + return anomalies.Select((anomaly, index) => + { + var enrichment = enrichments.FirstOrDefault(e => e.Index == index); + return enrichment is not null + ? anomaly with + { + AiAnalysis = enrichment.Analysis, + RecommendedAction = enrichment.Action + } + : anomaly; + }).ToList(); + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "AI anomaly enrichment failed for {Service}", telemetry.ServiceName); + } + + return anomalies; + } + + private static string BuildHealthInsightPrompt(ServiceTelemetrySummary telemetry) + { + var sb = new StringBuilder(); + sb.AppendLine($"Service: {telemetry.ServiceName}"); + sb.AppendLine($"Period: {telemetry.PeriodStart:u} to {telemetry.PeriodEnd:u}"); + sb.AppendLine($"Total Requests: {telemetry.TotalRequests:N0}"); + sb.AppendLine($"Failed Requests: {telemetry.FailedRequests:N0}"); + sb.AppendLine($"Failure Rate: {telemetry.FailureRatePercent:F2}%"); + sb.AppendLine($"Avg Response Time: {telemetry.AverageResponseTimeMs:F0}ms"); + sb.AppendLine($"P95 Response Time: {telemetry.P95ResponseTimeMs:F0}ms"); + sb.AppendLine($"P99 Response Time: {telemetry.P99ResponseTimeMs:F0}ms"); + sb.AppendLine($"Unique Exception Types: {telemetry.UniqueExceptionTypes}"); + + if (telemetry.TopExceptions.Count > 0) + { + sb.AppendLine("Top Exceptions:"); + foreach (var ex in telemetry.TopExceptions.Take(5)) + sb.AppendLine($" - {ex.ExceptionType}: {ex.Count} occurrences"); + } + + if (telemetry.DependencyMetrics.Count > 0) + { + sb.AppendLine("Dependencies:"); + foreach (var dep in telemetry.DependencyMetrics.Take(5)) + sb.AppendLine($" - {dep.DependencyName} ({dep.DependencyType}): " + + $"Avg {dep.AverageLatencyMs:F0}ms, Failures {dep.FailureRatePercent:F1}%"); + } + + return sb.ToString(); + } + + private static string BuildPlatformSummaryPrompt(List allTelemetry) + { + var sb = new StringBuilder(); + sb.AppendLine("Platform Telemetry Summary:"); + sb.AppendLine($"Services Monitored: {allTelemetry.Count}"); + sb.AppendLine(); + + foreach (var telemetry in allTelemetry) + { + sb.AppendLine($"--- {telemetry.ServiceName} ---"); + sb.AppendLine($" Requests: {telemetry.TotalRequests:N0}, Failures: {telemetry.FailureRatePercent:F2}%"); + sb.AppendLine($" Avg/P95/P99: {telemetry.AverageResponseTimeMs:F0}/{telemetry.P95ResponseTimeMs:F0}/{telemetry.P99ResponseTimeMs:F0} ms"); + sb.AppendLine($" Exceptions: {telemetry.UniqueExceptionTypes} types"); + sb.AppendLine(); + } + + return sb.ToString(); + } + + private static string BuildAnomalyAnalysisPrompt( + List anomalies, ServiceTelemetrySummary telemetry) + { + var sb = new StringBuilder(); + sb.AppendLine($"Service: {telemetry.ServiceName}"); + sb.AppendLine($"Requests: {telemetry.TotalRequests:N0}, Failure Rate: {telemetry.FailureRatePercent:F2}%"); + sb.AppendLine($"Response Times - Avg: {telemetry.AverageResponseTimeMs:F0}ms, P95: {telemetry.P95ResponseTimeMs:F0}ms"); + sb.AppendLine(); + sb.AppendLine("Detected Anomalies:"); + + for (int i = 0; i < anomalies.Count; i++) + { + var a = anomalies[i]; + sb.AppendLine($" [{i}] {a.Type}: {a.Description} (Severity: {a.Severity})"); + } + + return sb.ToString(); + } + + private static string GenerateFallbackInsight(ServiceTelemetrySummary telemetry) + { + var status = telemetry.FailureRatePercent < 1 ? "healthy" : + telemetry.FailureRatePercent < 5 ? "degraded" : "unhealthy"; + + return $"{telemetry.ServiceName} is {status}. " + + $"Processed {telemetry.TotalRequests:N0} requests with {telemetry.FailureRatePercent:F2}% failure rate. " + + $"Average response time: {telemetry.AverageResponseTimeMs:F0}ms (P95: {telemetry.P95ResponseTimeMs:F0}ms)."; + } + + private record AnomalyEnrichment(int Index, string Analysis, string Action); +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Services/AlertService.cs b/src/Functions/Monitoring/Monitoring.Functions/Services/AlertService.cs new file mode 100644 index 0000000..67b070f --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Services/AlertService.cs @@ -0,0 +1,286 @@ +using System.Text; +using System.Text.Json; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Monitoring.Functions.Configuration; +using Monitoring.Functions.Models; + +namespace Monitoring.Functions.Services; + +public sealed class AlertService : IAlertService +{ + private readonly MonitoringOptions _options; + private readonly IHttpClientFactory _httpClientFactory; + private readonly ILogger _logger; + + public AlertService( + IOptions options, + IHttpClientFactory httpClientFactory, + ILogger logger) + { + _options = options.Value; + _httpClientFactory = httpClientFactory; + _logger = logger; + } + + public Task CreateAlertAsync( + string serviceName, + AlertLevel level, + string title, + string summary, + List anomalies, + CancellationToken cancellationToken = default) + { + var alert = new MonitoringAlert( + AlertId: Guid.NewGuid(), + ServiceName: serviceName, + Level: level, + Title: title, + Summary: summary, + DetailedAnalysis: BuildDetailedAnalysis(anomalies), + Anomalies: anomalies, + RecommendedActions: anomalies + .Select(a => a.RecommendedAction) + .Where(a => !string.IsNullOrEmpty(a)) + .Distinct() + .ToList(), + CreatedAt: DateTime.UtcNow, + Status: AlertStatus.Active + ); + + _logger.LogWarning( + "Alert created: [{Level}] {Title} for {Service} - {Summary}", + level, title, serviceName, summary); + + return Task.FromResult(alert); + } + + public async Task SendAlertNotificationAsync( + MonitoringAlert alert, CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(_options.AlertWebhookUrl)) + { + _logger.LogInformation("No webhook URL configured; alert logged only: {AlertId}", alert.AlertId); + return; + } + + var payload = BuildWebhookPayload(alert); + await PostWebhookAsync(payload, cancellationToken); + } + + public async Task SendHealthReportNotificationAsync( + HealthReport report, CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(_options.AlertWebhookUrl)) + { + _logger.LogInformation("No webhook URL configured; health report logged only"); + return; + } + + var payload = BuildHealthReportPayload(report); + await PostWebhookAsync(payload, cancellationToken); + } + + private async Task PostWebhookAsync(object payload, CancellationToken cancellationToken) + { + try + { + var client = _httpClientFactory.CreateClient(); + var json = JsonSerializer.Serialize(payload, new JsonSerializerOptions + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + WriteIndented = false + }); + + var response = await client.PostAsync( + _options.AlertWebhookUrl, + new StringContent(json, Encoding.UTF8, "application/json"), + cancellationToken); + + if (response.IsSuccessStatusCode) + { + _logger.LogInformation("Webhook notification sent successfully"); + } + else + { + _logger.LogWarning( + "Webhook notification failed: {StatusCode} {Reason}", + response.StatusCode, response.ReasonPhrase); + } + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to send webhook notification"); + } + } + + private static object BuildWebhookPayload(MonitoringAlert alert) + { + var color = alert.Level switch + { + AlertLevel.Critical => "#FF0000", + AlertLevel.Error => "#FF6600", + AlertLevel.Warning => "#FFAA00", + _ => "#00AA00" + }; + + var icon = alert.Level switch + { + AlertLevel.Critical => "🔴", + AlertLevel.Error => "🟠", + AlertLevel.Warning => "🟡", + _ => "🟢" + }; + + return new + { + type = "message", + attachments = new[] + { + new + { + contentType = "application/vnd.microsoft.card.adaptive", + content = new + { + type = "AdaptiveCard", + version = "1.4", + body = new object[] + { + new + { + type = "TextBlock", + text = $"{icon} {alert.Title}", + weight = "Bolder", + size = "Large", + color = "Attention" + }, + new + { + type = "TextBlock", + text = $"**Service:** {alert.ServiceName} | **Level:** {alert.Level}", + isSubtle = true + }, + new + { + type = "TextBlock", + text = alert.Summary, + wrap = true + }, + new + { + type = "TextBlock", + text = $"**Anomalies:** {alert.Anomalies.Count} detected", + spacing = "Medium" + }, + new + { + type = "TextBlock", + text = string.Join("\n", + alert.RecommendedActions.Select(a => $"- {a}")), + wrap = true, + spacing = "Small" + }, + new + { + type = "TextBlock", + text = $"Alert ID: {alert.AlertId} | {alert.CreatedAt:u}", + isSubtle = true, + size = "Small" + } + } + } + } + } + }; + } + + private static object BuildHealthReportPayload(HealthReport report) + { + var statusIcon = report.OverallStatus switch + { + OverallHealthStatus.Healthy => "🟢", + OverallHealthStatus.Degraded => "🟡", + _ => "🔴" + }; + + var serviceLines = report.Services + .Select(s => + { + var sIcon = s.State switch + { + HealthState.Healthy => "🟢", + HealthState.Degraded => "🟡", + HealthState.Unhealthy => "🔴", + _ => "⚪" + }; + return $"{sIcon} **{s.ServiceName}** — {s.FailureRatePercent:F1}% errors, " + + $"{s.AverageResponseTimeMs:F0}ms avg"; + }); + + return new + { + type = "message", + attachments = new[] + { + new + { + contentType = "application/vnd.microsoft.card.adaptive", + content = new + { + type = "AdaptiveCard", + version = "1.4", + body = new object[] + { + new + { + type = "TextBlock", + text = $"{statusIcon} Platform Health Report", + weight = "Bolder", + size = "Large" + }, + new + { + type = "TextBlock", + text = report.AiSummary, + wrap = true + }, + new + { + type = "TextBlock", + text = string.Join("\n", serviceLines), + wrap = true, + spacing = "Medium" + }, + new + { + type = "TextBlock", + text = $"Active Alerts: {report.ActiveAlerts.Count} | " + + $"Generated: {report.GeneratedAt:u}", + isSubtle = true, + size = "Small" + } + } + } + } + } + }; + } + + private static string BuildDetailedAnalysis(List anomalies) + { + if (anomalies.Count == 0) + return "No anomalies detected."; + + var sb = new StringBuilder(); + foreach (var anomaly in anomalies) + { + sb.AppendLine($"[{anomaly.Severity}] {anomaly.Type}: {anomaly.Description}"); + if (!string.IsNullOrEmpty(anomaly.AiAnalysis)) + sb.AppendLine($" Analysis: {anomaly.AiAnalysis}"); + sb.AppendLine($" Action: {anomaly.RecommendedAction}"); + sb.AppendLine(); + } + + return sb.ToString(); + } +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Services/AppInsightsQueryService.cs b/src/Functions/Monitoring/Monitoring.Functions/Services/AppInsightsQueryService.cs new file mode 100644 index 0000000..131fa14 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Services/AppInsightsQueryService.cs @@ -0,0 +1,277 @@ +using Azure.Identity; +using Azure.Monitor.Query; +using Azure.Monitor.Query.Models; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Monitoring.Functions.Configuration; +using Monitoring.Functions.Models; + +namespace Monitoring.Functions.Services; + +public sealed class AppInsightsQueryService : IAppInsightsQueryService +{ + private readonly LogsQueryClient _logsClient; + private readonly MonitoringOptions _options; + private readonly ILogger _logger; + + public AppInsightsQueryService( + IOptions options, + ILogger logger) + { + _options = options.Value; + _logger = logger; + _logsClient = new LogsQueryClient(new DefaultAzureCredential()); + } + + public async Task GetServiceTelemetryAsync( + string serviceName, TimeSpan period, CancellationToken cancellationToken = default) + { + var periodEnd = DateTime.UtcNow; + var periodStart = periodEnd - period; + + var requestMetrics = await QueryRequestMetricsAsync(serviceName, period, cancellationToken); + var exceptions = await GetTopExceptionsAsync(serviceName, period, 10, cancellationToken); + var dependencies = await GetDependencyMetricsAsync(serviceName, period, cancellationToken); + var requestTimeSeries = await GetRequestRateTimeSeriesAsync( + serviceName, period, TimeSpan.FromMinutes(5), cancellationToken); + var responseTimeSeries = await GetResponseTimeTimeSeriesAsync( + serviceName, period, TimeSpan.FromMinutes(5), cancellationToken); + + return new ServiceTelemetrySummary( + ServiceName: serviceName, + PeriodStart: periodStart, + PeriodEnd: periodEnd, + TotalRequests: requestMetrics.TotalRequests, + FailedRequests: requestMetrics.FailedRequests, + FailureRatePercent: requestMetrics.FailureRatePercent, + AverageResponseTimeMs: requestMetrics.AverageResponseTimeMs, + P95ResponseTimeMs: requestMetrics.P95ResponseTimeMs, + P99ResponseTimeMs: requestMetrics.P99ResponseTimeMs, + UniqueExceptionTypes: exceptions.Select(e => e.ExceptionType).Distinct().Count(), + TopExceptions: exceptions, + DependencyMetrics: dependencies, + RequestTimeSeries: requestTimeSeries, + ResponseTimeTimeSeries: responseTimeSeries + ); + } + + public async Task> GetAllServicesTelemetryAsync( + TimeSpan period, CancellationToken cancellationToken = default) + { + var services = _options.GetMonitoredServiceNames(); + var tasks = services.Select(s => GetServiceTelemetryAsync(s, period, cancellationToken)); + var results = await Task.WhenAll(tasks); + return results.ToList(); + } + + public async Task> GetTopExceptionsAsync( + string serviceName, TimeSpan period, int top = 10, CancellationToken cancellationToken = default) + { + var query = $""" + exceptions + | where cloud_RoleName == '{serviceName}' + | where timestamp > ago({FormatTimeSpan(period)}) + | summarize Count=count(), LastOccurrence=max(timestamp) by type, outerMessage + | top {top} by Count desc + """; + + try + { + var response = await _logsClient.QueryWorkspaceAsync( + _options.WorkspaceId, query, new QueryTimeRange(period), + cancellationToken: cancellationToken); + + return response.Value.Table.Rows.Select(row => new ExceptionEntry( + ExceptionType: row.GetString("type") ?? "Unknown", + Message: row.GetString("outerMessage") ?? "No message", + Count: row.GetInt64("Count") ?? 0, + LastOccurrence: row.GetDateTimeOffset("LastOccurrence")?.UtcDateTime ?? DateTime.UtcNow + )).ToList(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to query exceptions for {Service}", serviceName); + return []; + } + } + + public async Task> GetRequestRateTimeSeriesAsync( + string serviceName, TimeSpan period, TimeSpan interval, CancellationToken cancellationToken = default) + { + var query = $""" + requests + | where cloud_RoleName == '{serviceName}' + | where timestamp > ago({FormatTimeSpan(period)}) + | summarize RequestCount=count() by bin(timestamp, {FormatTimeSpan(interval)}) + | order by timestamp asc + """; + + try + { + var response = await _logsClient.QueryWorkspaceAsync( + _options.WorkspaceId, query, new QueryTimeRange(period), + cancellationToken: cancellationToken); + + return response.Value.Table.Rows.Select(row => new TimeSeriesDataPoint( + Timestamp: row.GetDateTimeOffset("timestamp")?.UtcDateTime ?? DateTime.UtcNow, + Value: row.GetDouble("RequestCount") ?? 0 + )).ToList(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to query request rate for {Service}", serviceName); + return []; + } + } + + public async Task> GetResponseTimeTimeSeriesAsync( + string serviceName, TimeSpan period, TimeSpan interval, CancellationToken cancellationToken = default) + { + var query = $""" + requests + | where cloud_RoleName == '{serviceName}' + | where timestamp > ago({FormatTimeSpan(period)}) + | summarize AvgDuration=avg(duration) by bin(timestamp, {FormatTimeSpan(interval)}) + | order by timestamp asc + """; + + try + { + var response = await _logsClient.QueryWorkspaceAsync( + _options.WorkspaceId, query, new QueryTimeRange(period), + cancellationToken: cancellationToken); + + return response.Value.Table.Rows.Select(row => new TimeSeriesDataPoint( + Timestamp: row.GetDateTimeOffset("timestamp")?.UtcDateTime ?? DateTime.UtcNow, + Value: row.GetDouble("AvgDuration") ?? 0 + )).ToList(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to query response times for {Service}", serviceName); + return []; + } + } + + public async Task> GetDependencyMetricsAsync( + string serviceName, TimeSpan period, CancellationToken cancellationToken = default) + { + var query = $""" + dependencies + | where cloud_RoleName == '{serviceName}' + | where timestamp > ago({FormatTimeSpan(period)}) + | summarize + AvgLatency=avg(duration), + FailureRate=100.0 * countif(success == false) / count(), + CallCount=count() + by type, target + | order by CallCount desc + """; + + try + { + var response = await _logsClient.QueryWorkspaceAsync( + _options.WorkspaceId, query, new QueryTimeRange(period), + cancellationToken: cancellationToken); + + return response.Value.Table.Rows.Select(row => new DependencyMetric( + DependencyType: row.GetString("type") ?? "Unknown", + DependencyName: row.GetString("target") ?? "Unknown", + AverageLatencyMs: row.GetDouble("AvgLatency") ?? 0, + FailureRatePercent: row.GetDouble("FailureRate") ?? 0, + CallCount: row.GetInt64("CallCount") ?? 0 + )).ToList(); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to query dependencies for {Service}", serviceName); + return []; + } + } + + private async Task QueryRequestMetricsAsync( + string serviceName, TimeSpan period, CancellationToken cancellationToken) + { + var query = $""" + requests + | where cloud_RoleName == '{serviceName}' + | where timestamp > ago({FormatTimeSpan(period)}) + | summarize + TotalRequests=count(), + FailedRequests=countif(success == false), + AvgDuration=avg(duration), + P95Duration=percentile(duration, 95), + P99Duration=percentile(duration, 99) + """; + + try + { + var response = await _logsClient.QueryWorkspaceAsync( + _options.WorkspaceId, query, new QueryTimeRange(period), + cancellationToken: cancellationToken); + + var row = response.Value.Table.Rows.FirstOrDefault(); + if (row is null) + return new RequestMetricsSummary(0, 0, 0, 0, 0, 0); + + var total = row.GetInt64("TotalRequests") ?? 0; + var failed = row.GetInt64("FailedRequests") ?? 0; + var failureRate = total > 0 ? (double)failed / total * 100 : 0; + + return new RequestMetricsSummary( + TotalRequests: total, + FailedRequests: failed, + FailureRatePercent: failureRate, + AverageResponseTimeMs: row.GetDouble("AvgDuration") ?? 0, + P95ResponseTimeMs: row.GetDouble("P95Duration") ?? 0, + P99ResponseTimeMs: row.GetDouble("P99Duration") ?? 0 + ); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to query request metrics for {Service}", serviceName); + return new RequestMetricsSummary(0, 0, 0, 0, 0, 0); + } + } + + private static string FormatTimeSpan(TimeSpan ts) => + ts.TotalHours >= 24 ? $"{(int)ts.TotalDays}d" : + ts.TotalMinutes >= 60 ? $"{(int)ts.TotalHours}h" : + $"{(int)ts.TotalMinutes}m"; + + private record RequestMetricsSummary( + long TotalRequests, + long FailedRequests, + double FailureRatePercent, + double AverageResponseTimeMs, + double P95ResponseTimeMs, + double P99ResponseTimeMs + ); +} + +internal static class LogsQueryRowExtensions +{ + public static string? GetString(this LogsTableRow row, string column) + { + try { return row[column]?.ToString(); } + catch { return null; } + } + + public static long? GetInt64(this LogsTableRow row, string column) + { + try { return Convert.ToInt64(row[column]); } + catch { return null; } + } + + public static double? GetDouble(this LogsTableRow row, string column) + { + try { return Convert.ToDouble(row[column]); } + catch { return null; } + } + + public static DateTimeOffset? GetDateTimeOffset(this LogsTableRow row, string column) + { + try { return (DateTimeOffset?)row[column]; } + catch { return null; } + } +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Services/IAiAnalysisService.cs b/src/Functions/Monitoring/Monitoring.Functions/Services/IAiAnalysisService.cs new file mode 100644 index 0000000..e0d5c58 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Services/IAiAnalysisService.cs @@ -0,0 +1,18 @@ +using Monitoring.Functions.Models; + +namespace Monitoring.Functions.Services; + +public interface IAiAnalysisService +{ + Task> DetectAnomaliesAsync( + ServiceTelemetrySummary telemetry, CancellationToken cancellationToken = default); + + Task GenerateHealthInsightAsync( + ServiceTelemetrySummary telemetry, CancellationToken cancellationToken = default); + + Task GeneratePlatformSummaryAsync( + List allTelemetry, CancellationToken cancellationToken = default); + + Task AnalyzeExceptionPatternAsync( + List exceptions, string serviceName, CancellationToken cancellationToken = default); +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Services/IAlertService.cs b/src/Functions/Monitoring/Monitoring.Functions/Services/IAlertService.cs new file mode 100644 index 0000000..0a66c01 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Services/IAlertService.cs @@ -0,0 +1,20 @@ +using Monitoring.Functions.Models; + +namespace Monitoring.Functions.Services; + +public interface IAlertService +{ + Task CreateAlertAsync( + string serviceName, + AlertLevel level, + string title, + string summary, + List anomalies, + CancellationToken cancellationToken = default); + + Task SendAlertNotificationAsync( + MonitoringAlert alert, CancellationToken cancellationToken = default); + + Task SendHealthReportNotificationAsync( + HealthReport report, CancellationToken cancellationToken = default); +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/Services/IAppInsightsQueryService.cs b/src/Functions/Monitoring/Monitoring.Functions/Services/IAppInsightsQueryService.cs new file mode 100644 index 0000000..5db5bca --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/Services/IAppInsightsQueryService.cs @@ -0,0 +1,24 @@ +using Monitoring.Functions.Models; + +namespace Monitoring.Functions.Services; + +public interface IAppInsightsQueryService +{ + Task GetServiceTelemetryAsync( + string serviceName, TimeSpan period, CancellationToken cancellationToken = default); + + Task> GetAllServicesTelemetryAsync( + TimeSpan period, CancellationToken cancellationToken = default); + + Task> GetTopExceptionsAsync( + string serviceName, TimeSpan period, int top = 10, CancellationToken cancellationToken = default); + + Task> GetRequestRateTimeSeriesAsync( + string serviceName, TimeSpan period, TimeSpan interval, CancellationToken cancellationToken = default); + + Task> GetResponseTimeTimeSeriesAsync( + string serviceName, TimeSpan period, TimeSpan interval, CancellationToken cancellationToken = default); + + Task> GetDependencyMetricsAsync( + string serviceName, TimeSpan period, CancellationToken cancellationToken = default); +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/host.json b/src/Functions/Monitoring/Monitoring.Functions/host.json new file mode 100644 index 0000000..8ad6846 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/host.json @@ -0,0 +1,24 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + }, + "enableLiveMetricsFilters": true + }, + "logLevel": { + "default": "Information", + "Host.Results": "Error", + "Function": "Information", + "Host.Aggregator": "Trace" + } + }, + "extensions": { + "timers": { + "schedule": "0 */5 * * * *" + } + }, + "functionTimeout": "00:10:00" +} diff --git a/src/Functions/Monitoring/Monitoring.Functions/local.settings.json b/src/Functions/Monitoring/Monitoring.Functions/local.settings.json new file mode 100644 index 0000000..e801c21 --- /dev/null +++ b/src/Functions/Monitoring/Monitoring.Functions/local.settings.json @@ -0,0 +1,18 @@ +{ + "IsEncrypted": false, + "Values": { + "AzureWebJobsStorage": "UseDevelopmentStorage=true", + "FUNCTIONS_WORKER_RUNTIME": "dotnet-isolated", + "APPLICATIONINSIGHTS_CONNECTION_STRING": "", + "Monitoring__ApplicationInsightsConnectionString": "", + "Monitoring__WorkspaceId": "", + "Monitoring__AzureOpenAIEndpoint": "", + "Monitoring__AzureOpenAIDeployment": "gpt-4o", + "Monitoring__AlertWebhookUrl": "", + "Monitoring__AnomalyDetectionIntervalMinutes": "5", + "Monitoring__HealthCheckIntervalMinutes": "2", + "Monitoring__FailureRateThresholdPercent": "5.0", + "Monitoring__ResponseTimeThresholdMs": "2000", + "Monitoring__MonitoredServices": "identity-service,customer-service,order-service,product-service,notification-service,api-gateway" + } +} diff --git a/src/Microservices.sln b/src/Microservices.sln index bf2eaa1..c836abe 100644 --- a/src/Microservices.sln +++ b/src/Microservices.sln @@ -55,6 +55,13 @@ EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ApiGateway", "ApiGateway\ApiGateway.csproj", "{C0000001-0000-0000-0000-000000000001}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Functions", "Functions", "{A1B2C3D4-0008-0000-0000-000000000001}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Monitoring", "Monitoring", "{A1B2C3D4-0009-0000-0000-000000000001}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Monitoring.Functions", "Functions\Monitoring\Monitoring.Functions\Monitoring.Functions.csproj", "{E0000001-0000-0000-0000-000000000001}" +EndProject + Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Shared", "Shared", "{A1B2C3D4-0007-0000-0000-000000000001}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shared.Contracts", "Shared\Shared.Contracts\Shared.Contracts.csproj", "{D0000001-0000-0000-0000-000000000001}" @@ -88,5 +95,7 @@ Global {B5000001-0000-0000-0000-000000000001} = {A1B2C3D4-0006-0000-0000-000000000001} {B5000002-0000-0000-0000-000000000001} = {A1B2C3D4-0006-0000-0000-000000000001} {B5000003-0000-0000-0000-000000000001} = {A1B2C3D4-0006-0000-0000-000000000001} + {A1B2C3D4-0009-0000-0000-000000000001} = {A1B2C3D4-0008-0000-0000-000000000001} + {E0000001-0000-0000-0000-000000000001} = {A1B2C3D4-0009-0000-0000-000000000001} EndGlobalSection EndGlobal diff --git a/src/docker-compose.yml b/src/docker-compose.yml index bf66a7b..5cd1647 100644 --- a/src/docker-compose.yml +++ b/src/docker-compose.yml @@ -81,6 +81,23 @@ services: - ASPNETCORE_ENVIRONMENT=Development - ConnectionStrings__DefaultConnection=Host=postgres;Database=notificationdb;Username=postgres;Password=postgres + monitoring-agent: + build: + context: . + dockerfile: Functions/Monitoring/Monitoring.Functions/Dockerfile + ports: + - "7071:80" + environment: + - AzureWebJobsStorage=UseDevelopmentStorage=true + - FUNCTIONS_WORKER_RUNTIME=dotnet-isolated + - APPLICATIONINSIGHTS_CONNECTION_STRING=${APPLICATIONINSIGHTS_CONNECTION_STRING:-} + - Monitoring__ApplicationInsightsConnectionString=${APPLICATIONINSIGHTS_CONNECTION_STRING:-} + - Monitoring__WorkspaceId=${LOG_ANALYTICS_WORKSPACE_ID:-} + - Monitoring__AzureOpenAIEndpoint=${AZURE_OPENAI_ENDPOINT:-} + - Monitoring__AzureOpenAIDeployment=${AZURE_OPENAI_DEPLOYMENT:-gpt-4o} + - Monitoring__AlertWebhookUrl=${ALERT_WEBHOOK_URL:-} + - Monitoring__MonitoredServices=identity-service,customer-service,order-service,product-service,notification-service,api-gateway + postgres: image: postgres:16-alpine ports: