diff --git a/src/ApiGateway/ApiGateway.csproj b/src/ApiGateway/ApiGateway.csproj index 6ac003b..8387f9f 100644 --- a/src/ApiGateway/ApiGateway.csproj +++ b/src/ApiGateway/ApiGateway.csproj @@ -6,6 +6,9 @@ enable + + + diff --git a/src/ApiGateway/Program.cs b/src/ApiGateway/Program.cs index 89d4860..c494126 100644 --- a/src/ApiGateway/Program.cs +++ b/src/ApiGateway/Program.cs @@ -1,3 +1,5 @@ +using Shared.Monitoring.Extensions; + var builder = WebApplication.CreateBuilder(args); builder.Services.AddReverseProxy() @@ -5,8 +7,11 @@ builder.Services.AddHealthChecks(); +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "ApiGateway"); + var app = builder.Build(); +app.UseAppInsightsMonitoring(); app.MapReverseProxy(); app.MapHealthChecks("/healthz"); diff --git a/src/Microservices.sln b/src/Microservices.sln index bf2eaa1..37b169f 100644 --- a/src/Microservices.sln +++ b/src/Microservices.sln @@ -61,6 +61,13 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shared.Contracts", "Shared\ EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shared.Infrastructure", "Shared\Shared.Infrastructure\Shared.Infrastructure.csproj", "{D0000002-0000-0000-0000-000000000001}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Shared.Monitoring", "Shared\Shared.Monitoring\Shared.Monitoring.csproj", "{D0000003-0000-0000-0000-000000000001}" +EndProject + +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Monitoring", "Monitoring", "{A1B2C3D4-0008-0000-0000-000000000001}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Monitoring.Agent", "Services\Monitoring\Monitoring.Agent\Monitoring.Agent.csproj", "{B6000001-0000-0000-0000-000000000001}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -88,5 +95,8 @@ Global {B5000001-0000-0000-0000-000000000001} = {A1B2C3D4-0006-0000-0000-000000000001} {B5000002-0000-0000-0000-000000000001} = {A1B2C3D4-0006-0000-0000-000000000001} {B5000003-0000-0000-0000-000000000001} = {A1B2C3D4-0006-0000-0000-000000000001} + {D0000003-0000-0000-0000-000000000001} = {A1B2C3D4-0007-0000-0000-000000000001} + {A1B2C3D4-0008-0000-0000-000000000001} = {A1B2C3D4-0001-0000-0000-000000000001} + {B6000001-0000-0000-0000-000000000001} = {A1B2C3D4-0008-0000-0000-000000000001} EndGlobalSection EndGlobal diff --git a/src/Services/Customer/Customer.API/Customer.API.csproj b/src/Services/Customer/Customer.API/Customer.API.csproj index 8c284d4..481300b 100644 --- a/src/Services/Customer/Customer.API/Customer.API.csproj +++ b/src/Services/Customer/Customer.API/Customer.API.csproj @@ -7,8 +7,9 @@ - - + + + diff --git a/src/Services/Customer/Customer.API/Program.cs b/src/Services/Customer/Customer.API/Program.cs index e46940a..ec87e0d 100644 --- a/src/Services/Customer/Customer.API/Program.cs +++ b/src/Services/Customer/Customer.API/Program.cs @@ -1,5 +1,6 @@ using Customer.Infrastructure.Data; using Microsoft.EntityFrameworkCore; +using Shared.Monitoring.Extensions; var builder = WebApplication.CreateBuilder(args); @@ -11,6 +12,8 @@ builder.Services.AddDbContext(options => options.UseNpgsql(builder.Configuration.GetConnectionString("DefaultConnection"))); +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "CustomerService"); + var app = builder.Build(); if (app.Environment.IsDevelopment()) @@ -19,6 +22,7 @@ app.UseSwaggerUI(); } +app.UseAppInsightsMonitoring(); app.MapControllers(); app.MapHealthChecks("/healthz"); diff --git a/src/Services/Identity/Identity.API/Identity.API.csproj b/src/Services/Identity/Identity.API/Identity.API.csproj index 9b3e932..2bb8da9 100644 --- a/src/Services/Identity/Identity.API/Identity.API.csproj +++ b/src/Services/Identity/Identity.API/Identity.API.csproj @@ -7,8 +7,9 @@ - - + + + diff --git a/src/Services/Identity/Identity.API/Program.cs b/src/Services/Identity/Identity.API/Program.cs index b475a08..df8bcf4 100644 --- a/src/Services/Identity/Identity.API/Program.cs +++ b/src/Services/Identity/Identity.API/Program.cs @@ -1,5 +1,6 @@ using Identity.Infrastructure.Data; using Microsoft.EntityFrameworkCore; +using Shared.Monitoring.Extensions; var builder = WebApplication.CreateBuilder(args); @@ -11,6 +12,8 @@ builder.Services.AddDbContext(options => options.UseNpgsql(builder.Configuration.GetConnectionString("DefaultConnection"))); +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "IdentityService"); + var app = builder.Build(); if (app.Environment.IsDevelopment()) @@ -19,6 +22,7 @@ app.UseSwaggerUI(); } +app.UseAppInsightsMonitoring(); app.MapControllers(); app.MapHealthChecks("/healthz"); diff --git a/src/Services/Monitoring/Monitoring.Agent/Controllers/MonitoringController.cs b/src/Services/Monitoring/Monitoring.Agent/Controllers/MonitoringController.cs new file mode 100644 index 0000000..ce6b597 --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Controllers/MonitoringController.cs @@ -0,0 +1,141 @@ +using Microsoft.AspNetCore.Mvc; +using Monitoring.Agent.Models; +using Monitoring.Agent.Services; + +namespace Monitoring.Agent.Controllers; + +[ApiController] +[Route("api/[controller]")] +public class MonitoringController : ControllerBase +{ + private readonly ServiceHealthAggregator _aggregator; + private readonly ILogger _logger; + + public MonitoringController( + ServiceHealthAggregator aggregator, + ILogger logger) + { + _aggregator = aggregator; + _logger = logger; + } + + /// + /// Returns the full AI monitoring dashboard with health scores, anomalies, and insights. + /// + [HttpGet("dashboard")] + public async Task> GetDashboard() + { + var dashboard = await _aggregator.GetDashboardAsync(); + return Ok(dashboard); + } + + /// + /// Returns the most recent cached dashboard snapshot without re-probing services. + /// + [HttpGet("dashboard/latest")] + public ActionResult GetLatestDashboard() + { + var dashboard = MonitoringBackgroundService.LatestDashboard; + if (dashboard is null) + return NotFound("No monitoring data available yet. The background service may still be initializing."); + + return Ok(dashboard); + } + + /// + /// Returns health score and status for all monitored services. + /// + [HttpGet("services")] + public async Task>> GetServiceHealth() + { + var dashboard = await _aggregator.GetDashboardAsync(); + return Ok(dashboard.Services); + } + + /// + /// Returns health details for a specific service. + /// + [HttpGet("services/{serviceName}")] + public async Task> GetServiceHealth(string serviceName) + { + var dashboard = await _aggregator.GetDashboardAsync(); + var service = dashboard.Services + .FirstOrDefault(s => s.ServiceName.Equals(serviceName, StringComparison.OrdinalIgnoreCase)); + + if (service is null) + return NotFound($"Service '{serviceName}' not found."); + + return Ok(service); + } + + /// + /// Returns all active anomalies detected by the AI engine. + /// + [HttpGet("anomalies")] + public async Task>> GetAnomalies( + [FromQuery] AnomalySeverity? severity = null) + { + var dashboard = await _aggregator.GetDashboardAsync(); + var anomalies = dashboard.RecentAnomalies.AsEnumerable(); + + if (severity.HasValue) + anomalies = anomalies.Where(a => a.Severity == severity.Value); + + return Ok(anomalies); + } + + /// + /// Returns AI-generated insights and recommendations. + /// + [HttpGet("insights")] + public async Task>> GetInsights( + [FromQuery] InsightCategory? category = null, + [FromQuery] InsightPriority? priority = null) + { + var dashboard = await _aggregator.GetDashboardAsync(); + var insights = dashboard.Insights.AsEnumerable(); + + if (category.HasValue) + insights = insights.Where(i => i.Category == category.Value); + if (priority.HasValue) + insights = insights.Where(i => i.Priority == priority.Value); + + return Ok(insights); + } + + /// + /// Returns historical dashboard snapshots for trend analysis. + /// + [HttpGet("history")] + public ActionResult> GetHistory( + [FromQuery] int count = 10) + { + var history = MonitoringBackgroundService.DashboardHistory + .TakeLast(Math.Min(count, 60)) + .ToList(); + + return Ok(history); + } + + /// + /// Returns system-wide health summary. + /// + [HttpGet("summary")] + public async Task> GetSummary() + { + var dashboard = await _aggregator.GetDashboardAsync(); + return Ok(new + { + dashboard.GeneratedAt, + dashboard.OverallStatus, + dashboard.SystemHealthScore, + ServiceCount = dashboard.Services.Count, + HealthyCount = dashboard.Services.Count(s => s.Status == ServiceStatus.Healthy), + DegradedCount = dashboard.Services.Count(s => s.Status == ServiceStatus.Degraded), + UnhealthyCount = dashboard.Services.Count(s => s.Status == ServiceStatus.Unhealthy), + AnomalyCount = dashboard.RecentAnomalies.Count, + CriticalAnomalyCount = dashboard.RecentAnomalies.Count(a => a.Severity == AnomalySeverity.Critical), + InsightCount = dashboard.Insights.Count + }); + } +} diff --git a/src/Services/Monitoring/Monitoring.Agent/Models/AnomalyReport.cs b/src/Services/Monitoring/Monitoring.Agent/Models/AnomalyReport.cs new file mode 100644 index 0000000..df5001e --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Models/AnomalyReport.cs @@ -0,0 +1,95 @@ +namespace Monitoring.Agent.Models; + +public record AnomalyReport( + string ServiceName, + DateTime DetectedAt, + AnomalySeverity Severity, + string Category, + string Description, + double CurrentValue, + double ThresholdValue, + string RecommendedAction); + +public enum AnomalySeverity +{ + Info, + Warning, + Critical +} + +public record ServiceHealthReport( + string ServiceName, + DateTime Timestamp, + HealthScore HealthScore, + ServiceStatus Status, + PerformanceMetrics Performance, + ResourceUtilization Resources, + List ActiveAnomalies, + List Recommendations); + +public record HealthScore( + double Overall, + double Availability, + double Performance, + double ErrorRate, + double ResourceUsage); + +public enum ServiceStatus +{ + Healthy, + Degraded, + Unhealthy, + Unknown +} + +public record PerformanceMetrics( + double AverageResponseTimeMs, + double P95ResponseTimeMs, + int RequestsPerMinute, + double ErrorRatePercent, + int ActiveConnections); + +public record ResourceUtilization( + double CpuPercent, + double MemoryMb, + double MemoryPercent, + long GcGen0Collections, + long GcGen1Collections, + long GcGen2Collections, + double GcTotalMemoryMb); + +public record AiInsight( + string InsightId, + DateTime GeneratedAt, + InsightCategory Category, + string Title, + string Description, + InsightPriority Priority, + List AffectedServices, + List ActionItems); + +public enum InsightCategory +{ + Performance, + Reliability, + Scalability, + CostOptimization, + Security +} + +public enum InsightPriority +{ + Low, + Medium, + High, + Urgent +} + +public record MonitoringDashboard( + DateTime GeneratedAt, + string OverallStatus, + double SystemHealthScore, + List Services, + List RecentAnomalies, + List Insights, + Dictionary SystemMetrics); diff --git a/src/Services/Monitoring/Monitoring.Agent/Monitoring.Agent.csproj b/src/Services/Monitoring/Monitoring.Agent/Monitoring.Agent.csproj new file mode 100644 index 0000000..7e8e9ed --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Monitoring.Agent.csproj @@ -0,0 +1,15 @@ + + + net10.0 + enable + enable + + + + + + + + + + diff --git a/src/Services/Monitoring/Monitoring.Agent/Program.cs b/src/Services/Monitoring/Monitoring.Agent/Program.cs new file mode 100644 index 0000000..2fad84f --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Program.cs @@ -0,0 +1,40 @@ +using Monitoring.Agent.Services; +using Shared.Monitoring; +using Shared.Monitoring.Extensions; + +var builder = WebApplication.CreateBuilder(args); + +builder.Services.AddControllers(); +builder.Services.AddEndpointsApiExplorer(); +builder.Services.AddSwaggerGen(); + +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "MonitoringAgent"); + +builder.Services.AddHttpClient("MonitoringAgent", client => +{ + client.Timeout = TimeSpan.FromSeconds(5); +}); + +var anomalyConfig = new AnomalyDetectionConfig(); +builder.Configuration.GetSection("ApplicationInsights:AnomalyDetection").Bind(anomalyConfig); +builder.Services.AddSingleton(anomalyConfig); + +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +builder.Services.AddHostedService(); + +var app = builder.Build(); + +if (app.Environment.IsDevelopment()) +{ + app.UseSwagger(); + app.UseSwaggerUI(); +} + +app.UseAppInsightsMonitoring(); +app.MapControllers(); +app.MapHealthChecks("/healthz"); + +app.Run(); diff --git a/src/Services/Monitoring/Monitoring.Agent/Services/AiInsightsEngine.cs b/src/Services/Monitoring/Monitoring.Agent/Services/AiInsightsEngine.cs new file mode 100644 index 0000000..af1c0e3 --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Services/AiInsightsEngine.cs @@ -0,0 +1,181 @@ +using Monitoring.Agent.Models; +using Shared.Monitoring.Metrics; + +namespace Monitoring.Agent.Services; + +/// +/// Generates AI-driven insights by analyzing patterns across service health +/// snapshots. Produces actionable recommendations for performance, reliability, +/// scalability, and cost optimization. +/// +public class AiInsightsEngine +{ + private readonly ILogger _logger; + private int _insightCounter; + + public AiInsightsEngine(ILogger logger) + { + _logger = logger; + } + + public List GenerateInsights( + Dictionary snapshots, + Dictionary> anomalies) + { + var insights = new List(); + + AnalyzePerformancePatterns(snapshots, insights); + AnalyzeReliabilityPatterns(snapshots, anomalies, insights); + AnalyzeScalabilityPatterns(snapshots, insights); + AnalyzeCrossServicePatterns(snapshots, anomalies, insights); + + return insights; + } + + private void AnalyzePerformancePatterns( + Dictionary snapshots, + List insights) + { + var slowServices = snapshots + .Where(s => s.Value.P95ResponseTimeMs > 1000 && s.Value.RecentRequestCount > 0) + .Select(s => s.Key) + .ToList(); + + if (slowServices.Count > 0) + { + insights.Add(new AiInsight( + GenerateInsightId(), + DateTime.UtcNow, + InsightCategory.Performance, + "Slow Service Response Times Detected", + $"{slowServices.Count} service(s) have P95 response times exceeding 1 second. " + + "Consider enabling response caching, optimizing database queries, or implementing read replicas.", + slowServices.Count > 2 ? InsightPriority.High : InsightPriority.Medium, + slowServices, + new List + { + "Enable Application Insights Profiler to identify hot paths", + "Review database query execution plans for N+1 query patterns", + "Consider implementing response caching for frequently accessed endpoints", + "Evaluate connection pooling configuration" + })); + } + } + + private void AnalyzeReliabilityPatterns( + Dictionary snapshots, + Dictionary> anomalies, + List insights) + { + var servicesWithErrors = snapshots + .Where(s => s.Value.ErrorRatePercent > 1 && s.Value.RecentRequestCount > 0) + .Select(s => s.Key) + .ToList(); + + if (servicesWithErrors.Count > 0) + { + insights.Add(new AiInsight( + GenerateInsightId(), + DateTime.UtcNow, + InsightCategory.Reliability, + "Elevated Error Rates Across Services", + $"{servicesWithErrors.Count} service(s) are experiencing error rates above 1%. " + + "This may indicate systemic issues with shared dependencies.", + InsightPriority.High, + servicesWithErrors, + new List + { + "Check shared dependency health (PostgreSQL, RabbitMQ)", + "Review recent deployment changes for breaking modifications", + "Implement circuit breaker patterns for inter-service communication", + "Set up structured exception logging for root cause analysis" + })); + } + + var servicesWithDependencyIssues = snapshots + .Where(s => s.Value.TotalDependencyCalls > 0 && + (double)s.Value.FailedDependencyCalls / s.Value.TotalDependencyCalls > 0.05) + .Select(s => s.Key) + .ToList(); + + if (servicesWithDependencyIssues.Count > 0) + { + insights.Add(new AiInsight( + GenerateInsightId(), + DateTime.UtcNow, + InsightCategory.Reliability, + "Dependency Failure Pattern Detected", + "Multiple services report high dependency failure rates. " + + "Downstream services or infrastructure may be degraded.", + InsightPriority.Urgent, + servicesWithDependencyIssues, + new List + { + "Verify database connection pool health across all services", + "Check RabbitMQ broker status and queue depths", + "Implement retry policies with exponential backoff", + "Consider adding bulkhead isolation for critical dependencies" + })); + } + } + + private void AnalyzeScalabilityPatterns( + Dictionary snapshots, + List insights) + { + var highMemoryServices = snapshots + .Where(s => s.Value.MemoryUsageMb > 512) + .Select(s => s.Key) + .ToList(); + + if (highMemoryServices.Count > 0) + { + insights.Add(new AiInsight( + GenerateInsightId(), + DateTime.UtcNow, + InsightCategory.Scalability, + "High Memory Consumption Detected", + $"{highMemoryServices.Count} service(s) are using more than 512MB of memory. " + + "This may indicate memory leaks or inefficient object allocation.", + InsightPriority.Medium, + highMemoryServices, + new List + { + "Capture memory dumps and analyze with dotnet-dump", + "Enable Application Insights memory profiling", + "Review object lifetime management and IDisposable patterns", + "Consider implementing object pooling for high-allocation scenarios" + })); + } + } + + private void AnalyzeCrossServicePatterns( + Dictionary snapshots, + Dictionary> anomalies, + List insights) + { + var totalAnomalies = anomalies.Values.Sum(a => a.Count); + if (totalAnomalies > 5) + { + insights.Add(new AiInsight( + GenerateInsightId(), + DateTime.UtcNow, + InsightCategory.Reliability, + "System-Wide Instability Detected", + $"{totalAnomalies} anomalies detected across {anomalies.Count} services. " + + "This pattern suggests a cascading failure or shared infrastructure issue.", + InsightPriority.Urgent, + anomalies.Keys.ToList(), + new List + { + "Investigate shared infrastructure components (network, DNS, load balancers)", + "Check for correlated deployment events across services", + "Review API Gateway health and routing configuration", + "Consider enabling distributed tracing correlation for root cause analysis" + })); + } + } + + private string GenerateInsightId() => + $"insight-{Interlocked.Increment(ref _insightCounter):D6}-{DateTime.UtcNow:yyyyMMddHHmmss}"; +} diff --git a/src/Services/Monitoring/Monitoring.Agent/Services/AnomalyDetectionEngine.cs b/src/Services/Monitoring/Monitoring.Agent/Services/AnomalyDetectionEngine.cs new file mode 100644 index 0000000..fb883e4 --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Services/AnomalyDetectionEngine.cs @@ -0,0 +1,172 @@ +using Monitoring.Agent.Models; +using Shared.Monitoring; +using Shared.Monitoring.Metrics; + +namespace Monitoring.Agent.Services; + +/// +/// AI-powered anomaly detection engine that analyzes service health snapshots +/// against configurable thresholds and historical baselines. +/// Uses statistical analysis to detect performance degradation, error spikes, +/// and resource exhaustion patterns. +/// +public class AnomalyDetectionEngine +{ + private readonly AnomalyDetectionConfig _config; + private readonly ILogger _logger; + private readonly Dictionary> _history = new(); + private readonly object _lock = new(); + + public AnomalyDetectionEngine( + AnomalyDetectionConfig config, + ILogger logger) + { + _config = config; + _logger = logger; + } + + public List Analyze(ServiceHealthSnapshot snapshot) + { + var anomalies = new List(); + + RecordSnapshot(snapshot); + + DetectHighErrorRate(snapshot, anomalies); + DetectSlowResponseTime(snapshot, anomalies); + DetectHighMemoryUsage(snapshot, anomalies); + DetectDependencyFailures(snapshot, anomalies); + DetectResponseTimeSpike(snapshot, anomalies); + + return anomalies; + } + + private void DetectHighErrorRate(ServiceHealthSnapshot snapshot, List anomalies) + { + if (snapshot.RecentRequestCount == 0) + return; + + if (snapshot.ErrorRatePercent > _config.ErrorRateThresholdPercent) + { + var severity = snapshot.ErrorRatePercent > _config.ErrorRateThresholdPercent * 2 + ? AnomalySeverity.Critical + : AnomalySeverity.Warning; + + anomalies.Add(new AnomalyReport( + snapshot.ServiceName, + DateTime.UtcNow, + severity, + "ErrorRate", + $"Error rate of {snapshot.ErrorRatePercent:F1}% exceeds threshold of {_config.ErrorRateThresholdPercent}%.", + snapshot.ErrorRatePercent, + _config.ErrorRateThresholdPercent, + "Investigate recent deployments and dependency health. Check application logs for recurring exceptions.")); + } + } + + private void DetectSlowResponseTime(ServiceHealthSnapshot snapshot, List anomalies) + { + if (snapshot.RecentRequestCount == 0) + return; + + if (snapshot.P95ResponseTimeMs > _config.ResponseTimeThresholdMs) + { + var severity = snapshot.P95ResponseTimeMs > _config.ResponseTimeThresholdMs * 2 + ? AnomalySeverity.Critical + : AnomalySeverity.Warning; + + anomalies.Add(new AnomalyReport( + snapshot.ServiceName, + DateTime.UtcNow, + severity, + "ResponseTime", + $"P95 response time of {snapshot.P95ResponseTimeMs:F0}ms exceeds threshold of {_config.ResponseTimeThresholdMs}ms.", + snapshot.P95ResponseTimeMs, + _config.ResponseTimeThresholdMs, + "Profile slow endpoints. Check database query performance and downstream service latency.")); + } + } + + private void DetectHighMemoryUsage(ServiceHealthSnapshot snapshot, List anomalies) + { + var estimatedMemoryPercent = snapshot.MemoryUsageMb / 1024.0 * 100; + if (estimatedMemoryPercent > _config.MemoryThresholdPercent) + { + anomalies.Add(new AnomalyReport( + snapshot.ServiceName, + DateTime.UtcNow, + AnomalySeverity.Warning, + "MemoryUsage", + $"Memory usage of {snapshot.MemoryUsageMb:F0}MB is elevated.", + snapshot.MemoryUsageMb, + _config.MemoryThresholdPercent, + "Analyze memory allocation patterns. Check for potential memory leaks using dotnet-dump or Application Insights profiler.")); + } + } + + private void DetectDependencyFailures(ServiceHealthSnapshot snapshot, List anomalies) + { + if (snapshot.TotalDependencyCalls == 0) + return; + + var failureRate = (double)snapshot.FailedDependencyCalls / snapshot.TotalDependencyCalls * 100; + if (failureRate > _config.ErrorRateThresholdPercent) + { + anomalies.Add(new AnomalyReport( + snapshot.ServiceName, + DateTime.UtcNow, + AnomalySeverity.Critical, + "DependencyFailure", + $"Dependency failure rate of {failureRate:F1}% indicates downstream service issues.", + failureRate, + _config.ErrorRateThresholdPercent, + "Check health of dependent services (database, message broker, external APIs). Verify network connectivity and circuit breaker states.")); + } + } + + private void DetectResponseTimeSpike(ServiceHealthSnapshot snapshot, List anomalies) + { + lock (_lock) + { + if (!_history.TryGetValue(snapshot.ServiceName, out var history) || history.Count < 3) + return; + + var baseline = history + .Where(h => h.RecentRequestCount > 0) + .Select(h => h.AverageResponseTimeMs) + .ToArray(); + + if (baseline.Length < 3) + return; + + var mean = baseline.Average(); + var stdDev = Math.Sqrt(baseline.Average(v => Math.Pow(v - mean, 2))); + + if (stdDev > 0 && snapshot.AverageResponseTimeMs > mean + 2 * stdDev) + { + anomalies.Add(new AnomalyReport( + snapshot.ServiceName, + DateTime.UtcNow, + AnomalySeverity.Warning, + "ResponseTimeSpike", + $"Response time spike detected: {snapshot.AverageResponseTimeMs:F0}ms vs baseline {mean:F0}ms (2-sigma: {mean + 2 * stdDev:F0}ms).", + snapshot.AverageResponseTimeMs, + mean + 2 * stdDev, + "Correlate with deployment events or traffic pattern changes. Check for GC pauses or thread pool starvation.")); + } + } + } + + private void RecordSnapshot(ServiceHealthSnapshot snapshot) + { + lock (_lock) + { + if (!_history.ContainsKey(snapshot.ServiceName)) + _history[snapshot.ServiceName] = new List(); + + _history[snapshot.ServiceName].Add(snapshot); + + if (_history[snapshot.ServiceName].Count > 100) + _history[snapshot.ServiceName].RemoveAt(0); + } + } +} diff --git a/src/Services/Monitoring/Monitoring.Agent/Services/HealthScoringEngine.cs b/src/Services/Monitoring/Monitoring.Agent/Services/HealthScoringEngine.cs new file mode 100644 index 0000000..755a39f --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Services/HealthScoringEngine.cs @@ -0,0 +1,102 @@ +using Monitoring.Agent.Models; +using Shared.Monitoring.Metrics; + +namespace Monitoring.Agent.Services; + +/// +/// Computes a composite health score (0-100) for each service based on +/// availability, performance, error rate, and resource utilization. +/// Weights are tuned for microservice workloads. +/// +public class HealthScoringEngine +{ + private const double AvailabilityWeight = 0.30; + private const double PerformanceWeight = 0.30; + private const double ErrorRateWeight = 0.25; + private const double ResourceWeight = 0.15; + + public HealthScore CalculateScore(ServiceHealthSnapshot snapshot, List anomalies) + { + var availability = CalculateAvailabilityScore(snapshot); + var performance = CalculatePerformanceScore(snapshot); + var errorRate = CalculateErrorRateScore(snapshot); + var resource = CalculateResourceScore(snapshot); + + var overall = availability * AvailabilityWeight + + performance * PerformanceWeight + + errorRate * ErrorRateWeight + + resource * ResourceWeight; + + var anomalyPenalty = anomalies.Sum(a => a.Severity switch + { + AnomalySeverity.Critical => 15, + AnomalySeverity.Warning => 5, + _ => 0 + }); + + overall = Math.Max(0, overall - anomalyPenalty); + + return new HealthScore( + Overall: Math.Round(overall, 1), + Availability: Math.Round(availability, 1), + Performance: Math.Round(performance, 1), + ErrorRate: Math.Round(errorRate, 1), + ResourceUsage: Math.Round(resource, 1)); + } + + public ServiceStatus DetermineStatus(double overallScore) => overallScore switch + { + >= 80 => ServiceStatus.Healthy, + >= 50 => ServiceStatus.Degraded, + _ => ServiceStatus.Unhealthy + }; + + private static double CalculateAvailabilityScore(ServiceHealthSnapshot snapshot) + { + if (snapshot.TotalRequests == 0) return 100; + var successRate = 1.0 - (double)snapshot.FailedRequests / snapshot.TotalRequests; + return Math.Max(0, successRate * 100); + } + + private static double CalculatePerformanceScore(ServiceHealthSnapshot snapshot) + { + if (snapshot.RecentRequestCount == 0) return 100; + return snapshot.P95ResponseTimeMs switch + { + < 100 => 100, + < 250 => 90, + < 500 => 80, + < 1000 => 60, + < 2000 => 40, + < 5000 => 20, + _ => 5 + }; + } + + private static double CalculateErrorRateScore(ServiceHealthSnapshot snapshot) + { + if (snapshot.RecentRequestCount == 0) return 100; + return snapshot.ErrorRatePercent switch + { + < 0.1 => 100, + < 1 => 90, + < 2 => 75, + < 5 => 50, + < 10 => 25, + _ => 5 + }; + } + + private static double CalculateResourceScore(ServiceHealthSnapshot snapshot) + { + var memoryScore = snapshot.MemoryUsageMb switch + { + < 128 => 100, + < 256 => 90, + < 512 => 75, + < 1024 => 50, + _ => 25 + }; + return memoryScore; + } +} diff --git a/src/Services/Monitoring/Monitoring.Agent/Services/MonitoringBackgroundService.cs b/src/Services/Monitoring/Monitoring.Agent/Services/MonitoringBackgroundService.cs new file mode 100644 index 0000000..8812ae0 --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Services/MonitoringBackgroundService.cs @@ -0,0 +1,118 @@ +using System.Collections.Concurrent; +using Microsoft.ApplicationInsights; +using Monitoring.Agent.Models; + +namespace Monitoring.Agent.Services; + +/// +/// Background service that periodically collects telemetry from all +/// monitored services and publishes AI-analyzed metrics to Application Insights. +/// +public class MonitoringBackgroundService : BackgroundService +{ + private readonly ServiceHealthAggregator _aggregator; + private readonly TelemetryClient _telemetryClient; + private readonly ILogger _logger; + private readonly TimeSpan _interval; + + private static readonly ConcurrentQueue RecentDashboards = new(); + private const int MaxDashboardHistory = 60; + + public MonitoringBackgroundService( + ServiceHealthAggregator aggregator, + TelemetryClient telemetryClient, + IConfiguration configuration, + ILogger logger) + { + _aggregator = aggregator; + _telemetryClient = telemetryClient; + _logger = logger; + + var intervalSeconds = configuration + .GetValue("ApplicationInsights:MetricCollectionIntervalSeconds", 60); + _interval = TimeSpan.FromSeconds(intervalSeconds); + } + + public static MonitoringDashboard? LatestDashboard => + RecentDashboards.TryPeek(out var dashboard) ? dashboard : null; + + public static IReadOnlyCollection DashboardHistory => + RecentDashboards.ToArray(); + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + _logger.LogInformation("AI Monitoring Agent started. Collection interval: {Interval}s", + _interval.TotalSeconds); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + var dashboard = await _aggregator.GetDashboardAsync(); + + RecentDashboards.Enqueue(dashboard); + while (RecentDashboards.Count > MaxDashboardHistory) + RecentDashboards.TryDequeue(out _); + + PublishToAppInsights(dashboard); + + _logger.LogInformation( + "Monitoring cycle complete. System health: {Score}/100 ({Status}). " + + "Services: {Total}, Anomalies: {Anomalies}, Insights: {Insights}", + dashboard.SystemHealthScore, + dashboard.OverallStatus, + dashboard.Services.Count, + dashboard.RecentAnomalies.Count, + dashboard.Insights.Count); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error during monitoring cycle"); + } + + await Task.Delay(_interval, stoppingToken); + } + } + + private void PublishToAppInsights(MonitoringDashboard dashboard) + { + _telemetryClient.GetMetric("AI.SystemHealthScore") + .TrackValue(dashboard.SystemHealthScore); + + foreach (var service in dashboard.Services) + { + _telemetryClient.GetMetric("AI.ServiceHealthScore", "ServiceName") + .TrackValue(service.HealthScore.Overall, service.ServiceName); + _telemetryClient.GetMetric("AI.ServiceAvailability", "ServiceName") + .TrackValue(service.HealthScore.Availability, service.ServiceName); + _telemetryClient.GetMetric("AI.ServiceResponseTime", "ServiceName") + .TrackValue(service.Performance.AverageResponseTimeMs, service.ServiceName); + _telemetryClient.GetMetric("AI.ServiceErrorRate", "ServiceName") + .TrackValue(service.Performance.ErrorRatePercent, service.ServiceName); + _telemetryClient.GetMetric("AI.ServiceMemoryMb", "ServiceName") + .TrackValue(service.Resources.MemoryMb, service.ServiceName); + } + + foreach (var anomaly in dashboard.RecentAnomalies) + { + _telemetryClient.TrackEvent("AI.AnomalyDetected", new Dictionary + { + ["ServiceName"] = anomaly.ServiceName, + ["Severity"] = anomaly.Severity.ToString(), + ["Category"] = anomaly.Category, + ["Description"] = anomaly.Description + }); + } + + foreach (var insight in dashboard.Insights) + { + _telemetryClient.TrackEvent("AI.InsightGenerated", new Dictionary + { + ["InsightId"] = insight.InsightId, + ["Category"] = insight.Category.ToString(), + ["Priority"] = insight.Priority.ToString(), + ["Title"] = insight.Title + }); + } + } +} diff --git a/src/Services/Monitoring/Monitoring.Agent/Services/ServiceHealthAggregator.cs b/src/Services/Monitoring/Monitoring.Agent/Services/ServiceHealthAggregator.cs new file mode 100644 index 0000000..5809406 --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/Services/ServiceHealthAggregator.cs @@ -0,0 +1,206 @@ +using System.Diagnostics; +using Monitoring.Agent.Models; +using Shared.Monitoring; +using Shared.Monitoring.Metrics; + +namespace Monitoring.Agent.Services; + +/// +/// Aggregates health data from all monitored microservices by polling +/// their /healthz endpoints and collecting local process metrics. +/// Produces a unified monitoring dashboard with AI-generated insights. +/// +public class ServiceHealthAggregator +{ + private readonly HttpClient _httpClient; + private readonly AnomalyDetectionEngine _anomalyDetection; + private readonly HealthScoringEngine _healthScoring; + private readonly AiInsightsEngine _insightsEngine; + private readonly ILogger _logger; + private readonly Dictionary _serviceEndpoints; + + public ServiceHealthAggregator( + IHttpClientFactory httpClientFactory, + AnomalyDetectionEngine anomalyDetection, + HealthScoringEngine healthScoring, + AiInsightsEngine insightsEngine, + IConfiguration configuration, + ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("MonitoringAgent"); + _httpClient.Timeout = TimeSpan.FromSeconds(5); + _anomalyDetection = anomalyDetection; + _healthScoring = healthScoring; + _insightsEngine = insightsEngine; + _logger = logger; + + _serviceEndpoints = configuration + .GetSection("MonitoredServices") + .Get>() ?? new Dictionary + { + ["Identity"] = "http://localhost:5001", + ["Customer"] = "http://localhost:5002", + ["Order"] = "http://localhost:5003", + ["Product"] = "http://localhost:5004", + ["Notification"] = "http://localhost:5005", + ["ApiGateway"] = "http://localhost:5000" + }; + } + + public async Task GetDashboardAsync() + { + var serviceReports = new List(); + var allAnomalies = new Dictionary>(); + var snapshots = new Dictionary(); + + var tasks = _serviceEndpoints.Select(async kvp => + { + var report = await ProbeServiceAsync(kvp.Key, kvp.Value); + return (kvp.Key, report); + }); + + var results = await Task.WhenAll(tasks); + + foreach (var (name, report) in results) + { + serviceReports.Add(report.HealthReport); + allAnomalies[name] = report.Anomalies; + if (report.Snapshot is not null) + snapshots[name] = report.Snapshot; + } + + var insights = _insightsEngine.GenerateInsights(snapshots, allAnomalies); + var recentAnomalies = allAnomalies.Values.SelectMany(a => a).ToList(); + var systemScore = serviceReports.Count > 0 + ? serviceReports.Average(r => r.HealthScore.Overall) + : 0; + + var overallStatus = systemScore switch + { + >= 80 => "Healthy", + >= 50 => "Degraded", + _ => "Unhealthy" + }; + + return new MonitoringDashboard( + GeneratedAt: DateTime.UtcNow, + OverallStatus: overallStatus, + SystemHealthScore: Math.Round(systemScore, 1), + Services: serviceReports, + RecentAnomalies: recentAnomalies, + Insights: insights, + SystemMetrics: new Dictionary + { + ["TotalServices"] = _serviceEndpoints.Count, + ["HealthyServices"] = serviceReports.Count(r => r.Status == ServiceStatus.Healthy), + ["DegradedServices"] = serviceReports.Count(r => r.Status == ServiceStatus.Degraded), + ["UnhealthyServices"] = serviceReports.Count(r => r.Status == ServiceStatus.Unhealthy), + ["TotalAnomalies"] = recentAnomalies.Count, + ["CriticalAnomalies"] = recentAnomalies.Count(a => a.Severity == AnomalySeverity.Critical) + }); + } + + private async Task ProbeServiceAsync(string serviceName, string baseUrl) + { + var snapshot = CreateLocalSnapshot(serviceName); + var anomalies = _anomalyDetection.Analyze(snapshot); + var healthScore = _healthScoring.CalculateScore(snapshot, anomalies); + var status = _healthScoring.DetermineStatus(healthScore.Overall); + + bool isReachable; + try + { + var response = await _httpClient.GetAsync($"{baseUrl}/healthz"); + isReachable = response.IsSuccessStatusCode; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to reach {Service} at {Url}", serviceName, baseUrl); + isReachable = false; + } + + if (!isReachable) + status = ServiceStatus.Unknown; + + var process = Process.GetCurrentProcess(); + var performance = new PerformanceMetrics( + AverageResponseTimeMs: snapshot.AverageResponseTimeMs, + P95ResponseTimeMs: snapshot.P95ResponseTimeMs, + RequestsPerMinute: snapshot.RecentRequestCount, + ErrorRatePercent: snapshot.ErrorRatePercent, + ActiveConnections: 0); + + var gcInfo = GC.GetGCMemoryInfo(); + var resources = new ResourceUtilization( + CpuPercent: snapshot.CpuTimeSeconds, + MemoryMb: snapshot.MemoryUsageMb, + MemoryPercent: snapshot.MemoryUsageMb / 1024.0 * 100, + GcGen0Collections: GC.CollectionCount(0), + GcGen1Collections: GC.CollectionCount(1), + GcGen2Collections: GC.CollectionCount(2), + GcTotalMemoryMb: Math.Round(GC.GetTotalMemory(false) / (1024.0 * 1024.0), 2)); + + var recommendations = GenerateRecommendations(snapshot, anomalies, isReachable); + + var healthReport = new ServiceHealthReport( + ServiceName: serviceName, + Timestamp: DateTime.UtcNow, + HealthScore: healthScore, + Status: status, + Performance: performance, + Resources: resources, + ActiveAnomalies: anomalies, + Recommendations: recommendations); + + return new ServiceProbeResult(healthReport, anomalies, snapshot); + } + + private static ServiceHealthSnapshot CreateLocalSnapshot(string serviceName) + { + var process = Process.GetCurrentProcess(); + return new ServiceHealthSnapshot( + ServiceName: serviceName, + Timestamp: DateTime.UtcNow, + TotalRequests: 0, + FailedRequests: 0, + RecentRequestCount: 0, + RecentErrorCount: 0, + ErrorRatePercent: 0, + AverageResponseTimeMs: 0, + P95ResponseTimeMs: 0, + MemoryUsageMb: Math.Round(process.WorkingSet64 / (1024.0 * 1024.0), 2), + CpuTimeSeconds: Math.Round(process.TotalProcessorTime.TotalSeconds, 2), + TotalDependencyCalls: 0, + FailedDependencyCalls: 0); + } + + private static List GenerateRecommendations( + ServiceHealthSnapshot snapshot, + List anomalies, + bool isReachable) + { + var recommendations = new List(); + + if (!isReachable) + recommendations.Add("Service is unreachable. Verify deployment status and network configuration."); + + if (snapshot.P95ResponseTimeMs > 1000) + recommendations.Add("Enable Application Insights Profiler to identify performance bottlenecks."); + + if (snapshot.ErrorRatePercent > 2) + recommendations.Add("Set up Application Insights Smart Detection alerts for anomalous error rates."); + + if (anomalies.Count > 3) + recommendations.Add("Multiple anomalies detected. Consider scaling the service or investigating root cause."); + + if (recommendations.Count == 0) + recommendations.Add("No immediate action required. Service is operating within normal parameters."); + + return recommendations; + } + + private record ServiceProbeResult( + ServiceHealthReport HealthReport, + List Anomalies, + ServiceHealthSnapshot? Snapshot); +} diff --git a/src/Services/Monitoring/Monitoring.Agent/appsettings.json b/src/Services/Monitoring/Monitoring.Agent/appsettings.json new file mode 100644 index 0000000..58ec756 --- /dev/null +++ b/src/Services/Monitoring/Monitoring.Agent/appsettings.json @@ -0,0 +1,32 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning" + } + }, + "ApplicationInsights": { + "ConnectionString": "", + "CloudRoleName": "MonitoringAgent", + "EnableAdaptiveSampling": true, + "EnableDependencyTracking": true, + "EnablePerformanceCounters": true, + "EnableAiDiagnostics": true, + "MetricCollectionIntervalSeconds": 60, + "AnomalyDetection": { + "ResponseTimeThresholdMs": 2000, + "ErrorRateThresholdPercent": 5, + "CpuThresholdPercent": 80, + "MemoryThresholdPercent": 85, + "EvaluationWindowMinutes": 5 + } + }, + "MonitoredServices": { + "Identity": "http://localhost:5001", + "Customer": "http://localhost:5002", + "Order": "http://localhost:5003", + "Product": "http://localhost:5004", + "Notification": "http://localhost:5005", + "ApiGateway": "http://localhost:5000" + } +} diff --git a/src/Services/Notification/Notification.API/Notification.API.csproj b/src/Services/Notification/Notification.API/Notification.API.csproj index 25b5fb0..663abf1 100644 --- a/src/Services/Notification/Notification.API/Notification.API.csproj +++ b/src/Services/Notification/Notification.API/Notification.API.csproj @@ -7,8 +7,9 @@ - - + + + diff --git a/src/Services/Notification/Notification.API/Program.cs b/src/Services/Notification/Notification.API/Program.cs index 6219c9c..5b8c1c5 100644 --- a/src/Services/Notification/Notification.API/Program.cs +++ b/src/Services/Notification/Notification.API/Program.cs @@ -3,6 +3,7 @@ using Notification.Infrastructure.Data; using Notification.Infrastructure.Repositories; using Microsoft.EntityFrameworkCore; +using Shared.Monitoring.Extensions; var builder = WebApplication.CreateBuilder(args); @@ -18,6 +19,8 @@ builder.Services.AddScoped(); builder.Services.AddScoped(); +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "NotificationService"); + var app = builder.Build(); using (var scope = app.Services.CreateScope()) @@ -32,6 +35,7 @@ app.UseSwaggerUI(); } +app.UseAppInsightsMonitoring(); app.MapControllers(); app.MapHealthChecks("/healthz"); diff --git a/src/Services/Order/Order.API/Order.API.csproj b/src/Services/Order/Order.API/Order.API.csproj index 54aad4b..ad10c2a 100644 --- a/src/Services/Order/Order.API/Order.API.csproj +++ b/src/Services/Order/Order.API/Order.API.csproj @@ -7,8 +7,9 @@ - - + + + diff --git a/src/Services/Order/Order.API/Program.cs b/src/Services/Order/Order.API/Program.cs index 4512675..2783fef 100644 --- a/src/Services/Order/Order.API/Program.cs +++ b/src/Services/Order/Order.API/Program.cs @@ -1,5 +1,6 @@ using Order.Infrastructure.Data; using Microsoft.EntityFrameworkCore; +using Shared.Monitoring.Extensions; var builder = WebApplication.CreateBuilder(args); @@ -11,6 +12,8 @@ builder.Services.AddDbContext(options => options.UseNpgsql(builder.Configuration.GetConnectionString("DefaultConnection"))); +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "OrderService"); + var app = builder.Build(); if (app.Environment.IsDevelopment()) @@ -19,6 +22,7 @@ app.UseSwaggerUI(); } +app.UseAppInsightsMonitoring(); app.MapControllers(); app.MapHealthChecks("/healthz"); diff --git a/src/Services/Product/Product.API/Product.API.csproj b/src/Services/Product/Product.API/Product.API.csproj index 68be876..4a9b268 100644 --- a/src/Services/Product/Product.API/Product.API.csproj +++ b/src/Services/Product/Product.API/Product.API.csproj @@ -7,8 +7,9 @@ - - + + + diff --git a/src/Services/Product/Product.API/Program.cs b/src/Services/Product/Product.API/Program.cs index 73146ef..a5ac469 100644 --- a/src/Services/Product/Product.API/Program.cs +++ b/src/Services/Product/Product.API/Program.cs @@ -1,5 +1,6 @@ using Product.Infrastructure.Data; using Microsoft.EntityFrameworkCore; +using Shared.Monitoring.Extensions; var builder = WebApplication.CreateBuilder(args); @@ -11,6 +12,8 @@ builder.Services.AddDbContext(options => options.UseNpgsql(builder.Configuration.GetConnectionString("DefaultConnection"))); +builder.Services.AddAppInsightsMonitoring(builder.Configuration, "ProductService"); + var app = builder.Build(); if (app.Environment.IsDevelopment()) @@ -19,6 +22,7 @@ app.UseSwaggerUI(); } +app.UseAppInsightsMonitoring(); app.MapControllers(); app.MapHealthChecks("/healthz"); diff --git a/src/Shared/Shared.Monitoring/AppInsightsConfig.cs b/src/Shared/Shared.Monitoring/AppInsightsConfig.cs new file mode 100644 index 0000000..82f373e --- /dev/null +++ b/src/Shared/Shared.Monitoring/AppInsightsConfig.cs @@ -0,0 +1,25 @@ +namespace Shared.Monitoring; + +public class AppInsightsConfig +{ + public const string SectionName = "ApplicationInsights"; + + public string ConnectionString { get; set; } = string.Empty; + public string CloudRoleName { get; set; } = string.Empty; + public bool EnableAdaptiveSampling { get; set; } = true; + public double SamplingPercentage { get; set; } = 100; + public bool EnableDependencyTracking { get; set; } = true; + public bool EnablePerformanceCounters { get; set; } = true; + public bool EnableAiDiagnostics { get; set; } = true; + public int MetricCollectionIntervalSeconds { get; set; } = 60; + public AnomalyDetectionConfig AnomalyDetection { get; set; } = new(); +} + +public class AnomalyDetectionConfig +{ + public double ResponseTimeThresholdMs { get; set; } = 2000; + public double ErrorRateThresholdPercent { get; set; } = 5; + public double CpuThresholdPercent { get; set; } = 80; + public double MemoryThresholdPercent { get; set; } = 85; + public int EvaluationWindowMinutes { get; set; } = 5; +} diff --git a/src/Shared/Shared.Monitoring/Extensions/AppInsightsServiceExtensions.cs b/src/Shared/Shared.Monitoring/Extensions/AppInsightsServiceExtensions.cs new file mode 100644 index 0000000..35712b7 --- /dev/null +++ b/src/Shared/Shared.Monitoring/Extensions/AppInsightsServiceExtensions.cs @@ -0,0 +1,67 @@ +using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.AspNetCore.Builder; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Shared.Monitoring.HealthChecks; +using Shared.Monitoring.Metrics; +using Shared.Monitoring.Middleware; +using Shared.Monitoring.Telemetry; + +namespace Shared.Monitoring.Extensions; + +/// +/// Extension methods to register Application Insights AI monitoring +/// across all microservices with a single call. +/// +public static class AppInsightsServiceExtensions +{ + public static IServiceCollection AddAppInsightsMonitoring( + this IServiceCollection services, + IConfiguration configuration, + string serviceName) + { + var config = new AppInsightsConfig(); + configuration.GetSection(AppInsightsConfig.SectionName).Bind(config); + + if (!string.IsNullOrEmpty(config.ConnectionString)) + { + services.AddApplicationInsightsTelemetry(options => + { + options.ConnectionString = config.ConnectionString; + options.EnableAdaptiveSampling = config.EnableAdaptiveSampling; + }); + } + else + { + services.AddApplicationInsightsTelemetry(); + } + + services.AddSingleton( + new ServiceTelemetryInitializer( + string.IsNullOrEmpty(config.CloudRoleName) + ? serviceName + : config.CloudRoleName)); + + services.AddHttpContextAccessor(); + services.AddSingleton(); + + services.AddApplicationInsightsTelemetryProcessor(); + + services.AddSingleton(sp => + { + var client = sp.GetRequiredService(); + return new ServiceMetricsCollector(client, serviceName); + }); + + services.AddHealthChecks() + .AddCheck("app-insights"); + + return services; + } + + public static IApplicationBuilder UseAppInsightsMonitoring(this IApplicationBuilder app) + { + app.UseMiddleware(); + return app; + } +} diff --git a/src/Shared/Shared.Monitoring/HealthChecks/AppInsightsHealthCheck.cs b/src/Shared/Shared.Monitoring/HealthChecks/AppInsightsHealthCheck.cs new file mode 100644 index 0000000..de160c3 --- /dev/null +++ b/src/Shared/Shared.Monitoring/HealthChecks/AppInsightsHealthCheck.cs @@ -0,0 +1,42 @@ +using Microsoft.ApplicationInsights; +using Microsoft.Extensions.Diagnostics.HealthChecks; + +namespace Shared.Monitoring.HealthChecks; + +/// +/// Verifies Application Insights connectivity by flushing telemetry. +/// Reports degraded status if the connection string is not configured. +/// +public class AppInsightsHealthCheck : IHealthCheck +{ + private readonly TelemetryClient _telemetryClient; + + public AppInsightsHealthCheck(TelemetryClient telemetryClient) + { + _telemetryClient = telemetryClient; + } + + public Task CheckHealthAsync( + HealthCheckContext context, + CancellationToken cancellationToken = default) + { + try + { + if (string.IsNullOrEmpty(_telemetryClient.InstrumentationKey) + && string.IsNullOrEmpty(_telemetryClient.TelemetryConfiguration.ConnectionString)) + { + return Task.FromResult(HealthCheckResult.Degraded( + "Application Insights connection string is not configured.")); + } + + _telemetryClient.Flush(); + return Task.FromResult(HealthCheckResult.Healthy( + "Application Insights telemetry channel is active.")); + } + catch (Exception ex) + { + return Task.FromResult(HealthCheckResult.Unhealthy( + "Application Insights health check failed.", ex)); + } + } +} diff --git a/src/Shared/Shared.Monitoring/Metrics/ServiceMetricsCollector.cs b/src/Shared/Shared.Monitoring/Metrics/ServiceMetricsCollector.cs new file mode 100644 index 0000000..53dcaaa --- /dev/null +++ b/src/Shared/Shared.Monitoring/Metrics/ServiceMetricsCollector.cs @@ -0,0 +1,132 @@ +using System.Collections.Concurrent; +using System.Diagnostics; +using Microsoft.ApplicationInsights; +using Microsoft.ApplicationInsights.Metrics; + +namespace Shared.Monitoring.Metrics; + +/// +/// Collects and tracks custom service metrics for AI analysis. +/// Maintains a rolling window of request/error data for real-time health scoring. +/// +public class ServiceMetricsCollector +{ + private readonly TelemetryClient _telemetryClient; + private readonly string _serviceName; + private readonly ConcurrentQueue _recentRequests = new(); + private readonly TimeSpan _windowSize = TimeSpan.FromMinutes(5); + private long _totalRequests; + private long _failedRequests; + private long _totalDependencyCalls; + private long _failedDependencyCalls; + + public ServiceMetricsCollector(TelemetryClient telemetryClient, string serviceName) + { + _telemetryClient = telemetryClient; + _serviceName = serviceName; + } + + public void TrackRequest(string endpoint, double durationMs, bool success, int statusCode) + { + Interlocked.Increment(ref _totalRequests); + if (!success) + Interlocked.Increment(ref _failedRequests); + + var entry = new RequestMetricEntry( + DateTime.UtcNow, endpoint, durationMs, success, statusCode); + _recentRequests.Enqueue(entry); + + PruneOldEntries(); + + _telemetryClient.GetMetric($"{_serviceName}.RequestDuration", "Endpoint") + .TrackValue(durationMs, endpoint); + _telemetryClient.GetMetric($"{_serviceName}.RequestCount", "StatusCode") + .TrackValue(1, statusCode.ToString()); + } + + public void TrackDependency(string dependencyType, string target, double durationMs, bool success) + { + Interlocked.Increment(ref _totalDependencyCalls); + if (!success) + Interlocked.Increment(ref _failedDependencyCalls); + + _telemetryClient.GetMetric($"{_serviceName}.DependencyDuration", "Type") + .TrackValue(durationMs, dependencyType); + } + + public void TrackCustomEvent(string eventName, IDictionary? properties = null) + { + _telemetryClient.TrackEvent($"{_serviceName}.{eventName}", properties); + } + + public ServiceHealthSnapshot GetHealthSnapshot() + { + PruneOldEntries(); + var recentList = _recentRequests.ToArray(); + + var totalRecent = recentList.Length; + var failedRecent = recentList.Count(r => !r.Success); + var avgDuration = totalRecent > 0 + ? recentList.Average(r => r.DurationMs) + : 0; + var p95Duration = totalRecent > 0 + ? CalculatePercentile(recentList.Select(r => r.DurationMs).ToArray(), 95) + : 0; + + var process = Process.GetCurrentProcess(); + var memoryMb = process.WorkingSet64 / (1024.0 * 1024.0); + var cpuTime = process.TotalProcessorTime; + + return new ServiceHealthSnapshot( + ServiceName: _serviceName, + Timestamp: DateTime.UtcNow, + TotalRequests: Interlocked.Read(ref _totalRequests), + FailedRequests: Interlocked.Read(ref _failedRequests), + RecentRequestCount: totalRecent, + RecentErrorCount: failedRecent, + ErrorRatePercent: totalRecent > 0 ? (double)failedRecent / totalRecent * 100 : 0, + AverageResponseTimeMs: Math.Round(avgDuration, 2), + P95ResponseTimeMs: Math.Round(p95Duration, 2), + MemoryUsageMb: Math.Round(memoryMb, 2), + CpuTimeSeconds: Math.Round(cpuTime.TotalSeconds, 2), + TotalDependencyCalls: Interlocked.Read(ref _totalDependencyCalls), + FailedDependencyCalls: Interlocked.Read(ref _failedDependencyCalls)); + } + + private void PruneOldEntries() + { + var cutoff = DateTime.UtcNow - _windowSize; + while (_recentRequests.TryPeek(out var oldest) && oldest.Timestamp < cutoff) + _recentRequests.TryDequeue(out _); + } + + private static double CalculatePercentile(double[] values, int percentile) + { + if (values.Length == 0) return 0; + Array.Sort(values); + var index = (int)Math.Ceiling(percentile / 100.0 * values.Length) - 1; + return values[Math.Max(0, index)]; + } +} + +public record RequestMetricEntry( + DateTime Timestamp, + string Endpoint, + double DurationMs, + bool Success, + int StatusCode); + +public record ServiceHealthSnapshot( + string ServiceName, + DateTime Timestamp, + long TotalRequests, + long FailedRequests, + int RecentRequestCount, + int RecentErrorCount, + double ErrorRatePercent, + double AverageResponseTimeMs, + double P95ResponseTimeMs, + double MemoryUsageMb, + double CpuTimeSeconds, + long TotalDependencyCalls, + long FailedDependencyCalls); diff --git a/src/Shared/Shared.Monitoring/Middleware/AiTelemetryMiddleware.cs b/src/Shared/Shared.Monitoring/Middleware/AiTelemetryMiddleware.cs new file mode 100644 index 0000000..780917e --- /dev/null +++ b/src/Shared/Shared.Monitoring/Middleware/AiTelemetryMiddleware.cs @@ -0,0 +1,55 @@ +using System.Diagnostics; +using Microsoft.AspNetCore.Http; +using Microsoft.Extensions.Logging; +using Shared.Monitoring.Metrics; + +namespace Shared.Monitoring.Middleware; + +/// +/// Captures per-request telemetry and feeds it into the ServiceMetricsCollector +/// for real-time AI health analysis. +/// +public class AiTelemetryMiddleware +{ + private readonly RequestDelegate _next; + private readonly ServiceMetricsCollector _metricsCollector; + private readonly ILogger _logger; + + public AiTelemetryMiddleware( + RequestDelegate next, + ServiceMetricsCollector metricsCollector, + ILogger logger) + { + _next = next; + _metricsCollector = metricsCollector; + _logger = logger; + } + + public async Task InvokeAsync(HttpContext context) + { + var stopwatch = Stopwatch.StartNew(); + var endpoint = $"{context.Request.Method} {context.Request.Path}"; + + try + { + await _next(context); + } + catch (Exception ex) + { + _logger.LogError(ex, "Unhandled exception on {Endpoint}", endpoint); + throw; + } + finally + { + stopwatch.Stop(); + var statusCode = context.Response.StatusCode; + var success = statusCode < 400; + + _metricsCollector.TrackRequest( + endpoint, + stopwatch.Elapsed.TotalMilliseconds, + success, + statusCode); + } + } +} diff --git a/src/Shared/Shared.Monitoring/Shared.Monitoring.csproj b/src/Shared/Shared.Monitoring/Shared.Monitoring.csproj new file mode 100644 index 0000000..350d26f --- /dev/null +++ b/src/Shared/Shared.Monitoring/Shared.Monitoring.csproj @@ -0,0 +1,12 @@ + + + net10.0 + enable + enable + + + + + + + diff --git a/src/Shared/Shared.Monitoring/Telemetry/AiDiagnosticTelemetryProcessor.cs b/src/Shared/Shared.Monitoring/Telemetry/AiDiagnosticTelemetryProcessor.cs new file mode 100644 index 0000000..70e7c74 --- /dev/null +++ b/src/Shared/Shared.Monitoring/Telemetry/AiDiagnosticTelemetryProcessor.cs @@ -0,0 +1,66 @@ +using Microsoft.ApplicationInsights.Channel; +using Microsoft.ApplicationInsights.DataContracts; +using Microsoft.ApplicationInsights.Extensibility; + +namespace Shared.Monitoring.Telemetry; + +/// +/// Enriches request and dependency telemetry with AI diagnostic metadata. +/// Flags slow requests and failed dependencies for anomaly detection. +/// +public class AiDiagnosticTelemetryProcessor : ITelemetryProcessor +{ + private readonly ITelemetryProcessor _next; + private readonly double _slowRequestThresholdMs; + + public AiDiagnosticTelemetryProcessor( + ITelemetryProcessor next, + double slowRequestThresholdMs = 2000) + { + _next = next; + _slowRequestThresholdMs = slowRequestThresholdMs; + } + + public void Process(ITelemetry item) + { + if (item is RequestTelemetry request) + { + var durationMs = request.Duration.TotalMilliseconds; + if (request is ISupportProperties props) + { + props.Properties["AI.IsSlowRequest"] = + (durationMs > _slowRequestThresholdMs).ToString(); + props.Properties["AI.DurationBucket"] = GetDurationBucket(durationMs); + } + } + + if (item is DependencyTelemetry dependency) + { + if (!dependency.Success.GetValueOrDefault(true) + && dependency is ISupportProperties depProps) + { + depProps.Properties["AI.FailedDependency"] = "true"; + depProps.Properties["AI.DependencyTarget"] = dependency.Target; + } + } + + if (item is ExceptionTelemetry exception) + { + if (exception is ISupportProperties exProps) + { + exProps.Properties["AI.ExceptionType"] = + exception.Exception?.GetType().Name ?? "Unknown"; + } + } + + _next.Process(item); + } + + private static string GetDurationBucket(double ms) => ms switch + { + < 100 => "Fast", + < 500 => "Normal", + < 2000 => "Slow", + _ => "Critical" + }; +} diff --git a/src/Shared/Shared.Monitoring/Telemetry/CorrelationTelemetryInitializer.cs b/src/Shared/Shared.Monitoring/Telemetry/CorrelationTelemetryInitializer.cs new file mode 100644 index 0000000..88b25a3 --- /dev/null +++ b/src/Shared/Shared.Monitoring/Telemetry/CorrelationTelemetryInitializer.cs @@ -0,0 +1,34 @@ +using Microsoft.ApplicationInsights.Channel; +using Microsoft.ApplicationInsights.DataContracts; +using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.AspNetCore.Http; + +namespace Shared.Monitoring.Telemetry; + +/// +/// Propagates X-Correlation-ID from HTTP headers into Application Insights +/// telemetry as a custom property, linking distributed traces across services. +/// +public class CorrelationTelemetryInitializer : ITelemetryInitializer +{ + private const string CorrelationIdHeader = "X-Correlation-ID"; + private readonly IHttpContextAccessor _httpContextAccessor; + + public CorrelationTelemetryInitializer(IHttpContextAccessor httpContextAccessor) + { + _httpContextAccessor = httpContextAccessor; + } + + public void Initialize(ITelemetry telemetry) + { + var httpContext = _httpContextAccessor.HttpContext; + if (httpContext is null) + return; + + if (httpContext.Request.Headers.TryGetValue(CorrelationIdHeader, out var correlationId) + && telemetry is ISupportProperties propTelemetry) + { + propTelemetry.Properties["CorrelationId"] = correlationId.ToString(); + } + } +} diff --git a/src/Shared/Shared.Monitoring/Telemetry/ServiceTelemetryInitializer.cs b/src/Shared/Shared.Monitoring/Telemetry/ServiceTelemetryInitializer.cs new file mode 100644 index 0000000..4159eb4 --- /dev/null +++ b/src/Shared/Shared.Monitoring/Telemetry/ServiceTelemetryInitializer.cs @@ -0,0 +1,26 @@ +using Microsoft.ApplicationInsights.Channel; +using Microsoft.ApplicationInsights.Extensibility; + +namespace Shared.Monitoring.Telemetry; + +/// +/// Sets cloud role name and instance on all telemetry items so that +/// Application Insights Application Map groups each microservice correctly. +/// +public class ServiceTelemetryInitializer : ITelemetryInitializer +{ + private readonly string _roleName; + private readonly string _roleInstance; + + public ServiceTelemetryInitializer(string roleName) + { + _roleName = roleName; + _roleInstance = Environment.MachineName; + } + + public void Initialize(ITelemetry telemetry) + { + telemetry.Context.Cloud.RoleName = _roleName; + telemetry.Context.Cloud.RoleInstance = _roleInstance; + } +}