From 7e1f6705d37a5bf05db39e2a91f6632cc80959a8 Mon Sep 17 00:00:00 2001 From: Dennis Kliban Date: Mon, 20 Apr 2026 11:55:35 -0400 Subject: [PATCH] fix: prevent gunicorn worker recycling from corrupting histogram aggregation Gunicorn worker recycling causes in-memory Prometheus counters to reset. The OTel aggregation pipeline strips worker.name and sums all workers into a single cumulative counter via groupbyattrs. When a worker recycles, its counter resets to 0, decreasing the aggregate. This manifests as a "hidden counter reset" in Prometheus: if the recycled worker's final le=+Inf value coincidentally equals the new worker's starting value (e.g. both are 1 because the new worker immediately handled a slow request), Prometheus does not detect the reset for le=+Inf. But le=1000 resets visibly. This inflates rate(le=1000) relative to rate(le=+Inf), producing SLI ratios greater than 1. Fix: insert cumulativetodelta before worker aggregation so we sum per-worker deltas (always non-negative) instead of cumulative totals. Worker recycles produce a 0-delta rather than a negative value that corrupts the aggregate. Add deltatorumulative after groupbyattrs to convert the aggregate delta back to a cumulative counter for the Prometheus exporter. Co-Authored-By: Claude Sonnet 4.6 --- deploy/clowdapp.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deploy/clowdapp.yaml b/deploy/clowdapp.yaml index 8d39da83..d5b954c3 100644 --- a/deploy/clowdapp.yaml +++ b/deploy/clowdapp.yaml @@ -61,7 +61,7 @@ objects: metrics/aggregation: receivers: [otlp] - processors: + processors: - memory_limiter - filter/filter_pulp_api_request_duration - attributes/remove_worker_name @@ -373,6 +373,8 @@ objects: value: ${{OTEL_PYTHON_EXCLUDED_URLS}} - name: PULP_OTEL_PULP_API_HISTOGRAM_BUCKETS value: ${PULP_OTEL_PULP_API_HISTOGRAM_BUCKETS} + - name: OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE + value: "delta" - name: PULP_REDIS_PORT value: "6379" - name: SENTRY_DSN @@ -543,6 +545,8 @@ objects: value: ${OTEL_METRIC_EXPORT_TIMEOUT} - name: OTEL_TRACES_EXPORTER value: "none" + - name: OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE + value: "delta" - name: PULP_REDIS_PORT value: "6379" - name: SENTRY_DSN @@ -658,6 +662,8 @@ objects: value: ${{OTEL_EXPORTER_OTLP_ENDPOINT}} - name: OTEL_TRACES_EXPORTER value: "none" + - name: OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE + value: "delta" - name: PULP_OTEL_METRICS_DISPATCH_INTERVAL_MINUTES value: ${PULP_OTEL_METRICS_DISPATCH_INTERVAL_MINUTES} - name: PULP_REDIS_PORT