diff --git a/framework/.changeset/v0.14.2.md b/framework/.changeset/v0.14.2.md new file mode 100644 index 000000000..18f1e0cf6 --- /dev/null +++ b/framework/.changeset/v0.14.2.md @@ -0,0 +1 @@ +- Observability stack: update Tempo version to latest, enable traces-to-profiles, run Pyroscope with bigger and higher resolution profiles \ No newline at end of file diff --git a/framework/components/blockchain/anvil.go b/framework/components/blockchain/anvil.go index f8fac1a7e..c96439a08 100644 --- a/framework/components/blockchain/anvil.go +++ b/framework/components/blockchain/anvil.go @@ -36,7 +36,7 @@ func defaultAnvil(in *Input) { in.Port = "8545" } if in.ContainerName == "" { - in.ContainerName = "anvil" + in.ContainerName = "anvil-" + in.ChainID } } diff --git a/framework/components/clnode/default.go b/framework/components/clnode/default.go index 7ed8f5383..206cf65db 100644 --- a/framework/components/clnode/default.go +++ b/framework/components/clnode/default.go @@ -22,6 +22,7 @@ Level = 'debug' [Pyroscope] ServerAddress = 'http://host.docker.internal:4040' Environment = 'local' +LinkTracesToProfiles = true [WebServer] HTTPWriteTimeout = '30s' diff --git a/framework/observability/compose/conf/provisioning/datasources/loki.yaml b/framework/observability/compose/conf/provisioning/datasources/loki.yaml index 5e8cc01bb..a176bc3bb 100644 --- a/framework/observability/compose/conf/provisioning/datasources/loki.yaml +++ b/framework/observability/compose/conf/provisioning/datasources/loki.yaml @@ -3,22 +3,46 @@ apiVersion: 1 datasources: - name: Tempo type: tempo + uid: tempo access: proxy url: http://tempo:3200 isDefault: false + jsonData: + tracesToLogsV2: + datasourceUid: P8E80F9AEF21F6940 + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + filterByTraceID: true + filterBySpanID: true + tracesToProfiles: + datasourceUid: pyroscope + profileTypeId: 'process_cpu:cpu:nanoseconds:cpu:nanoseconds' + customQuery: true + query: '{service_name="chainlink-node"}' + serviceMap: + datasourceUid: PBFA97CFB590B2093 + nodeGraph: + enabled: true - name: Loki type: loki + uid: P8E80F9AEF21F6940 isDefault: true access: proxy url: http://loki:3100 jsonData: maxLines: 5000 + derivedFields: + - datasourceUid: tempo + matcherRegex: '"traceID":"([a-f0-9]+)"' + name: TraceID + url: '$${__value.raw}' - name: Prometheus type: prometheus + uid: PBFA97CFB590B2093 access: proxy - url: http://prometheus:9090 # Replace with your Prometheus URL + url: http://prometheus:9090 jsonData: - timeInterval: 5s # Adjust this interval as needed + timeInterval: 5s - name: PostgreSQL 0 type: postgres @@ -82,8 +106,11 @@ datasources: - name: Pyroscope type: grafana-pyroscope-datasource + uid: pyroscope access: proxy url: http://pyroscope:4040 isDefault: false version: 1 editable: true + jsonData: + maxNodes: 100000 diff --git a/framework/observability/compose/docker-compose.yaml b/framework/observability/compose/docker-compose.yaml index 11175ba8b..8daf27cb7 100644 --- a/framework/observability/compose/docker-compose.yaml +++ b/framework/observability/compose/docker-compose.yaml @@ -1,6 +1,7 @@ services: tempo: - image: grafana/tempo:2.3.1 + image: grafana/tempo:2.7.1 + user: '0:0' command: [ "-config.file=/etc/tempo.yaml" ] volumes: - ./tempo.yaml:/etc/tempo.yaml @@ -10,7 +11,11 @@ services: - "9411:9411" # zipkin - "14268:14268" # jaeger ingest - "14250:14250" # jaeger grpc - - "14400:55680" # otlp http legacy + deploy: + resources: + limits: + memory: 2G + restart: unless-stopped otel-collector: image: otel/opentelemetry-collector-contrib:0.123.0 volumes: @@ -40,6 +45,10 @@ services: prometheus: image: prom/prometheus:v3.4.1 user: '0:0' + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.enable-remote-write-receiver' + - '--enable-feature=exemplar-storage' volumes: - /var/run/docker.sock:/var/run/docker.sock - ./conf/prometheus.yml:/etc/prometheus/prometheus.yml @@ -88,8 +97,9 @@ services: ports: - '4040:4040' command: - - 'server' + - '-config.file=/etc/pyroscope/config.yaml' volumes: + - ./pyroscope-config.yaml:/etc/pyroscope/config.yaml - pyroscope-storage:/var/lib/pyroscope postgres_exporter_0: diff --git a/framework/observability/compose/pyroscope-config.yaml b/framework/observability/compose/pyroscope-config.yaml new file mode 100644 index 000000000..6492bf3e6 --- /dev/null +++ b/framework/observability/compose/pyroscope-config.yaml @@ -0,0 +1,22 @@ +# Pyroscope server configuration +# Increased limits to preserve more profile data for comparison analysis + +limits: + # Ingestion rate limits (default: 4 MiB/s rate, 2 MiB burst) + ingestion_rate_mb: 100 + ingestion_burst_size_mb: 50 + + # Disable sample limit to prevent profile rejection (default: 16000) + max_profile_stacktrace_samples: 0 + + # Increase max stacktrace depth (default: 1000) + max_profile_stacktrace_depth: 2000 + + # Increase default flame graph nodes shown (default: 8192) + max_flamegraph_nodes_default: 100000 + + # Disable max flame graph nodes limit (default: 1048576) + max_flamegraph_nodes_max: 0 + + # Increase max profile size (default: 4MB) + max_profile_size_bytes: 16777216 diff --git a/framework/observability/compose/tempo.yaml b/framework/observability/compose/tempo.yaml index 0cffaae26..6925ab742 100644 --- a/framework/observability/compose/tempo.yaml +++ b/framework/observability/compose/tempo.yaml @@ -6,10 +6,54 @@ distributor: otlp: protocols: grpc: + endpoint: 0.0.0.0:4317 http: + endpoint: 0.0.0.0:4318 + +# Query frontend for TraceQL metrics +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + metrics: + max_duration: 0s + +# Querier configuration +querier: + search: + prefer_self: 10 + +# Metrics generator for Grafana Drilldown (RED metrics from traces) +metrics_generator: + registry: + external_labels: + source: tempo + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + processor: + service_graphs: + dimensions: + - service.name + - service.namespace + span_metrics: + dimensions: + - service.name + - span.name + - span.kind + - status.code + local_blocks: + flush_to_storage: false + max_block_bytes: 100000000 # 100MB max block size for metrics generator overrides: max_traces_per_user: 50000 + metrics_generator_processors: + - service-graphs + - span-metrics + - local-blocks storage: trace: