diff --git a/internal/common/ingest/metrics/metrics.go b/internal/common/ingest/metrics/metrics.go index 61ca2232d05..7ff0550037f 100644 --- a/internal/common/ingest/metrics/metrics.go +++ b/internal/common/ingest/metrics/metrics.go @@ -79,7 +79,7 @@ func NewMetrics(prefix string) *Metrics { pulsarMessageProcessingDelay: promauto.NewGaugeVec(pulsarMessageProcessingDelayOpts, []string{"subscription", "partition"}), pulsarMessagePublishTime: promauto.NewGaugeVec(pulsarMessagePublishTime, []string{"subscription", "partition"}), pulsarMessagesProcessed: promauto.NewCounter(pulsarMessagesProcessedOpts), - eventsProcessed: promauto.NewCounterVec(eventsProcessedOpts, []string{"queue", "eventType", "msgType"}), + eventsProcessed: promauto.NewCounterVec(eventsProcessedOpts, []string{"queue", "eventType", "event_type", "msgType", "msg_type"}), } } @@ -110,9 +110,9 @@ func (m *Metrics) RecordPulsarProcessingDelay(subscriptionName string, partition } func (m *Metrics) RecordEventSequenceProcessed(queue string, msgType string) { - m.eventsProcessed.With(map[string]string{"queue": queue, "eventType": JobSetEventsLabel, "msgType": msgType}).Inc() + m.eventsProcessed.With(map[string]string{"queue": queue, "eventType": JobSetEventsLabel, "event_type": JobSetEventsLabel, "msgType": msgType, "msg_type": msgType}).Inc() } func (m *Metrics) RecordControlPlaneEventProcessed(msgType string) { - m.eventsProcessed.With(map[string]string{"queue": "N/A", "eventType": ControlPlaneEventsLabel, "msgType": msgType}).Inc() + m.eventsProcessed.With(map[string]string{"queue": "N/A", "eventType": ControlPlaneEventsLabel, "event_type": ControlPlaneEventsLabel, "msgType": msgType, "msg_type": msgType}).Inc() } diff --git a/internal/common/metrics/scheduler_metrics.go b/internal/common/metrics/scheduler_metrics.go index 8e22fb029d3..8798140cc41 100644 --- a/internal/common/metrics/scheduler_metrics.go +++ b/internal/common/metrics/scheduler_metrics.go @@ -46,133 +46,133 @@ var QueueDistinctSchedulingKeysDesc = prometheus.NewDesc( var QueueResourcesDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_queued", "Resource required by queued jobs", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource", "accounting_role"}, nil, ) var MinQueueResourcesDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_queued_min", "Min resource required by queued job", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource", "accounting_role"}, nil, ) var MaxQueueResourcesDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_queued_max", "Max resource required by queued job", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource", "accounting_role"}, nil, ) var MedianQueueResourcesDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_queued_median", "Median resource required by queued jobs", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource", "accounting_role"}, nil, ) var CountQueueResourcesDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_queued_count", "Count of queued jobs requiring resource", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource", "accounting_role"}, nil, ) var MinQueueDurationDesc = prometheus.NewDesc( MetricPrefix+"job_queued_seconds_min", "Min queue time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "accounting_role"}, nil, ) var MaxQueueDurationDesc = prometheus.NewDesc( MetricPrefix+"job_queued_seconds_max", "Max queue time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "accounting_role"}, nil, ) var MedianQueueDurationDesc = prometheus.NewDesc( MetricPrefix+"job_queued_seconds_median", "Median queue time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "accounting_role"}, nil, ) var QueueDurationDesc = prometheus.NewDesc( MetricPrefix+"job_queued_seconds", "Queued time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "accounting_role"}, nil, ) var MinJobRunDurationDesc = prometheus.NewDesc( MetricPrefix+"job_run_time_seconds_min", "Min run time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue"}, nil, ) var MaxJobRunDurationDesc = prometheus.NewDesc( MetricPrefix+"job_run_time_seconds_max", "Max run time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue"}, nil, ) var MedianJobRunDurationDesc = prometheus.NewDesc( MetricPrefix+"job_run_time_seconds_median", "Median run time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue"}, nil, ) var JobRunDurationDesc = prometheus.NewDesc( MetricPrefix+"job_run_time_seconds", "Run time for Armada jobs", - []string{"pool", "priorityClass", "queueName", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue"}, nil, ) var QueueAllocatedDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_allocated", "Resource allocated to running jobs of a queue", - []string{"cluster", "pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType", "nodeType", "reservation", "physical_pool"}, + []string{"cluster", "pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource", "nodeType", "node_type", "reservation", "physical_pool"}, nil, ) var MinQueueAllocatedDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_allocated_min", "Min resource allocated by a running job", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource"}, nil, ) var MaxQueueAllocatedDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_allocated_max", "Max resource allocated by a running job", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource"}, nil, ) var MedianQueueAllocatedDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_allocated_median", "Median resource allocated by a running job", - []string{"pool", "priorityClass", "queueName", "queue", "priceBand", "resourceType"}, + []string{"pool", "priorityClass", "priority_class", "queueName", "queue", "priceBand", "price_band", "resourceType", "resource"}, nil, ) var QueueUsedDesc = prometheus.NewDesc( MetricPrefix+"queue_resource_used", "Resource actually being used by running jobs of a queue", - []string{"cluster", "pool", "queueName", "queue", "resourceType", "nodeType", "reservation", "physical_pool"}, + []string{"cluster", "pool", "queueName", "queue", "resourceType", "resource", "nodeType", "node_type", "reservation", "physical_pool"}, nil, ) var QueueLeasedPodCountDesc = prometheus.NewDesc( MetricPrefix+"queue_leased_pod_count", "Number of leased pods", - []string{"cluster", "pool", "queueName", "queue", "phase", "nodeType", "reservation"}, + []string{"cluster", "pool", "queueName", "queue", "phase", "nodeType", "node_type", "reservation"}, nil, ) @@ -193,7 +193,7 @@ var NodeJobPhaseCounterDesc = prometheus.NewDesc( var ClusterCapacityDesc = prometheus.NewDesc( MetricPrefix+"cluster_capacity", "Cluster capacity", - []string{"cluster", "pool", "resourceType", "nodeType", "reservation", "physical_pool", "capacity_class"}, + []string{"cluster", "pool", "resourceType", "resource", "nodeType", "node_type", "reservation", "physical_pool", "capacity_class"}, nil, ) @@ -207,7 +207,7 @@ var ClusterCapacityDesc = prometheus.NewDesc( var ClusterFarmCapacityDesc = prometheus.NewDesc( MetricPrefix+"cluster_farm_capacity", "Cluster capacity less usage from non-Armada pods", - []string{"cluster", "pool", "resourceType", "nodeType", "reservation", "physical_pool", "capacity_class"}, + []string{"cluster", "pool", "resourceType", "resource", "nodeType", "node_type", "reservation", "physical_pool", "capacity_class"}, nil, ) @@ -221,14 +221,14 @@ var ClusterFarmCapacityDesc = prometheus.NewDesc( var ClusterAvailableCapacityDesc = prometheus.NewDesc( MetricPrefix+"cluster_available_capacity", "Cluster capacity available for Armada jobs", - []string{"cluster", "pool", "resourceType", "nodeType", "reservation", "physical_pool", "capacity_class"}, + []string{"cluster", "pool", "resourceType", "resource", "nodeType", "node_type", "reservation", "physical_pool", "capacity_class"}, nil, ) var ClusterCordonedStatusDesc = prometheus.NewDesc( MetricPrefix+"cluster_cordoned_status", "Cluster cordoned status", - []string{"cluster", "reason", "setByUser"}, + []string{"cluster", "reason", "setByUser", "set_by_user"}, nil, ) @@ -242,49 +242,49 @@ var QueuePriorityDesc = prometheus.NewDesc( var MinQueuePriceQueuedDesc = prometheus.NewDesc( MetricPrefix+"queue_price_queued_min", "Minimum price of queued jobs", - []string{"pool", "priorityClass", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queue", "accounting_role"}, nil, ) var MaxQueuePriceQueuedDesc = prometheus.NewDesc( MetricPrefix+"queue_price_queued_max", "Maximum price of queued jobs", - []string{"pool", "priorityClass", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queue", "accounting_role"}, nil, ) var MedianQueuePriceQueuedDesc = prometheus.NewDesc( MetricPrefix+"queue_price_queued_median", "Median price of queued jobs", - []string{"pool", "priorityClass", "queue", "accounting_role"}, + []string{"pool", "priorityClass", "priority_class", "queue", "accounting_role"}, nil, ) var MinQueuePriceRunningDesc = prometheus.NewDesc( MetricPrefix+"queue_price_running_min", "Minimum price of running jobs", - []string{"pool", "priorityClass", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queue"}, nil, ) var MaxQueuePriceRunningDesc = prometheus.NewDesc( MetricPrefix+"queue_price_running_max", "Maximum price of running jobs", - []string{"pool", "priorityClass", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queue"}, nil, ) var MedianQueuePriceRunningDesc = prometheus.NewDesc( MetricPrefix+"queue_price_running_median", "Median price of running jobs", - []string{"pool", "priorityClass", "queue"}, + []string{"pool", "priorityClass", "priority_class", "queue"}, nil, ) var QueuePriceBandPhaseBidDesc = prometheus.NewDesc( MetricPrefix+"queue_price_band_phase_bid", "Bid price for a queues price band", - []string{"pool", "queueName", "queue", "phase", "priceBand"}, + []string{"pool", "queueName", "queue", "phase", "priceBand", "price_band"}, nil, ) @@ -466,95 +466,95 @@ func NewQueueDistinctSchedulingKeyMetric(value int, queue string) prometheus.Met } func NewQueueDuration(count uint64, sum float64, buckets map[float64]uint64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstHistogram(QueueDurationDesc, count, sum, buckets, pool, priorityClass, queue, queue, accountingRole) + return prometheus.MustNewConstHistogram(QueueDurationDesc, count, sum, buckets, pool, priorityClass, priorityClass, queue, queue, accountingRole) } func NewQueueResources(value float64, pool string, priorityClass string, queue string, priceBand string, resource string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(QueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource, accountingRole) + return prometheus.MustNewConstMetric(QueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource, accountingRole) } func NewMaxQueueResources(value float64, pool string, priorityClass string, queue string, priceBand string, resource string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MaxQueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource, accountingRole) + return prometheus.MustNewConstMetric(MaxQueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource, accountingRole) } func NewMinQueueResources(value float64, pool string, priorityClass string, queue string, priceBand string, resource string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MinQueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource, accountingRole) + return prometheus.MustNewConstMetric(MinQueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource, accountingRole) } func NewMedianQueueResources(value float64, pool string, priorityClass string, queue string, priceBand string, resource string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MedianQueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource, accountingRole) + return prometheus.MustNewConstMetric(MedianQueueResourcesDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource, accountingRole) } func NewCountQueueResources(value uint64, pool string, priorityClass string, queue string, priceBand string, resource string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(CountQueueResourcesDesc, prometheus.GaugeValue, float64(value), pool, priorityClass, queue, queue, priceBand, resource, accountingRole) + return prometheus.MustNewConstMetric(CountQueueResourcesDesc, prometheus.GaugeValue, float64(value), pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource, accountingRole) } func NewMinQueueDuration(value float64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MinQueueDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, accountingRole) + return prometheus.MustNewConstMetric(MinQueueDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, accountingRole) } func NewMaxQueueDuration(value float64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MaxQueueDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, accountingRole) + return prometheus.MustNewConstMetric(MaxQueueDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, accountingRole) } func NewMedianQueueDuration(value float64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MedianQueueDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, accountingRole) + return prometheus.MustNewConstMetric(MedianQueueDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, accountingRole) } func NewMinJobRunDuration(value float64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstMetric(MinJobRunDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue) + return prometheus.MustNewConstMetric(MinJobRunDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue) } func NewMaxJobRunDuration(value float64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstMetric(MaxJobRunDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue) + return prometheus.MustNewConstMetric(MaxJobRunDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue) } func NewMedianJobRunDuration(value float64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstMetric(MedianJobRunDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue) + return prometheus.MustNewConstMetric(MedianJobRunDurationDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue) } func NewJobRunRunDuration(count uint64, sum float64, buckets map[float64]uint64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstHistogram(JobRunDurationDesc, count, sum, buckets, pool, priorityClass, queue, queue) + return prometheus.MustNewConstHistogram(JobRunDurationDesc, count, sum, buckets, pool, priorityClass, priorityClass, queue, queue) } func NewMinQueueAllocated(value float64, pool string, priorityClass string, queue string, priceBand string, resource string) prometheus.Metric { - return prometheus.MustNewConstMetric(MinQueueAllocatedDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource) + return prometheus.MustNewConstMetric(MinQueueAllocatedDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource) } func NewMaxQueueAllocated(value float64, pool string, priorityClass string, queue string, priceBand string, resource string) prometheus.Metric { - return prometheus.MustNewConstMetric(MaxQueueAllocatedDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource) + return prometheus.MustNewConstMetric(MaxQueueAllocatedDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource) } func NewMedianQueueAllocated(value float64, pool string, priorityClass string, queue string, priceBand string, resource string) prometheus.Metric { - return prometheus.MustNewConstMetric(MedianQueueAllocatedDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, queue, priceBand, resource) + return prometheus.MustNewConstMetric(MedianQueueAllocatedDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource) } func NewQueueAllocated(value float64, queue string, cluster string, pool string, priorityClass string, priceBand string, resource string, nodeType string, reservation string, physicalPool string) prometheus.Metric { - return prometheus.MustNewConstMetric(QueueAllocatedDesc, prometheus.GaugeValue, value, cluster, pool, priorityClass, queue, queue, priceBand, resource, nodeType, reservation, physicalPool) + return prometheus.MustNewConstMetric(QueueAllocatedDesc, prometheus.GaugeValue, value, cluster, pool, priorityClass, priorityClass, queue, queue, priceBand, priceBand, resource, resource, nodeType, nodeType, reservation, physicalPool) } func NewQueueLeasedPodCount(value float64, cluster string, pool string, queue string, phase string, nodeType string, reservation string) prometheus.Metric { - return prometheus.MustNewConstMetric(QueueLeasedPodCountDesc, prometheus.GaugeValue, value, cluster, pool, queue, queue, phase, nodeType, reservation) + return prometheus.MustNewConstMetric(QueueLeasedPodCountDesc, prometheus.GaugeValue, value, cluster, pool, queue, queue, phase, nodeType, nodeType, reservation) } func NewClusterAvailableCapacity(value float64, cluster string, pool string, resource string, nodeType string, reservation string, physicalPool string, capacityClass string) prometheus.Metric { - return prometheus.MustNewConstMetric(ClusterAvailableCapacityDesc, prometheus.GaugeValue, value, cluster, pool, resource, nodeType, reservation, physicalPool, capacityClass) + return prometheus.MustNewConstMetric(ClusterAvailableCapacityDesc, prometheus.GaugeValue, value, cluster, pool, resource, resource, nodeType, nodeType, reservation, physicalPool, capacityClass) } func NewClusterFarmCapacity(value float64, cluster string, pool string, resource string, nodeType string, reservation string, physicalPool string, capacityClass string) prometheus.Metric { - return prometheus.MustNewConstMetric(ClusterFarmCapacityDesc, prometheus.GaugeValue, value, cluster, pool, resource, nodeType, reservation, physicalPool, capacityClass) + return prometheus.MustNewConstMetric(ClusterFarmCapacityDesc, prometheus.GaugeValue, value, cluster, pool, resource, resource, nodeType, nodeType, reservation, physicalPool, capacityClass) } func NewClusterTotalCapacity(value float64, cluster string, pool string, resource string, nodeType string, reservation string, physicalPool string, capacityClass string) prometheus.Metric { - return prometheus.MustNewConstMetric(ClusterCapacityDesc, prometheus.GaugeValue, value, cluster, pool, resource, nodeType, reservation, physicalPool, capacityClass) + return prometheus.MustNewConstMetric(ClusterCapacityDesc, prometheus.GaugeValue, value, cluster, pool, resource, resource, nodeType, nodeType, reservation, physicalPool, capacityClass) } func NewClusterCordonedStatus(value float64, cluster string, reason string, setByUser string) prometheus.Metric { - return prometheus.MustNewConstMetric(ClusterCordonedStatusDesc, prometheus.GaugeValue, value, cluster, reason, setByUser) + return prometheus.MustNewConstMetric(ClusterCordonedStatusDesc, prometheus.GaugeValue, value, cluster, reason, setByUser, setByUser) } func NewQueueUsed(value float64, queue string, cluster string, pool string, resource string, nodeType string, reservation string, physicalPool string) prometheus.Metric { - return prometheus.MustNewConstMetric(QueueUsedDesc, prometheus.GaugeValue, value, cluster, pool, queue, queue, resource, nodeType, reservation, physicalPool) + return prometheus.MustNewConstMetric(QueueUsedDesc, prometheus.GaugeValue, value, cluster, pool, queue, queue, resource, resource, nodeType, nodeType, reservation, physicalPool) } func NewQueuePriorityMetric(value float64, queue string) prometheus.Metric { @@ -562,31 +562,31 @@ func NewQueuePriorityMetric(value float64, queue string) prometheus.Metric { } func NewMinQueuePriceQueuedMetric(value float64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MinQueuePriceQueuedDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, accountingRole) + return prometheus.MustNewConstMetric(MinQueuePriceQueuedDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, accountingRole) } func NewMaxQueuePriceQueuedMetric(value float64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MaxQueuePriceQueuedDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, accountingRole) + return prometheus.MustNewConstMetric(MaxQueuePriceQueuedDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, accountingRole) } func NewMedianQueuePriceQueuedMetric(value float64, pool string, priorityClass string, queue string, accountingRole string) prometheus.Metric { - return prometheus.MustNewConstMetric(MedianQueuePriceQueuedDesc, prometheus.GaugeValue, value, pool, priorityClass, queue, accountingRole) + return prometheus.MustNewConstMetric(MedianQueuePriceQueuedDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue, accountingRole) } func NewMinQueuePriceRunningMetric(value float64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstMetric(MinQueuePriceRunningDesc, prometheus.GaugeValue, value, pool, priorityClass, queue) + return prometheus.MustNewConstMetric(MinQueuePriceRunningDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue) } func NewMaxQueuePriceRunningMetric(value float64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstMetric(MaxQueuePriceRunningDesc, prometheus.GaugeValue, value, pool, priorityClass, queue) + return prometheus.MustNewConstMetric(MaxQueuePriceRunningDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue) } func NewMedianQueuePriceRunningMetric(value float64, pool string, priorityClass string, queue string) prometheus.Metric { - return prometheus.MustNewConstMetric(MedianQueuePriceRunningDesc, prometheus.GaugeValue, value, pool, priorityClass, queue) + return prometheus.MustNewConstMetric(MedianQueuePriceRunningDesc, prometheus.GaugeValue, value, pool, priorityClass, priorityClass, queue) } func NewQueuePriceBandBidMetric(value float64, pool string, queue string, phase, priceBand string) prometheus.Metric { - return prometheus.MustNewConstMetric(QueuePriceBandPhaseBidDesc, prometheus.GaugeValue, value, pool, queue, queue, phase, priceBand) + return prometheus.MustNewConstMetric(QueuePriceBandPhaseBidDesc, prometheus.GaugeValue, value, pool, queue, queue, phase, priceBand, priceBand) } func NewJobDBCumulativeInternedStrings(value float64) prometheus.Metric { diff --git a/internal/executor/metrics/pod_metrics/cluster_context.go b/internal/executor/metrics/pod_metrics/cluster_context.go index 89500ee4ffd..615f2ecd49d 100644 --- a/internal/executor/metrics/pod_metrics/cluster_context.go +++ b/internal/executor/metrics/pod_metrics/cluster_context.go @@ -17,11 +17,13 @@ import ( ) const ( - leasedPhase = "Leased" - queueLabel = "queue" - phaseLabel = "phase" - resourceTypeLabel = "resourceType" - nodeTypeLabel = "nodeType" + leasedPhase = "Leased" + queueLabel = "queue" + phaseLabel = "phase" + resourceTypeLabel = "resourceType" + resourceTypeLabelSnake = "resource" + nodeTypeLabel = "nodeType" + nodeTypeLabelSnake = "node_type" ) const ( @@ -32,37 +34,37 @@ const ( var podCountDesc = prometheus.NewDesc( metrics.ArmadaExecutorMetricsPrefix+"job_pod", "Pods in different phases by queue", - []string{queueLabel, phaseLabel, nodeTypeLabel}, nil, + []string{queueLabel, phaseLabel, nodeTypeLabel, nodeTypeLabelSnake}, nil, ) var podResourceRequestDesc = prometheus.NewDesc( metrics.ArmadaExecutorMetricsPrefix+"job_pod_resource_request", "Pod resource requests in different phases by queue", - []string{queueLabel, phaseLabel, resourceTypeLabel, nodeTypeLabel}, nil, + []string{queueLabel, phaseLabel, resourceTypeLabel, resourceTypeLabelSnake, nodeTypeLabel, nodeTypeLabelSnake}, nil, ) var podResourceUsageDesc = prometheus.NewDesc( metrics.ArmadaExecutorMetricsPrefix+"job_pod_resource_usage", "Pod resource usage in different phases by queue", - []string{queueLabel, phaseLabel, resourceTypeLabel, nodeTypeLabel}, nil, + []string{queueLabel, phaseLabel, resourceTypeLabel, resourceTypeLabelSnake, nodeTypeLabel, nodeTypeLabelSnake}, nil, ) var nodeCountDesc = prometheus.NewDesc( metrics.ArmadaExecutorMetricsPrefix+"available_node_count", "Number of nodes available for Armada jobs", - []string{nodeTypeLabel}, nil, + []string{nodeTypeLabel, nodeTypeLabelSnake}, nil, ) var nodeAvailableResourceDesc = prometheus.NewDesc( metrics.ArmadaExecutorMetricsPrefix+"available_node_resource_allocatable", "Resource allocatable on nodes available for Armada jobs", - []string{resourceTypeLabel, nodeTypeLabel}, nil, + []string{resourceTypeLabel, resourceTypeLabelSnake, nodeTypeLabel, nodeTypeLabelSnake}, nil, ) var nodeTotalResourceDesc = prometheus.NewDesc( metrics.ArmadaExecutorMetricsPrefix+"available_node_resource_total", "Total resource on nodes available for Armada jobs", - []string{resourceTypeLabel, nodeTypeLabel}, nil, + []string{resourceTypeLabel, resourceTypeLabelSnake, nodeTypeLabel, nodeTypeLabelSnake}, nil, ) type ClusterContextMetrics struct { @@ -208,27 +210,27 @@ func (m *ClusterContextMetrics) Collect(metrics chan<- prometheus.Metric) { for phase, phaseMetric := range phaseMetrics { for resourceType, request := range phaseMetric.resourceRequest { metrics <- prometheus.MustNewConstMetric(podResourceRequestDesc, prometheus.GaugeValue, - request.AsApproximateFloat64(), queue, phase, resourceType, nodeType) + request.AsApproximateFloat64(), queue, phase, resourceType, resourceType, nodeType, nodeType) } for resourceType, usage := range phaseMetric.resourceUsage { metrics <- prometheus.MustNewConstMetric(podResourceUsageDesc, prometheus.GaugeValue, - usage.AsApproximateFloat64(), queue, phase, resourceType, nodeType) + usage.AsApproximateFloat64(), queue, phase, resourceType, resourceType, nodeType, nodeType) } - metrics <- prometheus.MustNewConstMetric(podCountDesc, prometheus.GaugeValue, phaseMetric.count, queue, phase, nodeType) + metrics <- prometheus.MustNewConstMetric(podCountDesc, prometheus.GaugeValue, phaseMetric.count, queue, phase, nodeType, nodeType) } } } for _, nodeGroup := range nodeGroupAllocationInfos { - metrics <- prometheus.MustNewConstMetric(nodeCountDesc, prometheus.GaugeValue, float64(len(nodeGroup.Nodes)), nodeGroup.NodeType) + metrics <- prometheus.MustNewConstMetric(nodeCountDesc, prometheus.GaugeValue, float64(len(nodeGroup.Nodes)), nodeGroup.NodeType, nodeGroup.NodeType) for resourceType, allocatable := range nodeGroup.NodeGroupAllocatableCapacity { metrics <- prometheus.MustNewConstMetric(nodeAvailableResourceDesc, - prometheus.GaugeValue, allocatable.AsApproximateFloat64(), resourceType, - nodeGroup.NodeType) + prometheus.GaugeValue, allocatable.AsApproximateFloat64(), resourceType, resourceType, + nodeGroup.NodeType, nodeGroup.NodeType) } for resourceType, total := range nodeGroup.NodeGroupCapacity { - metrics <- prometheus.MustNewConstMetric(nodeTotalResourceDesc, prometheus.GaugeValue, total.AsApproximateFloat64(), resourceType, nodeGroup.NodeType) + metrics <- prometheus.MustNewConstMetric(nodeTotalResourceDesc, prometheus.GaugeValue, total.AsApproximateFloat64(), resourceType, resourceType, nodeGroup.NodeType, nodeGroup.NodeType) } } } diff --git a/internal/scheduler/metrics/constants.go b/internal/scheduler/metrics/constants.go index b7f1926d5e0..4af5a2919b0 100644 --- a/internal/scheduler/metrics/constants.go +++ b/internal/scheduler/metrics/constants.go @@ -14,15 +14,18 @@ const ( priorityClassLabel = "priority_class" nodeLabel = "node" nodeTypeLabel = "nodeType" + nodeTypeLabelSnake = "node_type" clusterLabel = "cluster" errorCategoryLabel = "category" errorSubcategoryLabel = "subcategory" stateLabel = "state" priorStateLabel = "priorState" + priorStateLabelSnake = "prior_state" resourceLabel = "resource" reservationLabel = "reservation" schedulableLabel = "schedulable" overAllocatedLabel = "overAllocated" + overAllocatedLabelSnake = "over_allocated" physicalPoolLabel = "physical_pool" capacityClassLabel = "capacity_class" jobShapeLabel = "job_shape" diff --git a/internal/scheduler/metrics/cycle_metrics.go b/internal/scheduler/metrics/cycle_metrics.go index 1756d2a3b77..f8c6dcd8d8d 100644 --- a/internal/scheduler/metrics/cycle_metrics.go +++ b/internal/scheduler/metrics/cycle_metrics.go @@ -31,7 +31,7 @@ var ( poolAndShapeAndReasonLabels = []string{poolLabel, jobShapeLabel, unschedulableReasonLabel} poolQueueAndResourceLabels = []string{poolLabel, queueLabel, resourceLabel} poolAndOutcomeLabels = []string{poolLabel, outcomeLabel, terminationReasonLabel} - nodeLabels = []string{poolLabel, nodeLabel, clusterLabel, nodeTypeLabel, resourceLabel, reservationLabel, schedulableLabel, overAllocatedLabel, physicalPoolLabel, capacityClassLabel} + nodeLabels = []string{poolLabel, nodeLabel, clusterLabel, nodeTypeLabel, nodeTypeLabelSnake, resourceLabel, reservationLabel, schedulableLabel, overAllocatedLabel, overAllocatedLabelSnake, physicalPoolLabel, capacityClassLabel} defaultType = "unknown" reconcilerFailureType = "reconciler" ) @@ -253,7 +253,7 @@ func newPerCycleMetrics() *perCycleMetrics { Name: prefix + "node_preemptibility", Help: "is it possible to clear this node by preempting any jobs on it?", }, - []string{poolLabel, nodeLabel, clusterLabel, nodeTypeLabel, "isPreemptible", "reason"}, + []string{poolLabel, nodeLabel, clusterLabel, nodeTypeLabel, nodeTypeLabelSnake, "isPreemptible", "is_preemptible", "reason"}, ) protectedFractionOfFairShare := prometheus.NewGaugeVec( @@ -597,6 +597,8 @@ func (m *cycleMetrics) ReportSchedulerResult(ctx *armadacontext.Context, result nodePreemptiblityStats.NodeName, nodePreemptiblityStats.Cluster, nodePreemptiblityStats.NodeType, + nodePreemptiblityStats.NodeType, + fmt.Sprintf("%t", nodePreemptiblityStats.Preemptible), fmt.Sprintf("%t", nodePreemptiblityStats.Preemptible), nodePreemptiblityStats.Reason).Set(1.0) } @@ -615,15 +617,15 @@ func (m *cycleMetrics) ReportSchedulerResult(ctx *armadacontext.Context, result nodeCapacityClass = CapacityClassShared } for _, resource := range node.GetAllocatableResources().GetAll() { - currentCycle.nodeAllocatableResource.WithLabelValues(pool, node.GetName(), node.GetExecutor(), node.GetReportingNodeType(), resource.Name, node.GetReservation(), - isSchedulable, isOverallocated, node.GetPool(), nodeCapacityClass).Set(resource.Value.AsApproximateFloat64()) + currentCycle.nodeAllocatableResource.WithLabelValues(pool, node.GetName(), node.GetExecutor(), node.GetReportingNodeType(), node.GetReportingNodeType(), resource.Name, node.GetReservation(), + isSchedulable, isOverallocated, isOverallocated, node.GetPool(), nodeCapacityClass).Set(resource.Value.AsApproximateFloat64()) } allocated := node.GetAllocatableResources().Subtract(node.AllocatableByPriority[internaltypes.EvictedPriority]) for _, resource := range allocated.GetAll() { allocatableValue := math.Max(resource.Value.AsApproximateFloat64(), 0) - currentCycle.nodeAllocatedResource.WithLabelValues(pool, node.GetName(), node.GetExecutor(), node.GetReportingNodeType(), resource.Name, node.GetReservation(), - isSchedulable, isOverallocated, node.GetPool(), nodeCapacityClass).Set(allocatableValue) + currentCycle.nodeAllocatedResource.WithLabelValues(pool, node.GetName(), node.GetExecutor(), node.GetReportingNodeType(), node.GetReportingNodeType(), resource.Name, node.GetReservation(), + isSchedulable, isOverallocated, isOverallocated, node.GetPool(), nodeCapacityClass).Set(allocatableValue) } } } diff --git a/internal/scheduler/metrics/cycle_metrics_test.go b/internal/scheduler/metrics/cycle_metrics_test.go index f90afc6ddfb..0969023d6c1 100644 --- a/internal/scheduler/metrics/cycle_metrics_test.go +++ b/internal/scheduler/metrics/cycle_metrics_test.go @@ -155,7 +155,7 @@ func TestResetLeaderMetrics_ResetsLatestCycleMetrics(t *testing.T) { poolLabelValues := []string{"pool1"} poolQueueLabelValues := []string{"pool1", "queue1"} poolQueueResourceLabelValues := []string{"pool1", "queue1", "cpu"} - nodeResourceLabelValues := []string{"pool1", "node1", "cluster1", "type1", "cpu", "", "true", "false", "pool1", CapacityClassDedicated} + nodeResourceLabelValues := []string{"pool1", "node1", "cluster1", "type1", "type1", "cpu", "", "true", "false", "false", "pool1", CapacityClassDedicated} testResetGauge := func(getVec func(metrics *cycleMetrics) *prometheus.GaugeVec, labelValues []string) { vec := getVec(m) @@ -227,8 +227,8 @@ func TestDisableLeaderMetrics(t *testing.T) { m.latestCycleMetrics.Load().loopNumber.WithLabelValues("pool1").Inc() m.latestCycleMetrics.Load().evictedJobs.WithLabelValues("pool1", "queue1").Inc() m.latestCycleMetrics.Load().evictedResources.WithLabelValues("pool1", "queue1", "cpu").Inc() - m.latestCycleMetrics.Load().nodeAllocatableResource.WithLabelValues("pool1", "node1", "cluster1", "type1", "cpu", "", "true", "false", "pool1", CapacityClassDedicated).Inc() - m.latestCycleMetrics.Load().nodeAllocatedResource.WithLabelValues("pool1", "node1", "cluster1", "type1", "cpu", "", "true", "false", "pool1", CapacityClassDedicated).Inc() + m.latestCycleMetrics.Load().nodeAllocatableResource.WithLabelValues("pool1", "node1", "cluster1", "type1", "type1", "cpu", "", "true", "false", "false", "pool1", CapacityClassDedicated).Inc() + m.latestCycleMetrics.Load().nodeAllocatedResource.WithLabelValues("pool1", "node1", "cluster1", "type1", "type1", "cpu", "", "true", "false", "false", "pool1", CapacityClassDedicated).Inc() m.latestCycleMetrics.Load().nodePoolSize.WithLabelValues("pool1").Inc() ch := make(chan prometheus.Metric, 1000) diff --git a/internal/scheduler/metrics/state_metrics.go b/internal/scheduler/metrics/state_metrics.go index 741994101c0..6b588b23392 100644 --- a/internal/scheduler/metrics/state_metrics.go +++ b/internal/scheduler/metrics/state_metrics.go @@ -53,42 +53,42 @@ func newJobStateMetrics( Name: prefix + "job_state_counter_by_queue", Help: "Job states at queue level", }, - []string{queueLabel, poolLabel, stateLabel, priorStateLabel}, + []string{queueLabel, poolLabel, stateLabel, priorStateLabel, priorStateLabelSnake}, ) jobStateCounterByNode := prometheus.NewCounterVec( prometheus.CounterOpts{ Name: prefix + "job_state_counter_by_node", Help: "Job states at node level", }, - []string{nodeLabel, poolLabel, clusterLabel, stateLabel, priorStateLabel}, + []string{nodeLabel, poolLabel, clusterLabel, stateLabel, priorStateLabel, priorStateLabelSnake}, ) jobStateSecondsByQueue := prometheus.NewCounterVec( prometheus.CounterOpts{ Name: prefix + "job_state_seconds_by_queue", Help: "time spent in different states at the queue level", }, - []string{queueLabel, poolLabel, stateLabel, priorStateLabel}, + []string{queueLabel, poolLabel, stateLabel, priorStateLabel, priorStateLabelSnake}, ) jobStateSecondsByNode := prometheus.NewCounterVec( prometheus.CounterOpts{ Name: prefix + "job_state_seconds_by_node", Help: "time spent in different states at the node level", }, - []string{nodeLabel, poolLabel, clusterLabel, stateLabel, priorStateLabel}, + []string{nodeLabel, poolLabel, clusterLabel, stateLabel, priorStateLabel, priorStateLabelSnake}, ) jobStateResourceSecondsByQueue := prometheus.NewCounterVec( prometheus.CounterOpts{ Name: prefix + "job_state_resource_seconds_by_queue", Help: "Resource-seconds spent in different states at the queue level", }, - []string{queueLabel, poolLabel, stateLabel, priorStateLabel, resourceLabel}, + []string{queueLabel, poolLabel, stateLabel, priorStateLabel, priorStateLabelSnake, resourceLabel}, ) jobStateResourceSecondsByNode := prometheus.NewCounterVec( prometheus.CounterOpts{ Name: prefix + "job_state_resource_seconds_by_node", Help: "Resource-seconds spent in different states at the node level", }, - []string{nodeLabel, poolLabel, clusterLabel, stateLabel, priorStateLabel, resourceLabel}, + []string{nodeLabel, poolLabel, clusterLabel, stateLabel, priorStateLabel, priorStateLabelSnake, resourceLabel}, ) jobErrorsByQueue := prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -270,17 +270,17 @@ func (m *jobStateMetrics) updateStateDuration(job *jobdb.Job, state string, prio // Counters m.jobStateCounterByQueue. - WithLabelValues(queue, pool, state, priorState).Inc() + WithLabelValues(queue, pool, state, priorState, priorState).Inc() m.jobStateCounterByNode. - WithLabelValues(node, pool, cluster, state, priorState).Inc() + WithLabelValues(node, pool, cluster, state, priorState, priorState).Inc() // State seconds m.jobStateSecondsByQueue. - WithLabelValues(queue, pool, state, priorState).Add(duration) + WithLabelValues(queue, pool, state, priorState, priorState).Add(duration) m.jobStateSecondsByNode. - WithLabelValues(node, pool, cluster, state, priorState).Add(duration) + WithLabelValues(node, pool, cluster, state, priorState, priorState).Add(duration) // Resource Seconds for _, res := range m.trackedResourceNames { @@ -288,9 +288,9 @@ func (m *jobStateMetrics) updateStateDuration(job *jobdb.Job, state string, prio resSeconds := duration * float64(resQty.MilliValue()) / 1000 resSeconds = math.Max(resSeconds, 0) m.jobStateResourceSecondsByQueue. - WithLabelValues(queue, pool, state, priorState, res.String()).Add(resSeconds) + WithLabelValues(queue, pool, state, priorState, priorState, res.String()).Add(resSeconds) m.jobStateResourceSecondsByNode. - WithLabelValues(node, pool, cluster, state, priorState, res.String()).Add(resSeconds) + WithLabelValues(node, pool, cluster, state, priorState, priorState, res.String()).Add(resSeconds) } } diff --git a/internal/scheduler/metrics/state_metrics_test.go b/internal/scheduler/metrics/state_metrics_test.go index 8bf836ca717..5c009363718 100644 --- a/internal/scheduler/metrics/state_metrics_test.go +++ b/internal/scheduler/metrics/state_metrics_test.go @@ -48,12 +48,12 @@ func TestReportJobStateTransitions(t *testing.T) { trackedResourceNames []v1.ResourceName jsts []jobdb.JobStateTransitions jobRunErrorsByRunId map[string]*armadaevents.Error - expectedJobStateCounterByQueue map[[4]string]float64 - expectedJobStateCounterByNode map[[5]string]float64 - expectedJobStateSecondsByQueue map[[4]string]float64 - expectedJobStateSecondsByNode map[[5]string]float64 - expectedJobStateResourceSecondsByQueue map[[5]string]float64 - expectedJobStateResourceSecondsByNode map[[6]string]float64 + expectedJobStateCounterByQueue map[[5]string]float64 + expectedJobStateCounterByNode map[[6]string]float64 + expectedJobStateSecondsByQueue map[[5]string]float64 + expectedJobStateSecondsByNode map[[6]string]float64 + expectedJobStateResourceSecondsByQueue map[[6]string]float64 + expectedJobStateResourceSecondsByNode map[[7]string]float64 }{ "Pending": { trackedResourceNames: []v1.ResourceName{"cpu"}, @@ -67,23 +67,23 @@ func TestReportJobStateTransitions(t *testing.T) { Pending: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "pending", "leased"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "pending", "leased", "leased"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "pending", "leased"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "pending", "leased", "leased"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "pending", "leased"}: 2, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "pending", "leased", "leased"}: 2, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "pending", "leased"}: 2, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "pending", "leased", "leased"}: 2, }, - expectedJobStateResourceSecondsByQueue: map[[5]string]float64{ - {testQueue, testPool, "pending", "leased", "cpu"}: 2 * 16, + expectedJobStateResourceSecondsByQueue: map[[6]string]float64{ + {testQueue, testPool, "pending", "leased", "leased", "cpu"}: 2 * 16, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "pending", "leased", "cpu"}: 2 * 16, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "pending", "leased", "leased", "cpu"}: 2 * 16, }, }, "Running": { @@ -99,23 +99,23 @@ func TestReportJobStateTransitions(t *testing.T) { Running: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "running", "pending"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "running", "pending", "pending"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "running", "pending"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "running", "pending", "pending"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "running", "pending"}: 10, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "running", "pending", "pending"}: 10, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "running", "pending"}: 10, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "running", "pending", "pending"}: 10, }, - expectedJobStateResourceSecondsByQueue: map[[5]string]float64{ - {testQueue, testPool, "running", "pending", "cpu"}: 10 * 16, + expectedJobStateResourceSecondsByQueue: map[[6]string]float64{ + {testQueue, testPool, "running", "pending", "pending", "cpu"}: 10 * 16, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "running", "pending", "cpu"}: 10 * 16, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "running", "pending", "pending", "cpu"}: 10 * 16, }, }, "Succeeded": { @@ -132,23 +132,23 @@ func TestReportJobStateTransitions(t *testing.T) { Succeeded: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "succeeded", "running"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "succeeded", "running", "running"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "succeeded", "running"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "succeeded", "running", "running"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "succeeded", "running"}: 8, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "succeeded", "running", "running"}: 8, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "succeeded", "running"}: 8, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "succeeded", "running", "running"}: 8, }, - expectedJobStateResourceSecondsByQueue: map[[5]string]float64{ - {testQueue, testPool, "succeeded", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByQueue: map[[6]string]float64{ + {testQueue, testPool, "succeeded", "running", "running", "cpu"}: 8 * 16, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "succeeded", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "succeeded", "running", "running", "cpu"}: 8 * 16, }, }, "Cancelled": { @@ -165,23 +165,23 @@ func TestReportJobStateTransitions(t *testing.T) { Cancelled: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "cancelled", "running"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "cancelled", "running", "running"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "cancelled", "running"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "cancelled", "running", "running"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "cancelled", "running"}: 8, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "cancelled", "running", "running"}: 8, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "cancelled", "running"}: 8, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "cancelled", "running", "running"}: 8, }, - expectedJobStateResourceSecondsByQueue: map[[5]string]float64{ - {testQueue, testPool, "cancelled", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByQueue: map[[6]string]float64{ + {testQueue, testPool, "cancelled", "running", "running", "cpu"}: 8 * 16, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "cancelled", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "cancelled", "running", "running", "cpu"}: 8 * 16, }, }, "Failed": { @@ -198,23 +198,23 @@ func TestReportJobStateTransitions(t *testing.T) { Failed: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "failed", "running"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "failed", "running", "running"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "failed", "running"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "failed", "running", "running"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "failed", "running"}: 8, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "failed", "running", "running"}: 8, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "failed", "running"}: 8, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "failed", "running", "running"}: 8, }, - expectedJobStateResourceSecondsByQueue: map[[5]string]float64{ - {testQueue, testPool, "failed", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByQueue: map[[6]string]float64{ + {testQueue, testPool, "failed", "running", "running", "cpu"}: 8 * 16, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "failed", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "failed", "running", "running", "cpu"}: 8 * 16, }, }, "Preempted": { @@ -231,23 +231,23 @@ func TestReportJobStateTransitions(t *testing.T) { Preempted: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "preempted", "running"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "preempted", "running", "running"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "preempted", "running"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "preempted", "running", "running"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "preempted", "running"}: 8, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "preempted", "running", "running"}: 8, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "preempted", "running"}: 8, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "preempted", "running", "running"}: 8, }, - expectedJobStateResourceSecondsByQueue: map[[5]string]float64{ - {testQueue, testPool, "preempted", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByQueue: map[[6]string]float64{ + {testQueue, testPool, "preempted", "running", "running", "cpu"}: 8 * 16, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "preempted", "running", "cpu"}: 8 * 16, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "preempted", "running", "running", "cpu"}: 8 * 16, }, }, "Multiple transitions": { @@ -267,30 +267,30 @@ func TestReportJobStateTransitions(t *testing.T) { Succeeded: true, }, }, - expectedJobStateCounterByQueue: map[[4]string]float64{ - {testQueue, testPool, "pending", "leased"}: 1, - {testQueue, testPool, "running", "pending"}: 1, - {testQueue, testPool, "succeeded", "running"}: 1, + expectedJobStateCounterByQueue: map[[5]string]float64{ + {testQueue, testPool, "pending", "leased", "leased"}: 1, + {testQueue, testPool, "running", "pending", "pending"}: 1, + {testQueue, testPool, "succeeded", "running", "running"}: 1, }, - expectedJobStateCounterByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "pending", "leased"}: 1, - {testNode, testPool, testCluster, "running", "pending"}: 1, - {testNode, testPool, testCluster, "succeeded", "running"}: 1, + expectedJobStateCounterByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "pending", "leased", "leased"}: 1, + {testNode, testPool, testCluster, "running", "pending", "pending"}: 1, + {testNode, testPool, testCluster, "succeeded", "running", "running"}: 1, }, - expectedJobStateSecondsByQueue: map[[4]string]float64{ - {testQueue, testPool, "pending", "leased"}: 2, - {testQueue, testPool, "running", "pending"}: 3, - {testQueue, testPool, "succeeded", "running"}: 4, + expectedJobStateSecondsByQueue: map[[5]string]float64{ + {testQueue, testPool, "pending", "leased", "leased"}: 2, + {testQueue, testPool, "running", "pending", "pending"}: 3, + {testQueue, testPool, "succeeded", "running", "running"}: 4, }, - expectedJobStateSecondsByNode: map[[5]string]float64{ - {testNode, testPool, testCluster, "pending", "leased"}: 2, - {testNode, testPool, testCluster, "running", "pending"}: 3, - {testNode, testPool, testCluster, "succeeded", "running"}: 4, + expectedJobStateSecondsByNode: map[[6]string]float64{ + {testNode, testPool, testCluster, "pending", "leased", "leased"}: 2, + {testNode, testPool, testCluster, "running", "pending", "pending"}: 3, + {testNode, testPool, testCluster, "succeeded", "running", "running"}: 4, }, - expectedJobStateResourceSecondsByNode: map[[6]string]float64{ - {testNode, testPool, testCluster, "pending", "leased", "cpu"}: 32, - {testNode, testPool, testCluster, "running", "pending", "cpu"}: 48, - {testNode, testPool, testCluster, "succeeded", "running", "cpu"}: 64, + expectedJobStateResourceSecondsByNode: map[[7]string]float64{ + {testNode, testPool, testCluster, "pending", "leased", "leased", "cpu"}: 32, + {testNode, testPool, testCluster, "running", "pending", "pending", "cpu"}: 48, + {testNode, testPool, testCluster, "succeeded", "running", "running", "cpu"}: 64, }, }, } @@ -417,11 +417,13 @@ func TestCategoriseErrors(t *testing.T) { func TestReset(t *testing.T) { byQueueLabels := []string{testQueue, testPool} - byQueueAndStateLabels := append(byQueueLabels, "running", "pending") - byNodeLabels := []string{testNode, testPool, testCluster, "running", "pending"} + byQueueAndStateLabels := append(byQueueLabels, "running", "pending", "pending") + byNodeLabels := []string{testNode, testPool, testCluster, "running", "pending", "pending"} byQueueResourceLabels := append(byQueueAndStateLabels, "cpu") byNodeResourceLabels := append(byNodeLabels, "cpu") resourceSecondsLostToPreemptionLabels := append(byQueueLabels, "none", "cpu") + byQueueErrorLabels := []string{testQueue, testPool, "podError", "generic pod error"} + byNodeErrorLabels := []string{testNode, testPool, testCluster, "podError", "generic pod error"} m := newJobStateMetrics(nil, nil, []time.Duration{}, 12*time.Hour) testReset := func(vec *prometheus.CounterVec, labels []string) { @@ -439,18 +441,20 @@ func TestReset(t *testing.T) { testReset(m.jobStateSecondsByNode, byNodeLabels) testReset(m.jobStateResourceSecondsByQueue, byQueueResourceLabels) testReset(m.jobStateResourceSecondsByNode, byNodeResourceLabels) - testReset(m.jobErrorsByQueue, byQueueAndStateLabels) - testReset(m.jobErrorsByNode, byNodeLabels) + testReset(m.jobErrorsByQueue, byQueueErrorLabels) + testReset(m.jobErrorsByNode, byNodeErrorLabels) testReset(m.jobResourceSecondsLostToPreemptionByQueue, resourceSecondsLostToPreemptionLabels) } func TestDisable(t *testing.T) { byQueueLabels := []string{testQueue, testPool} - byQueueAndStateLabels := append(byQueueLabels, "running", "pending") - byNodeLabels := []string{testNode, testPool, testCluster, "running", "pending"} + byQueueAndStateLabels := append(byQueueLabels, "running", "pending", "pending") + byNodeLabels := []string{testNode, testPool, testCluster, "running", "pending", "pending"} byQueueResourceLabels := append(byQueueAndStateLabels, "cpu") byNodeResourceLabels := append(byNodeLabels, "cpu") resourceSecondsLostToPreemptionLabels := append(byQueueLabels, "none", "cpu") + byQueueErrorLabels := []string{testQueue, testPool, "podError", "generic pod error"} + byNodeErrorLabels := []string{testNode, testPool, testCluster, "podError", "generic pod error"} collect := func(m *jobStateMetrics) []prometheus.Metric { m.jobStateCounterByQueue.WithLabelValues(byQueueAndStateLabels...).Inc() @@ -459,8 +463,8 @@ func TestDisable(t *testing.T) { m.jobStateSecondsByNode.WithLabelValues(byNodeLabels...).Inc() m.jobStateResourceSecondsByQueue.WithLabelValues(byQueueResourceLabels...).Inc() m.jobStateResourceSecondsByNode.WithLabelValues(byNodeResourceLabels...).Inc() - m.jobErrorsByQueue.WithLabelValues(byQueueAndStateLabels...).Inc() - m.jobErrorsByNode.WithLabelValues(byNodeLabels...).Inc() + m.jobErrorsByQueue.WithLabelValues(byQueueErrorLabels...).Inc() + m.jobErrorsByNode.WithLabelValues(byNodeErrorLabels...).Inc() m.jobResourceSecondsLostToPreemptionByQueue.WithLabelValues(resourceSecondsLostToPreemptionLabels...).Inc() ch := make(chan prometheus.Metric, 1000)