From 7b8aabb3dc0fbcab433c34ccb8871c4fe1394802 Mon Sep 17 00:00:00 2001 From: myersCody Date: Fri, 24 Apr 2026 15:43:46 -0400 Subject: [PATCH 01/18] [COST-7401] Potential wasted cost solutions --- docs/architecture/efficiency-scores/README.md | 7 +- .../solution-options-and-limitations.md | 237 ++++++++++++++++++ 2 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 docs/architecture/efficiency-scores/solution-options-and-limitations.md diff --git a/docs/architecture/efficiency-scores/README.md b/docs/architecture/efficiency-scores/README.md index 996ac218da..d05e4cf82d 100644 --- a/docs/architecture/efficiency-scores/README.md +++ b/docs/architecture/efficiency-scores/README.md @@ -6,7 +6,7 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef ## One-paragraph scope -**Implemented:** For `cpu` and `memory` report types, the API adds **`total_score`** on the **`total`** object and **`score`** on each **data** row (when scores are computed), exposing **`usage_efficiency_percent`** and **`wasted_cost`** derived from aggregated usage/request hours and CPU- or memory-scoped **`cost_total`** on [`OCPUsageLineItemDailySummary`](../../../koku/reporting/models.py). **Not implemented:** a separate `efficiency`-only route; cost/volume reports do not expose these fields. **Backlog** (out of code): idle/cost-efficiency/overhead scores and dedicated Optimizations Summary routing in the UI. +**Implemented:** For `cpu` and `memory` report types, the API adds **`total_score`** on the **`total`** object and **`score`** on each **data** row (when scores are computed), exposing **`usage_efficiency_percent`** and **`wasted_cost`** derived from aggregated usage/request hours and CPU- or memory-scoped **`cost_total`** on [`OCPUsageLineItemDailySummary`](../../../koku/reporting/models.py). **Not implemented:** a separate `efficiency`-only route; cost/volume reports do not expose these fields. **Backlog** (out of code): idle/cost-efficiency/overhead scores and dedicated Optimizations Summary routing in the UI. **Design note:** pooled aggregate math vs “sum of per-workload waste” and CPU+memory double-count are analyzed in [solution-options-and-limitations.md](./solution-options-and-limitations.md) (not bugs—semantic choices). --- @@ -27,6 +27,7 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef |-------|----------|------| | 1 | This README | As-built behavior, code map, builder handoff | | 2 | [formulas-and-data-contract.md](./formulas-and-data-contract.md) | Exact math, cost basis, rounding, when scores are empty | +| 3 | [solution-options-and-limitations.md](./solution-options-and-limitations.md) | Known math limitations (aggregation / double-count), solution axes, IQ for future changes | --- @@ -82,6 +83,7 @@ flowchart LR | — | **`request_sum == 0`** semantics | Implementation coalesces **`usage_efficiency_percent`** to **0** and **`wasted_cost`** to **0** (not `null`). Confirm UX/OpenAPI wording. | | — | Tag / multi-dimension `group_by` | Scores intentionally empty; document for UI. | | — | Tag **`exclude`** vs `should_compute` | Code does not pass `"exclude"` into [`get_tag_filter_keys`](../../../koku/api/report/queries.py); align product/UI expectations or extend `has_tag_interaction` if excludes should suppress scores. | +| IQ-7–IQ-10 | Bottom-up waste grain, metric alignment, dual-metric naming, infra split | See [solution-options-and-limitations.md](./solution-options-and-limitations.md). | --- @@ -89,6 +91,7 @@ flowchart LR | Date | Summary | |------|---------| +| 2026-04-24 | Added [solution-options-and-limitations.md](./solution-options-and-limitations.md) (cost-efficiency problems brief → solution axes and IQ-7–10). | | 2026-04-16 | Initial agent-focused hub and formulas doc from product brief. | | 2026-04-17 | Rewrote hub for **as-built** implementation (compute/memory, `total_score` / `score`, formulas, no new pipeline). | | 2026-04-21 | Corrected **when scores are omitted**: tag **`exclude`** does not affect `should_compute` today (only tag `group_by` and tag **`filter`** keys). | @@ -99,7 +102,7 @@ flowchart LR | Block | Content | |-------|---------| -| **Doc map** | This README (overview + links) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math + JSON). Reading order: README → formulas. | +| **Doc map** | This README (overview + links) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math + JSON) → [solution-options-and-limitations.md](./solution-options-and-limitations.md) (limitations + future fixes). Reading order: README → formulas → solution options (if changing behavior). | | **Assumptions** | None beyond what is cited from code; UI “Optimizations Summary” tab wiring lives in koku-ui. | | **IQ / decisions** | Resolved items in **Resolved decisions** table; backlog in **Open questions**. | | **API contract summary** | `GET …/reports/openshift/compute/` and `GET …/reports/openshift/memory/` with `filter` / `group_by` / `order_by` as other OCP inventory reports. Response: **`total.total_score`**: `{ usage_efficiency_percent: int, wasted_cost: { value, units } }` or `{}`. Data leaves: **`score`** same shape or `{}`. **`order_by[usage_efficiency]`**, **`order_by[wasted_cost]`** supported (with valid `group_by` per existing serializer rules). | diff --git a/docs/architecture/efficiency-scores/solution-options-and-limitations.md b/docs/architecture/efficiency-scores/solution-options-and-limitations.md new file mode 100644 index 0000000000..30e4d682e8 --- /dev/null +++ b/docs/architecture/efficiency-scores/solution-options-and-limitations.md @@ -0,0 +1,237 @@ +# Efficiency scores — solution options and known limitations + +Companion to [README.md](./README.md) and [formulas-and-data-contract.md](./formulas-and-data-contract.md). This document **analyzes** the three cost-efficiency issues. It does **not** change implementation; it maps problems to **verified backend behavior** and lists **solution axes** for future work. + +--- + +## Scope restatement + +| Aspect | Detail | +| --- | --- | +| Goal | Produce engineering-ready options so builders and PM can choose semantics without rediscovering math or data grain. | +| In scope | Problem ↔ code mapping; solution families; tradeoffs; tenancy and query-performance notes; IQ/decisions. | +| Out of scope | Serializer code, migrations, SQL templates, UI copy, OpenAPI regeneration. | + +**Success criteria:** A reader can explain why fleet `wasted_cost` can differ from a sum of “intuitive” per-workload waste, what would change under each fix, and what remains a **product definition** choice (e.g. naming two different metrics). + +--- + +## Verified: how the backend computes waste today + +The ORM builds **`usage_efficiency`** and **`wasted_cost`** from **aggregated** `usage_sum` and `request_sum` expressions at the **same grain as the report query** (fleet `aggregate()`, or `values(...).annotate(...)` for each group). The waste term is: + +`Greatest(cost_total * (1 - usage_sum / NullIf(request_sum, 0)), 0)` — see [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) and [formulas-and-data-contract.md](./formulas-and-data-contract.md). + +**Consequences (mathematical, not bugs):** + +1. **Fleet / group `wasted_cost` is not** \(\sum_i \max(c_i \cdot (1 - u_i/r_i), 0)\) **unless** each line item is computed separately and then summed. The shipped formula is **one** ratio applied to **one** pooled `cost_total` for the grouped rows. +2. **`Greatest(..., 0)` applies once** to that pooled expression. If the pooled ratio exceeds 1 (usage > request in aggregate), the whole waste term is **0** for that bucket—even when some underlying rows would have positive waste under a per-row formula. + +These match **Problem 1 (aggregation bias)** and **Problem 2 (negative waste offset)** in the brief. IQE reproducers linked from that brief (`test_api_ocp_ingest_efficiency_calc_logic`, `test_api_ocp_ingest_efficiency_overutilization`) are the regression anchor if behavior changes. + +### Flow contrast (conceptual) + +```mermaid +flowchart TB + subgraph today [Current: one ratio per bucket] + A1[Sum usage, Sum request, Sum cost in bucket] --> A2["waste = max(cost × (1 - U/R), 0)"] + end + subgraph proposed [Typical fix: row-level then sum] + B1[Per row: waste_i = max(cost_i × (1 - u_i/r_i), 0)] --> B2[Sum waste_i in bucket] + end +``` + +--- + +## Problem 1 — Aggregation bias in efficiency / wasted cost + +### Detailed description + +Many FinOps users think of **wasted cost** as an **additive** quantity: for each workload (pod, service, line item), estimate how much of *that* workload’s spend is “unused” relative to requests, then **add** those amounts to get a cluster, project, or fleet total. That is a **bottom-up** definition. + +The backend today uses a **top-down** definition at each API bucket (fleet `total`, or each `group_by` row such as cluster or project). It first **adds** usage hours, request hours, and cost across every line item in the bucket, then applies **one** utilization ratio \(U/R\) to **one** pooled `cost_total` for that bucket (see [formulas-and-data-contract.md](./formulas-and-data-contract.md) and [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py)). + +Those two definitions are **different functions** of the underlying data. They coincide in special cases (e.g. a single homogeneous workload, or proportional cost and identical ratios) but **diverge** when: + +- **Costs are uneven** across rows with different usage/request ratios (cheap rows vs expensive rows pull the pooled ratio and pooled cost differently than a sum of per-row waste). +- **`group_by` changes the bucket** (e.g. project view vs cluster view): the same underlying pods appear in different groupings, so top-down waste **per row** is not a simple roll-up of a single canonical “pod list” unless the math matches bottom-up. + +So the “problem” is not random API error; it is a **semantic mismatch** between **pooled structural waste** (what ships) and **sum of per-workload opportunity** (what many dashboards assume). The gap can appear as “**incorrect** total wasted cost” in **group_by project** and wrong **totals and per-row cluster** behavior in **group_by cluster**. + +### Worked example (two pods, one cluster bucket) + +Assume a single cluster bucket contains **two** workloads. Costs and hours are chosen so bottom-up and top-down disagree (same numbers as in the cost-efficiency brief). + +| Workload | Cost \(c_i\) | Usage \(u_i\) | Request \(r_i\) | Utilization \(u_i/r_i\) | Bottom-up waste \(\max(c_i(1-u_i/r_i), 0)\) | +|----------|-------------|----------------|------------------|-------------------------|-----------------------------------------------| +| Pod 1 | $200 | 5 | 10 | 50% | \(200 \times (1 - 0.5) =\) **$100** | +| Pod 2 | $1,050 | 1 | 5 | 20% | \(1050 \times (1 - 0.2) =\) **$840** | +| **Pooled (what the API uses for the bucket)** | **$1,250** | **6** | **15** | \(6/15 = 40\%\) | \(1250 \times (1 - 0.4) =\) **$750** | + +- **Bottom-up (sum of opportunities):** \(100 + 840 =\) **$940**. +- **Top-down (current backend):** one ratio on pooled sums: \((1 - 6/15) \times 1250 = 0.6 \times 1250 =\) **$750**. + +**Discrepancy:** **$190** (~20% in this toy scenario) between the two definitions. The backend matches **top-down**; users who sum or mentally aggregate **per-pod** waste will expect **bottom-up**. + +### How this ties to `wasted_cost` in code + +For each response bucket, `usage_sum` and `request_sum` are **aggregates over all rows** in that bucket, and `wasted_cost` applies `Greatest(cost_total × (1 - usage_sum/request_sum), 0)` once. There is **no** intermediate per-row waste term in the shipped path. + +### Solution families + +| ID | Approach | Idea | Pros | Cons / risks | +|----|-----------|------|------|----------------| +| **S1-a** | **Per–line-item waste, then `Sum`** in SQL/ORM | Annotate each `OCPUsageLineItemDailySummary` row with row-level waste (same `cost_total` basis as today, but using **row** usage/request fields), aggregate with `Sum` for fleet and for each `group_by`. | Aligns with FinOps “sum of opportunities”; stable under splitting/merging pods in examples. | Heavier queries; must define **row grain** (see IQ-7); `cost_total` at row level must remain defined and consistent with CPU vs memory split (ties to Problem 3). | +| **S1-b** | **Keep formula; rename / document** | Treat current `wasted_cost` as **“pooled structural waste”** (or similar); add separate optional field for bottom-up sum if product wants both. | No query rewrite; clears customer confusion. | Two metrics to explain; UI and API surface decisions. | +| **S1-c** | **Weighted / Shapley-style allocation** | Allocate pooled waste to dimensions by a principled rule. | Single headline number with axioms. | Hard to explain; implementation complexity; may still disagree with pod-sum intuition. | + +**Recommendation for design discussion:** Default engineering path for “fix the number” is **S1-a** at the **finest grain stored in** [`OCPUsageLineItemDailySummary`](../../../koku/reporting/provider/ocp/models.py) **that still carries the same cost and usage fields** as the report—**after** product confirms that grain matches “workload” in their examples (IQ-7). + +--- + +## Problem 2 — Over-utilization masking under-utilization waste + +### Detailed description + +When **some** workloads in a bucket are **under-provisioned** (usage below request, so there is “headline” waste) and **others** are **over-provisioned** (usage above request, sometimes called burst or pressure risk), a reasonable product question is: **should over-utilization reduce reported waste for the whole bucket?** + +Under the **current** formula, **aggregation happens first**: the bucket’s total usage \(U\) and total request \(R\) are summed, then \(1 - U/R\) is computed **once**. If \(U > R\) for the bucket as a whole, then \((1 - U/R) < 0\), and **`Greatest(..., 0)`** forces **`wasted_cost` to $0** for that entire bucket—even if **individual** rows would still show positive waste if waste were computed **per row** and then summed (each over-provisioned row would contribute **$0** after its own clamp, but under-provisioned rows would still contribute positive amounts). + +So **over-utilized** lines do not “subtract” dollar waste in a ledger sense; they **inflate the pooled utilization ratio** so much that the **single** pooled \((1-U/R)\) term goes non-positive, and the **one** `Greatest` at bucket level **zeros out** the whole bucket. That **masks** under-provisioned waste in reporting—a **cluster** or **project** total can read **$0** while **actionable** right-sizing opportunity still exists on a subset of pods. + +This is the same **order-of-operations** issue as Problem 1, but the user-visible symptom is often described as **“negative waste offset”** or **subsidy**: burst usage on one cohort **hides** savings opportunity on another. + +### Worked example (one under-provisioned pod, one over-provisioned pod) + +| Workload | Cost \(c_i\) | Usage \(u_i\) | Request \(r_i\) | \(u_i/r_i\) | Per-row waste \(\max(c_i(1-u_i/r_i),0)\) | +|----------|-------------|----------------|------------------|-------------|-------------------------------------------| +| Pod 1 (under) | $200 | 5 | 10 | 50% | **$100** | +| Pod 2 (over) | $1,150 | 15 | 5 | 300% | \(\max(1150 \times (1 - 3), 0) =\) **$0** | +| **Pooled bucket** | **$1,350** | **20** | **15** | \(20/15 \approx 133\%\) | **Current:** \(\max(1350 \times (1 - 20/15), 0) = \max(\text{negative}, 0) =\) **$0** | + +- **Bottom-up sum of clamped row waste:** \(100 + 0 =\) **$100** (still **$100** of opportunity on Pod 1 if you resize requests). +- **Current backend (pooled):** **$0** for the cluster bucket. + +So the dashboard can show **no** wasted cost at **cluster** (and similar masking can appear at **project** level in the brief’s screenshots), while the **actual** opportunity narrative remains **$100**. IQE coverage: `test_api_ocp_ingest_efficiency_overutilization` (see external brief / plugin MR). + +### How this ties to `wasted_cost` in code + +[`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) applies **`Greatest`** to the expression built from **already-aggregated** `usage_sum_prop` and `request_sum_prop`. There is **no** per-row `Greatest` before the `Sum` in the shipped implementation. + +### Solution families + +| ID | Approach | Relationship to Problem 1 | +|----|-----------|---------------------------| +| **S2-a** | **Row-level clamp, then sum** (`Sum(Greatest(row_expr, 0))`) | Same implementation core as **S1-a**; fixes both bias and masking in typical examples. | +| **S2-b** | **Cap efficiency at 100% for waste only** | `waste ∝ max(0, 1 - min(U/R, 1))` at aggregated or row level; **does not** by itself restore sum-of-pod waste if cost is still pooled in one ratio—usually combined with S1-a or careful definition. | +| **S2-c** | **Document only** | State that fleet waste is **zero whenever aggregate efficiency ≥ 100%** regardless of per-pod mix. | Lowest cost; leaves PM/CS explaining edge cases. | + +**Recommendation:** **S2-a** is the same lever as **S1-a**; separating them in the brief is useful for **QA and messaging**, not for two different code paths unless product wants **both** “pooled” and “opportunity” metrics. + +--- + +## Problem 3 — Infrastructure (and full) cost in both CPU and memory dimensions + +### Detailed description + +OpenShift **compute** (`…/reports/openshift/compute/`) and **memory** (`…/reports/openshift/memory/`) reports are **separate** report types. Each builds **`wasted_cost`** from a **dimension-specific** `cost_total`: CPU uses CPU-related cost model components; memory uses memory-related ones ([formulas-and-data-contract.md](./formulas-and-data-contract.md)). + +However, **both** dimensions’ `cost_total` expressions include the **same** classes of charges that are **not** split by dimension—specifically **`cloud_infrastructure_cost + markup_cost`** in each case, plus the respective **`cost_model_cpu_cost`** or **`cost_model_memory_cost`**. Intuitively, much of **raw infrastructure** spend is **shared** by a running pod (you pay for the node/cluster once, not “once per dimension” in the customer’s mental model). + +If a consumer **adds** `wasted_cost` from the **CPU** response to `wasted_cost` from the **memory** response (or the UI shows two large “waste” numbers that users mentally combine), **shared** infrastructure and markup can be **counted twice** in that **sum**. Finance teams then cannot reconcile “total wasted” from the UI to a **single** bill line or to **actual maximum** waste for one pod (which cannot exceed **100%** of that pod’s true economic cost once). + +### Worked example (one pod, idle on both dimensions) + +Suppose a workload’s **true** attributable spend for the period is **$100** (one economic unit). On **both** CPU and memory reports, usage vs request implies **0%** utilization (idle relative to request) for that dimension. + +| Dimension | Attributed `cost_total` (simplified) | Waste % (from ratio) | `wasted_cost` (illustrative) | +|-----------|--------------------------------------|----------------------|-------------------------------| +| **CPU** | $100 (includes full infra + markup slice used by CPU report) | 100% | **$100** | +| **Memory** | $100 (same infra + markup pattern for memory report) | 100% | **$100** | +| **User mental “total”** | — | — | **$200** if summed | + +- **Maximum plausible waste** for one **$100** asset: **$100** (you cannot waste more than the resource cost in this mental model). +- **Summed API-style interpretation:** **$200** → **100% inflation** vs bill reconciliation. + +The implementation is **consistent** with the chosen **per-dimension cost basis**; the tension is **cross-endpoint aggregation** and **UI/FinOps semantics**, not an accidental double insert in one SQL row. + +### Product / team disposition (from brief) + +The brief records **team agreement** to **ship as-is** for now and revisit if customers object, because **splitting infrastructure cleanly** between CPU and memory is a **data model / cost allocation** problem, not a one-line ORM tweak. + +### Solution families (future) + +| ID | Approach | Notes | +|----|-----------|------| +| **S3-a** | **Allocate `cloud_infrastructure_cost` by driver** (e.g. request-hours shares, or cost model weights) into `cpu_infra` / `memory_infra` in pipeline or reporting | Touches summarization and possibly Trino/self-hosted SQL paths per [`.cursor/rules/onprem-vs-saas.mdc`](../../../.cursor/rules/onprem-vs-saas.mdc); **tenant** tables only under `tenant_context`. | +| **S3-b** | **New combined “workload” endpoint or metric** that computes waste once on a unified cost | API/product change; avoids double-count in **one** contract if UI sums dimensions today. | +| **S3-c** | **UI / docs: never sum CPU + memory wasted** | Operational mitigation; backend unchanged. | + +--- + +## Cross-cutting concerns + +### Tenancy + +Any change continues to run under **`tenant_context`** in [`OCPReportQueryHandler.execute_query`](../../../koku/api/report/ocp/query_handler.py); no public-schema reads for this report path. + +### Performance + +Row-level annotation + `Sum` over the full filtered line set can increase **DB work** versus a single `aggregate()` over pre-joined sums. Needs **budget**: tenant size, date range, indexes on filter columns, and EXPLAIN on representative tenants. + +### Ordering and `group_by` + +Today **`order_by[wasted_cost]`** and **`order_by[usage_efficiency]`** sort on the **same** annotations as the response. If `wasted_cost` becomes **sum-of-row-waste** but `usage_efficiency` stays **ratio-of-sums** (or vice versa), product must decide whether **ordering tracks** the displayed waste definition (IQ-8). + +### `should_compute` / tags + +Tag and multi-`group_by` gates in [`execute_query`](../../../koku/api/report/ocp/query_handler.py) unchanged unless product extends when scores appear; any new metric should reuse the same gate for consistency. + +### Dual-path SQL + +Current scores are **ORM-only** on tenant summaries ([README](./README.md)). **S3-a** or precomputed row facts may touch **`masu/database/sql/`**, **`trino_sql/`**, **`self_hosted_sql/`**; parity is required for on-prem if pipeline changes. + +--- + +## IQ / decisions (solution pass) + +| ID | Question | Owner hint | +|----|----------|------------| +| **IQ-7** | What is the **canonical “row”** for bottom-up waste: one `OCPUsageLineItemDailySummary` row (PK `uuid`), pod-level upstream, or something else? The model has **no `pod` name column**; docstring says *“daily aggregation … aggregated by OCP resource”* — confirm whether one row ≡ one pod for CPU/memory pod reports or a coarser roll-up. | Product + pipeline | +| **IQ-8** | Should **`usage_efficiency_percent`** and **`wasted_cost`** always reflect the **same** aggregation story, or can they diverge (e.g. ratio-of-sums efficiency + sum-of-row waste)? | Product | +| **IQ-9** | If both **pooled** and **opportunity** waste are exposed, **names**, **defaults**, and **deprecation** of the current field? | Product + API review | +| **IQ-10** | For **S3-a**, acceptable **allocation** method for infra (and markup) between CPU and memory for **finance reconciliation**? | Cost model + Finance | + +--- + +## Phased delivery (suggested) + +| Phase | Content | Validation | +|-------|---------|------------| +| **P0** | Docs only (this file + hub links); optional customer-facing “do not sum CPU + memory waste” note via PM/CS. | No code change. | +| **P1** | Spike: ORM or SQL prototype of **S1-a/S2-a** on one tenant; compare fleet vs sum-of-rows for IQE scenarios; EXPLAIN. | Numeric match to reproducers; perf budget. | +| **P2** | Implement chosen semantics; align `order_by`; extend tests in [`test_ocp_query_handler`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py) and related. | Regression + OpenAPI if fields added/renamed. | +| **P3** | **S3-*** cost split or combined endpoint if IQ-10 resolved. | Pipeline + dual-path parity; reconciliation sign-off. | + +--- + +## Builder handoff + +| Block | Content | +|-------|---------| +| **Doc map** | [README.md](./README.md) (as-built) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math) → **this file** (limitations + options). | +| **Assumptions** | External brief is accurate; team disposition on Problem 3 as stated there is still current. | +| **IQ / decisions** | **IQ-7–IQ-10** above. | +| **API contract summary** | **No change** in this doc. Future: possible new fields or redefinition of `wasted_cost` / `usage_efficiency_percent` per IQ-8/IQ-9—requires OpenAPI and UI coordination. | +| **Data & tenancy** | Reads/writes (if pipeline) on **tenant** [`OCPUsageLineItemDailySummary`](../../../koku/reporting/provider/ocp/models.py); `tenant_context` preserved. | +| **Pipeline / tasks** | **None** for S1-a/S2-a if done purely in report ORM; **possible** summarization tasks/SQL for S3-a—**propose** only after spike; update [`celery-tasks.md`](../celery-tasks.md) when tasks are added. | +| **SQL / dual-path** | ORM-only path unchanged today; S3-a likely touches **`masu/database/*`** — flag parity in implementation PR. | +| **Out of scope for builders** | Choosing between pooled vs opportunity headline; finance allocation policy; pasting IQE MR links into this repo (keep in Jira/epic). | + +--- + +## Changelog + +| Date | Summary | +|------|---------| +| 2026-04-24 | Initial doc; expanded Problems 1–3 with detailed descriptions, worked examples, and code tie-ins. | From ccd9e83f337c29e378d40ffd324ce9fcc7d214b4 Mon Sep 17 00:00:00 2001 From: myersCody Date: Wed, 29 Apr 2026 09:25:18 -0400 Subject: [PATCH 02/18] Fix mermaid graphs --- .../solution-options-and-limitations.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/architecture/efficiency-scores/solution-options-and-limitations.md b/docs/architecture/efficiency-scores/solution-options-and-limitations.md index 30e4d682e8..e4b30b917d 100644 --- a/docs/architecture/efficiency-scores/solution-options-and-limitations.md +++ b/docs/architecture/efficiency-scores/solution-options-and-limitations.md @@ -33,12 +33,13 @@ These match **Problem 1 (aggregation bias)** and **Problem 2 (negative waste off ```mermaid flowchart TB - subgraph today [Current: one ratio per bucket] - A1[Sum usage, Sum request, Sum cost in bucket] --> A2["waste = max(cost × (1 - U/R), 0)"] - end - subgraph proposed [Typical fix: row-level then sum] - B1[Per row: waste_i = max(cost_i × (1 - u_i/r_i), 0)] --> B2[Sum waste_i in bucket] - end + cHead([Current — one ratio per bucket]) + cHead --> A1["Sum usage, Sum request, Sum cost in bucket"] + A1 --> A2["waste = max(cost * (1 - U/R), 0)"] + + pHead([Proposed — row-level then sum]) + pHead --> B1["Per row — waste_i = max(cost_i * (1 - u_i/r_i), 0)"] + B1 --> B2["Sum waste_i in bucket"] ``` --- From 3ef643211da2ec6ecad3e0b89227a7c624263db6 Mon Sep 17 00:00:00 2001 From: myersCody Date: Wed, 29 Apr 2026 12:30:29 -0400 Subject: [PATCH 03/18] Implement s1a --- .../s1a-per-line-item-waste-spec.md | 151 ++++++++++++ koku/api/report/ocp/provider_map.py | 82 ++++-- .../report/test/ocp/test_ocp_provider_map.py | 233 ++++++++++++++++++ 3 files changed, 447 insertions(+), 19 deletions(-) create mode 100644 docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md diff --git a/docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md b/docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md new file mode 100644 index 0000000000..036538f6c0 --- /dev/null +++ b/docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md @@ -0,0 +1,151 @@ +# Spec: S1-a — Per-line-item waste, then `Sum` (current OCP compute/memory flow) + +**Status:** Draft for engineering +**Parent analysis:** [solution-options-and-limitations.md](./solution-options-and-limitations.md) (S1-a, S2-a) +**Math contract (today):** [formulas-and-data-contract.md](./formulas-and-data-contract.md) +**Hub:** [README.md](./README.md) + +--- + +## 1. Objective + +Change **`wasted_cost`** on OpenShift **`cpu`** and **`memory`** inventory reports so it matches **bottom-up “sum of opportunities”**: for each qualifying row in the report queryset, compute waste from **that row’s** usage, request, and dimension-scoped cost, clamp at zero, then **`Sum`** those values for the fleet total and for each **`group_by`** bucket. + +**Unchanged unless product decides otherwise (IQ-8):** +- **`usage_efficiency_percent`** stays **ratio-of-sums** (aggregate `usage_sum / request_sum`, rounded), as today. +- **`should_compute`** gates, response shape (`total_score` / `score`), tenancy (`tenant_context`), and **Problem 3** (CPU + memory double-count of shared infra if users sum endpoints) are **out of scope** for this spec unless explicitly pulled in. + +--- + +## 2. Semantics + +### 2.1 Row grain + +**Assumption (pending IQ-7):** One **`OCPUsageLineItemDailySummary`** row after the report’s filters is the unit of “line item” for S1-a. Product confirms this matches FinOps examples (pod vs coarser aggregation). + +If IQ-7 concludes a different grain, this spec’s ORM expressions must be applied at that grain (e.g. subquery or pipeline pre-aggregation) before **`Sum`**; the **formula** below still applies per grain unit. + +### 2.2 Per-row fields (mirror today’s `cost_total` basis) + +For each report **`report_type`**: + +| Dimension | Row usage (hours) | Row request (hours) | Row `cost_total` (same components as [formulas-and-data-contract.md](./formulas-and-data-contract.md)) | +|-----------|-------------------|---------------------|--------------------------------------------------------------------------------------------------------| +| **cpu** | `pod_usage_cpu_core_hours` | `pod_request_cpu_core_hours` | `cloud_infrastructure_cost` + `markup_cost` + **`cost_model_cpu_cost`** — expressed **per row** (see §3.2) | +| **memory** | `pod_usage_memory_gigabyte_hours` | `pod_request_memory_gigabyte_hours` | `cloud_infrastructure_cost` + `markup_cost` + **`cost_model_memory_cost`** — **per row** | + +Use **`Coalesce(..., 0)`** on usage and request consistently with [`_cpu_usage_sum`](../../../koku/api/report/ocp/provider_map.py) / [`_memory_request_sum`](../../../koku/api/report/ocp/provider_map.py). + +### 2.3 Per-row waste expression + +Let **c_i**, **u_i**, **r_i** be row cost, usage, and request after coalescing. Per row: + +```text +waste_i = max( c_i * (1 - u_i / NULLIF(r_i, 0)), 0 ) +``` + +(`NULLIF` matches SQL / ORM: divide-by-zero avoided; when the ratio is null, the implementation **`Coalesce`s** the clamped term to **0**, same as [formulas-and-data-contract.md](./formulas-and-data-contract.md).) + +**Aggregate** `wasted_cost` for the bucket: + +```text +wasted_cost = sum over i of waste_i +``` + +— over the filtered set (fleet **`aggregate`**) or within each **`values(...).annotate(...)`** group. + +This simultaneously addresses **Problem 1 (aggregation bias)** and **Problem 2 (over-utilization masking)** as described in [solution-options-and-limitations.md](./solution-options-and-limitations.md). + +--- + +## 3. Technical design (current stack) + +### 3.1 Touch points + +| Layer | File | Change | +|-------|------|--------| +| Aggregates & grouped annotations | [`koku/api/report/ocp/provider_map.py`](../../../koku/api/report/ocp/provider_map.py) | Replace pooled **`wasted_cost`** in **`_efficiency_annotations`** usage for **`cpu`** / **`memory`** with **`Sum(per_row_waste_expr)`**, or add a parallel helper (e.g. `_efficiency_annotations_row_waste_sum`) and wire it into **`aggregates`** and **`annotations`** for those report types only. | +| Response packing | [`koku/api/report/ocp/query_handler.py`](../../../koku/api/report/ocp/query_handler.py) | No shape change; still maps internal **`wasted_cost`** to **`wasted_cost.value`**. | +| Ordering | [`koku/api/report/queries.py`](../../../koku/api/report/queries.py) (`_order_by`) | **`order_by[wasted_cost]`** must sort by the **new** annotated `Sum` (same alias **`wasted_cost`**). Confirm in-memory ordering still sees the annotation after queryset evaluation. | +| Tests | [`koku/api/report/test/ocp/test_ocp_query_handler.py`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py) and IQE | Update expectations where fleet/group totals used pooled math; add regression cases matching brief examples / `test_api_ocp_ingest_efficiency_*` scenarios. | + +**Non-goals for this change:** `masu/database/sql/`, `trino_sql/`, Celery tasks — S1-a can remain **ORM-only** on tenant summaries (same as today). + +### 3.2 Row-level `cost_total` expression + +Today, [`cloud_infrastructure_cost`](../../../koku/api/report/ocp/provider_map.py) and [`markup_cost`](../../../koku/api/report/ocp/provider_map.py) are **`Sum(...)`** over row expressions. For per-row waste, introduce **non-aggregated** equivalents (same multipliers and fields, without **`Sum`**), e.g.: + +- Row infra: `Coalesce(F("infrastructure_raw_cost"), 0) * Coalesce(F("infra_exchange_rate"), 1)` +- Row markup: `Coalesce(F("infrastructure_markup_cost"), 0) * Coalesce(F("infra_exchange_rate"), 1)` +- Row CPU cost model: `Coalesce(F("cost_model_cpu_cost"), 0) * Coalesce(F("exchange_rate"), 1)` (memory: `cost_model_memory_cost`) + +**Sum** of these row terms must equal the existing aggregated **`cost_total`** for the same queryset (sanity check / test). + +Compose **`per_row_cost_cpu`** / **`per_row_cost_memory`** as the same linear combinations used in **`cost_total`** for **`cpu`** and **`memory`** blocks (lines ~430 and ~581 in `provider_map.py`). + +### 3.3 ORM pattern + +Conceptually: + +```text +per_row_waste = Coalesce( + Greatest( + per_row_cost * (1 - per_row_usage / NullIf(per_row_request, 0)), + Value(0), + ), + Value(0), +) +wasted_cost = Sum(per_row_waste) # in aggregate() or grouped annotate() +``` + +Reuse the same **`DecimalField`** precision patterns as [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) to avoid silent type widening issues. + +**`usage_efficiency`:** Keep current implementation (aggregated **`usage_sum_prop`** / **`request_sum_prop`**) so the percent column does not change unless IQ-8 chooses alignment. + +--- + +## 4. Product / API decisions (blockers or follow-ups) + +| ID | Question | Impact if unresolved | +|----|----------|----------------------| +| **IQ-7** | Confirm **`OCPUsageLineItemDailySummary`** row = intended “workload” unit | Wrong grain → wrong customer narrative | +| **IQ-8** | OK for **`usage_efficiency_percent`** (pooled ratio) and **`wasted_cost`** (sum of row waste) to **diverge**? | Doc + UI copy; possible future second field if both metrics needed (IQ-9) | +| **IQ-9** | If customers need **both** pooled and opportunity waste, names and defaults | New or renamed JSON fields; OpenAPI | + +Until IQ-8 is explicit, document in release notes that **efficiency %** is still **fleet/group utilization** (ratio of sums), while **wasted cost** is **additive opportunity** (sum of clamped row waste). + +--- + +## 5. Acceptance criteria + +1. **Numeric:** For the worked examples in [solution-options-and-limitations.md](./solution-options-and-limitations.md) (Problems 1 and 2), API **`wasted_cost`** for the same filtered data matches **sum of per-row clamped waste**, not pooled **`cost × (1 - U/R)`**. +2. **`group_by`:** Project, cluster, node, and other supported inventory groupings return **`wasted_cost`** = **`Sum`** of row waste **within that group**; fleet total = **`Sum`** over all filtered rows. +3. **Edge cases:** **`request_sum == 0`** at row level → that row contributes **0** waste; over-utilized rows contribute **0**; under-utilized rows still contribute positive amounts when the group’s pooled ratio would have zeroed pooled waste. +4. **Ordering:** **`order_by[wasted_cost]`** orders by the new aggregate. +5. **Tenancy:** No regression to **`tenant_context`** isolation. +6. **Performance:** Run **`EXPLAIN (ANALYZE)`** (or equivalent) on representative tenant/date windows; record delta vs current pooled expression. If unacceptable, follow-up task: partial indexes, materialized subquery, or precomputed column (out of scope here). + +--- + +## 6. Test plan (minimum) + +- **Unit / handler:** Extend [`test_ocp_query_handler`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py) with small fixtures: two rows in one cluster, costs and usage/request chosen so pooled waste ≠ sum of row waste; assert **`wasted_cost`** equals sum of row formula. +- **Over-utilization mix:** One under-, one over-provisioned row; assert non-zero **`wasted_cost`** when pooled would be zero. +- **Regression:** Existing tests that hard-code old **`wasted_cost`** values must be updated. +- **Ordering:** One test that **`order_by[wasted_cost]`** returns rows in descending **`wasted_cost`** under the new definition. + +--- + +## 7. Documentation updates (post-merge) + +- [formulas-and-data-contract.md](./formulas-and-data-contract.md): **`wasted_cost`** section — replace pooled definition with **sum of row-level clamped waste**; note **`usage_efficiency_percent`** unchanged (if IQ-8 confirms). +- [README.md](./README.md): Resolved / backlog table — note IQ-2 superseded or narrowed for **`wasted_cost`** only. +- [solution-options-and-limitations.md](./solution-options-and-limitations.md): Optional one-line “**Implemented:** S1-a as of …” when shipped. + +--- + +## 8. Changelog + +| Date | Summary | +|------|---------| +| 2026-04-29 | Initial spec from S1-a row in solution-options-and-limitations. | diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index b666995a52..29bd912466 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -210,9 +210,52 @@ def _memory_request_sum(self): """Return a new Sum expression for memory request hours.""" return Sum(Coalesce(F("pod_request_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - def _efficiency_annotations(self, usage_sum_prop, request_sum_prop, cost_total_expr): - """Build usage_efficiency and wasted_cost annotation expressions.""" - _dec = DecimalField(max_digits=33, decimal_places=15) + @staticmethod + def _efficiency_decimal_field(): + return DecimalField(max_digits=33, decimal_places=15) + + def _per_row_cost_cpu_expr(self): + """Row-level CPU cost_total (infra + markup + cost_model_cpu), without Sum.""" + _dec = self._efficiency_decimal_field() + row_infra = Coalesce(F("infrastructure_raw_cost"), Value(0, output_field=_dec)) * Coalesce( + F("infra_exchange_rate"), Value(1, output_field=_dec) + ) + row_markup = Coalesce(F("infrastructure_markup_cost"), Value(0, output_field=_dec)) * Coalesce( + F("infra_exchange_rate"), Value(1, output_field=_dec) + ) + row_cm = Coalesce(F("cost_model_cpu_cost"), Value(0, output_field=_dec)) * Coalesce( + F("exchange_rate"), Value(1, output_field=_dec) + ) + return row_infra + row_markup + row_cm + + def _per_row_cost_memory_expr(self): + """Row-level memory cost_total (infra + markup + cost_model_memory), without Sum.""" + _dec = self._efficiency_decimal_field() + row_infra = Coalesce(F("infrastructure_raw_cost"), Value(0, output_field=_dec)) * Coalesce( + F("infra_exchange_rate"), Value(1, output_field=_dec) + ) + row_markup = Coalesce(F("infrastructure_markup_cost"), Value(0, output_field=_dec)) * Coalesce( + F("infra_exchange_rate"), Value(1, output_field=_dec) + ) + row_cm = Coalesce(F("cost_model_memory_cost"), Value(0, output_field=_dec)) * Coalesce( + F("exchange_rate"), Value(1, output_field=_dec) + ) + return row_infra + row_markup + row_cm + + def _efficiency_annotations_row_waste_sum( + self, usage_sum_prop, request_sum_prop, per_row_cost_expr, usage_field, request_field + ): + """usage_efficiency (ratio-of-sums); wasted_cost = Sum of per-row clamped waste (S1-a).""" + _dec = self._efficiency_decimal_field() + row_u = Coalesce(F(usage_field), Value(0, output_field=_dec)) + row_r = Coalesce(F(request_field), Value(0, output_field=_dec)) + per_row_waste = Coalesce( + Greatest( + per_row_cost_expr * (Value(1, output_field=_dec) - row_u / NullIf(row_r, Value(0, output_field=_dec))), + Value(0, output_field=_dec), + ), + Value(0, output_field=_dec), + ) return { "usage_efficiency": Coalesce( Round(usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) * Value(100)), @@ -220,14 +263,7 @@ def _efficiency_annotations(self, usage_sum_prop, request_sum_prop, cost_total_e output_field=IntegerField(), ), "wasted_cost": Coalesce( - Greatest( - cost_total_expr - * ( - Value(1, output_field=_dec) - - usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) - ), - Value(0, output_field=_dec), - ), + Sum(per_row_waste), Value(0, output_field=_dec), output_field=_dec, ), @@ -441,10 +477,12 @@ def __init__(self, provider, report_type, schema_name): "pod_request_cpu_core_hours", default=Value(0, output_field=DecimalField()) ), "limit": Sum("pod_limit_cpu_core_hours", default=Value(0, output_field=DecimalField())), - **self._efficiency_annotations( + **self._efficiency_annotations_row_waste_sum( self._cpu_usage_sum(), self._cpu_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_cpu_cost, + self._per_row_cost_cpu_expr(), + "pod_usage_cpu_core_hours", + "pod_request_cpu_core_hours", ), }, "capacity_aggregate": { @@ -501,10 +539,12 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( Coalesce(F("pod_limit_cpu_core_hours"), Value(0, output_field=DecimalField())) ), - **self._efficiency_annotations( + **self._efficiency_annotations_row_waste_sum( self._cpu_usage_sum(), self._cpu_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_cpu_cost, + self._per_row_cost_cpu_expr(), + "pod_usage_cpu_core_hours", + "pod_request_cpu_core_hours", ), "capacity": Max("cluster_capacity_cpu_core_hours"), # overwritten in capacity aggregation "clusters": ArrayAgg( @@ -598,10 +638,12 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( "pod_limit_memory_gigabyte_hours", default=Value(0, output_field=DecimalField()) ), - **self._efficiency_annotations( + **self._efficiency_annotations_row_waste_sum( self._memory_usage_sum(), self._memory_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_memory_cost, + self._per_row_cost_memory_expr(), + "pod_usage_memory_gigabyte_hours", + "pod_request_memory_gigabyte_hours", ), }, "capacity_aggregate": { @@ -659,10 +701,12 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( Coalesce(F("pod_limit_memory_gigabyte_hours"), Value(0, output_field=DecimalField())) ), - **self._efficiency_annotations( + **self._efficiency_annotations_row_waste_sum( self._memory_usage_sum(), self._memory_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_memory_cost, + self._per_row_cost_memory_expr(), + "pod_usage_memory_gigabyte_hours", + "pod_request_memory_gigabyte_hours", ), "capacity": Max( "cluster_capacity_memory_gigabyte_hours" diff --git a/koku/api/report/test/ocp/test_ocp_provider_map.py b/koku/api/report/test/ocp/test_ocp_provider_map.py index 244a3311d9..d47f614c1d 100644 --- a/koku/api/report/test/ocp/test_ocp_provider_map.py +++ b/koku/api/report/test/ocp/test_ocp_provider_map.py @@ -8,6 +8,7 @@ from decimal import Decimal from django.db.models import DecimalField +from django.db.models import Sum from django.db.models import Value from django_tenants.utils import tenant_context @@ -71,3 +72,235 @@ def test_distributed_costs_use_correct_exchange_rate(self): ).aggregate(val=getattr(mapper, mapper_attr)) self.assertEqual(result["val"], cost_val * expected_multiplier) + + def test_wasted_cost_cpu_is_sum_of_per_row_clamped_waste(self): + """Pooled utilization can be 100% while rows still have opportunity waste (S1-a).""" + cluster_id = "s1a-waste-test-cpu" + usage_date = self.dh.yesterday.date() + _dec = DecimalField(max_digits=33, decimal_places=15) + one = Value(Decimal("1"), output_field=_dec) + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("10"), + pod_usage_cpu_core_hours=Decimal("5"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + cost_model_cpu_cost=Decimal("0"), + ) + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("10"), + pod_usage_cpu_core_hours=Decimal("15"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + cost_model_cpu_cost=Decimal("0"), + ) + + mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) + aggregates = mapper.report_type_map["aggregates"] + qs = OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod").annotate( + exchange_rate=one, + infra_exchange_rate=one, + ) + result = qs.aggregate( + wasted_cost=aggregates["wasted_cost"], + cost_total=aggregates["cost_total"], + ) + # Row1: 100 * (1 - 5/10) = 50; row2: over-utilized -> 0 + self.assertEqual(result["wasted_cost"], Decimal("50")) + self.assertEqual(result["cost_total"], Decimal("200")) + + pooled_waste = result["cost_total"] * ( + Decimal("1") - Decimal("20") / Decimal("20") + ) # sum usage / sum request == 1 + self.assertEqual(pooled_waste, Decimal("0")) + self.assertNotEqual(result["wasted_cost"], pooled_waste) + + def test_wasted_cost_memory_is_sum_of_per_row_clamped_waste(self): + """Memory report uses the same per-row waste pattern as CPU.""" + cluster_id = "s1a-waste-test-mem" + usage_date = self.dh.yesterday.date() + _dec = DecimalField(max_digits=33, decimal_places=15) + one = Value(Decimal("1"), output_field=_dec) + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + pod_request_memory_gigabyte_hours=Decimal("10"), + pod_usage_memory_gigabyte_hours=Decimal("5"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + cost_model_memory_cost=Decimal("0"), + ) + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + pod_request_memory_gigabyte_hours=Decimal("10"), + pod_usage_memory_gigabyte_hours=Decimal("15"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + cost_model_memory_cost=Decimal("0"), + ) + + mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="memory", schema_name=self.schema_name) + aggregates = mapper.report_type_map["aggregates"] + qs = OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod").annotate( + exchange_rate=one, + infra_exchange_rate=one, + ) + result = qs.aggregate(wasted_cost=aggregates["wasted_cost"]) + self.assertEqual(result["wasted_cost"], Decimal("50")) + + def test_wasted_cost_cpu_row_with_zero_request_contributes_zero(self): + cluster_id = "s1a-waste-test-zero-req" + usage_date = self.dh.yesterday.date() + _dec = DecimalField(max_digits=33, decimal_places=15) + one = Value(Decimal("1"), output_field=_dec) + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("0"), + pod_usage_cpu_core_hours=Decimal("50"), + infrastructure_raw_cost=Decimal("999"), + infrastructure_markup_cost=Decimal("0"), + cost_model_cpu_cost=Decimal("0"), + ) + + mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) + aggregates = mapper.report_type_map["aggregates"] + qs = OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod").annotate( + exchange_rate=one, + infra_exchange_rate=one, + ) + result = qs.aggregate(wasted_cost=aggregates["wasted_cost"]) + self.assertEqual(result["wasted_cost"], Decimal("0")) + + def test_per_row_cost_cpu_sums_to_aggregate_cost_total(self): + """Sanity: Sum(row infra + markup + cm cpu) matches existing cost_total aggregate.""" + cluster_id = "s1a-cost-consistency" + usage_date = self.dh.yesterday.date() + _dec = DecimalField(max_digits=33, decimal_places=15) + one = Value(Decimal("1"), output_field=_dec) + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + infrastructure_raw_cost=Decimal("30"), + infrastructure_markup_cost=Decimal("7"), + cost_model_cpu_cost=Decimal("5"), + pod_request_cpu_core_hours=Decimal("1"), + pod_usage_cpu_core_hours=Decimal("1"), + ) + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="s1a-test", + usage_start=usage_date, + usage_end=usage_date, + infrastructure_raw_cost=Decimal("10"), + infrastructure_markup_cost=Decimal("2"), + cost_model_cpu_cost=Decimal("1"), + pod_request_cpu_core_hours=Decimal("1"), + pod_usage_cpu_core_hours=Decimal("1"), + ) + + mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) + aggregates = mapper.report_type_map["aggregates"] + qs = OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod").annotate( + exchange_rate=one, + infra_exchange_rate=one, + ) + row_cost = mapper._per_row_cost_cpu_expr() + summed = qs.aggregate(from_rows=Sum(row_cost), cost_total=aggregates["cost_total"]) + self.assertEqual(summed["from_rows"], summed["cost_total"]) + + def test_wasted_cost_order_by_desc_matches_annotation(self): + """Grouped wasted_cost annotation is sortable (used by order_by[wasted_cost]).""" + cluster_id = "s1a-waste-order" + usage_date = self.dh.yesterday.date() + _dec = DecimalField(max_digits=33, decimal_places=15) + one = Value(Decimal("1"), output_field=_dec) + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="low-waste", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("10"), + pod_usage_cpu_core_hours=Decimal("9"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + cost_model_cpu_cost=Decimal("0"), + ) + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="high-waste", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("10"), + pod_usage_cpu_core_hours=Decimal("1"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + cost_model_cpu_cost=Decimal("0"), + ) + + mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) + ann = mapper.report_type_map["annotations"] + rows = list( + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod") + .annotate(exchange_rate=one, infra_exchange_rate=one) + .values("namespace") + .annotate(**{k: ann[k] for k in ("wasted_cost",) if k in ann}) + .order_by("-wasted_cost") + ) + self.assertEqual(len(rows), 2) + self.assertEqual(rows[0]["namespace"], "high-waste") + self.assertEqual(rows[1]["namespace"], "low-waste") From 4536352013d1f47402ffda4ce4dd8872a0cf5dfb Mon Sep 17 00:00:00 2001 From: myersCody Date: Tue, 5 May 2026 15:58:07 -0400 Subject: [PATCH 04/18] Update docs --- docs/architecture/efficiency-scores/README.md | 16 +- .../formulas-and-data-contract.md | 25 +- .../s1a-per-line-item-waste-spec.md | 151 ----------- .../solution-options-and-limitations.md | 238 ------------------ 4 files changed, 20 insertions(+), 410 deletions(-) delete mode 100644 docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md delete mode 100644 docs/architecture/efficiency-scores/solution-options-and-limitations.md diff --git a/docs/architecture/efficiency-scores/README.md b/docs/architecture/efficiency-scores/README.md index d05e4cf82d..09cb7651f4 100644 --- a/docs/architecture/efficiency-scores/README.md +++ b/docs/architecture/efficiency-scores/README.md @@ -6,7 +6,7 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef ## One-paragraph scope -**Implemented:** For `cpu` and `memory` report types, the API adds **`total_score`** on the **`total`** object and **`score`** on each **data** row (when scores are computed), exposing **`usage_efficiency_percent`** and **`wasted_cost`** derived from aggregated usage/request hours and CPU- or memory-scoped **`cost_total`** on [`OCPUsageLineItemDailySummary`](../../../koku/reporting/models.py). **Not implemented:** a separate `efficiency`-only route; cost/volume reports do not expose these fields. **Backlog** (out of code): idle/cost-efficiency/overhead scores and dedicated Optimizations Summary routing in the UI. **Design note:** pooled aggregate math vs “sum of per-workload waste” and CPU+memory double-count are analyzed in [solution-options-and-limitations.md](./solution-options-and-limitations.md) (not bugs—semantic choices). +**Implemented:** For `cpu` and `memory` report types, the API adds **`total_score`** on the **`total`** object and **`score`** on each **data** row (when scores are computed), exposing **`usage_efficiency_percent`** (ratio of summed usage to summed request) and **`wasted_cost`** (**sum of per-line-item clamped waste**, same cost basis as **`cost_total`** at row grain — details in [formulas-and-data-contract.md](./formulas-and-data-contract.md)) on [`OCPUsageLineItemDailySummary`](../../../koku/reporting/models.py). **Not implemented:** a separate `efficiency`-only route; cost/volume reports do not expose these fields. **Backlog** (out of code): idle/cost-efficiency/overhead scores and dedicated Optimizations Summary routing in the UI. **Design note:** summing **`wasted_cost`** from both compute and memory can double-count shared infra costs if customers treat the two numbers as additive “total waste”—call that out in UX copy rather than changing the API contract here. --- @@ -27,7 +27,6 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef |-------|----------|------| | 1 | This README | As-built behavior, code map, builder handoff | | 2 | [formulas-and-data-contract.md](./formulas-and-data-contract.md) | Exact math, cost basis, rounding, when scores are empty | -| 3 | [solution-options-and-limitations.md](./solution-options-and-limitations.md) | Known math limitations (aggregation / double-count), solution axes, IQ for future changes | --- @@ -40,7 +39,7 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef | **Scores in response** | [`OCPReportQueryHandler._pack_score`](../../../koku/api/report/ocp/query_handler.py) builds `usage_efficiency_percent` (int) and `wasted_cost` `{ value, units }`. The **`total`** block exposes this as **`total_score`** (rename from internal `score` after packing). **Data rows** keep the key **`score`** (same inner shape). | | **When scores are omitted** | For `cpu` / `memory` only: if **more than one** `group_by` dimension is present **or** there is **tag** `group_by` **or** tag keys under **`filter`**, `should_compute` is false → `total_score` and per-row `score` are **empty objects** `{}`. **Tag `exclude` is not part of this check** ([`execute_query`](../../../koku/api/report/ocp/query_handler.py) uses `get_tag_filter_keys()` with the default **`filter`** parameter in [`ReportQueryHandler.get_tag_filter_keys`](../../../koku/api/report/queries.py)). Multi `group_by` is covered by [`test_efficiency_score_multi_group_by_returns_empty`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py). | | **Reports without scores** | `costs`, `costs_by_project`, `volume`, and other non-inventory types do not add `total_score`. Tests: [`test_efficiency_score_cost_report_excluded`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py), [`test_efficiency_score_volume_report_excluded`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py). | -| **Aggregations** | [`OCPProviderMap._efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) defines ORM expressions for `usage_efficiency` and `wasted_cost` on `cpu` and `memory` **aggregates** and **report_annotations**. | +| **Aggregations** | [`OCPProviderMap._efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py) defines `usage_efficiency` (ratio-of-sums) and `wasted_cost` (`Sum` of per-row clamped waste) for `cpu` and `memory` **aggregates** and **report_annotations**, using [`_per_row_cost_cpu_expr`](../../../koku/api/report/ocp/provider_map.py) / [`_per_row_cost_memory_expr`](../../../koku/api/report/ocp/provider_map.py). | | **Ordering** | In-memory sort includes `usage_efficiency` and `wasted_cost` in [`ReportQueryHandler._order_by`](../../../koku/api/report/queries.py). | | **Tenant boundary** | All queries run under **`tenant_context(self.tenant)`** in [`OCPReportQueryHandler.execute_query`](../../../koku/api/report/ocp/query_handler.py). | | **Pipeline / SQL** | No Celery tasks and no new SQL templates — scores are ORM aggregates on existing tenant line items. **Dual-path** (`trino_sql` / `self_hosted_sql`) is unchanged for this feature. | @@ -68,8 +67,8 @@ flowchart LR | ID | Resolution | Evidence | |----|------------|----------| | IQ-5 (endpoint shape) | **Extend** existing compute/memory inventory endpoints; **no** separate `efficiency` route. | Views unchanged path; handler gates on `report_type in ("cpu", "memory")`. | -| IQ-1 (fleet / total row) | **Single ratio over the filtered row set** — totals use the same aggregate expressions as grouped rows (ratio-of-sums style at the SQL aggregate level). | `query.aggregate(**aggregates)` in [`execute_query`](../../../koku/api/report/ocp/query_handler.py) includes `usage_efficiency` / `wasted_cost` from [`OCPProviderMap`](../../../koku/api/report/ocp/provider_map.py). | -| IQ-2 (wasted cost basis) | **`wasted_cost = max(cost_total * (1 - usage/request), 0)`** with CPU- or memory-specific **`cost_total`** and matching usage/request **sums**. | [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py). Details in [formulas-and-data-contract.md](./formulas-and-data-contract.md). | +| IQ-1 (fleet / total row) | **`usage_efficiency_percent`:** single ratio of sums over the filtered row set. **`wasted_cost`:** `Sum` of per-row clamped waste over that same set (not fleet pooled `cost × (1 - U/R)`). Grouped rows use the same pattern per bucket. | `query.aggregate(**aggregates)` in [`execute_query`](../../../koku/api/report/ocp/query_handler.py); annotations from [`OCPProviderMap`](../../../koku/api/report/ocp/provider_map.py). | +| IQ-2 (wasted cost basis) | **Per-row sum:** each row contributes `max(row_cost × (1 - u/r), 0)` with row cost aligned to **`cost_total`** components; fleet and groups use **`Sum`** of those values. | [`_efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py), [formulas-and-data-contract.md](./formulas-and-data-contract.md). | | IQ-6 (RBAC) | Same access pattern as other OCP report views (no separate optimizations-only permission in backend). | Reuses existing views. | --- @@ -83,7 +82,7 @@ flowchart LR | — | **`request_sum == 0`** semantics | Implementation coalesces **`usage_efficiency_percent`** to **0** and **`wasted_cost`** to **0** (not `null`). Confirm UX/OpenAPI wording. | | — | Tag / multi-dimension `group_by` | Scores intentionally empty; document for UI. | | — | Tag **`exclude`** vs `should_compute` | Code does not pass `"exclude"` into [`get_tag_filter_keys`](../../../koku/api/report/queries.py); align product/UI expectations or extend `has_tag_interaction` if excludes should suppress scores. | -| IQ-7–IQ-10 | Bottom-up waste grain, metric alignment, dual-metric naming, infra split | See [solution-options-and-limitations.md](./solution-options-and-limitations.md). | +| IQ-7–IQ-10 | Grain / naming / infra split follow-ups | **`wasted_cost`** is sum of row-level clamped waste; **`usage_efficiency_percent`** remains ratio-of-sums (the two can diverge). Further product choices are backlog, not encoded here. | --- @@ -91,10 +90,11 @@ flowchart LR | Date | Summary | |------|---------| -| 2026-04-24 | Added [solution-options-and-limitations.md](./solution-options-and-limitations.md) (cost-efficiency problems brief → solution axes and IQ-7–10). | +| 2026-04-24 | Added companion solution-options doc (cost-efficiency brief; doc later removed from tree). | | 2026-04-16 | Initial agent-focused hub and formulas doc from product brief. | | 2026-04-17 | Rewrote hub for **as-built** implementation (compute/memory, `total_score` / `score`, formulas, no new pipeline). | | 2026-04-21 | Corrected **when scores are omitted**: tag **`exclude`** does not affect `should_compute` today (only tag `group_by` and tag **`filter`** keys). | +| 2026-05-05 | **`wasted_cost`** documented as per-row clamped waste **`Sum`**; README tables and aggregation pointer updated (`_efficiency_annotations_row_waste_sum`). | --- @@ -102,7 +102,7 @@ flowchart LR | Block | Content | |-------|---------| -| **Doc map** | This README (overview + links) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math + JSON) → [solution-options-and-limitations.md](./solution-options-and-limitations.md) (limitations + future fixes). Reading order: README → formulas → solution options (if changing behavior). | +| **Doc map** | This README (overview + links) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math + JSON). Reading order: README → formulas when changing behavior. | | **Assumptions** | None beyond what is cited from code; UI “Optimizations Summary” tab wiring lives in koku-ui. | | **IQ / decisions** | Resolved items in **Resolved decisions** table; backlog in **Open questions**. | | **API contract summary** | `GET …/reports/openshift/compute/` and `GET …/reports/openshift/memory/` with `filter` / `group_by` / `order_by` as other OCP inventory reports. Response: **`total.total_score`**: `{ usage_efficiency_percent: int, wasted_cost: { value, units } }` or `{}`. Data leaves: **`score`** same shape or `{}`. **`order_by[usage_efficiency]`**, **`order_by[wasted_cost]`** supported (with valid `group_by` per existing serializer rules). | diff --git a/docs/architecture/efficiency-scores/formulas-and-data-contract.md b/docs/architecture/efficiency-scores/formulas-and-data-contract.md index 9d5088018d..a028ff9104 100644 --- a/docs/architecture/efficiency-scores/formulas-and-data-contract.md +++ b/docs/architecture/efficiency-scores/formulas-and-data-contract.md @@ -28,7 +28,7 @@ Let **`usage_sum`** and **`request_sum`** be the **Sum** expressions for the app | Expression | Meaning | |------------|---------| | Ratio | `usage_sum / NULLIF(request_sum, 0)` — avoids division by zero. | -| **Percent** | `Round(ratio * 100)` as **integer** (Django `Round` + `IntegerField` in [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py)). | +| **Percent** | `Round(ratio * 100)` as **integer** (Django `Round` + `IntegerField` in [`_efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py)). | | **Null / zero request** | `Coalesce(..., 0)` → API exposes **`0`** when the ratio is null (including **`request_sum == 0`**). | So **`usage_efficiency_percent` can exceed 100** when usage exceeds request over the aggregated rows (not clamped). @@ -41,18 +41,16 @@ Response shaping is done in [`OCPReportQueryHandler._pack_score`](../../../koku/ ## Wasted cost (`wasted_cost`) -### Definition (implemented) - -Uses the same **`usage_sum`**, **`request_sum`**, and a **`cost_total_expr`** passed into [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py): +### Definition (implemented) — sum of per-row waste -`wasted_cost = Coalesce(Greatest(cost_total * (1 - usage_sum / NullIf(request_sum, 0)), 0), 0)` (ORM/SQL expression; `NullIf` avoids divide-by-zero). +[`_efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py) computes **`wasted_cost`** as **`Sum(per_row_waste)`** over the same rows that feed the report (fleet aggregate or each `group_by` bucket). **`usage_efficiency_percent`** in that helper remains **ratio-of-sums** (see above); the two fields **can diverge** by design. -- **`cost_total`** for **`cpu`**: `cloud_infrastructure_cost + markup_cost + cost_model_cpu_cost` (same components as the `cost_total` aggregate for CPU inventory). -- **`cost_total`** for **`memory`**: `cloud_infrastructure_cost + markup_cost + cost_model_memory_cost`. +Per row *i*, with coalesced usage **u_i**, request **r_i**, and row cost **c_i** (same linear components as aggregate **`cost_total`**, without wrapping **`Sum`**): -So this is **not** “waste percent × arbitrary headline cost” from a different column — it tracks the **`cost_total` expression for that dimension** in the provider map. +- **c_i** from [`_per_row_cost_cpu_expr`](../../../koku/api/report/ocp/provider_map.py) (**`cpu`**) or [`_per_row_cost_memory_expr`](../../../koku/api/report/ocp/provider_map.py) (**`memory`**): infra raw + markup (each × `infra_exchange_rate`) + cost-model CPU or memory (× `exchange_rate`), matching the inventory **`cost_total`** basis at row grain. +- **waste_i** = `Coalesce(Greatest(c_i * (1 - u_i / NullIf(r_i, 0)), 0), 0)` — `NullIf` avoids divide-by-zero; **`r_i == 0`** → that row contributes **0**; over-requested rows contribute **0** waste. -When **`request_sum`** is zero, the ratio is null; the expression yields null and **`Coalesce`** forces **`wasted_cost`** to **decimal 0**. +**`wasted_cost`** = **Σ waste_i** (ORM **`Sum(per_row_waste)`**), with outer **`Coalesce(..., 0)`** when the aggregate is null. ### API shape @@ -69,8 +67,8 @@ When **`request_sum`** is zero, the ratio is null; the expression yields null an ## Fleet vs grouped rows -- **Total row:** `query.aggregate(**aggregates)` applies the same **`usage_efficiency`** and **`wasted_cost`** definitions over the **entire filtered queryset** — i.e. one combined ratio and one wasted-cost expression for the fleet (not an average of per-cluster percentages). -- **Grouped rows:** Each group gets its own **`usage_sum`**, **`request_sum`**, and **`cost_total`** slice from `values(...).annotate(...)`. +- **Total row:** `query.aggregate(**aggregates)` applies **`usage_efficiency`** as **one ratio of sums** over the **entire filtered queryset**, and **`wasted_cost`** as the **sum of per-row waste** over those same rows (not an average of per-cluster percentages, and not fleet `cost_total × (1 - U/R)`). +- **Grouped rows:** Each group gets its own **`usage_sum`**, **`request_sum`**, and its own **`Sum(per_row_waste)`** from `values(...).annotate(...)`. --- @@ -136,7 +134,7 @@ When scores are disabled, `"total_score": {}`. | Metric | Status | |--------|--------| -| Waste score `max(100 - efficiency, 0)` | Not exposed as its own field; wasted cost follows the formula above. | +| Waste score `max(100 - efficiency, 0)` | Not exposed as its own field; **`wasted_cost`** is sum-of-row clamped waste (not that scalar). | | Idle / signed unused | Backlog; see README IQ-3. | | Cost efficiency / overhead scores | Backlog. | @@ -147,4 +145,5 @@ When scores are disabled, `"total_score": {}`. | Date | Summary | |------|---------| | 2026-04-16 | Initial formulas; illustrative JSON; fleet and wasted-cost options. | -| 2026-04-17 | Aligned with implementation: `_efficiency_annotations`, `_pack_score`, `total_score` vs `score`, `request=0` → 0, wasted cost formula and cost basis. | +| 2026-04-17 | Aligned with implementation: `_pack_score`, `total_score` vs `score`, `request=0` → 0, wasted cost formula and cost basis. | +| 2026-05-05 | **`wasted_cost`:** `_efficiency_annotations_row_waste_sum`, per-row cost helpers, **`Sum`** of clamped row waste; **`usage_efficiency_percent`** unchanged (ratio-of-sums). | diff --git a/docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md b/docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md deleted file mode 100644 index 036538f6c0..0000000000 --- a/docs/architecture/efficiency-scores/s1a-per-line-item-waste-spec.md +++ /dev/null @@ -1,151 +0,0 @@ -# Spec: S1-a — Per-line-item waste, then `Sum` (current OCP compute/memory flow) - -**Status:** Draft for engineering -**Parent analysis:** [solution-options-and-limitations.md](./solution-options-and-limitations.md) (S1-a, S2-a) -**Math contract (today):** [formulas-and-data-contract.md](./formulas-and-data-contract.md) -**Hub:** [README.md](./README.md) - ---- - -## 1. Objective - -Change **`wasted_cost`** on OpenShift **`cpu`** and **`memory`** inventory reports so it matches **bottom-up “sum of opportunities”**: for each qualifying row in the report queryset, compute waste from **that row’s** usage, request, and dimension-scoped cost, clamp at zero, then **`Sum`** those values for the fleet total and for each **`group_by`** bucket. - -**Unchanged unless product decides otherwise (IQ-8):** -- **`usage_efficiency_percent`** stays **ratio-of-sums** (aggregate `usage_sum / request_sum`, rounded), as today. -- **`should_compute`** gates, response shape (`total_score` / `score`), tenancy (`tenant_context`), and **Problem 3** (CPU + memory double-count of shared infra if users sum endpoints) are **out of scope** for this spec unless explicitly pulled in. - ---- - -## 2. Semantics - -### 2.1 Row grain - -**Assumption (pending IQ-7):** One **`OCPUsageLineItemDailySummary`** row after the report’s filters is the unit of “line item” for S1-a. Product confirms this matches FinOps examples (pod vs coarser aggregation). - -If IQ-7 concludes a different grain, this spec’s ORM expressions must be applied at that grain (e.g. subquery or pipeline pre-aggregation) before **`Sum`**; the **formula** below still applies per grain unit. - -### 2.2 Per-row fields (mirror today’s `cost_total` basis) - -For each report **`report_type`**: - -| Dimension | Row usage (hours) | Row request (hours) | Row `cost_total` (same components as [formulas-and-data-contract.md](./formulas-and-data-contract.md)) | -|-----------|-------------------|---------------------|--------------------------------------------------------------------------------------------------------| -| **cpu** | `pod_usage_cpu_core_hours` | `pod_request_cpu_core_hours` | `cloud_infrastructure_cost` + `markup_cost` + **`cost_model_cpu_cost`** — expressed **per row** (see §3.2) | -| **memory** | `pod_usage_memory_gigabyte_hours` | `pod_request_memory_gigabyte_hours` | `cloud_infrastructure_cost` + `markup_cost` + **`cost_model_memory_cost`** — **per row** | - -Use **`Coalesce(..., 0)`** on usage and request consistently with [`_cpu_usage_sum`](../../../koku/api/report/ocp/provider_map.py) / [`_memory_request_sum`](../../../koku/api/report/ocp/provider_map.py). - -### 2.3 Per-row waste expression - -Let **c_i**, **u_i**, **r_i** be row cost, usage, and request after coalescing. Per row: - -```text -waste_i = max( c_i * (1 - u_i / NULLIF(r_i, 0)), 0 ) -``` - -(`NULLIF` matches SQL / ORM: divide-by-zero avoided; when the ratio is null, the implementation **`Coalesce`s** the clamped term to **0**, same as [formulas-and-data-contract.md](./formulas-and-data-contract.md).) - -**Aggregate** `wasted_cost` for the bucket: - -```text -wasted_cost = sum over i of waste_i -``` - -— over the filtered set (fleet **`aggregate`**) or within each **`values(...).annotate(...)`** group. - -This simultaneously addresses **Problem 1 (aggregation bias)** and **Problem 2 (over-utilization masking)** as described in [solution-options-and-limitations.md](./solution-options-and-limitations.md). - ---- - -## 3. Technical design (current stack) - -### 3.1 Touch points - -| Layer | File | Change | -|-------|------|--------| -| Aggregates & grouped annotations | [`koku/api/report/ocp/provider_map.py`](../../../koku/api/report/ocp/provider_map.py) | Replace pooled **`wasted_cost`** in **`_efficiency_annotations`** usage for **`cpu`** / **`memory`** with **`Sum(per_row_waste_expr)`**, or add a parallel helper (e.g. `_efficiency_annotations_row_waste_sum`) and wire it into **`aggregates`** and **`annotations`** for those report types only. | -| Response packing | [`koku/api/report/ocp/query_handler.py`](../../../koku/api/report/ocp/query_handler.py) | No shape change; still maps internal **`wasted_cost`** to **`wasted_cost.value`**. | -| Ordering | [`koku/api/report/queries.py`](../../../koku/api/report/queries.py) (`_order_by`) | **`order_by[wasted_cost]`** must sort by the **new** annotated `Sum` (same alias **`wasted_cost`**). Confirm in-memory ordering still sees the annotation after queryset evaluation. | -| Tests | [`koku/api/report/test/ocp/test_ocp_query_handler.py`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py) and IQE | Update expectations where fleet/group totals used pooled math; add regression cases matching brief examples / `test_api_ocp_ingest_efficiency_*` scenarios. | - -**Non-goals for this change:** `masu/database/sql/`, `trino_sql/`, Celery tasks — S1-a can remain **ORM-only** on tenant summaries (same as today). - -### 3.2 Row-level `cost_total` expression - -Today, [`cloud_infrastructure_cost`](../../../koku/api/report/ocp/provider_map.py) and [`markup_cost`](../../../koku/api/report/ocp/provider_map.py) are **`Sum(...)`** over row expressions. For per-row waste, introduce **non-aggregated** equivalents (same multipliers and fields, without **`Sum`**), e.g.: - -- Row infra: `Coalesce(F("infrastructure_raw_cost"), 0) * Coalesce(F("infra_exchange_rate"), 1)` -- Row markup: `Coalesce(F("infrastructure_markup_cost"), 0) * Coalesce(F("infra_exchange_rate"), 1)` -- Row CPU cost model: `Coalesce(F("cost_model_cpu_cost"), 0) * Coalesce(F("exchange_rate"), 1)` (memory: `cost_model_memory_cost`) - -**Sum** of these row terms must equal the existing aggregated **`cost_total`** for the same queryset (sanity check / test). - -Compose **`per_row_cost_cpu`** / **`per_row_cost_memory`** as the same linear combinations used in **`cost_total`** for **`cpu`** and **`memory`** blocks (lines ~430 and ~581 in `provider_map.py`). - -### 3.3 ORM pattern - -Conceptually: - -```text -per_row_waste = Coalesce( - Greatest( - per_row_cost * (1 - per_row_usage / NullIf(per_row_request, 0)), - Value(0), - ), - Value(0), -) -wasted_cost = Sum(per_row_waste) # in aggregate() or grouped annotate() -``` - -Reuse the same **`DecimalField`** precision patterns as [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) to avoid silent type widening issues. - -**`usage_efficiency`:** Keep current implementation (aggregated **`usage_sum_prop`** / **`request_sum_prop`**) so the percent column does not change unless IQ-8 chooses alignment. - ---- - -## 4. Product / API decisions (blockers or follow-ups) - -| ID | Question | Impact if unresolved | -|----|----------|----------------------| -| **IQ-7** | Confirm **`OCPUsageLineItemDailySummary`** row = intended “workload” unit | Wrong grain → wrong customer narrative | -| **IQ-8** | OK for **`usage_efficiency_percent`** (pooled ratio) and **`wasted_cost`** (sum of row waste) to **diverge**? | Doc + UI copy; possible future second field if both metrics needed (IQ-9) | -| **IQ-9** | If customers need **both** pooled and opportunity waste, names and defaults | New or renamed JSON fields; OpenAPI | - -Until IQ-8 is explicit, document in release notes that **efficiency %** is still **fleet/group utilization** (ratio of sums), while **wasted cost** is **additive opportunity** (sum of clamped row waste). - ---- - -## 5. Acceptance criteria - -1. **Numeric:** For the worked examples in [solution-options-and-limitations.md](./solution-options-and-limitations.md) (Problems 1 and 2), API **`wasted_cost`** for the same filtered data matches **sum of per-row clamped waste**, not pooled **`cost × (1 - U/R)`**. -2. **`group_by`:** Project, cluster, node, and other supported inventory groupings return **`wasted_cost`** = **`Sum`** of row waste **within that group**; fleet total = **`Sum`** over all filtered rows. -3. **Edge cases:** **`request_sum == 0`** at row level → that row contributes **0** waste; over-utilized rows contribute **0**; under-utilized rows still contribute positive amounts when the group’s pooled ratio would have zeroed pooled waste. -4. **Ordering:** **`order_by[wasted_cost]`** orders by the new aggregate. -5. **Tenancy:** No regression to **`tenant_context`** isolation. -6. **Performance:** Run **`EXPLAIN (ANALYZE)`** (or equivalent) on representative tenant/date windows; record delta vs current pooled expression. If unacceptable, follow-up task: partial indexes, materialized subquery, or precomputed column (out of scope here). - ---- - -## 6. Test plan (minimum) - -- **Unit / handler:** Extend [`test_ocp_query_handler`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py) with small fixtures: two rows in one cluster, costs and usage/request chosen so pooled waste ≠ sum of row waste; assert **`wasted_cost`** equals sum of row formula. -- **Over-utilization mix:** One under-, one over-provisioned row; assert non-zero **`wasted_cost`** when pooled would be zero. -- **Regression:** Existing tests that hard-code old **`wasted_cost`** values must be updated. -- **Ordering:** One test that **`order_by[wasted_cost]`** returns rows in descending **`wasted_cost`** under the new definition. - ---- - -## 7. Documentation updates (post-merge) - -- [formulas-and-data-contract.md](./formulas-and-data-contract.md): **`wasted_cost`** section — replace pooled definition with **sum of row-level clamped waste**; note **`usage_efficiency_percent`** unchanged (if IQ-8 confirms). -- [README.md](./README.md): Resolved / backlog table — note IQ-2 superseded or narrowed for **`wasted_cost`** only. -- [solution-options-and-limitations.md](./solution-options-and-limitations.md): Optional one-line “**Implemented:** S1-a as of …” when shipped. - ---- - -## 8. Changelog - -| Date | Summary | -|------|---------| -| 2026-04-29 | Initial spec from S1-a row in solution-options-and-limitations. | diff --git a/docs/architecture/efficiency-scores/solution-options-and-limitations.md b/docs/architecture/efficiency-scores/solution-options-and-limitations.md deleted file mode 100644 index e4b30b917d..0000000000 --- a/docs/architecture/efficiency-scores/solution-options-and-limitations.md +++ /dev/null @@ -1,238 +0,0 @@ -# Efficiency scores — solution options and known limitations - -Companion to [README.md](./README.md) and [formulas-and-data-contract.md](./formulas-and-data-contract.md). This document **analyzes** the three cost-efficiency issues. It does **not** change implementation; it maps problems to **verified backend behavior** and lists **solution axes** for future work. - ---- - -## Scope restatement - -| Aspect | Detail | -| --- | --- | -| Goal | Produce engineering-ready options so builders and PM can choose semantics without rediscovering math or data grain. | -| In scope | Problem ↔ code mapping; solution families; tradeoffs; tenancy and query-performance notes; IQ/decisions. | -| Out of scope | Serializer code, migrations, SQL templates, UI copy, OpenAPI regeneration. | - -**Success criteria:** A reader can explain why fleet `wasted_cost` can differ from a sum of “intuitive” per-workload waste, what would change under each fix, and what remains a **product definition** choice (e.g. naming two different metrics). - ---- - -## Verified: how the backend computes waste today - -The ORM builds **`usage_efficiency`** and **`wasted_cost`** from **aggregated** `usage_sum` and `request_sum` expressions at the **same grain as the report query** (fleet `aggregate()`, or `values(...).annotate(...)` for each group). The waste term is: - -`Greatest(cost_total * (1 - usage_sum / NullIf(request_sum, 0)), 0)` — see [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) and [formulas-and-data-contract.md](./formulas-and-data-contract.md). - -**Consequences (mathematical, not bugs):** - -1. **Fleet / group `wasted_cost` is not** \(\sum_i \max(c_i \cdot (1 - u_i/r_i), 0)\) **unless** each line item is computed separately and then summed. The shipped formula is **one** ratio applied to **one** pooled `cost_total` for the grouped rows. -2. **`Greatest(..., 0)` applies once** to that pooled expression. If the pooled ratio exceeds 1 (usage > request in aggregate), the whole waste term is **0** for that bucket—even when some underlying rows would have positive waste under a per-row formula. - -These match **Problem 1 (aggregation bias)** and **Problem 2 (negative waste offset)** in the brief. IQE reproducers linked from that brief (`test_api_ocp_ingest_efficiency_calc_logic`, `test_api_ocp_ingest_efficiency_overutilization`) are the regression anchor if behavior changes. - -### Flow contrast (conceptual) - -```mermaid -flowchart TB - cHead([Current — one ratio per bucket]) - cHead --> A1["Sum usage, Sum request, Sum cost in bucket"] - A1 --> A2["waste = max(cost * (1 - U/R), 0)"] - - pHead([Proposed — row-level then sum]) - pHead --> B1["Per row — waste_i = max(cost_i * (1 - u_i/r_i), 0)"] - B1 --> B2["Sum waste_i in bucket"] -``` - ---- - -## Problem 1 — Aggregation bias in efficiency / wasted cost - -### Detailed description - -Many FinOps users think of **wasted cost** as an **additive** quantity: for each workload (pod, service, line item), estimate how much of *that* workload’s spend is “unused” relative to requests, then **add** those amounts to get a cluster, project, or fleet total. That is a **bottom-up** definition. - -The backend today uses a **top-down** definition at each API bucket (fleet `total`, or each `group_by` row such as cluster or project). It first **adds** usage hours, request hours, and cost across every line item in the bucket, then applies **one** utilization ratio \(U/R\) to **one** pooled `cost_total` for that bucket (see [formulas-and-data-contract.md](./formulas-and-data-contract.md) and [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py)). - -Those two definitions are **different functions** of the underlying data. They coincide in special cases (e.g. a single homogeneous workload, or proportional cost and identical ratios) but **diverge** when: - -- **Costs are uneven** across rows with different usage/request ratios (cheap rows vs expensive rows pull the pooled ratio and pooled cost differently than a sum of per-row waste). -- **`group_by` changes the bucket** (e.g. project view vs cluster view): the same underlying pods appear in different groupings, so top-down waste **per row** is not a simple roll-up of a single canonical “pod list” unless the math matches bottom-up. - -So the “problem” is not random API error; it is a **semantic mismatch** between **pooled structural waste** (what ships) and **sum of per-workload opportunity** (what many dashboards assume). The gap can appear as “**incorrect** total wasted cost” in **group_by project** and wrong **totals and per-row cluster** behavior in **group_by cluster**. - -### Worked example (two pods, one cluster bucket) - -Assume a single cluster bucket contains **two** workloads. Costs and hours are chosen so bottom-up and top-down disagree (same numbers as in the cost-efficiency brief). - -| Workload | Cost \(c_i\) | Usage \(u_i\) | Request \(r_i\) | Utilization \(u_i/r_i\) | Bottom-up waste \(\max(c_i(1-u_i/r_i), 0)\) | -|----------|-------------|----------------|------------------|-------------------------|-----------------------------------------------| -| Pod 1 | $200 | 5 | 10 | 50% | \(200 \times (1 - 0.5) =\) **$100** | -| Pod 2 | $1,050 | 1 | 5 | 20% | \(1050 \times (1 - 0.2) =\) **$840** | -| **Pooled (what the API uses for the bucket)** | **$1,250** | **6** | **15** | \(6/15 = 40\%\) | \(1250 \times (1 - 0.4) =\) **$750** | - -- **Bottom-up (sum of opportunities):** \(100 + 840 =\) **$940**. -- **Top-down (current backend):** one ratio on pooled sums: \((1 - 6/15) \times 1250 = 0.6 \times 1250 =\) **$750**. - -**Discrepancy:** **$190** (~20% in this toy scenario) between the two definitions. The backend matches **top-down**; users who sum or mentally aggregate **per-pod** waste will expect **bottom-up**. - -### How this ties to `wasted_cost` in code - -For each response bucket, `usage_sum` and `request_sum` are **aggregates over all rows** in that bucket, and `wasted_cost` applies `Greatest(cost_total × (1 - usage_sum/request_sum), 0)` once. There is **no** intermediate per-row waste term in the shipped path. - -### Solution families - -| ID | Approach | Idea | Pros | Cons / risks | -|----|-----------|------|------|----------------| -| **S1-a** | **Per–line-item waste, then `Sum`** in SQL/ORM | Annotate each `OCPUsageLineItemDailySummary` row with row-level waste (same `cost_total` basis as today, but using **row** usage/request fields), aggregate with `Sum` for fleet and for each `group_by`. | Aligns with FinOps “sum of opportunities”; stable under splitting/merging pods in examples. | Heavier queries; must define **row grain** (see IQ-7); `cost_total` at row level must remain defined and consistent with CPU vs memory split (ties to Problem 3). | -| **S1-b** | **Keep formula; rename / document** | Treat current `wasted_cost` as **“pooled structural waste”** (or similar); add separate optional field for bottom-up sum if product wants both. | No query rewrite; clears customer confusion. | Two metrics to explain; UI and API surface decisions. | -| **S1-c** | **Weighted / Shapley-style allocation** | Allocate pooled waste to dimensions by a principled rule. | Single headline number with axioms. | Hard to explain; implementation complexity; may still disagree with pod-sum intuition. | - -**Recommendation for design discussion:** Default engineering path for “fix the number” is **S1-a** at the **finest grain stored in** [`OCPUsageLineItemDailySummary`](../../../koku/reporting/provider/ocp/models.py) **that still carries the same cost and usage fields** as the report—**after** product confirms that grain matches “workload” in their examples (IQ-7). - ---- - -## Problem 2 — Over-utilization masking under-utilization waste - -### Detailed description - -When **some** workloads in a bucket are **under-provisioned** (usage below request, so there is “headline” waste) and **others** are **over-provisioned** (usage above request, sometimes called burst or pressure risk), a reasonable product question is: **should over-utilization reduce reported waste for the whole bucket?** - -Under the **current** formula, **aggregation happens first**: the bucket’s total usage \(U\) and total request \(R\) are summed, then \(1 - U/R\) is computed **once**. If \(U > R\) for the bucket as a whole, then \((1 - U/R) < 0\), and **`Greatest(..., 0)`** forces **`wasted_cost` to $0** for that entire bucket—even if **individual** rows would still show positive waste if waste were computed **per row** and then summed (each over-provisioned row would contribute **$0** after its own clamp, but under-provisioned rows would still contribute positive amounts). - -So **over-utilized** lines do not “subtract” dollar waste in a ledger sense; they **inflate the pooled utilization ratio** so much that the **single** pooled \((1-U/R)\) term goes non-positive, and the **one** `Greatest` at bucket level **zeros out** the whole bucket. That **masks** under-provisioned waste in reporting—a **cluster** or **project** total can read **$0** while **actionable** right-sizing opportunity still exists on a subset of pods. - -This is the same **order-of-operations** issue as Problem 1, but the user-visible symptom is often described as **“negative waste offset”** or **subsidy**: burst usage on one cohort **hides** savings opportunity on another. - -### Worked example (one under-provisioned pod, one over-provisioned pod) - -| Workload | Cost \(c_i\) | Usage \(u_i\) | Request \(r_i\) | \(u_i/r_i\) | Per-row waste \(\max(c_i(1-u_i/r_i),0)\) | -|----------|-------------|----------------|------------------|-------------|-------------------------------------------| -| Pod 1 (under) | $200 | 5 | 10 | 50% | **$100** | -| Pod 2 (over) | $1,150 | 15 | 5 | 300% | \(\max(1150 \times (1 - 3), 0) =\) **$0** | -| **Pooled bucket** | **$1,350** | **20** | **15** | \(20/15 \approx 133\%\) | **Current:** \(\max(1350 \times (1 - 20/15), 0) = \max(\text{negative}, 0) =\) **$0** | - -- **Bottom-up sum of clamped row waste:** \(100 + 0 =\) **$100** (still **$100** of opportunity on Pod 1 if you resize requests). -- **Current backend (pooled):** **$0** for the cluster bucket. - -So the dashboard can show **no** wasted cost at **cluster** (and similar masking can appear at **project** level in the brief’s screenshots), while the **actual** opportunity narrative remains **$100**. IQE coverage: `test_api_ocp_ingest_efficiency_overutilization` (see external brief / plugin MR). - -### How this ties to `wasted_cost` in code - -[`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) applies **`Greatest`** to the expression built from **already-aggregated** `usage_sum_prop` and `request_sum_prop`. There is **no** per-row `Greatest` before the `Sum` in the shipped implementation. - -### Solution families - -| ID | Approach | Relationship to Problem 1 | -|----|-----------|---------------------------| -| **S2-a** | **Row-level clamp, then sum** (`Sum(Greatest(row_expr, 0))`) | Same implementation core as **S1-a**; fixes both bias and masking in typical examples. | -| **S2-b** | **Cap efficiency at 100% for waste only** | `waste ∝ max(0, 1 - min(U/R, 1))` at aggregated or row level; **does not** by itself restore sum-of-pod waste if cost is still pooled in one ratio—usually combined with S1-a or careful definition. | -| **S2-c** | **Document only** | State that fleet waste is **zero whenever aggregate efficiency ≥ 100%** regardless of per-pod mix. | Lowest cost; leaves PM/CS explaining edge cases. | - -**Recommendation:** **S2-a** is the same lever as **S1-a**; separating them in the brief is useful for **QA and messaging**, not for two different code paths unless product wants **both** “pooled” and “opportunity” metrics. - ---- - -## Problem 3 — Infrastructure (and full) cost in both CPU and memory dimensions - -### Detailed description - -OpenShift **compute** (`…/reports/openshift/compute/`) and **memory** (`…/reports/openshift/memory/`) reports are **separate** report types. Each builds **`wasted_cost`** from a **dimension-specific** `cost_total`: CPU uses CPU-related cost model components; memory uses memory-related ones ([formulas-and-data-contract.md](./formulas-and-data-contract.md)). - -However, **both** dimensions’ `cost_total` expressions include the **same** classes of charges that are **not** split by dimension—specifically **`cloud_infrastructure_cost + markup_cost`** in each case, plus the respective **`cost_model_cpu_cost`** or **`cost_model_memory_cost`**. Intuitively, much of **raw infrastructure** spend is **shared** by a running pod (you pay for the node/cluster once, not “once per dimension” in the customer’s mental model). - -If a consumer **adds** `wasted_cost` from the **CPU** response to `wasted_cost` from the **memory** response (or the UI shows two large “waste” numbers that users mentally combine), **shared** infrastructure and markup can be **counted twice** in that **sum**. Finance teams then cannot reconcile “total wasted” from the UI to a **single** bill line or to **actual maximum** waste for one pod (which cannot exceed **100%** of that pod’s true economic cost once). - -### Worked example (one pod, idle on both dimensions) - -Suppose a workload’s **true** attributable spend for the period is **$100** (one economic unit). On **both** CPU and memory reports, usage vs request implies **0%** utilization (idle relative to request) for that dimension. - -| Dimension | Attributed `cost_total` (simplified) | Waste % (from ratio) | `wasted_cost` (illustrative) | -|-----------|--------------------------------------|----------------------|-------------------------------| -| **CPU** | $100 (includes full infra + markup slice used by CPU report) | 100% | **$100** | -| **Memory** | $100 (same infra + markup pattern for memory report) | 100% | **$100** | -| **User mental “total”** | — | — | **$200** if summed | - -- **Maximum plausible waste** for one **$100** asset: **$100** (you cannot waste more than the resource cost in this mental model). -- **Summed API-style interpretation:** **$200** → **100% inflation** vs bill reconciliation. - -The implementation is **consistent** with the chosen **per-dimension cost basis**; the tension is **cross-endpoint aggregation** and **UI/FinOps semantics**, not an accidental double insert in one SQL row. - -### Product / team disposition (from brief) - -The brief records **team agreement** to **ship as-is** for now and revisit if customers object, because **splitting infrastructure cleanly** between CPU and memory is a **data model / cost allocation** problem, not a one-line ORM tweak. - -### Solution families (future) - -| ID | Approach | Notes | -|----|-----------|------| -| **S3-a** | **Allocate `cloud_infrastructure_cost` by driver** (e.g. request-hours shares, or cost model weights) into `cpu_infra` / `memory_infra` in pipeline or reporting | Touches summarization and possibly Trino/self-hosted SQL paths per [`.cursor/rules/onprem-vs-saas.mdc`](../../../.cursor/rules/onprem-vs-saas.mdc); **tenant** tables only under `tenant_context`. | -| **S3-b** | **New combined “workload” endpoint or metric** that computes waste once on a unified cost | API/product change; avoids double-count in **one** contract if UI sums dimensions today. | -| **S3-c** | **UI / docs: never sum CPU + memory wasted** | Operational mitigation; backend unchanged. | - ---- - -## Cross-cutting concerns - -### Tenancy - -Any change continues to run under **`tenant_context`** in [`OCPReportQueryHandler.execute_query`](../../../koku/api/report/ocp/query_handler.py); no public-schema reads for this report path. - -### Performance - -Row-level annotation + `Sum` over the full filtered line set can increase **DB work** versus a single `aggregate()` over pre-joined sums. Needs **budget**: tenant size, date range, indexes on filter columns, and EXPLAIN on representative tenants. - -### Ordering and `group_by` - -Today **`order_by[wasted_cost]`** and **`order_by[usage_efficiency]`** sort on the **same** annotations as the response. If `wasted_cost` becomes **sum-of-row-waste** but `usage_efficiency` stays **ratio-of-sums** (or vice versa), product must decide whether **ordering tracks** the displayed waste definition (IQ-8). - -### `should_compute` / tags - -Tag and multi-`group_by` gates in [`execute_query`](../../../koku/api/report/ocp/query_handler.py) unchanged unless product extends when scores appear; any new metric should reuse the same gate for consistency. - -### Dual-path SQL - -Current scores are **ORM-only** on tenant summaries ([README](./README.md)). **S3-a** or precomputed row facts may touch **`masu/database/sql/`**, **`trino_sql/`**, **`self_hosted_sql/`**; parity is required for on-prem if pipeline changes. - ---- - -## IQ / decisions (solution pass) - -| ID | Question | Owner hint | -|----|----------|------------| -| **IQ-7** | What is the **canonical “row”** for bottom-up waste: one `OCPUsageLineItemDailySummary` row (PK `uuid`), pod-level upstream, or something else? The model has **no `pod` name column**; docstring says *“daily aggregation … aggregated by OCP resource”* — confirm whether one row ≡ one pod for CPU/memory pod reports or a coarser roll-up. | Product + pipeline | -| **IQ-8** | Should **`usage_efficiency_percent`** and **`wasted_cost`** always reflect the **same** aggregation story, or can they diverge (e.g. ratio-of-sums efficiency + sum-of-row waste)? | Product | -| **IQ-9** | If both **pooled** and **opportunity** waste are exposed, **names**, **defaults**, and **deprecation** of the current field? | Product + API review | -| **IQ-10** | For **S3-a**, acceptable **allocation** method for infra (and markup) between CPU and memory for **finance reconciliation**? | Cost model + Finance | - ---- - -## Phased delivery (suggested) - -| Phase | Content | Validation | -|-------|---------|------------| -| **P0** | Docs only (this file + hub links); optional customer-facing “do not sum CPU + memory waste” note via PM/CS. | No code change. | -| **P1** | Spike: ORM or SQL prototype of **S1-a/S2-a** on one tenant; compare fleet vs sum-of-rows for IQE scenarios; EXPLAIN. | Numeric match to reproducers; perf budget. | -| **P2** | Implement chosen semantics; align `order_by`; extend tests in [`test_ocp_query_handler`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py) and related. | Regression + OpenAPI if fields added/renamed. | -| **P3** | **S3-*** cost split or combined endpoint if IQ-10 resolved. | Pipeline + dual-path parity; reconciliation sign-off. | - ---- - -## Builder handoff - -| Block | Content | -|-------|---------| -| **Doc map** | [README.md](./README.md) (as-built) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math) → **this file** (limitations + options). | -| **Assumptions** | External brief is accurate; team disposition on Problem 3 as stated there is still current. | -| **IQ / decisions** | **IQ-7–IQ-10** above. | -| **API contract summary** | **No change** in this doc. Future: possible new fields or redefinition of `wasted_cost` / `usage_efficiency_percent` per IQ-8/IQ-9—requires OpenAPI and UI coordination. | -| **Data & tenancy** | Reads/writes (if pipeline) on **tenant** [`OCPUsageLineItemDailySummary`](../../../koku/reporting/provider/ocp/models.py); `tenant_context` preserved. | -| **Pipeline / tasks** | **None** for S1-a/S2-a if done purely in report ORM; **possible** summarization tasks/SQL for S3-a—**propose** only after spike; update [`celery-tasks.md`](../celery-tasks.md) when tasks are added. | -| **SQL / dual-path** | ORM-only path unchanged today; S3-a likely touches **`masu/database/*`** — flag parity in implementation PR. | -| **Out of scope for builders** | Choosing between pooled vs opportunity headline; finance allocation policy; pasting IQE MR links into this repo (keep in Jira/epic). | - ---- - -## Changelog - -| Date | Summary | -|------|---------| -| 2026-04-24 | Initial doc; expanded Problems 1–3 with detailed descriptions, worked examples, and code tie-ins. | From d098bd1940114f7fbcd1f9e8e5d15d7505420c31 Mon Sep 17 00:00:00 2001 From: myersCody Date: Mon, 11 May 2026 09:47:52 -0400 Subject: [PATCH 05/18] Save current approach --- koku/api/report/ocp/query_handler.py | 179 ++++++++++++++++++ .../report/test/ocp/test_ocp_query_handler.py | 120 ++++++++++++ 2 files changed, 299 insertions(+) diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index a3d878400a..68815f8062 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -15,12 +15,21 @@ from django.db.models import CharField from django.db.models import DecimalField from django.db.models import F +from django.db.models import Sum from django.db.models import Value from django.db.models import When from django.db.models.fields.json import KT from django.db.models.functions import Coalesce +from django.db.models.functions import Greatest +from django.db.models.functions.comparison import NullIf from django_tenants.utils import tenant_context +from api.metrics.constants import OCP_METRIC_CPU_CORE_EFFECTIVE_USAGE_HOUR +from api.metrics.constants import OCP_METRIC_CPU_CORE_REQUEST_HOUR +from api.metrics.constants import OCP_METRIC_CPU_CORE_USAGE_HOUR +from api.metrics.constants import OCP_METRIC_MEM_GB_EFFECTIVE_USAGE_HOUR +from api.metrics.constants import OCP_METRIC_MEM_GB_REQUEST_HOUR +from api.metrics.constants import OCP_METRIC_MEM_GB_USAGE_HOUR from api.models import Provider from api.report.ocp.capacity.cluster_capacity import calculate_unused from api.report.ocp.capacity.cluster_capacity import ClusterCapacity @@ -31,6 +40,7 @@ from api.report.queries import ReportQueryHandler from cost_models.models import CostModel from cost_models.models import CostModelMap +from reporting.models import OCPUsageLineItemDailySummary LOG = logging.getLogger(__name__) @@ -40,6 +50,16 @@ class OCPReportQueryHandler(ReportQueryHandler): provider = Provider.PROVIDER_OCP + _CPU_HOUR_METRICS = frozenset( + {OCP_METRIC_CPU_CORE_USAGE_HOUR, OCP_METRIC_CPU_CORE_REQUEST_HOUR, OCP_METRIC_CPU_CORE_EFFECTIVE_USAGE_HOUR} + ) + _MEM_HOUR_METRICS = frozenset( + {OCP_METRIC_MEM_GB_USAGE_HOUR, OCP_METRIC_MEM_GB_REQUEST_HOUR, OCP_METRIC_MEM_GB_EFFECTIVE_USAGE_HOUR} + ) + _SENTINEL_NAMESPACES = frozenset( + ["Worker unallocated", "Platform unallocated", "Network unattributed", "Storage unattributed"] + ) + def __init__(self, parameters): """Establish OCP report query handler. @@ -253,6 +273,154 @@ def _pack_score(self, row, should_compute): row.pop("wasted_cost", None) row["score"] = {} + def _rate_sum_by_source(self, metric_names: frozenset) -> dict: + """Return {source_uuid_str: sum_of_flat_tier_values_in_cost_model_currency}. + + Parses CostModel.rates (a JSON list) and sums all rate values whose metric + name is in metric_names. Two structures are handled: + + * ``tiered_rates`` – a list of tier dicts each with a ``value`` key. + * ``tag_rates`` – a dict with a ``tag_values`` list; only entries where + ``default`` is true are included. These apply to every pod that does not + match a more-specific tag value (in practice, to all pods in the common + case of a single-value tag rule with ``default: true``). + """ + rate_by_cm: dict = {} + for row in CostModel.objects.all().values("uuid", "rates"): + total = Decimal("0") + for rate_entry in row["rates"] or []: + if rate_entry.get("metric", {}).get("name") in metric_names: + for tier in rate_entry.get("tiered_rates", []): + total += Decimal(str(tier.get("value", 0))) + for tv in (rate_entry.get("tag_rates", {}) or {}).get("tag_values", []) or []: + if tv.get("default"): + total += Decimal(str(tv.get("value", 0))) + rate_by_cm[str(row["uuid"])] = total + + return { + str(mapping_row["provider_uuid"]): rate_by_cm.get(str(mapping_row["cost_model_id"]), Decimal("0")) + for mapping_row in CostModelMap.objects.all().values("provider_uuid", "cost_model_id") + } + + def _line_item_wasted_cost(self, group_by: list) -> dict: + """Compute wasted_cost from OCPUsageLineItemDailySummary at daily per-pod grain. + + OCP pod summary tables (OCPPodSummaryP, etc.) aggregate across all pods within + a cluster/namespace/node before storing. When over-utilised and under-utilised + pods are pooled together their usage/request ratio approaches 1, driving the + waste formula toward zero at the bucket level while real per-pod waste remains. + This method queries the finest available grain (per-pod daily rows) so the + clamped-waste formula is evaluated before any cross-pod aggregation. + + The per-row cost basis is: + max(usage, request) × total_rate_from_cost_model × exchange_rate + + infrastructure_raw_cost × infra_exchange_rate + + infrastructure_markup_cost × infra_exchange_rate + + max(usage, request) is evaluated on the already-aggregated row (not the stored + pod_effective_usage field, which is sum(max(u_i, r_i)) per individual pod and + can exceed max(sum_u, sum_r) when the group contains both over- and + under-utilised pods). Using the row-level max matches the product's wasted-cost + definition and the integration test fixture. + + Rates are read from the CostModel table at query time so the computation is + correct whether or not the cost model application SQL has already run. + Only original rows (cost_model_rate_type IS NULL) are queried to avoid + double-counting when the cost model has been applied. + + Args: + group_by: the query_group_by list used by execute_query (["date", ...]). + + Returns: + A dict keyed by tuple(*group_by field values) → Decimal waste. + The sentinel key '__total__' holds the overall waste sum. + """ + _dec = OCPProviderMap._efficiency_decimal_field() + + if self._report_type == "cpu": + usage_field = "pod_usage_cpu_core_hours" + effective_field = "pod_effective_usage_cpu_core_hours" + rate_by_source = self._rate_sum_by_source(self._CPU_HOUR_METRICS) + else: + usage_field = "pod_usage_memory_gigabyte_hours" + effective_field = "pod_effective_usage_memory_gigabyte_hours" + rate_by_source = self._rate_sum_by_source(self._MEM_HOUR_METRICS) + + # Per-source rate annotation: Case(When(source_uuid=X, then=rate_X), ..., default=0) + rate_whens = [ + When(**{"source_uuid": uuid, "then": Value(rate, output_field=_dec)}) + for uuid, rate in rate_by_source.items() + if rate + ] + rate_ann = ( + Case(*rate_whens, default=Value(Decimal("0"), output_field=_dec), output_field=_dec) + if rate_whens + else Value(Decimal("0"), output_field=_dec) + ) + + row_infra = Coalesce(F("infrastructure_raw_cost"), Value(0, output_field=_dec)) + row_markup = Coalesce(F("infrastructure_markup_cost"), Value(0, output_field=_dec)) + row_u = Coalesce(F(usage_field), Value(0, output_field=_dec)) + + # pod_effective_usage_*_hours stores SUM(MAX(usage_i, request_i)) aggregated + # across all raw intervals in the day. This preserves the per-interval clamping + # so that hours where a pod is over-utilised contribute MAX(u_i, r_i) = u_i + # (no waste for that interval) rather than collapsing to MAX(SUM(u), SUM(r)) + # which would incorrectly cancel within-day over/under utilisation swings. + # waste = (effective - usage) * rate = SUM(MAX(0, r_i - u_i)) * rate ✓ + row_eff = Coalesce(F(effective_field), Value(0, output_field=_dec)) + + per_row_cost = ( + # Cloud infrastructure cost converted to the report currency + (row_infra + row_markup) * Coalesce(F("infra_exchange_rate"), Value(1, output_field=_dec)) + # Cost model contribution: rate × effective_usage → report currency. + # effective_usage = SUM(MAX(u_i, r_i)) already encodes the per-interval + # clamp, giving the correct cost basis for the waste formula. + + row_eff * F("_wc_rate") * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) + ) + # waste = per_row_cost × (1 − usage/effective) = (effective − usage) × rate + # Uses effective in the denominator (not request) so the ratio is always in [0,1] + # and the result equals the fixture's SUM(MAX(0, r_i − u_i)) × rate per group. + per_row_waste = Coalesce( + Greatest( + per_row_cost * (Value(1, output_field=_dec) - row_u / NullIf(row_eff, Value(0, output_field=_dec))), + Value(0, output_field=_dec), + ), + Value(0, output_field=_dec), + ) + waste_ann = Coalesce(Sum(per_row_waste), Value(0, output_field=_dec), output_field=_dec) + + # Original rows only: cost model application creates additional rows in the same + # table with cost_model_rate_type set. Querying only original rows avoids double- + # counting and gives the correct per-pod-group grain. + li_q = OCPUsageLineItemDailySummary.objects.filter(self.query_filter) + li_q = li_q.filter(cost_model_rate_type__isnull=True) + li_q = li_q.exclude(namespace__in=self._SENTINEL_NAMESPACES) + if self.query_exclusions: + li_q = li_q.exclude(self.query_exclusions) + + # Build a minimal annotation set to avoid GROUP BY pollution. + # Applying self.annotations directly adds cluster, currency_annotation, etc. + # as non-aggregate expressions which Django may include in GROUP BY, + # fragmenting groups beyond the intended dimensions. + fresh_ann: dict = { + "date": self.date_trunc("usage_start"), + "_wc_rate": rate_ann, + **self.exchange_rate_annotation_dict, + } + for gb in group_by: + if gb != "date" and gb in self.annotations: + fresh_ann[gb] = self.annotations[gb] + li_q = li_q.annotate(**fresh_ann) + + total = li_q.aggregate(wasted_cost=waste_ann)["wasted_cost"] or Decimal(0) + by_group = { + tuple(row[g] for g in group_by): row["wasted_cost"] + for row in li_q.values(*group_by).annotate(wasted_cost=waste_ann) + } + by_group["__total__"] = total + return by_group + def execute_query(self): # noqa: C901 """Execute query and return provided data. @@ -303,6 +471,14 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): has_tag_interaction = self._tag_group_by or self.get_tag_filter_keys() should_compute = not has_tag_interaction and len(group_by_value) <= 1 + # Compute wasted_cost from the finest available grain (per-pod daily rows) + # rather than from the pre-aggregated summary table buckets. Bucket-level + # aggregation causes over- and under-utilised pods to cancel, collapsing + # the waste formula toward zero even when real per-pod waste exists. + line_item_waste: dict = {} + if should_compute: + line_item_waste = self._line_item_wasted_cost(query_group_by) + query_sum["wasted_cost"] = line_item_waste.get("__total__", Decimal(0)) self._pack_score(query_sum, should_compute) if self._delta: @@ -312,6 +488,9 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): for row in query_data: + if should_compute: + key = tuple(row.get(g) for g in query_group_by) + row["wasted_cost"] = line_item_waste.get(key, Decimal(0)) self._pack_score(row, should_compute) for row in query_data: diff --git a/koku/api/report/test/ocp/test_ocp_query_handler.py b/koku/api/report/test/ocp/test_ocp_query_handler.py index 6fa7ec475b..aa0d8609e1 100644 --- a/koku/api/report/test/ocp/test_ocp_query_handler.py +++ b/koku/api/report/test/ocp/test_ocp_query_handler.py @@ -2286,3 +2286,123 @@ def test_order_by_wasted_cost_with_pagination(self): query_output = handler.execute_query() self.assertIsNotNone(query_output.get("data")) self.assertIsNotNone(query_output.get("total")) + + def test_line_item_wasted_cost_corrects_pooling_bug(self): + """_line_item_wasted_cost uses per-pod daily grain, not cluster-bucket aggregates. + + Two pods in the same cluster/day pool to 100% utilisation at bucket level, + driving the naive bucket-level waste formula to zero. But per-pod, pod-A is + only 50% utilised and carries real waste. This test confirms the method + returns the per-pod sum (50) rather than the bucket result (0). + + Cost basis: infrastructure_raw_cost (OCP-on-cloud path, no cost model). + effective_usage = SUM(MAX(usage_i, request_i)) per interval — for a single-interval + pod this equals MAX(usage, request). No cost model → rate = 0, so: + per_row_cost = effective × 0 + infra_raw × 1.0 = 100. + Pod A: waste = 100 × (1 − usage/effective) = 100 × (1 − 5/10) = 50. + Pod B: waste = max(0, 100 × (1 − 15/15)) = 0. + """ + usage_date = self.dh.yesterday.date() + cluster_id = "pooling-test-cluster-waste" + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + # Pod A: 50% utilized → effective=MAX(5,10)=10, waste = 100 × (1 − 5/10) = 50 + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="pooling-ns", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("10"), + pod_usage_cpu_core_hours=Decimal("5"), + pod_effective_usage_cpu_core_hours=Decimal("10"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + raw_currency="USD", + cost_model_rate_type=None, + ) + # Pod B: 150% utilized → effective=MAX(15,10)=15, waste = max(0, 100×(1−15/15)) = 0 + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="pooling-ns", + usage_start=usage_date, + usage_end=usage_date, + pod_request_cpu_core_hours=Decimal("10"), + pod_usage_cpu_core_hours=Decimal("15"), + pod_effective_usage_cpu_core_hours=Decimal("15"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + raw_currency="USD", + cost_model_rate_type=None, + ) + + url = f"?filter[cluster]={cluster_id}" + query_params = self.mocked_query_params(url, OCPCpuView) + handler = OCPReportQueryHandler(query_params) + query_group_by = ["date"] + + waste_map = handler._line_item_wasted_cost(query_group_by) + + # Per-pod sum: 50 + 0 = 50. Bucket-level would be 200 × (1 − 20/20) = 0. + self.assertEqual(waste_map["__total__"], Decimal("50")) + + def test_line_item_wasted_cost_memory_corrects_pooling_bug(self): + """Same pooling-bug correction for the memory report variant. + + Cost basis: infrastructure_raw_cost (OCP-on-cloud path, no cost model). + effective_usage = SUM(MAX(usage_i, request_i)) per interval. No cost model → rate = 0, so: + per_row_cost = effective × 0 + infra_raw × infra_exchange_rate = 100. + Pod A: waste = 100 × (1 − 5/10) = 50. Pod B: waste = max(0, 100×(1 − 15/15)) = 0. + """ + usage_date = self.dh.yesterday.date() + cluster_id = "pooling-test-cluster-waste-mem" + + with tenant_context(self.tenant): + OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + + # Pod A: 50% utilized → effective=MAX(5,10)=10, waste = 100 × (1 − 5/10) = 50 + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="pooling-ns-mem", + usage_start=usage_date, + usage_end=usage_date, + pod_request_memory_gigabyte_hours=Decimal("10"), + pod_usage_memory_gigabyte_hours=Decimal("5"), + pod_effective_usage_memory_gigabyte_hours=Decimal("10"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + raw_currency="USD", + cost_model_rate_type=None, + ) + # Pod B: 150% utilized → effective=MAX(15,10)=15, waste = max(0, 100×(1−15/15)) = 0 + self.baker.make( + OCPUsageLineItemDailySummary, + cluster_id=cluster_id, + data_source="Pod", + namespace="pooling-ns-mem", + usage_start=usage_date, + usage_end=usage_date, + pod_request_memory_gigabyte_hours=Decimal("10"), + pod_usage_memory_gigabyte_hours=Decimal("15"), + pod_effective_usage_memory_gigabyte_hours=Decimal("15"), + infrastructure_raw_cost=Decimal("100"), + infrastructure_markup_cost=Decimal("0"), + raw_currency="USD", + cost_model_rate_type=None, + ) + + url = f"?filter[cluster]={cluster_id}" + query_params = self.mocked_query_params(url, OCPMemoryView) + handler = OCPReportQueryHandler(query_params) + query_group_by = ["date"] + + waste_map = handler._line_item_wasted_cost(query_group_by) + + self.assertEqual(waste_map["__total__"], Decimal("50")) From e17d95b9e28fc6a693502ae8aa474e2f01db5df4 Mon Sep 17 00:00:00 2001 From: myersCody Date: Mon, 11 May 2026 09:56:58 -0400 Subject: [PATCH 06/18] Reset architecture docs --- docs/architecture/efficiency-scores/README.md | 13 ++++------ .../formulas-and-data-contract.md | 25 ++++++++++--------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/docs/architecture/efficiency-scores/README.md b/docs/architecture/efficiency-scores/README.md index 09cb7651f4..996ac218da 100644 --- a/docs/architecture/efficiency-scores/README.md +++ b/docs/architecture/efficiency-scores/README.md @@ -6,7 +6,7 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef ## One-paragraph scope -**Implemented:** For `cpu` and `memory` report types, the API adds **`total_score`** on the **`total`** object and **`score`** on each **data** row (when scores are computed), exposing **`usage_efficiency_percent`** (ratio of summed usage to summed request) and **`wasted_cost`** (**sum of per-line-item clamped waste**, same cost basis as **`cost_total`** at row grain — details in [formulas-and-data-contract.md](./formulas-and-data-contract.md)) on [`OCPUsageLineItemDailySummary`](../../../koku/reporting/models.py). **Not implemented:** a separate `efficiency`-only route; cost/volume reports do not expose these fields. **Backlog** (out of code): idle/cost-efficiency/overhead scores and dedicated Optimizations Summary routing in the UI. **Design note:** summing **`wasted_cost`** from both compute and memory can double-count shared infra costs if customers treat the two numbers as additive “total waste”—call that out in UX copy rather than changing the API contract here. +**Implemented:** For `cpu` and `memory` report types, the API adds **`total_score`** on the **`total`** object and **`score`** on each **data** row (when scores are computed), exposing **`usage_efficiency_percent`** and **`wasted_cost`** derived from aggregated usage/request hours and CPU- or memory-scoped **`cost_total`** on [`OCPUsageLineItemDailySummary`](../../../koku/reporting/models.py). **Not implemented:** a separate `efficiency`-only route; cost/volume reports do not expose these fields. **Backlog** (out of code): idle/cost-efficiency/overhead scores and dedicated Optimizations Summary routing in the UI. --- @@ -39,7 +39,7 @@ Help FinOps and Dev/Ops reason about CPU and memory utilization using **usage ef | **Scores in response** | [`OCPReportQueryHandler._pack_score`](../../../koku/api/report/ocp/query_handler.py) builds `usage_efficiency_percent` (int) and `wasted_cost` `{ value, units }`. The **`total`** block exposes this as **`total_score`** (rename from internal `score` after packing). **Data rows** keep the key **`score`** (same inner shape). | | **When scores are omitted** | For `cpu` / `memory` only: if **more than one** `group_by` dimension is present **or** there is **tag** `group_by` **or** tag keys under **`filter`**, `should_compute` is false → `total_score` and per-row `score` are **empty objects** `{}`. **Tag `exclude` is not part of this check** ([`execute_query`](../../../koku/api/report/ocp/query_handler.py) uses `get_tag_filter_keys()` with the default **`filter`** parameter in [`ReportQueryHandler.get_tag_filter_keys`](../../../koku/api/report/queries.py)). Multi `group_by` is covered by [`test_efficiency_score_multi_group_by_returns_empty`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py). | | **Reports without scores** | `costs`, `costs_by_project`, `volume`, and other non-inventory types do not add `total_score`. Tests: [`test_efficiency_score_cost_report_excluded`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py), [`test_efficiency_score_volume_report_excluded`](../../../koku/api/report/test/ocp/test_ocp_query_handler.py). | -| **Aggregations** | [`OCPProviderMap._efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py) defines `usage_efficiency` (ratio-of-sums) and `wasted_cost` (`Sum` of per-row clamped waste) for `cpu` and `memory` **aggregates** and **report_annotations**, using [`_per_row_cost_cpu_expr`](../../../koku/api/report/ocp/provider_map.py) / [`_per_row_cost_memory_expr`](../../../koku/api/report/ocp/provider_map.py). | +| **Aggregations** | [`OCPProviderMap._efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py) defines ORM expressions for `usage_efficiency` and `wasted_cost` on `cpu` and `memory` **aggregates** and **report_annotations**. | | **Ordering** | In-memory sort includes `usage_efficiency` and `wasted_cost` in [`ReportQueryHandler._order_by`](../../../koku/api/report/queries.py). | | **Tenant boundary** | All queries run under **`tenant_context(self.tenant)`** in [`OCPReportQueryHandler.execute_query`](../../../koku/api/report/ocp/query_handler.py). | | **Pipeline / SQL** | No Celery tasks and no new SQL templates — scores are ORM aggregates on existing tenant line items. **Dual-path** (`trino_sql` / `self_hosted_sql`) is unchanged for this feature. | @@ -67,8 +67,8 @@ flowchart LR | ID | Resolution | Evidence | |----|------------|----------| | IQ-5 (endpoint shape) | **Extend** existing compute/memory inventory endpoints; **no** separate `efficiency` route. | Views unchanged path; handler gates on `report_type in ("cpu", "memory")`. | -| IQ-1 (fleet / total row) | **`usage_efficiency_percent`:** single ratio of sums over the filtered row set. **`wasted_cost`:** `Sum` of per-row clamped waste over that same set (not fleet pooled `cost × (1 - U/R)`). Grouped rows use the same pattern per bucket. | `query.aggregate(**aggregates)` in [`execute_query`](../../../koku/api/report/ocp/query_handler.py); annotations from [`OCPProviderMap`](../../../koku/api/report/ocp/provider_map.py). | -| IQ-2 (wasted cost basis) | **Per-row sum:** each row contributes `max(row_cost × (1 - u/r), 0)` with row cost aligned to **`cost_total`** components; fleet and groups use **`Sum`** of those values. | [`_efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py), [formulas-and-data-contract.md](./formulas-and-data-contract.md). | +| IQ-1 (fleet / total row) | **Single ratio over the filtered row set** — totals use the same aggregate expressions as grouped rows (ratio-of-sums style at the SQL aggregate level). | `query.aggregate(**aggregates)` in [`execute_query`](../../../koku/api/report/ocp/query_handler.py) includes `usage_efficiency` / `wasted_cost` from [`OCPProviderMap`](../../../koku/api/report/ocp/provider_map.py). | +| IQ-2 (wasted cost basis) | **`wasted_cost = max(cost_total * (1 - usage/request), 0)`** with CPU- or memory-specific **`cost_total`** and matching usage/request **sums**. | [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py). Details in [formulas-and-data-contract.md](./formulas-and-data-contract.md). | | IQ-6 (RBAC) | Same access pattern as other OCP report views (no separate optimizations-only permission in backend). | Reuses existing views. | --- @@ -82,7 +82,6 @@ flowchart LR | — | **`request_sum == 0`** semantics | Implementation coalesces **`usage_efficiency_percent`** to **0** and **`wasted_cost`** to **0** (not `null`). Confirm UX/OpenAPI wording. | | — | Tag / multi-dimension `group_by` | Scores intentionally empty; document for UI. | | — | Tag **`exclude`** vs `should_compute` | Code does not pass `"exclude"` into [`get_tag_filter_keys`](../../../koku/api/report/queries.py); align product/UI expectations or extend `has_tag_interaction` if excludes should suppress scores. | -| IQ-7–IQ-10 | Grain / naming / infra split follow-ups | **`wasted_cost`** is sum of row-level clamped waste; **`usage_efficiency_percent`** remains ratio-of-sums (the two can diverge). Further product choices are backlog, not encoded here. | --- @@ -90,11 +89,9 @@ flowchart LR | Date | Summary | |------|---------| -| 2026-04-24 | Added companion solution-options doc (cost-efficiency brief; doc later removed from tree). | | 2026-04-16 | Initial agent-focused hub and formulas doc from product brief. | | 2026-04-17 | Rewrote hub for **as-built** implementation (compute/memory, `total_score` / `score`, formulas, no new pipeline). | | 2026-04-21 | Corrected **when scores are omitted**: tag **`exclude`** does not affect `should_compute` today (only tag `group_by` and tag **`filter`** keys). | -| 2026-05-05 | **`wasted_cost`** documented as per-row clamped waste **`Sum`**; README tables and aggregation pointer updated (`_efficiency_annotations_row_waste_sum`). | --- @@ -102,7 +99,7 @@ flowchart LR | Block | Content | |-------|---------| -| **Doc map** | This README (overview + links) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math + JSON). Reading order: README → formulas when changing behavior. | +| **Doc map** | This README (overview + links) → [formulas-and-data-contract.md](./formulas-and-data-contract.md) (math + JSON). Reading order: README → formulas. | | **Assumptions** | None beyond what is cited from code; UI “Optimizations Summary” tab wiring lives in koku-ui. | | **IQ / decisions** | Resolved items in **Resolved decisions** table; backlog in **Open questions**. | | **API contract summary** | `GET …/reports/openshift/compute/` and `GET …/reports/openshift/memory/` with `filter` / `group_by` / `order_by` as other OCP inventory reports. Response: **`total.total_score`**: `{ usage_efficiency_percent: int, wasted_cost: { value, units } }` or `{}`. Data leaves: **`score`** same shape or `{}`. **`order_by[usage_efficiency]`**, **`order_by[wasted_cost]`** supported (with valid `group_by` per existing serializer rules). | diff --git a/docs/architecture/efficiency-scores/formulas-and-data-contract.md b/docs/architecture/efficiency-scores/formulas-and-data-contract.md index a028ff9104..9d5088018d 100644 --- a/docs/architecture/efficiency-scores/formulas-and-data-contract.md +++ b/docs/architecture/efficiency-scores/formulas-and-data-contract.md @@ -28,7 +28,7 @@ Let **`usage_sum`** and **`request_sum`** be the **Sum** expressions for the app | Expression | Meaning | |------------|---------| | Ratio | `usage_sum / NULLIF(request_sum, 0)` — avoids division by zero. | -| **Percent** | `Round(ratio * 100)` as **integer** (Django `Round` + `IntegerField` in [`_efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py)). | +| **Percent** | `Round(ratio * 100)` as **integer** (Django `Round` + `IntegerField` in [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py)). | | **Null / zero request** | `Coalesce(..., 0)` → API exposes **`0`** when the ratio is null (including **`request_sum == 0`**). | So **`usage_efficiency_percent` can exceed 100** when usage exceeds request over the aggregated rows (not clamped). @@ -41,16 +41,18 @@ Response shaping is done in [`OCPReportQueryHandler._pack_score`](../../../koku/ ## Wasted cost (`wasted_cost`) -### Definition (implemented) — sum of per-row waste +### Definition (implemented) + +Uses the same **`usage_sum`**, **`request_sum`**, and a **`cost_total_expr`** passed into [`_efficiency_annotations`](../../../koku/api/report/ocp/provider_map.py): -[`_efficiency_annotations_row_waste_sum`](../../../koku/api/report/ocp/provider_map.py) computes **`wasted_cost`** as **`Sum(per_row_waste)`** over the same rows that feed the report (fleet aggregate or each `group_by` bucket). **`usage_efficiency_percent`** in that helper remains **ratio-of-sums** (see above); the two fields **can diverge** by design. +`wasted_cost = Coalesce(Greatest(cost_total * (1 - usage_sum / NullIf(request_sum, 0)), 0), 0)` (ORM/SQL expression; `NullIf` avoids divide-by-zero). -Per row *i*, with coalesced usage **u_i**, request **r_i**, and row cost **c_i** (same linear components as aggregate **`cost_total`**, without wrapping **`Sum`**): +- **`cost_total`** for **`cpu`**: `cloud_infrastructure_cost + markup_cost + cost_model_cpu_cost` (same components as the `cost_total` aggregate for CPU inventory). +- **`cost_total`** for **`memory`**: `cloud_infrastructure_cost + markup_cost + cost_model_memory_cost`. -- **c_i** from [`_per_row_cost_cpu_expr`](../../../koku/api/report/ocp/provider_map.py) (**`cpu`**) or [`_per_row_cost_memory_expr`](../../../koku/api/report/ocp/provider_map.py) (**`memory`**): infra raw + markup (each × `infra_exchange_rate`) + cost-model CPU or memory (× `exchange_rate`), matching the inventory **`cost_total`** basis at row grain. -- **waste_i** = `Coalesce(Greatest(c_i * (1 - u_i / NullIf(r_i, 0)), 0), 0)` — `NullIf` avoids divide-by-zero; **`r_i == 0`** → that row contributes **0**; over-requested rows contribute **0** waste. +So this is **not** “waste percent × arbitrary headline cost” from a different column — it tracks the **`cost_total` expression for that dimension** in the provider map. -**`wasted_cost`** = **Σ waste_i** (ORM **`Sum(per_row_waste)`**), with outer **`Coalesce(..., 0)`** when the aggregate is null. +When **`request_sum`** is zero, the ratio is null; the expression yields null and **`Coalesce`** forces **`wasted_cost`** to **decimal 0**. ### API shape @@ -67,8 +69,8 @@ Per row *i*, with coalesced usage **u_i**, request **r_i**, and row cost **c_i** ## Fleet vs grouped rows -- **Total row:** `query.aggregate(**aggregates)` applies **`usage_efficiency`** as **one ratio of sums** over the **entire filtered queryset**, and **`wasted_cost`** as the **sum of per-row waste** over those same rows (not an average of per-cluster percentages, and not fleet `cost_total × (1 - U/R)`). -- **Grouped rows:** Each group gets its own **`usage_sum`**, **`request_sum`**, and its own **`Sum(per_row_waste)`** from `values(...).annotate(...)`. +- **Total row:** `query.aggregate(**aggregates)` applies the same **`usage_efficiency`** and **`wasted_cost`** definitions over the **entire filtered queryset** — i.e. one combined ratio and one wasted-cost expression for the fleet (not an average of per-cluster percentages). +- **Grouped rows:** Each group gets its own **`usage_sum`**, **`request_sum`**, and **`cost_total`** slice from `values(...).annotate(...)`. --- @@ -134,7 +136,7 @@ When scores are disabled, `"total_score": {}`. | Metric | Status | |--------|--------| -| Waste score `max(100 - efficiency, 0)` | Not exposed as its own field; **`wasted_cost`** is sum-of-row clamped waste (not that scalar). | +| Waste score `max(100 - efficiency, 0)` | Not exposed as its own field; wasted cost follows the formula above. | | Idle / signed unused | Backlog; see README IQ-3. | | Cost efficiency / overhead scores | Backlog. | @@ -145,5 +147,4 @@ When scores are disabled, `"total_score": {}`. | Date | Summary | |------|---------| | 2026-04-16 | Initial formulas; illustrative JSON; fleet and wasted-cost options. | -| 2026-04-17 | Aligned with implementation: `_pack_score`, `total_score` vs `score`, `request=0` → 0, wasted cost formula and cost basis. | -| 2026-05-05 | **`wasted_cost`:** `_efficiency_annotations_row_waste_sum`, per-row cost helpers, **`Sum`** of clamped row waste; **`usage_efficiency_percent`** unchanged (ratio-of-sums). | +| 2026-04-17 | Aligned with implementation: `_efficiency_annotations`, `_pack_score`, `total_score` vs `score`, `request=0` → 0, wasted cost formula and cost basis. | From e497a58ca7c1d463a8113a802a8810dd1ad1e2f3 Mon Sep 17 00:00:00 2001 From: myersCody Date: Mon, 11 May 2026 13:52:22 -0400 Subject: [PATCH 07/18] Back out provider map changes --- koku/api/report/ocp/provider_map.py | 85 +++++++--------------------- koku/api/report/ocp/query_handler.py | 2 +- 2 files changed, 23 insertions(+), 64 deletions(-) diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index 29bd912466..b185d5763d 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -210,52 +210,9 @@ def _memory_request_sum(self): """Return a new Sum expression for memory request hours.""" return Sum(Coalesce(F("pod_request_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - @staticmethod - def _efficiency_decimal_field(): - return DecimalField(max_digits=33, decimal_places=15) - - def _per_row_cost_cpu_expr(self): - """Row-level CPU cost_total (infra + markup + cost_model_cpu), without Sum.""" - _dec = self._efficiency_decimal_field() - row_infra = Coalesce(F("infrastructure_raw_cost"), Value(0, output_field=_dec)) * Coalesce( - F("infra_exchange_rate"), Value(1, output_field=_dec) - ) - row_markup = Coalesce(F("infrastructure_markup_cost"), Value(0, output_field=_dec)) * Coalesce( - F("infra_exchange_rate"), Value(1, output_field=_dec) - ) - row_cm = Coalesce(F("cost_model_cpu_cost"), Value(0, output_field=_dec)) * Coalesce( - F("exchange_rate"), Value(1, output_field=_dec) - ) - return row_infra + row_markup + row_cm - - def _per_row_cost_memory_expr(self): - """Row-level memory cost_total (infra + markup + cost_model_memory), without Sum.""" - _dec = self._efficiency_decimal_field() - row_infra = Coalesce(F("infrastructure_raw_cost"), Value(0, output_field=_dec)) * Coalesce( - F("infra_exchange_rate"), Value(1, output_field=_dec) - ) - row_markup = Coalesce(F("infrastructure_markup_cost"), Value(0, output_field=_dec)) * Coalesce( - F("infra_exchange_rate"), Value(1, output_field=_dec) - ) - row_cm = Coalesce(F("cost_model_memory_cost"), Value(0, output_field=_dec)) * Coalesce( - F("exchange_rate"), Value(1, output_field=_dec) - ) - return row_infra + row_markup + row_cm - - def _efficiency_annotations_row_waste_sum( - self, usage_sum_prop, request_sum_prop, per_row_cost_expr, usage_field, request_field - ): - """usage_efficiency (ratio-of-sums); wasted_cost = Sum of per-row clamped waste (S1-a).""" - _dec = self._efficiency_decimal_field() - row_u = Coalesce(F(usage_field), Value(0, output_field=_dec)) - row_r = Coalesce(F(request_field), Value(0, output_field=_dec)) - per_row_waste = Coalesce( - Greatest( - per_row_cost_expr * (Value(1, output_field=_dec) - row_u / NullIf(row_r, Value(0, output_field=_dec))), - Value(0, output_field=_dec), - ), - Value(0, output_field=_dec), - ) + def _efficiency_annotations(self, usage_sum_prop, request_sum_prop, cost_total_expr): + """Build usage_efficiency and wasted_cost annotation expressions.""" + _dec = DecimalField(max_digits=33, decimal_places=15) return { "usage_efficiency": Coalesce( Round(usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) * Value(100)), @@ -263,7 +220,14 @@ def _efficiency_annotations_row_waste_sum( output_field=IntegerField(), ), "wasted_cost": Coalesce( - Sum(per_row_waste), + Greatest( + cost_total_expr + * ( + Value(1, output_field=_dec) + - usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) + ), + Value(0, output_field=_dec), + ), Value(0, output_field=_dec), output_field=_dec, ), @@ -477,12 +441,10 @@ def __init__(self, provider, report_type, schema_name): "pod_request_cpu_core_hours", default=Value(0, output_field=DecimalField()) ), "limit": Sum("pod_limit_cpu_core_hours", default=Value(0, output_field=DecimalField())), - **self._efficiency_annotations_row_waste_sum( + **self._efficiency_annotations( self._cpu_usage_sum(), self._cpu_request_sum(), - self._per_row_cost_cpu_expr(), - "pod_usage_cpu_core_hours", - "pod_request_cpu_core_hours", + self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_cpu_cost, ), }, "capacity_aggregate": { @@ -539,12 +501,10 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( Coalesce(F("pod_limit_cpu_core_hours"), Value(0, output_field=DecimalField())) ), - **self._efficiency_annotations_row_waste_sum( + **self._efficiency_annotations( self._cpu_usage_sum(), self._cpu_request_sum(), - self._per_row_cost_cpu_expr(), - "pod_usage_cpu_core_hours", - "pod_request_cpu_core_hours", + self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_cpu_cost, ), "capacity": Max("cluster_capacity_cpu_core_hours"), # overwritten in capacity aggregation "clusters": ArrayAgg( @@ -638,12 +598,10 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( "pod_limit_memory_gigabyte_hours", default=Value(0, output_field=DecimalField()) ), - **self._efficiency_annotations_row_waste_sum( + **self._efficiency_annotations( self._memory_usage_sum(), self._memory_request_sum(), - self._per_row_cost_memory_expr(), - "pod_usage_memory_gigabyte_hours", - "pod_request_memory_gigabyte_hours", + self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_memory_cost, ), }, "capacity_aggregate": { @@ -701,12 +659,10 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( Coalesce(F("pod_limit_memory_gigabyte_hours"), Value(0, output_field=DecimalField())) ), - **self._efficiency_annotations_row_waste_sum( + **self._efficiency_annotations( self._memory_usage_sum(), self._memory_request_sum(), - self._per_row_cost_memory_expr(), - "pod_usage_memory_gigabyte_hours", - "pod_request_memory_gigabyte_hours", + self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_memory_cost, ), "capacity": Max( "cluster_capacity_memory_gigabyte_hours" @@ -1211,6 +1167,9 @@ def __init__(self, provider, report_type, schema_name): {"field": "mig_profile", "operation": "gt", "parameter": ""}, ], "group_by": ["mig_profile"], + # filter[limit] ranks individual MIG instances (mig_id level), not profiles. + # A node can have many instances per profile, so limiting by profile is not useful. + "rank_group_by": ["mig_profile", "mig_id"], # Do not synthesize an "Other(s)" bucket when filter[limit] is used; return top-N only. "rank_limit_include_others": False, "cost_units_key": "raw_currency", diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index 68815f8062..d0402edc74 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -335,7 +335,7 @@ def _line_item_wasted_cost(self, group_by: list) -> dict: A dict keyed by tuple(*group_by field values) → Decimal waste. The sentinel key '__total__' holds the overall waste sum. """ - _dec = OCPProviderMap._efficiency_decimal_field() + _dec = DecimalField(max_digits=33, decimal_places=15) if self._report_type == "cpu": usage_field = "pod_usage_cpu_core_hours" From e07d577f54ddadbfb2b4f625bf2865b03e294312 Mon Sep 17 00:00:00 2001 From: myersCody Date: Tue, 12 May 2026 14:38:59 -0400 Subject: [PATCH 08/18] Make the provider map easy to read --- koku/api/report/ocp/provider_map.py | 130 ++++++++++++++++------------ 1 file changed, 74 insertions(+), 56 deletions(-) diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index b185d5763d..ca6067b60c 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -22,7 +22,6 @@ from django.db.models.functions import Cast from django.db.models.functions import Coalesce from django.db.models.functions import Concat -from django.db.models.functions import Greatest from django.db.models.functions import JSONObject from django.db.models.functions import Round from django.db.models.functions.comparison import NullIf @@ -194,45 +193,6 @@ def __cost_model_distributed_cost(self, cost_model_rate_type, exchange_rate_colu * Coalesce(exchange_rate_column, Value(1, output_field=DecimalField())), ) - def _cpu_usage_sum(self): - """Return a new Sum expression for CPU usage hours.""" - return Sum(Coalesce(F("pod_usage_cpu_core_hours"), Value(0, output_field=DecimalField()))) - - def _cpu_request_sum(self): - """Return a new Sum expression for CPU request hours.""" - return Sum(Coalesce(F("pod_request_cpu_core_hours"), Value(0, output_field=DecimalField()))) - - def _memory_usage_sum(self): - """Return a new Sum expression for memory usage hours.""" - return Sum(Coalesce(F("pod_usage_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - - def _memory_request_sum(self): - """Return a new Sum expression for memory request hours.""" - return Sum(Coalesce(F("pod_request_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - - def _efficiency_annotations(self, usage_sum_prop, request_sum_prop, cost_total_expr): - """Build usage_efficiency and wasted_cost annotation expressions.""" - _dec = DecimalField(max_digits=33, decimal_places=15) - return { - "usage_efficiency": Coalesce( - Round(usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) * Value(100)), - Value(0), - output_field=IntegerField(), - ), - "wasted_cost": Coalesce( - Greatest( - cost_total_expr - * ( - Value(1, output_field=_dec) - - usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) - ), - Value(0, output_field=_dec), - ), - Value(0, output_field=_dec), - output_field=_dec, - ), - } - def __init__(self, provider, report_type, schema_name): """Constructor.""" self._schema_name = schema_name @@ -441,11 +401,23 @@ def __init__(self, provider, report_type, schema_name): "pod_request_cpu_core_hours", default=Value(0, output_field=DecimalField()) ), "limit": Sum("pod_limit_cpu_core_hours", default=Value(0, output_field=DecimalField())), - **self._efficiency_annotations( - self._cpu_usage_sum(), - self._cpu_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_cpu_cost, + "usage_efficiency": Coalesce( + Round( + Sum(Coalesce(F("pod_usage_cpu_core_hours"), Value(0, output_field=DecimalField()))) + / NullIf( + Sum( + Coalesce( + F("pod_request_cpu_core_hours"), Value(0, output_field=DecimalField()) + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Value(100) + ), + Value(0), + output_field=IntegerField(), ), + "wasted_cost": Sum(Value(0, output_field=DecimalField())), }, "capacity_aggregate": { "cluster": { @@ -501,11 +473,23 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( Coalesce(F("pod_limit_cpu_core_hours"), Value(0, output_field=DecimalField())) ), - **self._efficiency_annotations( - self._cpu_usage_sum(), - self._cpu_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_cpu_cost, + "usage_efficiency": Coalesce( + Round( + Sum(Coalesce(F("pod_usage_cpu_core_hours"), Value(0, output_field=DecimalField()))) + / NullIf( + Sum( + Coalesce( + F("pod_request_cpu_core_hours"), Value(0, output_field=DecimalField()) + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Value(100) + ), + Value(0), + output_field=IntegerField(), ), + "wasted_cost": Sum(Value(0, output_field=DecimalField())), "capacity": Max("cluster_capacity_cpu_core_hours"), # overwritten in capacity aggregation "clusters": ArrayAgg( Coalesce("cluster_alias", "cluster_id"), distinct=True, default=Value([]) @@ -598,11 +582,28 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( "pod_limit_memory_gigabyte_hours", default=Value(0, output_field=DecimalField()) ), - **self._efficiency_annotations( - self._memory_usage_sum(), - self._memory_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_memory_cost, + "usage_efficiency": Coalesce( + Round( + Sum( + Coalesce( + F("pod_usage_memory_gigabyte_hours"), Value(0, output_field=DecimalField()) + ) + ) + / NullIf( + Sum( + Coalesce( + F("pod_request_memory_gigabyte_hours"), + Value(0, output_field=DecimalField()), + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Value(100) + ), + Value(0), + output_field=IntegerField(), ), + "wasted_cost": Sum(Value(0, output_field=DecimalField())), }, "capacity_aggregate": { "cluster": { @@ -659,11 +660,28 @@ def __init__(self, provider, report_type, schema_name): "limit": Sum( Coalesce(F("pod_limit_memory_gigabyte_hours"), Value(0, output_field=DecimalField())) ), - **self._efficiency_annotations( - self._memory_usage_sum(), - self._memory_request_sum(), - self.cloud_infrastructure_cost + self.markup_cost + self.cost_model_memory_cost, + "usage_efficiency": Coalesce( + Round( + Sum( + Coalesce( + F("pod_usage_memory_gigabyte_hours"), Value(0, output_field=DecimalField()) + ) + ) + / NullIf( + Sum( + Coalesce( + F("pod_request_memory_gigabyte_hours"), + Value(0, output_field=DecimalField()), + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Value(100) + ), + Value(0), + output_field=IntegerField(), ), + "wasted_cost": Sum(Value(0, output_field=DecimalField())), "capacity": Max( "cluster_capacity_memory_gigabyte_hours" ), # This is to keep the order, overwritten with capacity aggregate From a2940883e92b0230e06d4b1338b376813781bf24 Mon Sep 17 00:00:00 2001 From: myersCody Date: Wed, 13 May 2026 12:57:22 -0400 Subject: [PATCH 09/18] Turn off wasted_cost --- koku/api/report/ocp/query_handler.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index d0402edc74..ebbb9cf8c2 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -471,14 +471,15 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): has_tag_interaction = self._tag_group_by or self.get_tag_filter_keys() should_compute = not has_tag_interaction and len(group_by_value) <= 1 - # Compute wasted_cost from the finest available grain (per-pod daily rows) - # rather than from the pre-aggregated summary table buckets. Bucket-level - # aggregation causes over- and under-utilised pods to cancel, collapsing - # the waste formula toward zero even when real per-pod waste exists. - line_item_waste: dict = {} - if should_compute: - line_item_waste = self._line_item_wasted_cost(query_group_by) - query_sum["wasted_cost"] = line_item_waste.get("__total__", Decimal(0)) + # # should_compute = False + # # Compute wasted_cost from the finest available grain (per-pod daily rows) + # # rather than from the pre-aggregated summary table buckets. Bucket-level + # # aggregation causes over- and under-utilised pods to cancel, collapsing + # # the waste formula toward zero even when real per-pod waste exists. + # line_item_waste: dict = {} + # if should_compute: + # line_item_waste = self._line_item_wasted_cost(query_group_by) + # query_sum["wasted_cost"] = line_item_waste.get("__total__", Decimal(0)) self._pack_score(query_sum, should_compute) if self._delta: @@ -488,9 +489,9 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): for row in query_data: - if should_compute: - key = tuple(row.get(g) for g in query_group_by) - row["wasted_cost"] = line_item_waste.get(key, Decimal(0)) + # if should_compute: + # key = tuple(row.get(g) for g in query_group_by) + # row["wasted_cost"] = line_item_waste.get(key, Decimal(0)) self._pack_score(row, should_compute) for row in query_data: From 89ace5a8db2e2a9596f31dfe55a1bd74c57f95fa Mon Sep 17 00:00:00 2001 From: myersCody Date: Thu, 14 May 2026 12:51:09 -0400 Subject: [PATCH 10/18] Save current approach --- koku/api/report/ocp/provider_map.py | 40 +++++++++ koku/api/report/ocp/query_handler.py | 66 ++++++++++++--- .../report/test/ocp/test_ocp_query_handler.py | 12 ++- .../reporting_ocp_pod_summary_by_node_p.sql | 84 ++++++++++++++++--- ...reporting_ocp_pod_summary_by_project_p.sql | 80 +++++++++++++++--- .../reporting_ocp_pod_summary_p.sql | 79 ++++++++++++++--- .../0351_pod_summary_wasted_cost.py | 42 ++++++++++ koku/reporting/provider/ocp/models.py | 12 +++ 8 files changed, 366 insertions(+), 49 deletions(-) create mode 100644 koku/reporting/migrations/0351_pod_summary_wasted_cost.py diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index ca6067b60c..2c477fd99d 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -12,6 +12,7 @@ from django.db.models import DecimalField from django.db.models import F from django.db.models import Func +from django.db.models import Greatest from django.db.models import IntegerField from django.db.models import Max from django.db.models import Q @@ -193,6 +194,45 @@ def __cost_model_distributed_cost(self, cost_model_rate_type, exchange_rate_colu * Coalesce(exchange_rate_column, Value(1, output_field=DecimalField())), ) + def _cpu_usage_sum(self): + """Return a new Sum expression for CPU usage hours.""" + return Sum(Coalesce(F("pod_usage_cpu_core_hours"), Value(0, output_field=DecimalField()))) + + def _cpu_request_sum(self): + """Return a new Sum expression for CPU request hours.""" + return Sum(Coalesce(F("pod_request_cpu_core_hours"), Value(0, output_field=DecimalField()))) + + def _memory_usage_sum(self): + """Return a new Sum expression for memory usage hours.""" + return Sum(Coalesce(F("pod_usage_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) + + def _memory_request_sum(self): + """Return a new Sum expression for memory request hours.""" + return Sum(Coalesce(F("pod_request_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) + + def _efficiency_annotations(self, usage_sum_prop, request_sum_prop, cost_total_expr): + """Build usage_efficiency and wasted_cost annotation expressions.""" + _dec = DecimalField(max_digits=33, decimal_places=15) + return { + "usage_efficiency": Coalesce( + Round(usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) * Value(100)), + Value(0), + output_field=IntegerField(), + ), + "wasted_cost": Coalesce( + Greatest( + cost_total_expr + * ( + Value(1, output_field=_dec) + - usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) + ), + Value(0, output_field=_dec), + ), + Value(0, output_field=_dec), + output_field=_dec, + ), + } + def __init__(self, provider, report_type, schema_name): """Constructor.""" self._schema_name = schema_name diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index ebbb9cf8c2..16a43663a4 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -41,9 +41,15 @@ from cost_models.models import CostModel from cost_models.models import CostModelMap from reporting.models import OCPUsageLineItemDailySummary +from reporting.provider.ocp.models import OCPPodSummaryByNodeP +from reporting.provider.ocp.models import OCPPodSummaryByProjectP +from reporting.provider.ocp.models import OCPPodSummaryP LOG = logging.getLogger(__name__) +# Models that carry pre-computed per-line-item wasted cost columns. +_POD_SUMMARY_MODELS = frozenset({OCPPodSummaryP, OCPPodSummaryByProjectP, OCPPodSummaryByNodeP}) + class OCPReportQueryHandler(ReportQueryHandler): """Handles report queries and responses for OCP.""" @@ -259,19 +265,46 @@ def _format_query_response(self): return output def _pack_score(self, row, should_compute): - """Shape efficiency annotations into the score response object.""" + """Shape efficiency annotations into the score response object. + + wasted_cost is always included when present because it is a SUM of pre-computed + per-line-item values and is correct for any GROUP BY dimension. + usage_efficiency_percent is a ratio-of-sums that is only meaningful when + should_compute is True (single GROUP BY dimension, no tag interaction). + """ + score: dict = {} if should_compute: - row["score"] = { - "usage_efficiency_percent": row.pop("usage_efficiency", 0), - "wasted_cost": { - "value": row.pop("wasted_cost", Decimal(0)), - "units": self.currency, - }, - } + score["usage_efficiency_percent"] = row.pop("usage_efficiency", 0) else: row.pop("usage_efficiency", None) - row.pop("wasted_cost", None) - row["score"] = {} + + wasted = row.pop("wasted_cost", None) + if wasted is not None: + score["wasted_cost"] = {"value": wasted, "units": self.currency} + + row["score"] = score + + def _wasted_cost_precomputed_expr(self): + """Return a Sum expression for the pre-computed wasted cost column, or None. + + Returns a ready-to-use ORM expression only when the active query table is one of + the pod summary models that carry the pre-computed column. Falls back to None so + the caller can use the bucket-level formula from the provider map instead. + """ + if self._report_type not in ("cpu", "memory"): + return None + if self.query_table not in _POD_SUMMARY_MODELS: + return None + _dec = DecimalField(max_digits=33, decimal_places=15) + field = "wasted_cpu_cost" if self._report_type == "cpu" else "wasted_memory_cost" + return Coalesce( + Sum( + Coalesce(F(field), Value(0, output_field=_dec)) + * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) + ), + Value(0, output_field=_dec), + output_field=_dec, + ) def _rate_sum_by_source(self, metric_names: frozenset) -> dict: """Return {source_uuid_str: sum_of_flat_tier_values_in_cost_model_currency}. @@ -441,9 +474,14 @@ def execute_query(self): # noqa: C901 query_group_by = ["date"] + group_by_value query_order_by = ["-date", self.order] - query_data = query.values(*query_group_by).annotate( - **{k: v for k, v in self.report_annotations.items() if k not in group_by_value} - ) + # When the query table carries pre-computed per-line-item wasted cost columns, + # replace the bucket-level formula with a simple SUM of those columns. + wasted_precomputed = self._wasted_cost_precomputed_expr() + row_annotations = {k: v for k, v in self.report_annotations.items() if k not in group_by_value} + if wasted_precomputed is not None: + row_annotations = {**row_annotations, "wasted_cost": wasted_precomputed} + + query_data = query.values(*query_group_by).annotate(**row_annotations) if is_grouped_by_project(self.parameters): query_data = self._project_classification_annotation(query_data) @@ -460,6 +498,8 @@ def execute_query(self): # noqa: C901 # Populate the 'total' section of the API response if query.exists(): aggregates = self._mapper.report_type_map.get("aggregates") + if wasted_precomputed is not None: + aggregates = {**aggregates, "wasted_cost": wasted_precomputed} metric_sum = query.aggregate(**aggregates) query_sum = {key: metric_sum.get(key) for key in aggregates} diff --git a/koku/api/report/test/ocp/test_ocp_query_handler.py b/koku/api/report/test/ocp/test_ocp_query_handler.py index aa0d8609e1..db7851c130 100644 --- a/koku/api/report/test/ocp/test_ocp_query_handler.py +++ b/koku/api/report/test/ocp/test_ocp_query_handler.py @@ -2214,14 +2214,20 @@ def test_efficiency_score_memory_report(self): self.assertIn("value", total_score["wasted_cost"]) self.assertIn("units", total_score["wasted_cost"]) - def test_efficiency_score_multi_group_by_returns_empty(self): - """Test that multi group-by returns empty total_score.""" + def test_efficiency_score_multi_group_by_omits_efficiency_percent(self): + """Test that multi group-by omits usage_efficiency_percent but still returns wasted_cost. + + usage_efficiency_percent is a ratio-of-sums that is only meaningful for a single + group-by dimension. wasted_cost is always a SUM of values and is returned regardless. + """ url = "?group_by[cluster]=*&group_by[project]=*" query_params = self.mocked_query_params(url, OCPCpuView) handler = OCPReportQueryHandler(query_params) query_output = handler.execute_query() total = query_output.get("total") - self.assertEqual(total.get("total_score"), {}) + total_score = total.get("total_score") + self.assertNotIn("usage_efficiency_percent", total_score) + self.assertIn("wasted_cost", total_score) def test_efficiency_score_cost_report_excluded(self): """Test that cost report does not include total_score.""" diff --git a/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_node_p.sql b/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_node_p.sql index cb500a74ef..1ecd228f7e 100644 --- a/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_node_p.sql +++ b/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_node_p.sql @@ -4,6 +4,58 @@ WHERE usage_start >= {{start_date}}::date AND source_uuid = {{source_uuid}} ; +WITH line_items AS ( + SELECT + cluster_id, + cluster_alias, + node, + resource_id, + usage_start, + cost_model_rate_type, + infrastructure_raw_cost, + infrastructure_markup_cost, + cost_model_cpu_cost, + cost_model_memory_cost, + cost_model_volume_cost, + cost_model_gpu_cost, + pod_usage_cpu_core_hours, + pod_request_cpu_core_hours, + pod_effective_usage_cpu_core_hours, + pod_limit_cpu_core_hours, + pod_usage_memory_gigabyte_hours, + pod_request_memory_gigabyte_hours, + pod_effective_usage_memory_gigabyte_hours, + pod_limit_memory_gigabyte_hours, + cluster_capacity_cpu_core_hours, + cluster_capacity_memory_gigabyte_hours, + node_capacity_cpu_cores, + node_capacity_cpu_core_hours, + node_capacity_memory_gigabytes, + node_capacity_memory_gigabyte_hours, + cost_category_id, + raw_currency, + distributed_cost, + data_source, + -- Window: sums from base rows (rate_type IS NULL) at node+cluster+date level. + -- Enables cross-rate-type waste computation in the outer SELECT. + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_usage_cpu_core_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, node, usage_start) AS grp_cpu_usage, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_effective_usage_cpu_core_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, node, usage_start) AS grp_cpu_effective, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_usage_memory_gigabyte_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, node, usage_start) AS grp_mem_usage, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_effective_usage_memory_gigabyte_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, node, usage_start) AS grp_mem_effective + FROM {{schema | sqlsafe}}.reporting_ocpusagelineitem_daily_summary + WHERE usage_start >= {{start_date}}::date + AND usage_start <= {{end_date}}::date + AND source_uuid = {{source_uuid}} + AND data_source = 'Pod' + AND namespace IS DISTINCT FROM 'Worker unallocated' + AND namespace IS DISTINCT FROM 'Platform unallocated' + AND namespace IS DISTINCT FROM 'Network unattributed' + and namespace IS DISTINCT FROM 'Storage unattributed' +) INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_by_node_p ( id, cluster_id, @@ -38,7 +90,9 @@ INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_by_node_p ( source_uuid, cost_category_id, raw_currency, - distributed_cost + distributed_cost, + wasted_cpu_cost, + wasted_memory_cost ) SELECT uuid_generate_v4() as id, cluster_id, @@ -73,15 +127,23 @@ INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_by_node_p ( {{source_uuid}}::uuid as source_uuid, max(cost_category_id) as cost_category_id, max(raw_currency) as raw_currency, - sum(distributed_cost) as distributed_cost - FROM {{schema | sqlsafe}}.reporting_ocpusagelineitem_daily_summary - WHERE usage_start >= {{start_date}}::date - AND usage_start <= {{end_date}}::date - AND source_uuid = {{source_uuid}} - AND data_source = 'Pod' - AND namespace IS DISTINCT FROM 'Worker unallocated' - AND namespace IS DISTINCT FROM 'Platform unallocated' - AND namespace IS DISTINCT FROM 'Network unattributed' - and namespace IS DISTINCT FROM 'Storage unattributed' + sum(distributed_cost) as distributed_cost, + CASE + WHEN cost_model_rate_type IN ('Infrastructure', 'Supplementary') + THEN GREATEST(0, + SUM(COALESCE(cost_model_cpu_cost, 0)) + * (1 - MAX(grp_cpu_usage) / NULLIF(MAX(grp_cpu_effective), 0)) + ) + ELSE 0 + END AS wasted_cpu_cost, + CASE + WHEN cost_model_rate_type IN ('Infrastructure', 'Supplementary') + THEN GREATEST(0, + SUM(COALESCE(cost_model_memory_cost, 0)) + * (1 - MAX(grp_mem_usage) / NULLIF(MAX(grp_mem_effective), 0)) + ) + ELSE 0 + END AS wasted_memory_cost + FROM line_items GROUP BY usage_start, cluster_id, cluster_alias, node, cost_model_rate_type ; diff --git a/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_project_p.sql b/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_project_p.sql index 682ea1fc29..460b2fc770 100644 --- a/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_project_p.sql +++ b/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_by_project_p.sql @@ -4,6 +4,54 @@ WHERE usage_start >= {{start_date}}::date AND source_uuid = {{source_uuid}} ; +WITH line_items AS ( + SELECT + cluster_id, + cluster_alias, + namespace, + resource_id, + usage_start, + cost_model_rate_type, + infrastructure_raw_cost, + infrastructure_markup_cost, + cost_model_cpu_cost, + cost_model_memory_cost, + cost_model_volume_cost, + cost_model_gpu_cost, + pod_usage_cpu_core_hours, + pod_request_cpu_core_hours, + pod_effective_usage_cpu_core_hours, + pod_limit_cpu_core_hours, + pod_usage_memory_gigabyte_hours, + pod_request_memory_gigabyte_hours, + pod_effective_usage_memory_gigabyte_hours, + pod_limit_memory_gigabyte_hours, + cluster_capacity_cpu_core_hours, + cluster_capacity_memory_gigabyte_hours, + cost_category_id, + raw_currency, + distributed_cost, + data_source, + -- Window: sums from base rows (rate_type IS NULL) at namespace+cluster+date level. + -- Enables cross-rate-type waste computation in the outer SELECT. + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_usage_cpu_core_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, namespace, usage_start) AS grp_cpu_usage, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_effective_usage_cpu_core_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, namespace, usage_start) AS grp_cpu_effective, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_usage_memory_gigabyte_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, namespace, usage_start) AS grp_mem_usage, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_effective_usage_memory_gigabyte_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, namespace, usage_start) AS grp_mem_effective + FROM {{schema | sqlsafe}}.reporting_ocpusagelineitem_daily_summary + WHERE usage_start >= {{start_date}}::date + AND usage_start <= {{end_date}}::date + AND source_uuid = {{source_uuid}} + AND data_source = 'Pod' + AND namespace IS DISTINCT FROM 'Worker unallocated' + AND namespace IS DISTINCT FROM 'Platform unallocated' + AND namespace IS DISTINCT FROM 'Network unattributed' + AND namespace IS DISTINCT FROM 'Storage unattributed' +) INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_by_project_p ( id, cluster_id, @@ -34,7 +82,9 @@ INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_by_project_p ( source_uuid, cost_category_id, raw_currency, - distributed_cost + distributed_cost, + wasted_cpu_cost, + wasted_memory_cost ) SELECT uuid_generate_v4() as id, cluster_id, @@ -65,15 +115,23 @@ INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_by_project_p ( {{source_uuid}}::uuid as source_uuid, max(cost_category_id) as cost_category_id, max(raw_currency) as raw_currency, - sum(distributed_cost) as distributed_cost - FROM {{schema | sqlsafe}}.reporting_ocpusagelineitem_daily_summary - WHERE usage_start >= {{start_date}}::date - AND usage_start <= {{end_date}}::date - AND source_uuid = {{source_uuid}} - AND data_source = 'Pod' - AND namespace IS DISTINCT FROM 'Worker unallocated' - AND namespace IS DISTINCT FROM 'Platform unallocated' - AND namespace IS DISTINCT FROM 'Network unattributed' - AND namespace IS DISTINCT FROM 'Storage unattributed' + sum(distributed_cost) as distributed_cost, + CASE + WHEN cost_model_rate_type IN ('Infrastructure', 'Supplementary') + THEN GREATEST(0, + SUM(COALESCE(cost_model_cpu_cost, 0)) + * (1 - MAX(grp_cpu_usage) / NULLIF(MAX(grp_cpu_effective), 0)) + ) + ELSE 0 + END AS wasted_cpu_cost, + CASE + WHEN cost_model_rate_type IN ('Infrastructure', 'Supplementary') + THEN GREATEST(0, + SUM(COALESCE(cost_model_memory_cost, 0)) + * (1 - MAX(grp_mem_usage) / NULLIF(MAX(grp_mem_effective), 0)) + ) + ELSE 0 + END AS wasted_memory_cost + FROM line_items GROUP BY usage_start, cluster_id, cluster_alias, namespace, cost_model_rate_type ; diff --git a/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_p.sql b/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_p.sql index 09128298c8..aaadbd9ffe 100644 --- a/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_p.sql +++ b/koku/masu/database/sql/openshift/ui_summary/reporting_ocp_pod_summary_p.sql @@ -4,6 +4,53 @@ WHERE usage_start >= {{start_date}}::date AND source_uuid = {{source_uuid}} ; +WITH line_items AS ( + SELECT + cluster_id, + cluster_alias, + resource_id, + usage_start, + cost_model_rate_type, + infrastructure_raw_cost, + infrastructure_markup_cost, + cost_model_cpu_cost, + cost_model_memory_cost, + cost_model_volume_cost, + cost_model_gpu_cost, + pod_usage_cpu_core_hours, + pod_request_cpu_core_hours, + pod_effective_usage_cpu_core_hours, + pod_limit_cpu_core_hours, + pod_usage_memory_gigabyte_hours, + pod_request_memory_gigabyte_hours, + pod_effective_usage_memory_gigabyte_hours, + pod_limit_memory_gigabyte_hours, + cluster_capacity_cpu_core_hours, + cluster_capacity_memory_gigabyte_hours, + cost_category_id, + raw_currency, + distributed_cost, + data_source, + -- Window: sums from base rows (rate_type IS NULL) at cluster+date level. + -- Enables cross-rate-type waste computation in the outer SELECT. + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_usage_cpu_core_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, usage_start) AS grp_cpu_usage, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_effective_usage_cpu_core_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, usage_start) AS grp_cpu_effective, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_usage_memory_gigabyte_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, usage_start) AS grp_mem_usage, + SUM(CASE WHEN cost_model_rate_type IS NULL THEN COALESCE(pod_effective_usage_memory_gigabyte_hours, 0) ELSE 0 END) + OVER (PARTITION BY cluster_id, usage_start) AS grp_mem_effective + FROM {{schema | sqlsafe}}.reporting_ocpusagelineitem_daily_summary + WHERE usage_start >= {{start_date}}::date + AND usage_start <= {{end_date}}::date + AND source_uuid = {{source_uuid}} + AND data_source = 'Pod' + AND namespace IS DISTINCT FROM 'Worker unallocated' + AND namespace IS DISTINCT FROM 'Platform unallocated' + AND namespace IS DISTINCT FROM 'Network unattributed' + AND namespace IS DISTINCT FROM 'Storage unattributed' +) INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_p ( id, cluster_id, @@ -33,7 +80,9 @@ INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_p ( source_uuid, cost_category_id, raw_currency, - distributed_cost + distributed_cost, + wasted_cpu_cost, + wasted_memory_cost ) SELECT uuid_generate_v4() as id, cluster_id, @@ -63,15 +112,23 @@ INSERT INTO {{schema | sqlsafe}}.reporting_ocp_pod_summary_p ( {{source_uuid}}::uuid as source_uuid, max(cost_category_id) as cost_category_id, max(raw_currency) as raw_currency, - sum(distributed_cost) as distributed_cost - FROM {{schema | sqlsafe}}.reporting_ocpusagelineitem_daily_summary - WHERE usage_start >= {{start_date}}::date - AND usage_start <= {{end_date}}::date - AND source_uuid = {{source_uuid}} - AND data_source = 'Pod' - AND namespace IS DISTINCT FROM 'Worker unallocated' - AND namespace IS DISTINCT FROM 'Platform unallocated' - AND namespace IS DISTINCT FROM 'Network unattributed' - AND namespace IS DISTINCT FROM 'Storage unattributed' + sum(distributed_cost) as distributed_cost, + CASE + WHEN cost_model_rate_type IN ('Infrastructure', 'Supplementary') + THEN GREATEST(0, + SUM(COALESCE(cost_model_cpu_cost, 0)) + * (1 - MAX(grp_cpu_usage) / NULLIF(MAX(grp_cpu_effective), 0)) + ) + ELSE 0 + END AS wasted_cpu_cost, + CASE + WHEN cost_model_rate_type IN ('Infrastructure', 'Supplementary') + THEN GREATEST(0, + SUM(COALESCE(cost_model_memory_cost, 0)) + * (1 - MAX(grp_mem_usage) / NULLIF(MAX(grp_mem_effective), 0)) + ) + ELSE 0 + END AS wasted_memory_cost + FROM line_items GROUP BY usage_start, cluster_id, cluster_alias, cost_model_rate_type ; diff --git a/koku/reporting/migrations/0351_pod_summary_wasted_cost.py b/koku/reporting/migrations/0351_pod_summary_wasted_cost.py new file mode 100644 index 0000000000..f62f9d002c --- /dev/null +++ b/koku/reporting/migrations/0351_pod_summary_wasted_cost.py @@ -0,0 +1,42 @@ +from django.db import migrations +from django.db import models + + +class Migration(migrations.Migration): + + dependencies = [ + ("reporting", "0350_widen_ratestousage_label_hash"), + ] + + operations = [ + migrations.AddField( + model_name="ocppodsummaryp", + name="wasted_cpu_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocppodsummaryp", + name="wasted_memory_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocppodsummarybyprojectp", + name="wasted_cpu_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocppodsummarybyprojectp", + name="wasted_memory_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocppodsummarybynodep", + name="wasted_cpu_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocppodsummarybynodep", + name="wasted_memory_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + ] diff --git a/koku/reporting/provider/ocp/models.py b/koku/reporting/provider/ocp/models.py index 3922abb227..168a193536 100644 --- a/koku/reporting/provider/ocp/models.py +++ b/koku/reporting/provider/ocp/models.py @@ -740,6 +740,10 @@ class Meta: cost_model_gpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) cost_model_rate_type = models.TextField(null=True) + # Pre-computed per-line-item wasted cost (pipeline-time, correct grain) + wasted_cpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + wasted_memory_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + class OCPPodSummaryByProjectP(models.Model): """A summarized partitioned table specifically for UI API queries. @@ -800,6 +804,10 @@ class Meta: cost_model_gpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) cost_model_rate_type = models.TextField(null=True) + # Pre-computed per-line-item wasted cost (pipeline-time, correct grain) + wasted_cpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + wasted_memory_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + class OCPPodSummaryByNodeP(models.Model): """A summarized partitioned table specifically for UI API queries. @@ -864,6 +872,10 @@ class Meta: cost_model_gpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) cost_model_rate_type = models.TextField(null=True) + # Pre-computed per-line-item wasted cost (pipeline-time, correct grain) + wasted_cpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + wasted_memory_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + class OCPVolumeSummaryP(models.Model): """A summarized partitioned table specifically for UI API queries. From 9d6f693fd4811ce505d76ea2a5776f085187c6e3 Mon Sep 17 00:00:00 2001 From: myersCody Date: Thu, 14 May 2026 13:27:08 -0400 Subject: [PATCH 11/18] Moved logic into provider map --- koku/api/report/ocp/provider_map.py | 58 ++++++------- koku/api/report/ocp/query_handler.py | 55 +----------- .../report/test/ocp/test_ocp_provider_map.py | 84 ++++++------------- ...ummarybynodep_wasted_cpu_cost_and_more.py} | 19 ++++- koku/reporting/provider/ocp/models.py | 2 + 5 files changed, 77 insertions(+), 141 deletions(-) rename koku/reporting/migrations/{0351_pod_summary_wasted_cost.py => 0351_ocppodsummarybynodep_wasted_cpu_cost_and_more.py} (75%) diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index 2c477fd99d..1591b82952 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -12,7 +12,6 @@ from django.db.models import DecimalField from django.db.models import F from django.db.models import Func -from django.db.models import Greatest from django.db.models import IntegerField from django.db.models import Max from django.db.models import Q @@ -210,29 +209,6 @@ def _memory_request_sum(self): """Return a new Sum expression for memory request hours.""" return Sum(Coalesce(F("pod_request_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - def _efficiency_annotations(self, usage_sum_prop, request_sum_prop, cost_total_expr): - """Build usage_efficiency and wasted_cost annotation expressions.""" - _dec = DecimalField(max_digits=33, decimal_places=15) - return { - "usage_efficiency": Coalesce( - Round(usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) * Value(100)), - Value(0), - output_field=IntegerField(), - ), - "wasted_cost": Coalesce( - Greatest( - cost_total_expr - * ( - Value(1, output_field=_dec) - - usage_sum_prop / NullIf(request_sum_prop, Value(0, output_field=_dec)) - ), - Value(0, output_field=_dec), - ), - Value(0, output_field=_dec), - output_field=_dec, - ), - } - def __init__(self, provider, report_type, schema_name): """Constructor.""" self._schema_name = schema_name @@ -457,7 +433,7 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": Sum(Value(0, output_field=DecimalField())), + "wasted_cost": self.wasted_cpu_cost_expr, }, "capacity_aggregate": { "cluster": { @@ -529,7 +505,7 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": Sum(Value(0, output_field=DecimalField())), + "wasted_cost": self.wasted_cpu_cost_expr, "capacity": Max("cluster_capacity_cpu_core_hours"), # overwritten in capacity aggregation "clusters": ArrayAgg( Coalesce("cluster_alias", "cluster_id"), distinct=True, default=Value([]) @@ -643,7 +619,7 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": Sum(Value(0, output_field=DecimalField())), + "wasted_cost": self.wasted_memory_cost_expr, }, "capacity_aggregate": { "cluster": { @@ -721,7 +697,7 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": Sum(Value(0, output_field=DecimalField())), + "wasted_cost": self.wasted_memory_cost_expr, "capacity": Max( "cluster_capacity_memory_gigabyte_hours" ), # This is to keep the order, overwritten with capacity aggregate @@ -1494,6 +1470,32 @@ def cost_model_gpu_cost(self): """Return all GPU cost model costs.""" return self.__cost_model_gpu_cost() + @cached_property + def wasted_cpu_cost_expr(self): + """Sum of pre-computed wasted CPU cost, converted to display currency.""" + _dec = DecimalField(max_digits=33, decimal_places=15) + return Coalesce( + Sum( + Coalesce(F("wasted_cpu_cost"), Value(0, output_field=_dec)) + * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) + ), + Value(0, output_field=_dec), + output_field=_dec, + ) + + @cached_property + def wasted_memory_cost_expr(self): + """Sum of pre-computed wasted memory cost, converted to display currency.""" + _dec = DecimalField(max_digits=33, decimal_places=15) + return Coalesce( + Sum( + Coalesce(F("wasted_memory_cost"), Value(0, output_field=_dec)) + * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) + ), + Value(0, output_field=_dec), + output_field=_dec, + ) + @cached_property def cloud_infrastructure_cost(self): """Return ORM term for cloud infra costs.""" diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index 16a43663a4..a0bf3e14fe 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -41,15 +41,9 @@ from cost_models.models import CostModel from cost_models.models import CostModelMap from reporting.models import OCPUsageLineItemDailySummary -from reporting.provider.ocp.models import OCPPodSummaryByNodeP -from reporting.provider.ocp.models import OCPPodSummaryByProjectP -from reporting.provider.ocp.models import OCPPodSummaryP LOG = logging.getLogger(__name__) -# Models that carry pre-computed per-line-item wasted cost columns. -_POD_SUMMARY_MODELS = frozenset({OCPPodSummaryP, OCPPodSummaryByProjectP, OCPPodSummaryByNodeP}) - class OCPReportQueryHandler(ReportQueryHandler): """Handles report queries and responses for OCP.""" @@ -267,10 +261,9 @@ def _format_query_response(self): def _pack_score(self, row, should_compute): """Shape efficiency annotations into the score response object. - wasted_cost is always included when present because it is a SUM of pre-computed - per-line-item values and is correct for any GROUP BY dimension. - usage_efficiency_percent is a ratio-of-sums that is only meaningful when - should_compute is True (single GROUP BY dimension, no tag interaction). + wasted_cost comes from the provider map's pre-computed column sum and is always + included. usage_efficiency_percent is a ratio-of-sums that is only meaningful + when should_compute is True (single GROUP BY dimension, no tag interaction). """ score: dict = {} if should_compute: @@ -284,28 +277,6 @@ def _pack_score(self, row, should_compute): row["score"] = score - def _wasted_cost_precomputed_expr(self): - """Return a Sum expression for the pre-computed wasted cost column, or None. - - Returns a ready-to-use ORM expression only when the active query table is one of - the pod summary models that carry the pre-computed column. Falls back to None so - the caller can use the bucket-level formula from the provider map instead. - """ - if self._report_type not in ("cpu", "memory"): - return None - if self.query_table not in _POD_SUMMARY_MODELS: - return None - _dec = DecimalField(max_digits=33, decimal_places=15) - field = "wasted_cpu_cost" if self._report_type == "cpu" else "wasted_memory_cost" - return Coalesce( - Sum( - Coalesce(F(field), Value(0, output_field=_dec)) - * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) - ), - Value(0, output_field=_dec), - output_field=_dec, - ) - def _rate_sum_by_source(self, metric_names: frozenset) -> dict: """Return {source_uuid_str: sum_of_flat_tier_values_in_cost_model_currency}. @@ -474,13 +445,7 @@ def execute_query(self): # noqa: C901 query_group_by = ["date"] + group_by_value query_order_by = ["-date", self.order] - # When the query table carries pre-computed per-line-item wasted cost columns, - # replace the bucket-level formula with a simple SUM of those columns. - wasted_precomputed = self._wasted_cost_precomputed_expr() row_annotations = {k: v for k, v in self.report_annotations.items() if k not in group_by_value} - if wasted_precomputed is not None: - row_annotations = {**row_annotations, "wasted_cost": wasted_precomputed} - query_data = query.values(*query_group_by).annotate(**row_annotations) if is_grouped_by_project(self.parameters): @@ -498,8 +463,6 @@ def execute_query(self): # noqa: C901 # Populate the 'total' section of the API response if query.exists(): aggregates = self._mapper.report_type_map.get("aggregates") - if wasted_precomputed is not None: - aggregates = {**aggregates, "wasted_cost": wasted_precomputed} metric_sum = query.aggregate(**aggregates) query_sum = {key: metric_sum.get(key) for key in aggregates} @@ -511,15 +474,6 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): has_tag_interaction = self._tag_group_by or self.get_tag_filter_keys() should_compute = not has_tag_interaction and len(group_by_value) <= 1 - # # should_compute = False - # # Compute wasted_cost from the finest available grain (per-pod daily rows) - # # rather than from the pre-aggregated summary table buckets. Bucket-level - # # aggregation causes over- and under-utilised pods to cancel, collapsing - # # the waste formula toward zero even when real per-pod waste exists. - # line_item_waste: dict = {} - # if should_compute: - # line_item_waste = self._line_item_wasted_cost(query_group_by) - # query_sum["wasted_cost"] = line_item_waste.get("__total__", Decimal(0)) self._pack_score(query_sum, should_compute) if self._delta: @@ -529,9 +483,6 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): for row in query_data: - # if should_compute: - # key = tuple(row.get(g) for g in query_group_by) - # row["wasted_cost"] = line_item_waste.get(key, Decimal(0)) self._pack_score(row, should_compute) for row in query_data: diff --git a/koku/api/report/test/ocp/test_ocp_provider_map.py b/koku/api/report/test/ocp/test_ocp_provider_map.py index d47f614c1d..80e0758062 100644 --- a/koku/api/report/test/ocp/test_ocp_provider_map.py +++ b/koku/api/report/test/ocp/test_ocp_provider_map.py @@ -73,16 +73,22 @@ def test_distributed_costs_use_correct_exchange_rate(self): self.assertEqual(result["val"], cost_val * expected_multiplier) - def test_wasted_cost_cpu_is_sum_of_per_row_clamped_waste(self): - """Pooled utilization can be 100% while rows still have opportunity waste (S1-a).""" + def test_wasted_cost_cpu_reads_precomputed_column_with_exchange_rate(self): + """wasted_cpu_cost_expr sums the pre-computed column and applies exchange_rate. + + The SQL summarization pre-computes per-pod waste before any cross-pod aggregation, + avoiding the pooling problem (over- and under-utilised pods cancelling each other). + The provider map expression simply reads that value and converts to display currency. + """ cluster_id = "s1a-waste-test-cpu" usage_date = self.dh.yesterday.date() _dec = DecimalField(max_digits=33, decimal_places=15) - one = Value(Decimal("1"), output_field=_dec) + exchange_rate = Value(Decimal("2"), output_field=_dec) with tenant_context(self.tenant): OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() + # Row representing a pod group with pre-computed waste=50 (e.g. 50% waste on 100 cost) self.baker.make( OCPUsageLineItemDailySummary, cluster_id=cluster_id, @@ -90,12 +96,9 @@ def test_wasted_cost_cpu_is_sum_of_per_row_clamped_waste(self): namespace="s1a-test", usage_start=usage_date, usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("10"), - pod_usage_cpu_core_hours=Decimal("5"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - cost_model_cpu_cost=Decimal("0"), + wasted_cpu_cost=Decimal("50"), ) + # Row representing an over-utilised pod group — waste already clamped to 0 by SQL self.baker.make( OCPUsageLineItemDailySummary, cluster_id=cluster_id, @@ -103,35 +106,21 @@ def test_wasted_cost_cpu_is_sum_of_per_row_clamped_waste(self): namespace="s1a-test", usage_start=usage_date, usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("10"), - pod_usage_cpu_core_hours=Decimal("15"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - cost_model_cpu_cost=Decimal("0"), + wasted_cpu_cost=Decimal("0"), ) mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) aggregates = mapper.report_type_map["aggregates"] qs = OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod").annotate( - exchange_rate=one, - infra_exchange_rate=one, - ) - result = qs.aggregate( - wasted_cost=aggregates["wasted_cost"], - cost_total=aggregates["cost_total"], + exchange_rate=exchange_rate, + infra_exchange_rate=exchange_rate, ) - # Row1: 100 * (1 - 5/10) = 50; row2: over-utilized -> 0 - self.assertEqual(result["wasted_cost"], Decimal("50")) - self.assertEqual(result["cost_total"], Decimal("200")) - - pooled_waste = result["cost_total"] * ( - Decimal("1") - Decimal("20") / Decimal("20") - ) # sum usage / sum request == 1 - self.assertEqual(pooled_waste, Decimal("0")) - self.assertNotEqual(result["wasted_cost"], pooled_waste) + result = qs.aggregate(wasted_cost=aggregates["wasted_cost"]) + # SUM(50 + 0) * exchange_rate(2) = 100 + self.assertEqual(result["wasted_cost"], Decimal("100")) - def test_wasted_cost_memory_is_sum_of_per_row_clamped_waste(self): - """Memory report uses the same per-row waste pattern as CPU.""" + def test_wasted_cost_memory_reads_precomputed_column_with_exchange_rate(self): + """wasted_memory_cost_expr mirrors the CPU expression for the memory report.""" cluster_id = "s1a-waste-test-mem" usage_date = self.dh.yesterday.date() _dec = DecimalField(max_digits=33, decimal_places=15) @@ -147,11 +136,7 @@ def test_wasted_cost_memory_is_sum_of_per_row_clamped_waste(self): namespace="s1a-test", usage_start=usage_date, usage_end=usage_date, - pod_request_memory_gigabyte_hours=Decimal("10"), - pod_usage_memory_gigabyte_hours=Decimal("5"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - cost_model_memory_cost=Decimal("0"), + wasted_memory_cost=Decimal("30"), ) self.baker.make( OCPUsageLineItemDailySummary, @@ -160,11 +145,7 @@ def test_wasted_cost_memory_is_sum_of_per_row_clamped_waste(self): namespace="s1a-test", usage_start=usage_date, usage_end=usage_date, - pod_request_memory_gigabyte_hours=Decimal("10"), - pod_usage_memory_gigabyte_hours=Decimal("15"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - cost_model_memory_cost=Decimal("0"), + wasted_memory_cost=Decimal("20"), ) mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="memory", schema_name=self.schema_name) @@ -176,8 +157,9 @@ def test_wasted_cost_memory_is_sum_of_per_row_clamped_waste(self): result = qs.aggregate(wasted_cost=aggregates["wasted_cost"]) self.assertEqual(result["wasted_cost"], Decimal("50")) - def test_wasted_cost_cpu_row_with_zero_request_contributes_zero(self): - cluster_id = "s1a-waste-test-zero-req" + def test_wasted_cost_null_column_contributes_zero(self): + """Rows without a pre-computed wasted_cost (NULL) are treated as zero.""" + cluster_id = "s1a-waste-test-null" usage_date = self.dh.yesterday.date() _dec = DecimalField(max_digits=33, decimal_places=15) one = Value(Decimal("1"), output_field=_dec) @@ -192,11 +174,7 @@ def test_wasted_cost_cpu_row_with_zero_request_contributes_zero(self): namespace="s1a-test", usage_start=usage_date, usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("0"), - pod_usage_cpu_core_hours=Decimal("50"), - infrastructure_raw_cost=Decimal("999"), - infrastructure_markup_cost=Decimal("0"), - cost_model_cpu_cost=Decimal("0"), + wasted_cpu_cost=None, ) mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) @@ -272,11 +250,7 @@ def test_wasted_cost_order_by_desc_matches_annotation(self): namespace="low-waste", usage_start=usage_date, usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("10"), - pod_usage_cpu_core_hours=Decimal("9"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - cost_model_cpu_cost=Decimal("0"), + wasted_cpu_cost=Decimal("10"), ) self.baker.make( OCPUsageLineItemDailySummary, @@ -285,11 +259,7 @@ def test_wasted_cost_order_by_desc_matches_annotation(self): namespace="high-waste", usage_start=usage_date, usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("10"), - pod_usage_cpu_core_hours=Decimal("1"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - cost_model_cpu_cost=Decimal("0"), + wasted_cpu_cost=Decimal("90"), ) mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) diff --git a/koku/reporting/migrations/0351_pod_summary_wasted_cost.py b/koku/reporting/migrations/0351_ocppodsummarybynodep_wasted_cpu_cost_and_more.py similarity index 75% rename from koku/reporting/migrations/0351_pod_summary_wasted_cost.py rename to koku/reporting/migrations/0351_ocppodsummarybynodep_wasted_cpu_cost_and_more.py index f62f9d002c..8a0913b5da 100644 --- a/koku/reporting/migrations/0351_pod_summary_wasted_cost.py +++ b/koku/reporting/migrations/0351_ocppodsummarybynodep_wasted_cpu_cost_and_more.py @@ -1,3 +1,4 @@ +# Generated by Django 5.2.12 on 2026-05-14 17:15 from django.db import migrations from django.db import models @@ -10,12 +11,12 @@ class Migration(migrations.Migration): operations = [ migrations.AddField( - model_name="ocppodsummaryp", + model_name="ocppodsummarybynodep", name="wasted_cpu_cost", field=models.DecimalField(decimal_places=15, max_digits=33, null=True), ), migrations.AddField( - model_name="ocppodsummaryp", + model_name="ocppodsummarybynodep", name="wasted_memory_cost", field=models.DecimalField(decimal_places=15, max_digits=33, null=True), ), @@ -30,12 +31,22 @@ class Migration(migrations.Migration): field=models.DecimalField(decimal_places=15, max_digits=33, null=True), ), migrations.AddField( - model_name="ocppodsummarybynodep", + model_name="ocppodsummaryp", name="wasted_cpu_cost", field=models.DecimalField(decimal_places=15, max_digits=33, null=True), ), migrations.AddField( - model_name="ocppodsummarybynodep", + model_name="ocppodsummaryp", + name="wasted_memory_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocpusagelineitemdailysummary", + name="wasted_cpu_cost", + field=models.DecimalField(decimal_places=15, max_digits=33, null=True), + ), + migrations.AddField( + model_name="ocpusagelineitemdailysummary", name="wasted_memory_cost", field=models.DecimalField(decimal_places=15, max_digits=33, null=True), ), diff --git a/koku/reporting/provider/ocp/models.py b/koku/reporting/provider/ocp/models.py index 168a193536..9e58c8bf2b 100644 --- a/koku/reporting/provider/ocp/models.py +++ b/koku/reporting/provider/ocp/models.py @@ -204,6 +204,8 @@ class Meta: source_uuid = models.UUIDField(unique=False, null=True) raw_currency = models.TextField(null=True) distributed_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + wasted_cpu_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) + wasted_memory_cost = models.DecimalField(max_digits=33, decimal_places=15, null=True) class OCPTagsValues(models.Model): From 705ed9166c493b6fa3582435c3a6588161f28bdc Mon Sep 17 00:00:00 2001 From: myersCody Date: Thu, 14 May 2026 13:34:53 -0400 Subject: [PATCH 12/18] Clean up provider map --- koku/api/report/ocp/provider_map.py | 36 ++++++++--------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index 1591b82952..dfc8e2d768 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -193,22 +193,6 @@ def __cost_model_distributed_cost(self, cost_model_rate_type, exchange_rate_colu * Coalesce(exchange_rate_column, Value(1, output_field=DecimalField())), ) - def _cpu_usage_sum(self): - """Return a new Sum expression for CPU usage hours.""" - return Sum(Coalesce(F("pod_usage_cpu_core_hours"), Value(0, output_field=DecimalField()))) - - def _cpu_request_sum(self): - """Return a new Sum expression for CPU request hours.""" - return Sum(Coalesce(F("pod_request_cpu_core_hours"), Value(0, output_field=DecimalField()))) - - def _memory_usage_sum(self): - """Return a new Sum expression for memory usage hours.""" - return Sum(Coalesce(F("pod_usage_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - - def _memory_request_sum(self): - """Return a new Sum expression for memory request hours.""" - return Sum(Coalesce(F("pod_request_memory_gigabyte_hours"), Value(0, output_field=DecimalField()))) - def __init__(self, provider, report_type, schema_name): """Constructor.""" self._schema_name = schema_name @@ -1473,27 +1457,27 @@ def cost_model_gpu_cost(self): @cached_property def wasted_cpu_cost_expr(self): """Sum of pre-computed wasted CPU cost, converted to display currency.""" - _dec = DecimalField(max_digits=33, decimal_places=15) return Coalesce( Sum( - Coalesce(F("wasted_cpu_cost"), Value(0, output_field=_dec)) - * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) + Coalesce(F("wasted_cpu_cost"), Value(0, output_field=DecimalField(max_digits=33, decimal_places=15))) + * Coalesce(F("exchange_rate"), Value(1, output_field=DecimalField(max_digits=33, decimal_places=15))) ), - Value(0, output_field=_dec), - output_field=_dec, + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + output_field=DecimalField(max_digits=33, decimal_places=15), ) @cached_property def wasted_memory_cost_expr(self): """Sum of pre-computed wasted memory cost, converted to display currency.""" - _dec = DecimalField(max_digits=33, decimal_places=15) return Coalesce( Sum( - Coalesce(F("wasted_memory_cost"), Value(0, output_field=_dec)) - * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) + Coalesce( + F("wasted_memory_cost"), Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)) + ) + * Coalesce(F("exchange_rate"), Value(1, output_field=DecimalField(max_digits=33, decimal_places=15))) ), - Value(0, output_field=_dec), - output_field=_dec, + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + output_field=DecimalField(max_digits=33, decimal_places=15), ) @cached_property From 61936978fc394379db9c0ef96cca47db08104cbf Mon Sep 17 00:00:00 2001 From: myersCody Date: Thu, 14 May 2026 13:41:37 -0400 Subject: [PATCH 13/18] Remove unused code from query handler --- koku/api/report/ocp/query_handler.py | 168 --------------------------- 1 file changed, 168 deletions(-) diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index a0bf3e14fe..54d7f3f23a 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -15,21 +15,12 @@ from django.db.models import CharField from django.db.models import DecimalField from django.db.models import F -from django.db.models import Sum from django.db.models import Value from django.db.models import When from django.db.models.fields.json import KT from django.db.models.functions import Coalesce -from django.db.models.functions import Greatest -from django.db.models.functions.comparison import NullIf from django_tenants.utils import tenant_context -from api.metrics.constants import OCP_METRIC_CPU_CORE_EFFECTIVE_USAGE_HOUR -from api.metrics.constants import OCP_METRIC_CPU_CORE_REQUEST_HOUR -from api.metrics.constants import OCP_METRIC_CPU_CORE_USAGE_HOUR -from api.metrics.constants import OCP_METRIC_MEM_GB_EFFECTIVE_USAGE_HOUR -from api.metrics.constants import OCP_METRIC_MEM_GB_REQUEST_HOUR -from api.metrics.constants import OCP_METRIC_MEM_GB_USAGE_HOUR from api.models import Provider from api.report.ocp.capacity.cluster_capacity import calculate_unused from api.report.ocp.capacity.cluster_capacity import ClusterCapacity @@ -40,7 +31,6 @@ from api.report.queries import ReportQueryHandler from cost_models.models import CostModel from cost_models.models import CostModelMap -from reporting.models import OCPUsageLineItemDailySummary LOG = logging.getLogger(__name__) @@ -50,16 +40,6 @@ class OCPReportQueryHandler(ReportQueryHandler): provider = Provider.PROVIDER_OCP - _CPU_HOUR_METRICS = frozenset( - {OCP_METRIC_CPU_CORE_USAGE_HOUR, OCP_METRIC_CPU_CORE_REQUEST_HOUR, OCP_METRIC_CPU_CORE_EFFECTIVE_USAGE_HOUR} - ) - _MEM_HOUR_METRICS = frozenset( - {OCP_METRIC_MEM_GB_USAGE_HOUR, OCP_METRIC_MEM_GB_REQUEST_HOUR, OCP_METRIC_MEM_GB_EFFECTIVE_USAGE_HOUR} - ) - _SENTINEL_NAMESPACES = frozenset( - ["Worker unallocated", "Platform unallocated", "Network unattributed", "Storage unattributed"] - ) - def __init__(self, parameters): """Establish OCP report query handler. @@ -277,154 +257,6 @@ def _pack_score(self, row, should_compute): row["score"] = score - def _rate_sum_by_source(self, metric_names: frozenset) -> dict: - """Return {source_uuid_str: sum_of_flat_tier_values_in_cost_model_currency}. - - Parses CostModel.rates (a JSON list) and sums all rate values whose metric - name is in metric_names. Two structures are handled: - - * ``tiered_rates`` – a list of tier dicts each with a ``value`` key. - * ``tag_rates`` – a dict with a ``tag_values`` list; only entries where - ``default`` is true are included. These apply to every pod that does not - match a more-specific tag value (in practice, to all pods in the common - case of a single-value tag rule with ``default: true``). - """ - rate_by_cm: dict = {} - for row in CostModel.objects.all().values("uuid", "rates"): - total = Decimal("0") - for rate_entry in row["rates"] or []: - if rate_entry.get("metric", {}).get("name") in metric_names: - for tier in rate_entry.get("tiered_rates", []): - total += Decimal(str(tier.get("value", 0))) - for tv in (rate_entry.get("tag_rates", {}) or {}).get("tag_values", []) or []: - if tv.get("default"): - total += Decimal(str(tv.get("value", 0))) - rate_by_cm[str(row["uuid"])] = total - - return { - str(mapping_row["provider_uuid"]): rate_by_cm.get(str(mapping_row["cost_model_id"]), Decimal("0")) - for mapping_row in CostModelMap.objects.all().values("provider_uuid", "cost_model_id") - } - - def _line_item_wasted_cost(self, group_by: list) -> dict: - """Compute wasted_cost from OCPUsageLineItemDailySummary at daily per-pod grain. - - OCP pod summary tables (OCPPodSummaryP, etc.) aggregate across all pods within - a cluster/namespace/node before storing. When over-utilised and under-utilised - pods are pooled together their usage/request ratio approaches 1, driving the - waste formula toward zero at the bucket level while real per-pod waste remains. - This method queries the finest available grain (per-pod daily rows) so the - clamped-waste formula is evaluated before any cross-pod aggregation. - - The per-row cost basis is: - max(usage, request) × total_rate_from_cost_model × exchange_rate - + infrastructure_raw_cost × infra_exchange_rate - + infrastructure_markup_cost × infra_exchange_rate - - max(usage, request) is evaluated on the already-aggregated row (not the stored - pod_effective_usage field, which is sum(max(u_i, r_i)) per individual pod and - can exceed max(sum_u, sum_r) when the group contains both over- and - under-utilised pods). Using the row-level max matches the product's wasted-cost - definition and the integration test fixture. - - Rates are read from the CostModel table at query time so the computation is - correct whether or not the cost model application SQL has already run. - Only original rows (cost_model_rate_type IS NULL) are queried to avoid - double-counting when the cost model has been applied. - - Args: - group_by: the query_group_by list used by execute_query (["date", ...]). - - Returns: - A dict keyed by tuple(*group_by field values) → Decimal waste. - The sentinel key '__total__' holds the overall waste sum. - """ - _dec = DecimalField(max_digits=33, decimal_places=15) - - if self._report_type == "cpu": - usage_field = "pod_usage_cpu_core_hours" - effective_field = "pod_effective_usage_cpu_core_hours" - rate_by_source = self._rate_sum_by_source(self._CPU_HOUR_METRICS) - else: - usage_field = "pod_usage_memory_gigabyte_hours" - effective_field = "pod_effective_usage_memory_gigabyte_hours" - rate_by_source = self._rate_sum_by_source(self._MEM_HOUR_METRICS) - - # Per-source rate annotation: Case(When(source_uuid=X, then=rate_X), ..., default=0) - rate_whens = [ - When(**{"source_uuid": uuid, "then": Value(rate, output_field=_dec)}) - for uuid, rate in rate_by_source.items() - if rate - ] - rate_ann = ( - Case(*rate_whens, default=Value(Decimal("0"), output_field=_dec), output_field=_dec) - if rate_whens - else Value(Decimal("0"), output_field=_dec) - ) - - row_infra = Coalesce(F("infrastructure_raw_cost"), Value(0, output_field=_dec)) - row_markup = Coalesce(F("infrastructure_markup_cost"), Value(0, output_field=_dec)) - row_u = Coalesce(F(usage_field), Value(0, output_field=_dec)) - - # pod_effective_usage_*_hours stores SUM(MAX(usage_i, request_i)) aggregated - # across all raw intervals in the day. This preserves the per-interval clamping - # so that hours where a pod is over-utilised contribute MAX(u_i, r_i) = u_i - # (no waste for that interval) rather than collapsing to MAX(SUM(u), SUM(r)) - # which would incorrectly cancel within-day over/under utilisation swings. - # waste = (effective - usage) * rate = SUM(MAX(0, r_i - u_i)) * rate ✓ - row_eff = Coalesce(F(effective_field), Value(0, output_field=_dec)) - - per_row_cost = ( - # Cloud infrastructure cost converted to the report currency - (row_infra + row_markup) * Coalesce(F("infra_exchange_rate"), Value(1, output_field=_dec)) - # Cost model contribution: rate × effective_usage → report currency. - # effective_usage = SUM(MAX(u_i, r_i)) already encodes the per-interval - # clamp, giving the correct cost basis for the waste formula. - + row_eff * F("_wc_rate") * Coalesce(F("exchange_rate"), Value(1, output_field=_dec)) - ) - # waste = per_row_cost × (1 − usage/effective) = (effective − usage) × rate - # Uses effective in the denominator (not request) so the ratio is always in [0,1] - # and the result equals the fixture's SUM(MAX(0, r_i − u_i)) × rate per group. - per_row_waste = Coalesce( - Greatest( - per_row_cost * (Value(1, output_field=_dec) - row_u / NullIf(row_eff, Value(0, output_field=_dec))), - Value(0, output_field=_dec), - ), - Value(0, output_field=_dec), - ) - waste_ann = Coalesce(Sum(per_row_waste), Value(0, output_field=_dec), output_field=_dec) - - # Original rows only: cost model application creates additional rows in the same - # table with cost_model_rate_type set. Querying only original rows avoids double- - # counting and gives the correct per-pod-group grain. - li_q = OCPUsageLineItemDailySummary.objects.filter(self.query_filter) - li_q = li_q.filter(cost_model_rate_type__isnull=True) - li_q = li_q.exclude(namespace__in=self._SENTINEL_NAMESPACES) - if self.query_exclusions: - li_q = li_q.exclude(self.query_exclusions) - - # Build a minimal annotation set to avoid GROUP BY pollution. - # Applying self.annotations directly adds cluster, currency_annotation, etc. - # as non-aggregate expressions which Django may include in GROUP BY, - # fragmenting groups beyond the intended dimensions. - fresh_ann: dict = { - "date": self.date_trunc("usage_start"), - "_wc_rate": rate_ann, - **self.exchange_rate_annotation_dict, - } - for gb in group_by: - if gb != "date" and gb in self.annotations: - fresh_ann[gb] = self.annotations[gb] - li_q = li_q.annotate(**fresh_ann) - - total = li_q.aggregate(wasted_cost=waste_ann)["wasted_cost"] or Decimal(0) - by_group = { - tuple(row[g] for g in group_by): row["wasted_cost"] - for row in li_q.values(*group_by).annotate(wasted_cost=waste_ann) - } - by_group["__total__"] = total - return by_group - def execute_query(self): # noqa: C901 """Execute query and return provided data. From 23b85505959344352caffeaa131dc5c139d01f87 Mon Sep 17 00:00:00 2001 From: myersCody Date: Thu, 14 May 2026 14:08:12 -0400 Subject: [PATCH 14/18] Unify the packing strategy for wasted cost --- koku/api/report/ocp/provider_map.py | 6 +++++ koku/api/report/ocp/query_handler.py | 35 +++++++++++----------------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index dfc8e2d768..61f22b2fff 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -490,6 +490,9 @@ def __init__(self, provider, report_type, schema_name): output_field=IntegerField(), ), "wasted_cost": self.wasted_cpu_cost_expr, + "wasted_cost_units": Coalesce( + "currency_annotation", Value("USD", output_field=CharField()) + ), "capacity": Max("cluster_capacity_cpu_core_hours"), # overwritten in capacity aggregation "clusters": ArrayAgg( Coalesce("cluster_alias", "cluster_id"), distinct=True, default=Value([]) @@ -682,6 +685,9 @@ def __init__(self, provider, report_type, schema_name): output_field=IntegerField(), ), "wasted_cost": self.wasted_memory_cost_expr, + "wasted_cost_units": Coalesce( + "currency_annotation", Value("USD", output_field=CharField()) + ), "capacity": Max( "cluster_capacity_memory_gigabyte_hours" ), # This is to keep the order, overwritten with capacity aggregate diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index 54d7f3f23a..ab4aa7a6b9 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -112,6 +112,14 @@ def __init__(self, parameters): ocp_pack_definitions["compute"] = {"keys": ["compute"], "units": "mig_compute_units"} ocp_pack_definitions["memory"] = {"keys": ["memory"], "units": "mig_memory_units"} ocp_pack_definitions["gpu_count"] = {"keys": ["gpu_count"], "units": "gpu_count_units"} + ocp_pack_definitions["score_efficiency"] = { + "keys": {"usage_efficiency": {"key": "usage_efficiency_percent", "group": "score"}}, + "units": None, + } + ocp_pack_definitions["score_wasted"] = { + "keys": {"wasted_cost": {"key": "wasted_cost", "group": "score"}}, + "units": "wasted_cost_units", + } # super() needs to be called after _mapper and _limit is set super().__init__(parameters) @@ -238,25 +246,6 @@ def _format_query_response(self): return output - def _pack_score(self, row, should_compute): - """Shape efficiency annotations into the score response object. - - wasted_cost comes from the provider map's pre-computed column sum and is always - included. usage_efficiency_percent is a ratio-of-sums that is only meaningful - when should_compute is True (single GROUP BY dimension, no tag interaction). - """ - score: dict = {} - if should_compute: - score["usage_efficiency_percent"] = row.pop("usage_efficiency", 0) - else: - row.pop("usage_efficiency", None) - - wasted = row.pop("wasted_cost", None) - if wasted is not None: - score["wasted_cost"] = {"value": wasted, "units": self.currency} - - row["score"] = score - def execute_query(self): # noqa: C901 """Execute query and return provided data. @@ -306,16 +295,18 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): has_tag_interaction = self._tag_group_by or self.get_tag_filter_keys() should_compute = not has_tag_interaction and len(group_by_value) <= 1 - self._pack_score(query_sum, should_compute) + query_sum["wasted_cost_units"] = self.currency + if not should_compute: + query_sum.pop("usage_efficiency", None) if self._delta: query_data = self.add_deltas(query_data, query_sum) query_data = self.order_by(query_data, query_order_by) - if self._report_type in ("cpu", "memory"): + if self._report_type in ("cpu", "memory") and not should_compute: for row in query_data: - self._pack_score(row, should_compute) + row.pop("usage_efficiency", None) for row in query_data: if tag_iterable := row.get("tags"): From ea8a3ea83b3d7afa78800372e713927cd6ee91cb Mon Sep 17 00:00:00 2001 From: myersCody Date: Thu, 14 May 2026 14:28:24 -0400 Subject: [PATCH 15/18] Improve readability --- koku/api/report/ocp/provider_map.py | 86 +++++++++++++++++++---------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/koku/api/report/ocp/provider_map.py b/koku/api/report/ocp/provider_map.py index 61f22b2fff..c58d5b629a 100644 --- a/koku/api/report/ocp/provider_map.py +++ b/koku/api/report/ocp/provider_map.py @@ -417,7 +417,20 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": self.wasted_cpu_cost_expr, + "wasted_cost": Coalesce( + Sum( + Coalesce( + F("wasted_cpu_cost"), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Coalesce( + F("exchange_rate"), + Value(1, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + output_field=DecimalField(max_digits=33, decimal_places=15), + ), }, "capacity_aggregate": { "cluster": { @@ -489,7 +502,20 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": self.wasted_cpu_cost_expr, + "wasted_cost": Coalesce( + Sum( + Coalesce( + F("wasted_cpu_cost"), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Coalesce( + F("exchange_rate"), + Value(1, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + output_field=DecimalField(max_digits=33, decimal_places=15), + ), "wasted_cost_units": Coalesce( "currency_annotation", Value("USD", output_field=CharField()) ), @@ -606,7 +632,20 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": self.wasted_memory_cost_expr, + "wasted_cost": Coalesce( + Sum( + Coalesce( + F("wasted_memory_cost"), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Coalesce( + F("exchange_rate"), + Value(1, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + output_field=DecimalField(max_digits=33, decimal_places=15), + ), }, "capacity_aggregate": { "cluster": { @@ -684,7 +723,20 @@ def __init__(self, provider, report_type, schema_name): Value(0), output_field=IntegerField(), ), - "wasted_cost": self.wasted_memory_cost_expr, + "wasted_cost": Coalesce( + Sum( + Coalesce( + F("wasted_memory_cost"), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + * Coalesce( + F("exchange_rate"), + Value(1, output_field=DecimalField(max_digits=33, decimal_places=15)), + ) + ), + Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), + output_field=DecimalField(max_digits=33, decimal_places=15), + ), "wasted_cost_units": Coalesce( "currency_annotation", Value("USD", output_field=CharField()) ), @@ -1460,32 +1512,6 @@ def cost_model_gpu_cost(self): """Return all GPU cost model costs.""" return self.__cost_model_gpu_cost() - @cached_property - def wasted_cpu_cost_expr(self): - """Sum of pre-computed wasted CPU cost, converted to display currency.""" - return Coalesce( - Sum( - Coalesce(F("wasted_cpu_cost"), Value(0, output_field=DecimalField(max_digits=33, decimal_places=15))) - * Coalesce(F("exchange_rate"), Value(1, output_field=DecimalField(max_digits=33, decimal_places=15))) - ), - Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), - output_field=DecimalField(max_digits=33, decimal_places=15), - ) - - @cached_property - def wasted_memory_cost_expr(self): - """Sum of pre-computed wasted memory cost, converted to display currency.""" - return Coalesce( - Sum( - Coalesce( - F("wasted_memory_cost"), Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)) - ) - * Coalesce(F("exchange_rate"), Value(1, output_field=DecimalField(max_digits=33, decimal_places=15))) - ), - Value(0, output_field=DecimalField(max_digits=33, decimal_places=15)), - output_field=DecimalField(max_digits=33, decimal_places=15), - ) - @cached_property def cloud_infrastructure_cost(self): """Return ORM term for cloud infra costs.""" From 966c527fc21ea322fae197d0ba4becd6d889fb1a Mon Sep 17 00:00:00 2001 From: myersCody Date: Fri, 15 May 2026 12:00:24 -0400 Subject: [PATCH 16/18] Fix unittests --- .../report/test/ocp/test_ocp_provider_map.py | 48 ------- .../report/test/ocp/test_ocp_query_handler.py | 120 ------------------ 2 files changed, 168 deletions(-) diff --git a/koku/api/report/test/ocp/test_ocp_provider_map.py b/koku/api/report/test/ocp/test_ocp_provider_map.py index 80e0758062..78f7551cd1 100644 --- a/koku/api/report/test/ocp/test_ocp_provider_map.py +++ b/koku/api/report/test/ocp/test_ocp_provider_map.py @@ -8,7 +8,6 @@ from decimal import Decimal from django.db.models import DecimalField -from django.db.models import Sum from django.db.models import Value from django_tenants.utils import tenant_context @@ -186,53 +185,6 @@ def test_wasted_cost_null_column_contributes_zero(self): result = qs.aggregate(wasted_cost=aggregates["wasted_cost"]) self.assertEqual(result["wasted_cost"], Decimal("0")) - def test_per_row_cost_cpu_sums_to_aggregate_cost_total(self): - """Sanity: Sum(row infra + markup + cm cpu) matches existing cost_total aggregate.""" - cluster_id = "s1a-cost-consistency" - usage_date = self.dh.yesterday.date() - _dec = DecimalField(max_digits=33, decimal_places=15) - one = Value(Decimal("1"), output_field=_dec) - - with tenant_context(self.tenant): - OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() - - self.baker.make( - OCPUsageLineItemDailySummary, - cluster_id=cluster_id, - data_source="Pod", - namespace="s1a-test", - usage_start=usage_date, - usage_end=usage_date, - infrastructure_raw_cost=Decimal("30"), - infrastructure_markup_cost=Decimal("7"), - cost_model_cpu_cost=Decimal("5"), - pod_request_cpu_core_hours=Decimal("1"), - pod_usage_cpu_core_hours=Decimal("1"), - ) - self.baker.make( - OCPUsageLineItemDailySummary, - cluster_id=cluster_id, - data_source="Pod", - namespace="s1a-test", - usage_start=usage_date, - usage_end=usage_date, - infrastructure_raw_cost=Decimal("10"), - infrastructure_markup_cost=Decimal("2"), - cost_model_cpu_cost=Decimal("1"), - pod_request_cpu_core_hours=Decimal("1"), - pod_usage_cpu_core_hours=Decimal("1"), - ) - - mapper = OCPProviderMap(provider=Provider.PROVIDER_OCP, report_type="cpu", schema_name=self.schema_name) - aggregates = mapper.report_type_map["aggregates"] - qs = OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id, data_source="Pod").annotate( - exchange_rate=one, - infra_exchange_rate=one, - ) - row_cost = mapper._per_row_cost_cpu_expr() - summed = qs.aggregate(from_rows=Sum(row_cost), cost_total=aggregates["cost_total"]) - self.assertEqual(summed["from_rows"], summed["cost_total"]) - def test_wasted_cost_order_by_desc_matches_annotation(self): """Grouped wasted_cost annotation is sortable (used by order_by[wasted_cost]).""" cluster_id = "s1a-waste-order" diff --git a/koku/api/report/test/ocp/test_ocp_query_handler.py b/koku/api/report/test/ocp/test_ocp_query_handler.py index db7851c130..0ca2db366b 100644 --- a/koku/api/report/test/ocp/test_ocp_query_handler.py +++ b/koku/api/report/test/ocp/test_ocp_query_handler.py @@ -2292,123 +2292,3 @@ def test_order_by_wasted_cost_with_pagination(self): query_output = handler.execute_query() self.assertIsNotNone(query_output.get("data")) self.assertIsNotNone(query_output.get("total")) - - def test_line_item_wasted_cost_corrects_pooling_bug(self): - """_line_item_wasted_cost uses per-pod daily grain, not cluster-bucket aggregates. - - Two pods in the same cluster/day pool to 100% utilisation at bucket level, - driving the naive bucket-level waste formula to zero. But per-pod, pod-A is - only 50% utilised and carries real waste. This test confirms the method - returns the per-pod sum (50) rather than the bucket result (0). - - Cost basis: infrastructure_raw_cost (OCP-on-cloud path, no cost model). - effective_usage = SUM(MAX(usage_i, request_i)) per interval — for a single-interval - pod this equals MAX(usage, request). No cost model → rate = 0, so: - per_row_cost = effective × 0 + infra_raw × 1.0 = 100. - Pod A: waste = 100 × (1 − usage/effective) = 100 × (1 − 5/10) = 50. - Pod B: waste = max(0, 100 × (1 − 15/15)) = 0. - """ - usage_date = self.dh.yesterday.date() - cluster_id = "pooling-test-cluster-waste" - - with tenant_context(self.tenant): - OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() - - # Pod A: 50% utilized → effective=MAX(5,10)=10, waste = 100 × (1 − 5/10) = 50 - self.baker.make( - OCPUsageLineItemDailySummary, - cluster_id=cluster_id, - data_source="Pod", - namespace="pooling-ns", - usage_start=usage_date, - usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("10"), - pod_usage_cpu_core_hours=Decimal("5"), - pod_effective_usage_cpu_core_hours=Decimal("10"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - raw_currency="USD", - cost_model_rate_type=None, - ) - # Pod B: 150% utilized → effective=MAX(15,10)=15, waste = max(0, 100×(1−15/15)) = 0 - self.baker.make( - OCPUsageLineItemDailySummary, - cluster_id=cluster_id, - data_source="Pod", - namespace="pooling-ns", - usage_start=usage_date, - usage_end=usage_date, - pod_request_cpu_core_hours=Decimal("10"), - pod_usage_cpu_core_hours=Decimal("15"), - pod_effective_usage_cpu_core_hours=Decimal("15"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - raw_currency="USD", - cost_model_rate_type=None, - ) - - url = f"?filter[cluster]={cluster_id}" - query_params = self.mocked_query_params(url, OCPCpuView) - handler = OCPReportQueryHandler(query_params) - query_group_by = ["date"] - - waste_map = handler._line_item_wasted_cost(query_group_by) - - # Per-pod sum: 50 + 0 = 50. Bucket-level would be 200 × (1 − 20/20) = 0. - self.assertEqual(waste_map["__total__"], Decimal("50")) - - def test_line_item_wasted_cost_memory_corrects_pooling_bug(self): - """Same pooling-bug correction for the memory report variant. - - Cost basis: infrastructure_raw_cost (OCP-on-cloud path, no cost model). - effective_usage = SUM(MAX(usage_i, request_i)) per interval. No cost model → rate = 0, so: - per_row_cost = effective × 0 + infra_raw × infra_exchange_rate = 100. - Pod A: waste = 100 × (1 − 5/10) = 50. Pod B: waste = max(0, 100×(1 − 15/15)) = 0. - """ - usage_date = self.dh.yesterday.date() - cluster_id = "pooling-test-cluster-waste-mem" - - with tenant_context(self.tenant): - OCPUsageLineItemDailySummary.objects.filter(cluster_id=cluster_id).delete() - - # Pod A: 50% utilized → effective=MAX(5,10)=10, waste = 100 × (1 − 5/10) = 50 - self.baker.make( - OCPUsageLineItemDailySummary, - cluster_id=cluster_id, - data_source="Pod", - namespace="pooling-ns-mem", - usage_start=usage_date, - usage_end=usage_date, - pod_request_memory_gigabyte_hours=Decimal("10"), - pod_usage_memory_gigabyte_hours=Decimal("5"), - pod_effective_usage_memory_gigabyte_hours=Decimal("10"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - raw_currency="USD", - cost_model_rate_type=None, - ) - # Pod B: 150% utilized → effective=MAX(15,10)=15, waste = max(0, 100×(1−15/15)) = 0 - self.baker.make( - OCPUsageLineItemDailySummary, - cluster_id=cluster_id, - data_source="Pod", - namespace="pooling-ns-mem", - usage_start=usage_date, - usage_end=usage_date, - pod_request_memory_gigabyte_hours=Decimal("10"), - pod_usage_memory_gigabyte_hours=Decimal("15"), - pod_effective_usage_memory_gigabyte_hours=Decimal("15"), - infrastructure_raw_cost=Decimal("100"), - infrastructure_markup_cost=Decimal("0"), - raw_currency="USD", - cost_model_rate_type=None, - ) - - url = f"?filter[cluster]={cluster_id}" - query_params = self.mocked_query_params(url, OCPMemoryView) - handler = OCPReportQueryHandler(query_params) - query_group_by = ["date"] - - waste_map = handler._line_item_wasted_cost(query_group_by) - - self.assertEqual(waste_map["__total__"], Decimal("50")) From 31ae1009fa17870312c06a41f14d4d59f080c24f Mon Sep 17 00:00:00 2001 From: myersCody Date: Mon, 18 May 2026 13:22:50 -0400 Subject: [PATCH 17/18] Add format logic for clean test pass --- koku/api/report/ocp/query_handler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index ab4aa7a6b9..38e93ab778 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -318,6 +318,12 @@ def execute_query(self): # noqa: C901 data = [{"date": date_string, "vm_names": query_data}] else: data = list(query_data) + if self._report_type in ("cpu", "memory"): + # Pack score fields so the CSV renderer emits score.usage_efficiency_percent + # and score.wasted_cost.* columns to match the JSON response structure. + score_pack = {k: v for k, v in self._mapper.PACK_DEFINITIONS.items() if k.startswith("score_")} + for row in data: + self._pack_data_object(row, **score_pack) else: # Pass in a copy of the group by without the added # tag column name prefix From fed30759bd20f38b38533fb73532bd0aabdb8587 Mon Sep 17 00:00:00 2001 From: myersCody Date: Tue, 19 May 2026 09:47:34 -0400 Subject: [PATCH 18/18] Fix key error for total_score --- koku/api/report/ocp/query_handler.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/koku/api/report/ocp/query_handler.py b/koku/api/report/ocp/query_handler.py index 38e93ab778..2793a7cc08 100644 --- a/koku/api/report/ocp/query_handler.py +++ b/koku/api/report/ocp/query_handler.py @@ -237,8 +237,7 @@ def _format_query_response(self): output["data"] = self.query_data self.query_sum = self._pack_data_object(self.query_sum, **self._mapper.PACK_DEFINITIONS) - if "score" in self.query_sum: - self.query_sum["total_score"] = self.query_sum.pop("score") + self.query_sum["total_score"] = self.query_sum.pop("score", {}) output["total"] = self.query_sum if self._delta: @@ -295,9 +294,12 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory"): has_tag_interaction = self._tag_group_by or self.get_tag_filter_keys() should_compute = not has_tag_interaction and len(group_by_value) <= 1 - query_sum["wasted_cost_units"] = self.currency - if not should_compute: + if should_compute: + query_sum["wasted_cost_units"] = self.currency + else: query_sum.pop("usage_efficiency", None) + query_sum.pop("wasted_cost", None) + query_sum.pop("wasted_cost_units", None) if self._delta: query_data = self.add_deltas(query_data, query_sum) @@ -307,6 +309,8 @@ def execute_query(self): # noqa: C901 if self._report_type in ("cpu", "memory") and not should_compute: for row in query_data: row.pop("usage_efficiency", None) + row.pop("wasted_cost", None) + row.pop("wasted_cost_units", None) for row in query_data: if tag_iterable := row.get("tags"):