tsouza · tsouza · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
@@ -19,10 +19,10 @@
 # `demo_batch_last_success_timestamp_seconds`, `demo_intermittent_metric`).
 # When both ref Prom and cerberus return empty results for a missing-metric
 # query, the tester records that as PASS (zero diff) — acceptable noise
-# for the RC1 baseline.
+# for the compatibility baseline.
 #
 # Pointer-to-upstream: re-sync this file by re-copying the upstream and
-# re-applying the edit above. Track in a follow-up M1.x ticket so this
+# re-applying the edit above. Track in a follow-up issue so this
 # divergence stays auditable.
 
 test_cases:
@@ -38,7 +38,7 @@ test_cases:
   - query: 'NaN'
 
   # Vector selectors.
-  # TODO: Add tests for staleness support.
+  # TODO(upstream): Add tests for staleness support.
   - query: 'demo_memory_usage_bytes'
   - query: '{__name__="demo_memory_usage_bytes"}'
   - query: 'demo_memory_usage_bytes{type="free"}'
@@ -110,7 +110,7 @@ test_cases:
     # Check that vector-scalar binops set output timestamps correctly.
   - query: 'timestamp(demo_memory_usage_bytes * 1)'
     # Check that unary minus sets timestamps correctly.
-    # TODO: Check this more systematically for every node type?
+    # TODO(upstream): Check this more systematically for every node type?
   - query: 'timestamp(-demo_memory_usage_bytes)'
   - query: 'demo_memory_usage_bytes {{.binOp}} on(instance, job, type) demo_memory_usage_bytes'
     variant_args: ['binOp']
@@ -125,8 +125,8 @@ test_cases:
   - query: 'sum without(job) (demo_memory_usage_bytes) / on(instance, type) group_left(job) demo_memory_usage_bytes'
   - query: 'demo_memory_usage_bytes / on(instance, job) group_left demo_num_cpus'
   - query: 'demo_memory_usage_bytes / on(instance, type, job, non_existent) demo_memory_usage_bytes'
-  # TODO: Add non-explicit many-to-one / one-to-many that errors.
-  # TODO: Add many-to-many match that errors.
+  # TODO(upstream): Add non-explicit many-to-one / one-to-many that errors.
+  # TODO(upstream): Add many-to-many match that errors.
 
   # NaN/Inf/-Inf support.
   - query: 'demo_num_cpus * Inf'

@@ -198,6 +198,29 @@ needs strengthening, (b) a functionally-equivalent mutation (`<` vs
 `append` regrows past), or (c) a missing test. The gremlins JSON
 artifact on each run names the file + line + mutation kind.
 
+### Surviving-mutant policy
+
+When a mutant survives the phase threshold, pick the remedy in this
+order — the goal is to keep production code clear and let the test
+suite carry the discipline:
+
+1. **PREFERRED — prove equivalent.** Add a comment in the source
+   explaining why the mutated branch is semantically identical to the
+   original, then drop the phase efficacy threshold in `.gremlins.yaml`
+   by 1 percentage point to absorb the equivalent mutant. The mutation
+   count is now defensible and the source stays clear.
+2. **ACCEPTABLE — add a distinguishing test.** Write a unit / property
+   test whose output differs between the original and the mutated
+   branch. This is the right call when the mutation reveals real
+   under-tested behaviour.
+3. **REJECTED — refactor production code to make the mutant
+   distinguishable.** This is pattern #11 (DEFEAT-MUTANT) — the
+   codebase loses clarity to satisfy the mutation tool. Don't do it.
+
+Prior PRs #504 and #664 carry pattern-#3 refactors. They are not
+reverted (their diffs are now load-bearing for the published
+thresholds), but new violations should follow remedy #1 or #2.
+
 ## Regression meta-tests
 
 `test/regression/` pins past CI failures so they can't silently

@@ -893,7 +893,7 @@ func TestBuilder_Expr(t *testing.T) {
 	}
 }
 
-// --- typed operator / punctuation Frag constructors (R6.11a) -----------
+// --- typed operator / punctuation Frag constructors -------------------
 
 // TestOperatorFrags_BinaryOps — each comparison + arithmetic operator
 // renders "<l> <op> <r>" with single spaces around the op token, and

@@ -190,7 +190,7 @@ func TestPropertyOptimizerSemanticEquivalence(t *testing.T) {
 //
 // Depth budget: at depth 0 the generator picks any node type; once
 // depth ≥ 3 it bottoms out into a Scan to keep trees small. This
-// covers the three RC1-relevant shapes:
+// covers the three plan shapes the optimizer is expected to handle:
 //
 //	Scan(table)
 //	Filter(<expr>, Scan(table))

@@ -104,6 +104,19 @@ pre-push:
       tags: discipline
       run: |
         scripts/test-forbid-skip.sh
+    # Mirrors the CI `forbid-skip` job's `Guard new should_skip entries`
+    # step. Rejects net-new `should_skip:` entries in compatibility
+    # overlays that lack a tracking ref (jira / link / #NNN in reason).
+    # Background: PRs #429 + #537 added skip rows to silence failing
+    # cases instead of fixing them, and the skips lingered for weeks.
+    # Local guard catches the anti-pattern before CI does.
+    forbid-should-skip-additions:
+      tags: discipline
+      env:
+        BASE_REF: origin/main
+      run: |
+        scripts/check-skip-additions.sh --self-test
+        scripts/check-skip-additions.sh
     forbid-soft-assert:
       tags: discipline
       run: |

@@ -1,3 +1,7 @@
+# empty: rate(...) > rate(...) on the same selector compares each
+# series against itself, so the strict greater-than yields zero
+# samples — every join pair has lhs == rhs.
+
 -- query.logql --
 rate({service_name="api"}[5m]) > rate({service_name="api"}[5m])
 -- seed --

@@ -1,3 +1,7 @@
+# empty: vector-vs-vector `<` joined on identical label sets compares
+# each series against itself, so the strict less-than yields no true
+# samples — every join pair has lhs == rhs.
+
 -- query.promql --
 demo_memory_usage_bytes < on(instance, job, type) demo_memory_usage_bytes
 -- seed --

@@ -1,3 +1,7 @@
+# empty: seeded spans all carry service.name in {backend, db, cache};
+# the resource.service.name = "frontend" matcher selects zero traces,
+# so the count() > 0 trace-level filter drops every candidate.
+
 -- query.traceql --
 { resource.service.name = "frontend" } | count() > 0
 -- seed --

@@ -1,3 +1,8 @@
+# empty: traces t1+t2 contain frontend spans (count >= 1) and t3 has
+# none, but the inner matcher drops t3 entirely before grouping; so
+# every surviving trace has count >= 1 and the = 0 filter excludes
+# them all.
+
 -- query.traceql --
 { resource.service.name = "frontend" } | count() = 0
 -- seed --

@@ -1,3 +1,8 @@
+# empty: the intersect (service.name == a AND service.name == b on the
+# same span) selects zero spans because resource.service.name is
+# single-valued per span, so no descendant chain can be rooted in the
+# intersect set.
+
 -- query.traceql --
 ({ resource.service.name = "a" } && { resource.service.name = "b" }) >> { resource.service.name = "c" }
 -- sql --