remove reduntatn comments

Gasoonjia · Gasoonjia · commit f6bdb2c6f0d8 · 2026-06-16T16:01:27.000-07:00
diff --git a/backends/cuda/tests/test_tq4_sdpa.py b/backends/cuda/tests/test_tq4_sdpa.py
@@ -20,7 +20,6 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-
 from executorch.backends.cuda.cuda_backend import CudaBackend
 from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 from executorch.backends.cuda.triton.kernels.tq4_sdpa import tq4_sdpa
@@ -440,38 +439,6 @@ def test_output_shape_and_dtype(self):
 
     # ------------------------------------------------------------------
     # 128k code path: kv_len clamp (decode) + mask_is_causal (prefill)
-    #
-    # Every test above calls tq4_sdpa WITHOUT kv_len and WITHOUT
-    # mask_is_causal, so they only exercise the kv_len=None fallback
-    # (full-Lk loop) at short KV. The cases below drive the actual
-    # long-context paths used in production by the Gemma-4 31B global
-    # layers (head_dim=512, GQA 8:4) and Qwen 3.5 MoE (head_dim=256,
-    # GQA 16:2):
-    #   * the on-device kv_len scalar that bounds the KV loop to the
-    #     filled context (decode), and
-    #   * the mask_is_causal per-tile causal block-skip (prefill).
-    #
-    # "GARBAGE TAIL": in production the KV cache is a fixed buffer
-    # pre-allocated to max_seq_len (e.g. 131072). At any step only the
-    # first kv_len positions hold real K/V; the rest is stale /
-    # uninitialized memory that attention must ignore. We simulate that
-    # tail by writing large-magnitude (x1000) values into [kv_len:]. If
-    # the clamp / block-skip works the kernel never reads the tail and
-    # the output matches a reference built from [0, kv_len) only; if it
-    # is broken the huge tail values dominate the softmax and the cosine
-    # collapses to ~0. So the garbage tail is a built-in negative control
-    # (verified: dropping kv_len drives the cosine to ~-0.01 and fails).
-    #
-    # CAUSAL ALIGNMENT (top-left vs bottom-right): when L_q < L_kv (a
-    # chunked prefill / decode, where the Lq new queries sit at the END
-    # of a kv_len-long context) there are two ways to place the causal
-    # triangle. PyTorch F.sdpa(is_causal=True) uses TOP-LEFT alignment
-    # (query row i attends to keys [0, i]) -- wrong for a KV cache. This
-    # kernel and gemma4_31b/model.py::_build_masks use BOTTOM-RIGHT
-    # alignment: query row i is absolute position (kv_len - Lq + i) and
-    # attends to keys [0, kv_len - Lq + i]. So the reference below builds
-    # an explicit bottom-right mask (q_pos >= cache_pos) rather than
-    # passing is_causal=True, which would otherwise mismatch the kernel.
     # ------------------------------------------------------------------
 
     def _run_long_kv_test(