phu0ngng · pull · May 23, 2026 · May 22, 2026 · May 23, 2026
diff --git a/docs/envvars.rst b/docs/envvars.rst
@@ -287,6 +287,30 @@ Kernel Configuration
    :Default: ``0``
    :Description: Enable row-scaled NVFP4 tensors for forward activation quantizers in the ``NVFP4BlockScaling`` recipe. When set to ``1`` (or when ``NVFP4BlockScaling(row_scaled_activation=True)`` is used), rowwise ``amax`` metadata is stored as one FP32 value per tensor row instead of a single scalar.
 
+.. envvar:: NVTE_NVFP4_4OVER6
+
+   :Type: ``str`` (``none``, ``weights``, ``activations``, or ``all``)
+   :Default: ``none``
+   :Description: Enable 4over6 adaptive NVFP4 block scaling for weights, activations, or both in the ``NVFP4BlockScaling`` recipe. For each selected FP4 block, quantization compares map-to-4 and map-to-6 candidates and stores the candidate with lower configured error. ``none`` keeps standard NVFP4. Current 4over6 support targets RL and post-training scenarios; pre-training paths that combine 4over6 with RHT are not yet implemented.
+
+.. envvar:: NVTE_NVFP4_4OVER6_E4M3_USE_256
+
+   :Type: ``str`` (``none``, ``weights``, ``activations``, or ``all``)
+   :Default: ``all``
+   :Description: Select NVFP4 4over6 quantizers that use 256 instead of 448 as the global E4M3 scale bound. By default, all 4over6 quantizers use 256. Set the env var to ``none`` (or set ``NVFP4BlockScaling(nvfp4_4over6_e4m3_use_256="none")``) to use the standard NVFP4 448 bound for all 4over6 quantizers. This option is only meaningful for tensor roles that also enable :envvar:`NVTE_NVFP4_4OVER6`.
+
+.. envvar:: NVTE_NVFP4_4OVER6_ERR_MODE
+
+   :Type: ``str`` (``MAE`` or ``MSE``)
+   :Default: ``MAE``
+   :Description: Select the input-domain error metric used by NVFP4 4over6 map-to-4 versus map-to-6 candidate selection in the ``NVFP4BlockScaling`` recipe.
+
+.. envvar:: NVTE_NVFP4_4OVER6_ERR_USE_FAST_MATH
+
+   :Type: ``int`` (0 or 1)
+   :Default: ``0``
+   :Description: Allow the NVFP4 4over6 candidate error computation to use faster non-strict floating-point expressions. By default, 4over6 error comparison uses strict expressions; ``NVTE_USE_FAST_MATH`` does not control this error-comparison path.
+
 Torch Compilation and Fusion
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/qa/L1_pytorch_distributed_unittest/test.sh b/qa/L1_pytorch_distributed_unittest/test.sh
@@ -22,14 +22,31 @@ mkdir -p "$XML_LOG_DIR"
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
+# Run CP tests (deterministic + non-deterministic) first so they can be parallelized.
+# Each needs 4 GPUs, so >=8 GPUs allows them to run concurrently on disjoint GPU sets.
+NUM_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())")
+echo "Detected $NUM_GPUS GPU(s)"
+if [ "$NUM_GPUS" -ge 8 ]; then
+    echo "Running CP tests in parallel: non-deterministic on GPUs 0-3, deterministic on GPUs 4-7"
+    CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py &
+    PID_CP_NONDET=$!
+    CUDA_VISIBLE_DEVICES=4,5,6,7 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_deterministic_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py &
+    PID_CP_DET=$!
+    wait $PID_CP_NONDET || test_fail "test_attention_with_cp.py"
+    wait $PID_CP_DET || test_fail "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention_with_cp.py"
+else
+    echo "Running CP tests sequentially: need >=8 GPUs for parallel execution"
+    python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
+    NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_deterministic_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 test_attention_with_cp.py"
+fi
+
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics_exact.xml $TE_PATH/tests/pytorch/distributed/test_numerics_exact.py || test_fail "test_numerics_exact.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py || test_fail "test_torch_fsdp2.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cp_utils.xml $TE_PATH/tests/pytorch/attention/test_cp_utils.py || test_fail "test_cp_utils.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_newton_schulz.xml $TE_PATH/tests/pytorch/distributed/test_newton_schulz.py || test_fail "test_newton_schulz.py"

diff --git a/qa/L3_pytorch_FA_versions_test/test.sh b/qa/L3_pytorch_FA_versions_test/test.sh
@@ -2,26 +2,44 @@
 #
 # See LICENSE for license information.
 
-set -e
+function error_exit() {
+    echo "Error: $1"
+    exit 1
+}
+
+function test_fail() {
+    RET=1
+    FAILED_CASES="$FAILED_CASES $1"
+    echo "Error: sub-test failed: $1"
+}
+
+RET=0
+FAILED_CASES=""
 
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
-pip3 install pytest==8.2.1
+pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
 # Limit parallel build jobs to avoid overwhelming system resources
 export MAX_JOBS=32
 
 # Iterate over Flash Attention versions
 sm_arch=`python3 -c "import torch; sm = torch.cuda.get_device_capability(0); print(sm[0]*10+sm[1])"`
 export FLASH_ATTN_CUDA_ARCHS=$sm_arch
+# CP tests are expensive and run only once per arch:
+#   - sm90 (H100):  FA3 (3.0.0b1) - context_parallel.py only supports FA3 on Hopper
+#   - sm>90 (B200): latest FA4    - FA3 is not built/installed for sm>90
+# Non-CP tests still run for every FA version in the array.
 if [ $sm_arch -gt 90 ]
 then
-  FA_versions=(2.8.3 4.0.0b8)
+  FA_versions=(2.8.3 4.0.0b11)
+  CP_FA_VERSION="${FA_versions[-1]}"
 elif [ $sm_arch -eq 90 ]
 then
-  FA_versions=(2.7.3 2.8.3 3.0.0b1 4.0.0b8)
+  FA_versions=(2.8.3 3.0.0b1 4.0.0b11)
+  CP_FA_VERSION="3.0.0b1"
 fi
 
 for fa_version in "${FA_versions[@]}"
@@ -35,12 +53,63 @@ do
   then
     pip3 install flash-attn-4==${fa_version} nvidia-cutlass-dsl[cu13]==4.4.2 --no-build-isolation
   else
-    git clone https://github.com/Dao-AILab/flash-attention.git
-    cd flash-attention/hopper && python setup.py install
-    cd ../../
+    # FA3 source build (~20 min). Skip if FA3 is already installed.
+    if python3 -c "import flash_attn_3" 2>/dev/null; then
+      echo "FA3 already installed (from base image); skipping source build"
+    else
+      git clone https://github.com/Dao-AILab/flash-attention.git
+      cd flash-attention/hopper && python setup.py install
+      cd ../../
+    fi
   fi
 
+  # Ensure local test utils is found before nvidia-cutlass-dsl's utils package
+  export PYTHONPATH=$TE_PATH/tests/pytorch:${PYTHONPATH:-}
+
   # Run tests
-  NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest.xml $TE_PATH/tests/pytorch/attention/test_attention.py
+  NUM_GPUS=$(nvidia-smi -L | wc -l)
+  echo "Detected $NUM_GPUS GPU(s)"
+
+  # Suffix junit XMLs with the FA version so per-iteration results are preserved
+  # (otherwise pytest.xml is overwritten on each loop iteration and we lose timing
+  # data for all but the last FA version).
+  fa_tag="${fa_version//./_}"
+  XML_ATTN="$XML_LOG_DIR/pytest_test_attention_fa${fa_tag}.xml"
+  XML_CP="$XML_LOG_DIR/pytest_test_attention_with_cp_fa${fa_tag}.xml"
+
+  if [ "$fa_version" = "$CP_FA_VERSION" ]; then
+    echo "Running CP tests with FA $fa_version (CP version for sm$sm_arch)"
+    if [ "$NUM_GPUS" -ge 5 ]; then
+      CP_NUM_GPUS=$(( NUM_GPUS - 1 > 4 ? 4 : NUM_GPUS - 1 ))
+      CP_GPUS=$(seq -s, 1 $CP_NUM_GPUS)
+      echo "Running tests in parallel: test_attention.py on GPU 0, test_attention_with_cp.py on GPUs $CP_GPUS ($CP_NUM_GPUS GPUs)"
+
+      CUDA_VISIBLE_DEVICES=0 NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s \
+        --junitxml=$XML_ATTN \
+        $TE_PATH/tests/pytorch/attention/test_attention.py &
+      PID_ATTN=$!
 
+      CUDA_VISIBLE_DEVICES=$CP_GPUS NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s \
+        --junitxml=$XML_CP \
+        $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py &
+      PID_CP=$!
+
+      wait $PID_ATTN || test_fail "test_attention.py (FA $fa_version)"
+      wait $PID_CP || test_fail "test_attention_with_cp.py (FA $fa_version)"
+    else
+      echo "Running tests sequentially: need >=5 GPUs for parallel execution (1 for test_attention + 4 for test_attention_with_cp)"
+      NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_ATTN $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py (FA $fa_version)"
+      NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_CP $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py (FA $fa_version)"
+    fi
+  else
+    echo "Skipping CP tests for FA $fa_version (CP only runs with FA $CP_FA_VERSION on sm$sm_arch)"
+    NVTE_TORCH_COMPILE=0 python3 -m pytest -v -s --junitxml=$XML_ATTN $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py (FA $fa_version)"
+  fi
 done
+
+if [ "$RET" -ne 0 ]; then
+    echo "Error in the following test cases:$FAILED_CASES"
+    exit 1
+fi
+echo "All tests passed"
+exit 0