portable: accumulate in fp32 for Half/BFloat16 in softmax, log_softmax, mean, and sum (#20090)

vacu9708 · web-flow · commit 8bb71cfca464 · 2026-06-17T13:53:36.000-07:00
This PR follows up on #19117 (`op_grid_sampler_2d`) ### Motivation softmax, log_softmax, mean, and sum all accumulate their reduction in the input dtype. For BFloat16, that sum saturates around 256. Once it gets there, adding 1.0 rounds away and the total gets stuck. A uniform softmax over 512 elements in BFloat16 gives `~1/256` per output instead of `1/512`. ### Why FP32 accumulation is needed BFloat16 has the same exponent width as Float32, so it has a similar range. However, it has far fewer fraction bits, which makes its representable spacing much coarser as values grow. | Type | Exponent bits | Fraction bits | Practical effect | | --- | ---: | ---: | --- | | `BFloat16` | 8 | 7 | Similar range to `Float32`, but coarse spacing | | `Float32` | 8 | 23 | Similar range, much finer spacing | For BFloat16, the gap between consecutive representable values (i.e, the smallest step size) increases at each power-of-two range: | Range | BFloat16 step size | Representable examples | | --- | ---: | --- | | `[128, 256)` | `1` | `128, 129, 130, ..., 255` | | `[256, 512)` | `2` | `256, 258, 260, ..., 510` | As a result, once a BFloat16 running sum reaches `256`, adding `1.0` no longer changes the value: | Operation | Exact result | BFloat16 result | Reason | | --- | ---: | ---: | --- | | `256 + 1` | `257` | `256` | `257` is not representable and rounds back to `256` (according to IEEE 754; round-to-nearest-even) | This directly affects all four ops for large inputs. For a softmax over 512 zeros, each `exp(0)` contributes `1.0`, so the denominator should be `512`. If the BFloat16 accumulation gets stuck at `256`, the output becomes approximately `1/256` instead of the correct `1/512`. | Case | Expected denominator | BFloat16 accumulated denominator | Output | | --- | ---: | ---: | ---: | | Correct accumulation | `512` | `512` | `1/512` | | BFloat16 accumulation | `512` | `~256` | `~1/256` | ### Tests ``` $ cmake --build cmake-out --target portable_kernels_test -j$(nproc) [100%] Built target portable_kernels_test # Post-fix — new tests: [ OK ] OpSoftmaxOutTest.BFloat16LargeDimAccumulatesInFloat [ OK ] OpLogSoftmaxOutTest.BFloat16LargeDimAccumulatesInFloat [ OK ] OpMeanOutTest.BFloat16LargeDimAccumulatesInFloat [ OK ] OpSumOutTest.BFloat16LargeDimAccumulatesInFloat # Pre-fix (reverted op files): [ FAILED ] OpSoftmaxOutTest.BFloat16LargeDimAccumulatesInFloat [ FAILED ] OpLogSoftmaxOutTest.BFloat16LargeDimAccumulatesInFloat [ FAILED ] OpMeanOutTest.BFloat16LargeDimAccumulatesInFloat [ FAILED ] OpSumOutTest.BFloat16LargeDimAccumulatesInFloat $ lintrunner op_softmax.cpp op_log_softmax.cpp op_mean.cpp op_sum.cpp \ op_softmax_test.cpp op_log_softmax_test.cpp op_mean_test.cpp op_sum_test.cpp ok No lint issues. ``` cc @larryliu0820 @manuelcandales
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -98,20 +98,27 @@ void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
   return;
 }
 
-// OUT_T is the corresponding C++ type for out.scalar_type(). Only takes float
-// or double.
-template <
-    typename OUT_T,
-    std::enable_if_t<std::is_floating_point<OUT_T>::value, bool> = true>
+// OUT_T is the corresponding C++ type for out.scalar_type().
+template <typename OUT_T>
 bool log_softmax_wrapper(const Tensor& X, int64_t dim, Tensor& out) {
-  auto input_scalar_type = X.scalar_type();
-  switch (input_scalar_type) {
-    // TODO: support Double as well
-    case ScalarType::Float:
-      log_softmax_kernel<float, OUT_T>(X, dim, out);
-      return true;
-    default:
-      return false; // Unsupported input dtype
+  if constexpr (
+      std::is_same_v<OUT_T, executorch::aten::BFloat16> ||
+      std::is_same_v<OUT_T, executorch::aten::Half>) {
+    // Input dtype equals output dtype (enforced by check_log_softmax_args).
+    // Use if constexpr to avoid instantiating cross-type combinations that
+    // the ATen vectorized functions do not support.
+    log_softmax_kernel<OUT_T, OUT_T>(X, dim, out);
+    return true;
+  } else {
+    auto input_scalar_type = X.scalar_type();
+    switch (input_scalar_type) {
+      // TODO: support Double as well
+      case ScalarType::Float:
+        log_softmax_kernel<float, OUT_T>(X, dim, out);
+        return true;
+      default:
+        return false; // Unsupported input dtype
+    }
   }
 }
 } // namespace
@@ -148,6 +155,18 @@ Tensor& opt_log_softmax_out(
       ET_KERNEL_CHECK(context, success, InvalidArgument, out);
       break;
     }
+    case ScalarType::BFloat16: {
+      bool success =
+          log_softmax_wrapper<executorch::aten::BFloat16>(self, dim, out);
+      ET_KERNEL_CHECK(context, success, InvalidArgument, out);
+      break;
+    }
+    case ScalarType::Half: {
+      bool success =
+          log_softmax_wrapper<executorch::aten::Half>(self, dim, out);
+      ET_KERNEL_CHECK(context, success, InvalidArgument, out);
+      break;
+    }
     default:
       ET_KERNEL_CHECK(context, false, InvalidArgument, out);
   }
diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <type_traits>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -42,8 +43,16 @@ Tensor& log_softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
+  // For half-precision inputs, the exp-sum is accumulated in float to avoid
+  // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
+  // acc_type behavior. See also op_grid_sampler_2d.cpp.
   ET_SWITCH_FLOATHBF16_TYPES(
       in.scalar_type(), ctx, "_log_softmax.out", CTYPE, [&]() {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
         CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -61,11 +70,12 @@ Tensor& log_softmax_out(
                   size,
                   stride);
 
-              CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+              ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
-                    return std::exp(val_in - max_in);
+                    return std::exp(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
                   },
-                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                  [](const ACC mapped_in, ACC val_accum) {
                     return val_accum + mapped_in;
                   },
                   in_data + base,
@@ -75,7 +85,9 @@ Tensor& log_softmax_out(
 
               apply_unary_map_fn(
                   [max_in, temp_sum](const CTYPE val_in) {
-                    return val_in - max_in - temp_sum;
+                    return static_cast<CTYPE>(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in) -
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -58,17 +60,24 @@ Tensor& mean_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "mean.out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior.
       ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-        const CTYPE denom = static_cast<CTYPE>(reduce_size);
+        const ACC denom = static_cast<ACC>(reduce_size);
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc / denom;
+          out_data[i] = static_cast<CTYPE>(acc / denom);
         }
       });
       return out;
@@ -83,19 +92,25 @@ Tensor& mean_dim_out(
   static constexpr const char op_name[] = "mean.out";
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      using ACC = std::conditional_t<
+          std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+              std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+          float,
+          CTYPE_OUT>;
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
       const size_t num = get_reduced_dim_product(in, dim_list);
       const bool success = parallel_for_each_reduce_over_dim_list_output_index(
           in, dim_list, out, [&](const auto begin, const auto end) {
             for (const auto out_ix : c10::irange(begin, end)) {
-              CTYPE_OUT sum = 0;
+              ACC sum = 0;
               if (plan.has_value()) {
-                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                sum = plan->execute<CTYPE_IN, ACC>(
+                    [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                    [](ACC outv, ACC acc) { return acc + outv; },
                     out_ix);
               }
-              out_data[out_ix] = sum / static_cast<float>(num);
+              out_data[out_ix] =
+                  static_cast<CTYPE_OUT>(sum / static_cast<float>(num));
             }
           });
       ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <type_traits>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -42,8 +43,16 @@ Tensor& softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
+  // For half-precision inputs, the exp-sum is accumulated in float to avoid
+  // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
+  // acc_type behavior. See also op_grid_sampler_2d.cpp.
   ET_SWITCH_FLOATHBF16_TYPES(
       in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
         CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -61,11 +70,12 @@ Tensor& softmax_out(
                   size,
                   stride);
 
-              const CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+              const ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
-                    return std::exp(val_in - max_in);
+                    return std::exp(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
                   },
-                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                  [](const ACC mapped_in, ACC val_accum) {
                     return val_accum + mapped_in;
                   },
                   in_data + base,
@@ -74,7 +84,11 @@ Tensor& softmax_out(
 
               apply_unary_map_fn(
                   [max_in, temp_sum](const CTYPE val_in) {
-                    return std::exp(val_in - max_in) / temp_sum;
+                    return static_cast<CTYPE>(
+                        std::exp(
+                            static_cast<ACC>(val_in) -
+                            static_cast<ACC>(max_in)) /
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -60,16 +62,23 @@ Tensor& sum_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "sum.IntList_out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior. See also op_grid_sampler_2d.cpp.
       ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc;
+          out_data[i] = static_cast<CTYPE>(acc);
         }
       });
       return out;
@@ -108,23 +117,24 @@ Tensor& sum_dim_out(
     ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
       ET_SWITCH_REALHBBF16_TYPES(
           out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+            using ACC = std::conditional_t<
+                std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+                    std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+                float,
+                CTYPE_OUT>;
             CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
             const bool success =
                 parallel_for_each_reduce_over_dim_list_output_index(
                     in, dim_list, out, [&](const auto begin, const auto end) {
                       for (const auto out_ix : c10::irange(begin, end)) {
-                        CTYPE_OUT sum = 0;
+                        ACC sum = 0;
                         if (plan.has_value()) {
-                          sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                              [](CTYPE_IN v) {
-                                return static_cast<CTYPE_OUT>(v);
-                              },
-                              [](CTYPE_OUT outv, CTYPE_OUT acc) {
-                                return acc + outv;
-                              },
+                          sum = plan->execute<CTYPE_IN, ACC>(
+                              [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                              [](ACC outv, ACC acc) { return acc + outv; },
                               out_ix);
                         }
-                        out_data[out_ix] = sum;
+                        out_data[out_ix] = static_cast<CTYPE_OUT>(sum);
                       }
                     });
             ET_KERNEL_CHECK_MSG(
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
@@ -369,6 +369,19 @@ TEST_F(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpLogSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512: without fp32 accumulation, the exp-sum saturates at BFloat16's
+  // precision limit (~256), so the output is ~-log(256) instead of -log(512).
+  // atol=1e-1 can catch pre-fix error: |log(512) - log(256)| = log(2)
+  constexpr int N = 512;
+  Tensor x = tf.zeros({1, N});
+  Tensor out = tf.zeros({1, N});
+  op_log_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
+  Tensor expected = tf.full({1, N}, -std::log(static_cast<float>(N)));
+  EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-1);
+}
+
 TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
@@ -263,6 +263,35 @@ void OpMeanOutTest::
   test_mean_dim_out_bool<ScalarType::Double>();
 }
 
+TEST_F(OpMeanOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256, giving
+  // 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpMeanOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16, giving 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpMeanOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
@@ -251,6 +251,19 @@ TEST_F(OpSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512: without fp32 accumulation the exp-sum saturates at BFloat16's
+  // precision limit (~256), so the output is ~1/256 instead of 1/512.
+  // 1e-3 is tight enough to catch pre-fix error: |1/256 - 1/512| ≈ 0.00195
+  constexpr int N = 512;
+  Tensor x = tf.zeros({1, N});
+  Tensor out = tf.zeros({1, N});
+  op_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
+  Tensor expected = tf.full({1, N}, 1.0f / N);
+  EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-3);
+}
+
 TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_sum_test.cpp b/kernels/test/op_sum_test.cpp