portable: accumulate in fp32 for Half/BFloat16 in mean and sum

vacu9708 · vacu9708 · commit 16ba018adb97 · 2026-06-16T12:27:29.000+09:00
Problem: The fast-path and generic reduction loops in mean.out and sum.IntList_out accumulated the running sum in the tensor dtype. For BFloat16, the sum saturates around 256, so a mean over N=512 all-ones elements gives 0.5 instead of 1.0, and summing 512 all-ones elements gives 256 instead of 512. Changes: Accumulate in float for Half/BFloat16 by promoting the loop accumulator to ACC in both the fast path and the generic path. The final result is cast back to the tensor dtype on store. Continues the fp32-accumulation work in #19117.
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -58,17 +60,24 @@ Tensor& mean_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "mean.out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior.
       ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-        const CTYPE denom = static_cast<CTYPE>(reduce_size);
+        const ACC denom = static_cast<ACC>(reduce_size);
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc / denom;
+          out_data[i] = static_cast<CTYPE>(acc / denom);
         }
       });
       return out;
@@ -83,19 +92,25 @@ Tensor& mean_dim_out(
   static constexpr const char op_name[] = "mean.out";
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      using ACC = std::conditional_t<
+          std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+              std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+          float,
+          CTYPE_OUT>;
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
       const size_t num = get_reduced_dim_product(in, dim_list);
       const bool success = parallel_for_each_reduce_over_dim_list_output_index(
           in, dim_list, out, [&](const auto begin, const auto end) {
             for (const auto out_ix : c10::irange(begin, end)) {
-              CTYPE_OUT sum = 0;
+              ACC sum = 0;
               if (plan.has_value()) {
-                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                sum = plan->execute<CTYPE_IN, ACC>(
+                    [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                    [](ACC outv, ACC acc) { return acc + outv; },
                     out_ix);
               }
-              out_data[out_ix] = sum / static_cast<float>(num);
+              out_data[out_ix] =
+                  static_cast<CTYPE_OUT>(sum / static_cast<float>(num));
             }
           });
       ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -60,16 +62,23 @@ Tensor& sum_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "sum.IntList_out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior. See also op_grid_sampler_2d.cpp.
       ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc;
+          out_data[i] = static_cast<CTYPE>(acc);
         }
       });
       return out;
@@ -108,23 +117,24 @@ Tensor& sum_dim_out(
     ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
       ET_SWITCH_REALHBBF16_TYPES(
           out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+            using ACC = std::conditional_t<
+                std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+                    std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+                float,
+                CTYPE_OUT>;
             CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
             const bool success =
                 parallel_for_each_reduce_over_dim_list_output_index(
                     in, dim_list, out, [&](const auto begin, const auto end) {
                       for (const auto out_ix : c10::irange(begin, end)) {
-                        CTYPE_OUT sum = 0;
+                        ACC sum = 0;
                         if (plan.has_value()) {
-                          sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                              [](CTYPE_IN v) {
-                                return static_cast<CTYPE_OUT>(v);
-                              },
-                              [](CTYPE_OUT outv, CTYPE_OUT acc) {
-                                return acc + outv;
-                              },
+                          sum = plan->execute<CTYPE_IN, ACC>(
+                              [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                              [](ACC outv, ACC acc) { return acc + outv; },
                               out_ix);
                         }
-                        out_data[out_ix] = sum;
+                        out_data[out_ix] = static_cast<CTYPE_OUT>(sum);
                       }
                     });
             ET_KERNEL_CHECK_MSG(
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
@@ -263,6 +263,35 @@ void OpMeanOutTest::
   test_mean_dim_out_bool<ScalarType::Double>();
 }
 
+TEST_F(OpMeanOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256, giving
+  // 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpMeanOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16, giving 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpMeanOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,
diff --git a/kernels/test/op_sum_test.cpp b/kernels/test/op_sum_test.cpp
@@ -307,6 +307,35 @@ class OpSumOutTest : public OperatorTest {
   }
 };
 
+TEST_F(OpSumOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256 instead of
+  // 512. 512 = 2^9 is exactly representable in BFloat16.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_sum_intlist_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, static_cast<float>(N));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpSumOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16 instead of 512.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_sum_intlist_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, static_cast<float>(N));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpSumOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,