From 79f265bd6614190a1a6d18469cf1c976a39dfde2 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Tue, 9 Jun 2026 17:15:10 +0200
Subject: [PATCH 01/26] Native: add tensor bucketize operator

---
 kernels/portable/cpu/op_bucketize.cpp | 135 ++++++++++++++++++++
 kernels/portable/functions.yaml       |  10 ++
 kernels/test/CMakeLists.txt           |   1 +
 kernels/test/op_bucketize_test.cpp    | 174 ++++++++++++++++++++++++++
 4 files changed, 320 insertions(+)
 create mode 100644 kernels/portable/cpu/op_bucketize.cpp
 create mode 100644 kernels/test/op_bucketize_test.cpp
diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
new file mode 100644
index 00000000000..abe536b5b90
--- /dev/null
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -0,0 +1,135 @@
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+#include <cstdio>
+#include <iostream>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+namespace {
+
+template <typename CTYPE>
+int64_t
+cus_lower_bound(int64_t start, int64_t end, const CTYPE val, const CTYPE* bd) {
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    if (bd[mid] < val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template <typename CTYPE>
+int64_t
+cus_upper_bound(int64_t start, int64_t end, const CTYPE val, const CTYPE* bd) {
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    if (bd[mid] <= val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template <typename CTYPE, typename OUT_CTYPE>
+void searchsorted_cpu(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Tensor& boundaries,
+    const bool& right,
+    Tensor& out) {
+  const auto bd_data = boundaries.const_data_ptr<CTYPE>();
+  const auto in_data = input.const_data_ptr<CTYPE>();
+  OUT_CTYPE* out_data = out.mutable_data_ptr<OUT_CTYPE>();
+  int64_t end_bd = boundaries.sizes().back();
+
+  const bool success = parallel_for(
+      0, input.numel(), 200, [&](const auto begin, const auto end) {
+        for (const auto out_i : c10::irange(begin, end)) {
+          int64_t pos = right
+              ? cus_upper_bound(0, end_bd, in_data[out_i], bd_data)
+              : cus_lower_bound(0, end_bd, in_data[out_i], bd_data);
+          out_data[out_i] = pos;
+        }
+      });
+  ET_KERNEL_CHECK_MSG(context, success, Internal, , "parallel_for failed");
+}
+
+void bucketize_pre_check(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Tensor& boundaries,
+    bool out_int32,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      context,
+      boundaries.dim() == 1,
+      InvalidArgument,
+      ,
+      "boundaries tensor must be 1 dimension, but got dim(",
+      boundaries.dim(),
+      ")");
+
+  ScalarType out_dtype = out.scalar_type();
+  ET_KERNEL_CHECK_MSG(
+      context,
+      (out_dtype == ScalarType::Long && !out_int32) ||
+          (out_dtype == ScalarType::Int && out_int32),
+      InvalidArgument,
+      ,
+      "torch.bucketize(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ",
+      "whether out_int32 flag is True, but we got output tensor's dtype ",
+      out_dtype,
+      " and out_int32 flag is ",
+      (out_int32 ? "True" : "False"));
+
+  ET_KERNEL_CHECK(
+      context, tensors_have_same_shape(input, out), InvalidArgument, );
+}
+
+} // namespace
+
+Tensor& bucketize_tensor_out(
+    KernelRuntimeContext& context,
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& out) {
+  bucketize_pre_check(context, self, boundaries, out_int32, out);
+
+  ScalarType common_type =
+      promoteTypes(self.scalar_type(), boundaries.scalar_type());
+
+  ET_SWITCH_REALHBF16_TYPES(
+      common_type, context, "bucketize.Tensor_out", CTYPE, [&]() {
+        if (out_int32) {
+          searchsorted_cpu<CTYPE, int32_t>(
+              context, self, boundaries, right, out);
+        } else {
+          searchsorted_cpu<CTYPE, int64_t>(
+              context, self, boundaries, right, out);
+        }
+      });
+  return out;
+}
+Tensor& bucketize_scalar_out(
+    KernelRuntimeContext& context,
+    const Scalar& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& out) {
+  return out;
+
+} // namespace
+} // namespace native
+} // namespace executor
+} // namespace torch
\ No newline at end of file
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index ecf62ee3606..8f4dcf6e4bd 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -242,6 +242,16 @@
     - arg_meta: null
       kernel_name: torch::executor::bmm_out
 
+- op: bucketize.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::bucketize_tensor_out
+
+- op: bucketize.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::bucketize_scalar_out
+
 - op: cat.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 2707ba5db71..e45fed272ef 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -184,6 +184,7 @@ set(all_test_sources
     "op_bitwise_right_shift_test.cpp"
     "op_bitwise_xor_test.cpp"
     "op_bmm_test.cpp"
+    "op_bucketize_test.cpp"
     "op_cat_test.cpp"
     "op_cdist_forward_test.cpp"
     "op_ceil_test.cpp"
diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
new file mode 100644
index 00000000000..4efe63d1ba8
--- /dev/null
+++ b/kernels/test/op_bucketize_test.cpp
@@ -0,0 +1,174 @@
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpBucketizeTest : public OperatorTest {
+ protected:
+  Tensor& op_bucketize_out(
+      const Tensor& in,
+      const Tensor& boundaries,
+      bool out_int32,
+      bool right,
+      Tensor& out) {
+    return torch::executor::aten::bucketize_outf(
+        context_, in, boundaries, out_int32, right, out);
+  }
+
+  template <ScalarType dtype>
+  void run_smoke_test_int64() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<dtype> tf_dtype;
+
+    Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+    Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+    Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+    Tensor out = tf_out.zeros({2, 2});
+
+    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+    EXPECT_TENSOR_EQ(ret, expected);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  template <ScalarType dtype>
+  void run_smoke_test_int32() {
+    TensorFactory<ScalarType::Int> tf_out;
+    TensorFactory<dtype> tf_dtype;
+
+    Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+    Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+    Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+    Tensor out = tf_out.zeros({2, 2});
+
+    Tensor ret = op_bucketize_out(values, boundaries, true, true, out);
+
+    EXPECT_TENSOR_EQ(ret, expected);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  template <ScalarType dtype>
+  void run_smoke_test_non_int_out() {
+    TensorFactory<dtype> tf_out;
+    TensorFactory<ScalarType::Float> tf_dtype;
+
+    Tensor values = tf_dtype.make({2, 2}, {1.5, 2.5, 3.5, 4.5});
+    Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+    Tensor expected = tf_dtype.make({2, 2}, {1, 2, 3, 4});
+    Tensor out = tf_out.zeros({2, 2});
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_bucketize_out(values, boundaries, true, false, out));
+  }
+};
+
+TEST_F(OpBucketizeTest, SmokeTestInt64) {
+#define RUN_SMOKE_TEST(ctype, dtype) run_smoke_test_int64<ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(RUN_SMOKE_TEST);
+#undef RUN_SMOKE_TEST
+}
+
+TEST_F(OpBucketizeTest, SmokeTestInt32) {
+#define RUN_SMOKE_TEST(ctype, dtype) run_smoke_test_int32<ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(RUN_SMOKE_TEST);
+#undef RUN_SMOKE_TEST
+}
+
+TEST_F(OpBucketizeTest, RightTest) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 2, 3, 4});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, LeftTest) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 2, 3, 4});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {0, 1, 2, 3});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, OutOfBoundaryTest) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {0, 0, 5, 5});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, Boundaries1DTest) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {0, 0, 5, 5});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, BoundariesNDimTest) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({3, 2}, {1, 2, 3, 4, 5, 6});
+  Tensor out = tf_out.zeros({2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, false, false, out));
+}
+
+TEST_F(OpBucketizeTest, MismatchingInOutTest) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor out = tf_out.zeros({2, 3});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, false, false, out));
+}
+
+TEST_F(OpBucketizeTest, NonIntOutTest) {
+#define RUN_SMOKE_TEST(ctype, dtype) \
+  run_smoke_test_non_int_out<ScalarType::dtype>();
+  ET_FORALL_FLOAT_TYPES(RUN_SMOKE_TEST);
+#undef RUN_SMOKE_TEST
+}
\ No newline at end of file

From ef17bca0ed373f4c851944d3dd49c90499ff91e1 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Wed, 10 Jun 2026 22:28:17 +0200
Subject: [PATCH 02/26] Bucketize: handle input and boundaries with different
 types

---
 kernels/portable/cpu/op_bucketize.cpp | 100 +++++++++++++++++---------
 1 file changed, 68 insertions(+), 32 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index abe536b5b90..284446a9338 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -1,8 +1,7 @@
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
-#include <cstdio>
-#include <iostream>
 
 namespace torch {
 namespace executor {
@@ -10,12 +9,23 @@ namespace native {
 
 namespace {
 
+using namespace torch::executor::native::utils::internal;
+using namespace torch::executor::native::utils;
+
 template <typename CTYPE>
-int64_t
-cus_lower_bound(int64_t start, int64_t end, const CTYPE val, const CTYPE* bd) {
+int64_t cus_lower_bound(
+    int64_t end,
+    const CTYPE val,
+    const char* bd,
+    load_to_compute_fn<CTYPE> bd_load_fn,
+    ssize_t bd_elem_size) {
+  int64_t start = 0;
+
   while (start < end) {
     const int64_t mid = start + ((end - start) >> 1);
-    if (bd[mid] < val) {
+    CTYPE mid_bd = bd_load_fn(&bd[mid * bd_elem_size]);
+
+    if (mid_bd < val) {
       start = mid + 1;
     } else {
       end = mid;
@@ -25,11 +35,19 @@ cus_lower_bound(int64_t start, int64_t end, const CTYPE val, const CTYPE* bd) {
 }
 
 template <typename CTYPE>
-int64_t
-cus_upper_bound(int64_t start, int64_t end, const CTYPE val, const CTYPE* bd) {
+int64_t cus_upper_bound(
+    int64_t end,
+    const CTYPE val,
+    const char* bd,
+    load_to_compute_fn<CTYPE> bd_load_fn,
+    ssize_t bd_elem_size) {
+  ino64_t start = 0;
+
   while (start < end) {
     const int64_t mid = start + ((end - start) >> 1);
-    if (bd[mid] <= val) {
+    CTYPE mid_bd = bd_load_fn(&bd[mid * bd_elem_size]);
+
+    if (mid_bd <= val) {
       start = mid + 1;
     } else {
       end = mid;
@@ -38,33 +56,44 @@ cus_upper_bound(int64_t start, int64_t end, const CTYPE val, const CTYPE* bd) {
   return start;
 }
 
-template <typename CTYPE, typename OUT_CTYPE>
-void searchsorted_cpu(
+template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
+void bucketize_tensor(
     KernelRuntimeContext& context,
-    const Tensor& input,
+    const Tensor& self,
     const Tensor& boundaries,
     const bool& right,
     Tensor& out) {
-  const auto bd_data = boundaries.const_data_ptr<CTYPE>();
-  const auto in_data = input.const_data_ptr<CTYPE>();
-  OUT_CTYPE* out_data = out.mutable_data_ptr<OUT_CTYPE>();
-  int64_t end_bd = boundaries.sizes().back();
-
-  const bool success = parallel_for(
-      0, input.numel(), 200, [&](const auto begin, const auto end) {
-        for (const auto out_i : c10::irange(begin, end)) {
+  auto in_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
+      context, self, SupportedTensorDtypes::REALHBF16);
+  const ssize_t in_size = self.element_size();
+  auto in_data = reinterpret_cast<const char*>(self.const_data_ptr());
+
+  auto bd_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
+      context, boundaries, SupportedTensorDtypes::REALHBF16);
+  const ssize_t bd_elem_size = boundaries.element_size();
+  auto bd_data = reinterpret_cast<const char*>(boundaries.const_data_ptr());
+  int64_t bd_end = boundaries.sizes().back();
+
+  auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
+
+  const bool success =
+      parallel_for(0, self.numel(), 200, [&](const auto begin, const auto end) {
+        for (const auto i : c10::irange(begin, end)) {
+          auto compute_val = in_load_fn(&in_data[i * in_size]);
           int64_t pos = right
-              ? cus_upper_bound(0, end_bd, in_data[out_i], bd_data)
-              : cus_lower_bound(0, end_bd, in_data[out_i], bd_data);
-          out_data[out_i] = pos;
+              ? cus_upper_bound(
+                    bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size)
+              : cus_lower_bound(
+                    bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size);
+          out_data[i] = pos;
         }
       });
+
   ET_KERNEL_CHECK_MSG(context, success, Internal, , "parallel_for failed");
 }
 
-void bucketize_pre_check(
+void bucketize_common_pre_checks(
     KernelRuntimeContext& context,
-    const Tensor& input,
     const Tensor& boundaries,
     bool out_int32,
     Tensor& out) {
@@ -89,9 +118,6 @@ void bucketize_pre_check(
       out_dtype,
       " and out_int32 flag is ",
       (out_int32 ? "True" : "False"));
-
-  ET_KERNEL_CHECK(
-      context, tensors_have_same_shape(input, out), InvalidArgument, );
 }
 
 } // namespace
@@ -103,23 +129,33 @@ Tensor& bucketize_tensor_out(
     bool out_int32,
     bool right,
     Tensor& out) {
-  bucketize_pre_check(context, self, boundaries, out_int32, out);
+  bucketize_common_pre_checks(context, boundaries, out_int32, out);
+  // Check manually as bucketize_common_pre_checks do not return
+  if (context.failure_state() != Error::Ok) {
+    return out;
+  }
+  ET_KERNEL_CHECK(
+      context, tensors_have_same_shape(self, out), InvalidArgument, out);
 
   ScalarType common_type =
       promoteTypes(self.scalar_type(), boundaries.scalar_type());
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  static constexpr const char op_name[] = "bucketize.Tensor_out";
 
   ET_SWITCH_REALHBF16_TYPES(
-      common_type, context, "bucketize.Tensor_out", CTYPE, [&]() {
+      compute_type, context, op_name, CTYPE_COMPUTE, [&]() {
         if (out_int32) {
-          searchsorted_cpu<CTYPE, int32_t>(
+          bucketize_tensor<CTYPE_COMPUTE, int32_t, op_name>(
               context, self, boundaries, right, out);
         } else {
-          searchsorted_cpu<CTYPE, int64_t>(
+          bucketize_tensor<CTYPE_COMPUTE, int64_t, op_name>(
               context, self, boundaries, right, out);
         }
       });
   return out;
 }
+
 Tensor& bucketize_scalar_out(
     KernelRuntimeContext& context,
     const Scalar& self,
@@ -128,8 +164,8 @@ Tensor& bucketize_scalar_out(
     bool right,
     Tensor& out) {
   return out;
+}
 
-} // namespace
 } // namespace native
 } // namespace executor
 } // namespace torch
\ No newline at end of file

From 9fe3c3fa16c0d51ecbba693d8507141ee4242af0 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Wed, 10 Jun 2026 22:31:43 +0200
Subject: [PATCH 03/26] Bucketize: add scalar implementation

---
 kernels/portable/cpu/op_bucketize.cpp | 47 +++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 284446a9338..e8d60fa4dbd 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -92,6 +92,29 @@ void bucketize_tensor(
   ET_KERNEL_CHECK_MSG(context, success, Internal, , "parallel_for failed");
 }
 
+template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
+void bucketize_scalar(
+    KernelRuntimeContext& context,
+    const Scalar self,
+    const Tensor& boundaries,
+    const bool& right,
+    Tensor& out) {
+  CTYPE_COMPUTE compute_val = utils::scalar_to<CTYPE_COMPUTE>(self);
+
+  auto bd_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
+      context, boundaries, SupportedTensorDtypes::REALHBF16);
+  const ssize_t bd_elem_size = boundaries.element_size();
+  auto bd_data = reinterpret_cast<const char*>(boundaries.const_data_ptr());
+  int64_t bd_end = boundaries.sizes().back();
+
+  auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
+
+  int64_t pos = right
+      ? cus_upper_bound(bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size)
+      : cus_lower_bound(bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size);
+  out_data[0] = pos;
+}
+
 void bucketize_common_pre_checks(
     KernelRuntimeContext& context,
     const Tensor& boundaries,
@@ -163,6 +186,30 @@ Tensor& bucketize_scalar_out(
     bool out_int32,
     bool right,
     Tensor& out) {
+  bucketize_common_pre_checks(context, boundaries, out_int32, out);
+  // Check manually as bucketize_common_pre_checks do not return
+  if (context.failure_state() != Error::Ok) {
+    return out;
+  }
+  ET_KERNEL_CHECK(context, out.sizes().back() == 1, InvalidArgument, out);
+
+  ScalarType common_type =
+      utils::promote_type_with_scalar(boundaries.scalar_type(), self);
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  static constexpr const char op_name[] = "bucketize.Scalar_out";
+
+  ET_SWITCH_REALHBF16_TYPES(
+      compute_type, context, op_name, CTYPE_COMPUTE, [&]() {
+        if (out_int32) {
+          bucketize_scalar<CTYPE_COMPUTE, int32_t, op_name>(
+              context, self, boundaries, right, out);
+        } else {
+          bucketize_scalar<CTYPE_COMPUTE, int64_t, op_name>(
+              context, self, boundaries, right, out);
+        }
+      });
+
   return out;
 }
 

From 6f204306cfcc86e5a5150a84a22fc3ca8d685871 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Wed, 10 Jun 2026 22:32:56 +0200
Subject: [PATCH 04/26] Bucketize: add scalar, types and mismatching out_int32
 tests

---
 kernels/test/op_bucketize_test.cpp | 227 +++++++++++++++++++++++------
 1 file changed, 179 insertions(+), 48 deletions(-)

diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 4efe63d1ba8..f0da3c3b916 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -1,3 +1,4 @@
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
@@ -7,82 +8,193 @@
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using torch::executor::testing::TensorFactory;
 
-class OpBucketizeTest : public OperatorTest {
+class OpBucketizeScalarTest : public OperatorTest {
  protected:
   Tensor& op_bucketize_out(
-      const Tensor& in,
+      const Scalar& self,
       const Tensor& boundaries,
       bool out_int32,
       bool right,
       Tensor& out) {
     return torch::executor::aten::bucketize_outf(
-        context_, in, boundaries, out_int32, right, out);
+        context_, self, boundaries, out_int32, right, out);
   }
 
-  template <ScalarType dtype>
-  void run_smoke_test_int64() {
+  template <ScalarType BOUND_DTYPE>
+  void test_bucketize_types() {
     TensorFactory<ScalarType::Long> tf_out;
-    TensorFactory<dtype> tf_dtype;
+    TensorFactory<BOUND_DTYPE> tf_bound;
 
-    Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
-    Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
-    Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
-    Tensor out = tf_out.zeros({2, 2});
+    Scalar value = 2;
+    Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+    Tensor expected = tf_out.make({1}, {1});
+    Tensor out = tf_out.zeros({1});
 
-    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+    Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
 
     EXPECT_TENSOR_EQ(ret, expected);
     EXPECT_TENSOR_EQ(out, expected);
   }
 
-  template <ScalarType dtype>
-  void run_smoke_test_int32() {
-    TensorFactory<ScalarType::Int> tf_out;
-    TensorFactory<dtype> tf_dtype;
+  void test_bucketize_bound_types() {
+#define RUN_TEST(ctype, dtype) test_bucketize_types<ScalarType::dtype>();
+    ET_FORALL_REALHBF16_TYPES(RUN_TEST)
+#undef RUN_TEST
+  }
+};
+
+TEST_F(OpBucketizeScalarTest, SanityCheck) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2.5;
+  Tensor boundaries = tf_bound.make({10}, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18});
+  Tensor expected = tf_out.make({1}, {2});
+  Tensor out = tf_out.zeros({1});
+
+  Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarBoundaryTypes) {
+  test_bucketize_bound_types();
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarOut1DFails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2;
+  Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({5});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(value, boundaries, false, true, out));
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarOutNDFails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
 
-    Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
-    Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Scalar value = 2;
+  Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({5, 5});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(value, boundaries, false, true, out));
+}
+
+class OpBucketizeTest : public OperatorTest {
+ protected:
+  Tensor& op_bucketize_out(
+      const Tensor& in,
+      const Tensor& boundaries,
+      bool out_int32,
+      bool right,
+      Tensor& out) {
+    return torch::executor::aten::bucketize_outf(
+        context_, in, boundaries, out_int32, right, out);
+  }
+
+  template <ScalarType IN_DTYPE, ScalarType BOUND_DTYPE>
+  void test_bucketize_types() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<BOUND_DTYPE> tf_bound;
+
+    Tensor values = tf_in.make({2, 2}, {1, 4, 6, 8});
+    Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
     Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
     Tensor out = tf_out.zeros({2, 2});
 
-    Tensor ret = op_bucketize_out(values, boundaries, true, true, out);
+    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
 
     EXPECT_TENSOR_EQ(ret, expected);
     EXPECT_TENSOR_EQ(out, expected);
   }
 
-  template <ScalarType dtype>
-  void run_smoke_test_non_int_out() {
-    TensorFactory<dtype> tf_out;
-    TensorFactory<ScalarType::Float> tf_dtype;
-
-    Tensor values = tf_dtype.make({2, 2}, {1.5, 2.5, 3.5, 4.5});
-    Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
-    Tensor expected = tf_dtype.make({2, 2}, {1, 2, 3, 4});
-    Tensor out = tf_out.zeros({2, 2});
+  template <ScalarType IN_DTYPE>
+  void test_bucketize_bound_types() {
+#define RUN_TEST(ctype, dtype) \
+  test_bucketize_types<IN_DTYPE, ScalarType::dtype>();
+    ET_FORALL_REALHBF16_TYPES(RUN_TEST)
+#undef RUN_TEST
+  }
 
-    ET_EXPECT_KERNEL_FAILURE(
-        context_, op_bucketize_out(values, boundaries, true, false, out));
+  void test_bucketize_in_types() {
+#define RUN_TEST(ctype, dtype) test_bucketize_bound_types<ScalarType::dtype>();
+    ET_FORALL_REALHBF16_TYPES(RUN_TEST)
+#undef RUN_TEST
   }
 };
 
-TEST_F(OpBucketizeTest, SmokeTestInt64) {
-#define RUN_SMOKE_TEST(ctype, dtype) run_smoke_test_int64<ScalarType::dtype>();
-  ET_FORALL_REALHBF16_TYPES(RUN_SMOKE_TEST);
-#undef RUN_SMOKE_TEST
+TEST_F(OpBucketizeTest, SanityCheck) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_comp;
+
+  Tensor values =
+      tf_comp.make({2, 4, 4}, {1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8,
+
+                               1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8});
+
+  Tensor boundaries = tf_comp.make({5}, {0, 3, 5, 7, 9});
+
+  Tensor expected =
+      tf_out.make({2, 4, 4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+
+                              1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4});
+
+  Tensor out = tf_out.zeros({2, 4, 4});
+
+  // The execution of the operator
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, InAndBoundaryTypes) {
+  test_bucketize_in_types();
 }
 
-TEST_F(OpBucketizeTest, SmokeTestInt32) {
-#define RUN_SMOKE_TEST(ctype, dtype) run_smoke_test_int32<ScalarType::dtype>();
-  ET_FORALL_REALHBF16_TYPES(RUN_SMOKE_TEST);
-#undef RUN_SMOKE_TEST
+TEST_F(OpBucketizeTest, Int64Out) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, Int32Out) {
+  TensorFactory<ScalarType::Int> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, true, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeTest, RightTest) {
+TEST_F(OpBucketizeTest, BoundariesRight) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
@@ -97,7 +209,7 @@ TEST_F(OpBucketizeTest, RightTest) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeTest, LeftTest) {
+TEST_F(OpBucketizeTest, BoundariesLeft) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
@@ -112,7 +224,7 @@ TEST_F(OpBucketizeTest, LeftTest) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeTest, OutOfBoundaryTest) {
+TEST_F(OpBucketizeTest, OutOfBoundary) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
@@ -127,7 +239,7 @@ TEST_F(OpBucketizeTest, OutOfBoundaryTest) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeTest, Boundaries1DTest) {
+TEST_F(OpBucketizeTest, Boundaries1D) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
@@ -142,7 +254,9 @@ TEST_F(OpBucketizeTest, Boundaries1DTest) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeTest, BoundariesNDimTest) {
+TEST_F(OpBucketizeTest, BoundaryTypeNonRealHBF16Fails) {}
+
+TEST_F(OpBucketizeTest, BoundariesNDFails) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
@@ -154,7 +268,7 @@ TEST_F(OpBucketizeTest, BoundariesNDimTest) {
       context_, op_bucketize_out(values, boundaries, false, false, out));
 }
 
-TEST_F(OpBucketizeTest, MismatchingInOutTest) {
+TEST_F(OpBucketizeTest, MismatchingInOutDimsFails) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
@@ -166,9 +280,26 @@ TEST_F(OpBucketizeTest, MismatchingInOutTest) {
       context_, op_bucketize_out(values, boundaries, false, false, out));
 }
 
-TEST_F(OpBucketizeTest, NonIntOutTest) {
-#define RUN_SMOKE_TEST(ctype, dtype) \
-  run_smoke_test_non_int_out<ScalarType::dtype>();
-  ET_FORALL_FLOAT_TYPES(RUN_SMOKE_TEST);
-#undef RUN_SMOKE_TEST
+TEST_F(OpBucketizeTest, MismatchingIntArg32Fails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor out = tf_out.zeros({2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, true, false, out));
+}
+
+TEST_F(OpBucketizeTest, MismatchingIntArg64Fails) {
+  TensorFactory<ScalarType::Int> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor out = tf_out.zeros({2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, false, false, out));
 }
\ No newline at end of file

From c7095ebfbc47b2f2594c1f4bb45de5636244a3ea Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Wed, 10 Jun 2026 23:08:16 +0200
Subject: [PATCH 05/26] Bucketize: improve error messages and pre check flow

---
 kernels/portable/cpu/op_bucketize.cpp | 45 ++++++++++++---------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index e8d60fa4dbd..f7ba43bf50b 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -1,5 +1,6 @@
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
@@ -115,32 +116,26 @@ void bucketize_scalar(
   out_data[0] = pos;
 }
 
-void bucketize_common_pre_checks(
-    KernelRuntimeContext& context,
+Error bucketize_common_pre_checks(
     const Tensor& boundaries,
     bool out_int32,
     Tensor& out) {
-  ET_KERNEL_CHECK_MSG(
-      context,
+  ET_CHECK_OR_RETURN_ERROR(
       boundaries.dim() == 1,
       InvalidArgument,
-      ,
-      "boundaries tensor must be 1 dimension, but got dim(",
-      boundaries.dim(),
-      ")");
+      "boundaries tensor must be 1 dimension, but got dim(%zu)",
+      boundaries.dim());
 
   ScalarType out_dtype = out.scalar_type();
-  ET_KERNEL_CHECK_MSG(
-      context,
+  ET_CHECK_OR_RETURN_ERROR(
       (out_dtype == ScalarType::Long && !out_int32) ||
           (out_dtype == ScalarType::Int && out_int32),
       InvalidArgument,
-      ,
-      "torch.bucketize(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ",
-      "whether out_int32 flag is True, but we got output tensor's dtype ",
-      out_dtype,
-      " and out_int32 flag is ",
+      "torch.bucketize(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on whether out_int32 flag is True, but we got output tensor dtype %s and out_int32 flag is %s",
+      toString(out_dtype),
       (out_int32 ? "True" : "False"));
+
+  return Error::Ok;
 }
 
 } // namespace
@@ -152,11 +147,11 @@ Tensor& bucketize_tensor_out(
     bool out_int32,
     bool right,
     Tensor& out) {
-  bucketize_common_pre_checks(context, boundaries, out_int32, out);
-  // Check manually as bucketize_common_pre_checks do not return
-  if (context.failure_state() != Error::Ok) {
-    return out;
-  }
+  ET_KERNEL_CHECK(
+      context,
+      bucketize_common_pre_checks(boundaries, out_int32, out) == Error::Ok,
+      InvalidArgument,
+      out);
   ET_KERNEL_CHECK(
       context, tensors_have_same_shape(self, out), InvalidArgument, out);
 
@@ -186,11 +181,11 @@ Tensor& bucketize_scalar_out(
     bool out_int32,
     bool right,
     Tensor& out) {
-  bucketize_common_pre_checks(context, boundaries, out_int32, out);
-  // Check manually as bucketize_common_pre_checks do not return
-  if (context.failure_state() != Error::Ok) {
-    return out;
-  }
+  ET_KERNEL_CHECK(
+      context,
+      bucketize_common_pre_checks(boundaries, out_int32, out) == Error::Ok,
+      InvalidArgument,
+      out);
   ET_KERNEL_CHECK(context, out.sizes().back() == 1, InvalidArgument, out);
 
   ScalarType common_type =

From 97f19bcb455029f0dd2699c8baca32f7dcf15143 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 13:06:23 +0200
Subject: [PATCH 06/26] Bucketize: expect 0 dimensional output in scalar
 version

---
 kernels/portable/cpu/op_bucketize.cpp | 2 +-
 kernels/test/op_bucketize_test.cpp    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index f7ba43bf50b..17492a541b7 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -186,7 +186,7 @@ Tensor& bucketize_scalar_out(
       bucketize_common_pre_checks(boundaries, out_int32, out) == Error::Ok,
       InvalidArgument,
       out);
-  ET_KERNEL_CHECK(context, out.sizes().back() == 1, InvalidArgument, out);
+  ET_KERNEL_CHECK(context, out.dim() == 0, InvalidArgument, out);
 
   ScalarType common_type =
       utils::promote_type_with_scalar(boundaries.scalar_type(), self);
diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index f0da3c3b916..3f6dedebeca 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -32,8 +32,8 @@ class OpBucketizeScalarTest : public OperatorTest {
 
     Scalar value = 2;
     Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
-    Tensor expected = tf_out.make({1}, {1});
-    Tensor out = tf_out.zeros({1});
+    Tensor expected = tf_out.make({}, {1});
+    Tensor out = tf_out.zeros({});
 
     Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
 
@@ -54,8 +54,8 @@ TEST_F(OpBucketizeScalarTest, SanityCheck) {
 
   Scalar value = 2.5;
   Tensor boundaries = tf_bound.make({10}, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18});
-  Tensor expected = tf_out.make({1}, {2});
-  Tensor out = tf_out.zeros({1});
+  Tensor expected = tf_out.make({}, {2});
+  Tensor out = tf_out.zeros({});
 
   Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
 

From 8c26f1e146d6af074993f14be80f122adb1ab2b9 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 13:24:29 +0200
Subject: [PATCH 07/26] Bucketize: add input and boundaries realhbf16 dtypes
 check

---
 kernels/portable/cpu/op_bucketize.cpp | 12 +++++++
 kernels/test/op_bucketize_test.cpp    | 46 +++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 17492a541b7..ee35eecf421 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -1,6 +1,7 @@
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
@@ -8,6 +9,8 @@ namespace torch {
 namespace executor {
 namespace native {
 
+using namespace executorch::runtime;
+
 namespace {
 
 using namespace torch::executor::native::utils::internal;
@@ -135,6 +138,13 @@ Error bucketize_common_pre_checks(
       toString(out_dtype),
       (out_int32 ? "True" : "False"));
 
+  ScalarType bound_dtype = boundaries.scalar_type();
+  ET_CHECK_OR_RETURN_ERROR(
+      isRealHBF16Type(bound_dtype),
+      InvalidArgument,
+      "boundaries tensor of type %s is not supported",
+      toString(bound_dtype));
+
   return Error::Ok;
 }
 
@@ -154,6 +164,8 @@ Tensor& bucketize_tensor_out(
       out);
   ET_KERNEL_CHECK(
       context, tensors_have_same_shape(self, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      context, tensor_is_realhbf16_type(self), InvalidArgument, out);
 
   ScalarType common_type =
       promoteTypes(self.scalar_type(), boundaries.scalar_type());
diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 3f6dedebeca..8d3c314bc99 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -120,6 +120,38 @@ class OpBucketizeTest : public OperatorTest {
     EXPECT_TENSOR_EQ(out, expected);
   }
 
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_bucketize_complex_boundary() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<ScalarType::Float> tf_in;
+    TensorFactory<DTYPE> tf_bound;
+
+    Tensor values = tf_in.make({2, 2}, {1, 4, 6, 8});
+    Tensor boundaries = tf_bound.make({1}, {CTYPE(0, 1)});
+    Tensor out = tf_out.zeros({2, 2});
+
+    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_bucketize_out(values, boundaries, false, false, out));
+  }
+
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_bucketize_complex_input() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<DTYPE> tf_in;
+    TensorFactory<ScalarType::Float> tf_bound;
+
+    Tensor values = tf_in.make({1}, {CTYPE(0, 1)});
+    Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+    Tensor out = tf_out.zeros({2, 2});
+
+    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_bucketize_out(values, boundaries, false, false, out));
+  }
+
   template <ScalarType IN_DTYPE>
   void test_bucketize_bound_types() {
 #define RUN_TEST(ctype, dtype) \
@@ -302,4 +334,18 @@ TEST_F(OpBucketizeTest, MismatchingIntArg64Fails) {
 
   ET_EXPECT_KERNEL_FAILURE(
       context_, op_bucketize_out(values, boundaries, false, false, out));
+}
+
+TEST_F(OpBucketizeTest, ComplexBoundaryTypesFails) {
+#define RUN_TEST(ctype, dtype) \
+  test_bucketize_complex_boundary<ctype, ScalarType::dtype>();
+  ET_FORALL_COMPLEXH_TYPES(RUN_TEST)
+#undef RUN_TEST
+}
+
+TEST_F(OpBucketizeTest, ComplexInputTypesFails) {
+#define RUN_TEST(ctype, dtype) \
+  test_bucketize_complex_input<ctype, ScalarType::dtype>();
+  ET_FORALL_COMPLEXH_TYPES(RUN_TEST)
+#undef RUN_TEST
 }
\ No newline at end of file

From b89f3eb8e2027237e887a3cdc722e3ae0e12552d Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 14:17:17 +0200
Subject: [PATCH 08/26] Bucketize: fix typo

---
 kernels/portable/cpu/op_bucketize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index ee35eecf421..ad259badc40 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -45,7 +45,7 @@ int64_t cus_upper_bound(
     const char* bd,
     load_to_compute_fn<CTYPE> bd_load_fn,
     ssize_t bd_elem_size) {
-  ino64_t start = 0;
+  int64_t start = 0;
 
   while (start < end) {
     const int64_t mid = start + ((end - start) >> 1);

From 394f463ae59ea0a90ca2f100f29ca12c7d9c2bef Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 14:37:16 +0200
Subject: [PATCH 09/26] Bucketize: add python tests

---
 kernels/test/targets.bzl       |   1 +
 kernels/test/test_bucketize.py | 135 +++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)
 create mode 100644 kernels/test/test_bucketize.py

diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 5212d691c5b..93bc17c036d 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -210,6 +210,7 @@ def define_common_targets():
     _common_op_test("op_bitwise_or_test", ["aten", "portable"])
     _common_op_test("op_bitwise_right_shift_test", ["portable"])
     _common_op_test("op_bitwise_xor_test", ["aten", "portable"])
+    _common_op_test("op_bucketize_test", ["portable"])
     _common_op_test("op_bmm_test", ["aten", "portable", "optimized"])
     _common_op_test("op_cat_test", ["aten", "portable"])
     _common_op_test("op_cdist_forward_test", ["aten", "portable"])
diff --git a/kernels/test/test_bucketize.py b/kernels/test/test_bucketize.py
new file mode 100644
index 00000000000..50f6f307406
--- /dev/null
+++ b/kernels/test/test_bucketize.py
@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Test for bucketize operations in ExecuTorch.
+
+This test validates that the bucketize operator work correctly
+by creating simple models that use the operation and running inference.
+"""
+
+import tempfile
+import unittest
+from pathlib import Path
+
+import torch
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from executorch.extension.export_util.utils import save_pte_program
+from executorch.runtime import Runtime
+
+
+class BucketizeModule(torch.nn.Module):
+    """Module that uses bucketize"""
+
+    def __init__(self, out_int32: bool, right: bool):
+        super().__init__()
+        self.out_int32 = out_int32
+        self.right = right
+
+    def forward(self, x, bounds: torch.Tensor) -> torch.Tensor:
+        return torch.bucketize(x, bounds, out_int32=self.out_int32, right=self.right)
+
+
+def export_and_generate_pte(model, example_inputs, output_path):
+    """Export a model and generate a .pte file."""
+    exported_program = torch.export.export(model, example_inputs)
+    edge_program_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=None,
+        compile_config=EdgeCompileConfig(
+            _core_aten_ops_exception_list=[
+                torch.ops.aten.bucketize.Tensor,
+                torch.ops.aten.bucketize.Scalar,
+            ]
+        ),
+    )
+    executorch_program_manager = edge_program_manager.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
+    )
+    save_pte_program(executorch_program_manager, str(output_path))
+
+
+class TestBucketizeOperator(unittest.TestCase):
+    """Test bucketize operator in ExecuTorch."""
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.temp_path = Path(self.temp_dir.name)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def _run_and_compare(self, model, inputs, pte_name):
+        """Helper to export, run, and compare outputs."""
+        model.eval()
+        expected = model(*inputs)
+
+        pte_path = self.temp_path / pte_name
+        export_and_generate_pte(model, inputs, pte_path)
+
+        runtime = Runtime.get()
+        method = runtime.load_program(pte_path).load_method("forward")
+        outputs = method.execute(list(inputs))
+
+        self.assertEqual(len(outputs), 1)
+        print(outputs[0])
+        print(expected)
+        torch.testing.assert_close(outputs[0], expected)
+        return outputs[0]
+
+    # ==========================================================================
+    # Core tests: one per operator signature
+    # ==========================================================================
+
+    def test_bucketize_tensor_out_int64(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, False)
+        x = torch.tensor([[1, 4, 6, 8]], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_out_int64.pte")
+
+    def test_bucketize_tensor_out_int32(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(True, False)
+        x = torch.tensor([[1, 4, 6, 8]], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_out_int32.pte")
+
+    def test_bucketize_tensor_right(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, True)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_right.pte")
+
+    def test_bucketize_tensor_left(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, False)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
+
+    def test_bucketize_scalar_out_int64(self):
+        """Test bucketize.Tensor_out: (Scalar, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, False)
+        x = 1
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int64.pte")
+
+    def test_bucketize_scalar_out_int32(self):
+        """Test bucketize.Tensor_out: (Scalar, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, False)
+        x = 1
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int32.pte")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 6b8b5148159201cf465bef7ca59ff05bd0a7cf19 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 14:44:04 +0200
Subject: [PATCH 10/26] Bucketize: add python edge cases tests

---
 kernels/test/test_bucketize.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernels/test/test_bucketize.py b/kernels/test/test_bucketize.py
index 50f6f307406..8c7115bfb65 100644
--- a/kernels/test/test_bucketize.py
+++ b/kernels/test/test_bucketize.py
@@ -130,6 +130,24 @@ def test_bucketize_scalar_out_int32(self):
         bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int32.pte")
 
+    # ==========================================================================
+    # Edge cases tests
+    # ==========================================================================
+
+    def test_bucketize_tensor_empty_boundary(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, False)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
+
+    def test_bucketize_tensor_empty_input(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(False, False)
+        x = torch.tensor([[]], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
+
 
 if __name__ == "__main__":
     unittest.main()

From 7a18241ff5ca07a5fa1fda7708fb7b992763469b Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 21:02:52 +0200
Subject: [PATCH 11/26] Bucketize: add empty inputs tests

---
 kernels/test/op_bucketize_test.cpp | 62 +++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 8d3c314bc99..64263a682a0 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -48,7 +48,7 @@ class OpBucketizeScalarTest : public OperatorTest {
   }
 };
 
-TEST_F(OpBucketizeScalarTest, SanityCheck) {
+TEST_F(OpBucketizeScalarTest, ScalarEmptyBoundaries) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_bound;
 
@@ -63,6 +63,21 @@ TEST_F(OpBucketizeScalarTest, SanityCheck) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
+TEST_F(OpBucketizeScalarTest, SanityCheck) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2.5;
+  Tensor boundaries = tf_bound.make({0}, {});
+  Tensor expected = tf_out.make({}, {0});
+  Tensor out = tf_out.zeros({});
+
+  Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
 TEST_F(OpBucketizeScalarTest, ScalarBoundaryTypes) {
   test_bucketize_bound_types();
 }
@@ -286,6 +301,51 @@ TEST_F(OpBucketizeTest, Boundaries1D) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
+TEST_F(OpBucketizeTest, EmptyBoundaries) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor boundaries = tf_dtype.make({0}, {});
+  Tensor expected = tf_out.make({2, 2}, {0, 0, 0, 0});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, EmptyInput) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({0}, {});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({0}, {});
+  Tensor out = tf_out.zeros({0});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, EmptyAll) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({0}, {});
+  Tensor boundaries = tf_dtype.make({0}, {});
+  Tensor expected = tf_out.make({0}, {});
+  Tensor out = tf_out.zeros({0});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
 TEST_F(OpBucketizeTest, BoundaryTypeNonRealHBF16Fails) {}
 
 TEST_F(OpBucketizeTest, BoundariesNDFails) {

From 52b347f98d63d9f4bf2e994cd6716aee95b6cf23 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 21:08:20 +0200
Subject: [PATCH 12/26] Bucketize: replace using directives with using
 declarations

---
 kernels/portable/cpu/op_bucketize.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index ad259badc40..2de8d2e0146 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -8,13 +8,12 @@
 namespace torch {
 namespace executor {
 namespace native {
-
-using namespace executorch::runtime;
-
 namespace {
 
-using namespace torch::executor::native::utils::internal;
-using namespace torch::executor::native::utils;
+using executorch::runtime::isRealHBF16Type;
+using torch::executor::native::utils::SupportedTensorDtypes;
+using torch::executor::native::utils::internal::get_load_to_compute_fn;
+using torch::executor::native::utils::internal::load_to_compute_fn;
 
 template <typename CTYPE>
 int64_t cus_lower_bound(
@@ -150,6 +149,8 @@ Error bucketize_common_pre_checks(
 
 } // namespace
 
+using executorch::runtime::tensor_is_realhbf16_type;
+
 Tensor& bucketize_tensor_out(
     KernelRuntimeContext& context,
     const Tensor& self,

From de0b4d244e240803ff85bb1c345991974d90fb39 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sat, 13 Jun 2026 21:09:32 +0200
Subject: [PATCH 13/26] =?UTF-8?q?Bucketize:=20rename=20bucketize=5Fscalar?=
 =?UTF-8?q?=20and=20bucketize=5Ftensor=20to=20bc=C3=ACucketize=5Fscalar=5F?=
 =?UTF-8?q?impl=20and=20buccketize=5Ftensor=5Fimpl?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 kernels/portable/cpu/op_bucketize.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 2de8d2e0146..4c58d8da66a 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -60,7 +60,7 @@ int64_t cus_upper_bound(
 }
 
 template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
-void bucketize_tensor(
+void bucketize_tensor_impl(
     KernelRuntimeContext& context,
     const Tensor& self,
     const Tensor& boundaries,
@@ -96,7 +96,7 @@ void bucketize_tensor(
 }
 
 template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
-void bucketize_scalar(
+void bucketize_scalar_impl(
     KernelRuntimeContext& context,
     const Scalar self,
     const Tensor& boundaries,
@@ -177,10 +177,10 @@ Tensor& bucketize_tensor_out(
   ET_SWITCH_REALHBF16_TYPES(
       compute_type, context, op_name, CTYPE_COMPUTE, [&]() {
         if (out_int32) {
-          bucketize_tensor<CTYPE_COMPUTE, int32_t, op_name>(
+          bucketize_tensor_impl<CTYPE_COMPUTE, int32_t, op_name>(
               context, self, boundaries, right, out);
         } else {
-          bucketize_tensor<CTYPE_COMPUTE, int64_t, op_name>(
+          bucketize_tensor_impl<CTYPE_COMPUTE, int64_t, op_name>(
               context, self, boundaries, right, out);
         }
       });
@@ -210,10 +210,10 @@ Tensor& bucketize_scalar_out(
   ET_SWITCH_REALHBF16_TYPES(
       compute_type, context, op_name, CTYPE_COMPUTE, [&]() {
         if (out_int32) {
-          bucketize_scalar<CTYPE_COMPUTE, int32_t, op_name>(
+          bucketize_scalar_impl<CTYPE_COMPUTE, int32_t, op_name>(
               context, self, boundaries, right, out);
         } else {
-          bucketize_scalar<CTYPE_COMPUTE, int64_t, op_name>(
+          bucketize_scalar_impl<CTYPE_COMPUTE, int64_t, op_name>(
               context, self, boundaries, right, out);
         }
       });

From 76641c89970468916482deb0fa5fd22fc7af256f Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sun, 14 Jun 2026 15:10:35 +0200
Subject: [PATCH 14/26] Bucketize: remove Boundaries1D test, reformat, improve
 test values

---
 kernels/test/op_bucketize_test.cpp | 57 ++++++++++--------------------
 1 file changed, 18 insertions(+), 39 deletions(-)

diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 64263a682a0..327365cd867 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -48,12 +48,12 @@ class OpBucketizeScalarTest : public OperatorTest {
   }
 };
 
-TEST_F(OpBucketizeScalarTest, ScalarEmptyBoundaries) {
+TEST_F(OpBucketizeScalarTest, SanityCheck) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_bound;
 
   Scalar value = 2.5;
-  Tensor boundaries = tf_bound.make({10}, {0, 2, 4, 6, 8, 10, 12, 14, 16, 18});
+  Tensor boundaries = tf_bound.make({5}, {0, 2, 4, 6, 8});
   Tensor expected = tf_out.make({}, {2});
   Tensor out = tf_out.zeros({});
 
@@ -63,7 +63,7 @@ TEST_F(OpBucketizeScalarTest, ScalarEmptyBoundaries) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeScalarTest, SanityCheck) {
+TEST_F(OpBucketizeScalarTest, ScalarEmptyBoundaries) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_bound;
 
@@ -145,8 +145,6 @@ class OpBucketizeTest : public OperatorTest {
     Tensor boundaries = tf_bound.make({1}, {CTYPE(0, 1)});
     Tensor out = tf_out.zeros({2, 2});
 
-    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
-
     ET_EXPECT_KERNEL_FAILURE(
         context_, op_bucketize_out(values, boundaries, false, false, out));
   }
@@ -161,8 +159,6 @@ class OpBucketizeTest : public OperatorTest {
     Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
     Tensor out = tf_out.zeros({2, 2});
 
-    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
-
     ET_EXPECT_KERNEL_FAILURE(
         context_, op_bucketize_out(values, boundaries, false, false, out));
   }
@@ -186,17 +182,17 @@ TEST_F(OpBucketizeTest, SanityCheck) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_comp;
 
-  Tensor values =
-      tf_comp.make({2, 4, 4}, {1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8,
+  Tensor values = tf_comp.make(
+      {2, 4, 4}, {0, 4, 6, 8, 1, 4, 5, 8, 1,  5, 6, 8, -1, 4, 6, 9,
 
-                               1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8, 1, 4, 6, 8});
+                  1, 4, 6, 8, 1, 4, 7, 8, -2, 4, 6, 8, 1,  4, 6, 8});
 
   Tensor boundaries = tf_comp.make({5}, {0, 3, 5, 7, 9});
 
   Tensor expected =
-      tf_out.make({2, 4, 4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+      tf_out.make({2, 4, 4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 3, 3, 4, 0, 2, 3, 5,
 
-                              1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4});
+                              1, 2, 3, 4, 1, 2, 4, 4, 0, 2, 3, 4, 1, 2, 3, 4});
 
   Tensor out = tf_out.zeros({2, 4, 4});
 
@@ -275,22 +271,7 @@ TEST_F(OpBucketizeTest, OutOfBoundary) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
-  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
-  Tensor expected = tf_out.make({2, 2}, {0, 0, 5, 5});
-  Tensor out = tf_out.zeros({2, 2});
-
-  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
-
-  EXPECT_TENSOR_EQ(ret, expected);
-  EXPECT_TENSOR_EQ(out, expected);
-}
-
-TEST_F(OpBucketizeTest, Boundaries1D) {
-  TensorFactory<ScalarType::Long> tf_out;
-  TensorFactory<ScalarType::Float> tf_dtype;
-
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 6, 40});
   Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
   Tensor expected = tf_out.make({2, 2}, {0, 0, 5, 5});
   Tensor out = tf_out.zeros({2, 2});
@@ -305,7 +286,7 @@ TEST_F(OpBucketizeTest, EmptyBoundaries) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
   Tensor boundaries = tf_dtype.make({0}, {});
   Tensor expected = tf_out.make({2, 2}, {0, 0, 0, 0});
   Tensor out = tf_out.zeros({2, 2});
@@ -346,14 +327,12 @@ TEST_F(OpBucketizeTest, EmptyAll) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBucketizeTest, BoundaryTypeNonRealHBF16Fails) {}
-
 TEST_F(OpBucketizeTest, BoundariesNDFails) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
-  Tensor boundaries = tf_dtype.make({3, 2}, {1, 2, 3, 4, 5, 6});
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({3, 2}, {0, 3, 5, 7, 9, 11});
   Tensor out = tf_out.zeros({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
@@ -364,8 +343,8 @@ TEST_F(OpBucketizeTest, MismatchingInOutDimsFails) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
-  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
   Tensor out = tf_out.zeros({2, 3});
 
   ET_EXPECT_KERNEL_FAILURE(
@@ -376,8 +355,8 @@ TEST_F(OpBucketizeTest, MismatchingIntArg32Fails) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
-  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
   Tensor out = tf_out.zeros({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(
@@ -388,8 +367,8 @@ TEST_F(OpBucketizeTest, MismatchingIntArg64Fails) {
   TensorFactory<ScalarType::Int> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
 
-  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 30, 40});
-  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
   Tensor out = tf_out.zeros({2, 2});
 
   ET_EXPECT_KERNEL_FAILURE(

From 0e78a6c68eab7203cf4ac56a3303c8bfb1aec233 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sun, 14 Jun 2026 15:28:10 +0200
Subject: [PATCH 15/26] Bucketize: use keyword arguments in model creation, fix
 typos

---
 kernels/test/test_bucketize.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/kernels/test/test_bucketize.py b/kernels/test/test_bucketize.py
index 8c7115bfb65..037411bed8d 100644
--- a/kernels/test/test_bucketize.py
+++ b/kernels/test/test_bucketize.py
@@ -7,7 +7,7 @@
 """
 Test for bucketize operations in ExecuTorch.
 
-This test validates that the bucketize operator work correctly
+This test validates that the bucketize operator works correctly
 by creating simple models that use the operation and running inference.
 """
 
@@ -90,42 +90,42 @@ def _run_and_compare(self, model, inputs, pte_name):
 
     def test_bucketize_tensor_out_int64(self):
         """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, False)
+        model = BucketizeModule(out_int32=False, right=False)
         x = torch.tensor([[1, 4, 6, 8]], dtype=torch.float)
         bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_out_int64.pte")
 
     def test_bucketize_tensor_out_int32(self):
         """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(True, False)
+        model = BucketizeModule(out_int32=True, right=False)
         x = torch.tensor([[1, 4, 6, 8]], dtype=torch.float)
         bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_out_int32.pte")
 
     def test_bucketize_tensor_right(self):
         """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, True)
+        model = BucketizeModule(out_int32=False, right=True)
         x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
         bounds = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_right.pte")
 
     def test_bucketize_tensor_left(self):
         """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, False)
+        model = BucketizeModule(out_int32=False, right=False)
         x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
         bounds = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
 
     def test_bucketize_scalar_out_int64(self):
         """Test bucketize.Tensor_out: (Scalar, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, False)
+        model = BucketizeModule(out_int32=False, right=False)
         x = 1
         bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int64.pte")
 
     def test_bucketize_scalar_out_int32(self):
         """Test bucketize.Tensor_out: (Scalar, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, False)
+        model = BucketizeModule(out_int32=False, right=False)
         x = 1
         bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
         self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int32.pte")
@@ -136,17 +136,21 @@ def test_bucketize_scalar_out_int32(self):
 
     def test_bucketize_tensor_empty_boundary(self):
         """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, False)
+        model = BucketizeModule(out_int32=False, right=False)
         x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
         bounds = torch.tensor([], dtype=torch.float)
-        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
+        self._run_and_compare(
+            model, (x, bounds), "test_bucketize_tensor_empty_boundary.pte"
+        )
 
     def test_bucketize_tensor_empty_input(self):
         """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
-        model = BucketizeModule(False, False)
-        x = torch.tensor([[]], dtype=torch.float)
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([], dtype=torch.float)
         bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
-        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
+        self._run_and_compare(
+            model, (x, bounds), "test_bucketize_tensor_empty_input.pte"
+        )
 
 
 if __name__ == "__main__":

From c6626b945f15e046e5282b0531d0cea56521b808 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sun, 14 Jun 2026 15:33:28 +0200
Subject: [PATCH 16/26] Bucketize; extract paraller for grain size to variable

---
 kernels/portable/cpu/op_bucketize.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 4c58d8da66a..76fa5330d15 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -15,6 +15,8 @@ using torch::executor::native::utils::SupportedTensorDtypes;
 using torch::executor::native::utils::internal::get_load_to_compute_fn;
 using torch::executor::native::utils::internal::load_to_compute_fn;
 
+constexpr int64_t BUCKETIZE_GRAIN_SIZE = 200;
+
 template <typename CTYPE>
 int64_t cus_lower_bound(
     int64_t end,
@@ -79,8 +81,11 @@ void bucketize_tensor_impl(
 
   auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
 
-  const bool success =
-      parallel_for(0, self.numel(), 200, [&](const auto begin, const auto end) {
+  const bool success = parallel_for(
+      0,
+      self.numel(),
+      BUCKETIZE_GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
         for (const auto i : c10::irange(begin, end)) {
           auto compute_val = in_load_fn(&in_data[i * in_size]);
           int64_t pos = right

From 711d9378b1cad848665b4f694b35c1d31ed52412 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sun, 14 Jun 2026 15:59:48 +0200
Subject: [PATCH 17/26] Bucketize: add comments to pre checks

---
 kernels/portable/cpu/op_bucketize.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 76fa5330d15..db89b0d02a6 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -123,6 +123,10 @@ void bucketize_scalar_impl(
   out_data[0] = pos;
 }
 
+// Performs check which are common to both tensor and scalar implementations:
+// - Boundaries must be 1D
+// - Out type must be consistent with out_int32 parameter
+// - Boundaries type must be realhbf16
 Error bucketize_common_pre_checks(
     const Tensor& boundaries,
     bool out_int32,

From 989caa3833776e4841094dd3884c9b3ce37a169f Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Sun, 14 Jun 2026 16:05:06 +0200
Subject: [PATCH 18/26] Bucketize: comment on out type missing check

---
 kernels/portable/cpu/op_bucketize.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index db89b0d02a6..51dd14b99a2 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -127,6 +127,8 @@ void bucketize_scalar_impl(
 // - Boundaries must be 1D
 // - Out type must be consistent with out_int32 parameter
 // - Boundaries type must be realhbf16
+// Boundaries size is not checked against out type as SizesType always fits into
+// an int32_t.
 Error bucketize_common_pre_checks(
     const Tensor& boundaries,
     bool out_int32,

From afb0d909b1b7e34eee8ec1e11153b08743abae46 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Tue, 16 Jun 2026 12:43:12 +0200
Subject: [PATCH 19/26] Bucketize: add tests for inf inputs and boundaries

---
 kernels/test/op_bucketize_test.cpp | 53 +++++++++++++++++++++++++++++-
 kernels/test/test_bucketize.py     | 23 +++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 327365cd867..5ae4d29cd05 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -4,8 +4,8 @@
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
-
 #include <gtest/gtest.h>
+#include <limits>
 
 using namespace ::testing;
 using executorch::aten::Scalar;
@@ -78,6 +78,21 @@ TEST_F(OpBucketizeScalarTest, ScalarEmptyBoundaries) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
+TEST_F(OpBucketizeScalarTest, ScalarInfInput) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Int> tf_bound;
+
+  Scalar value = std::numeric_limits<float>::infinity();
+  Tensor boundaries = tf_bound.make({5}, {0, 2, 4, 6, 8});
+  Tensor expected = tf_out.make({}, {5});
+  Tensor out = tf_out.zeros({});
+
+  Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
 TEST_F(OpBucketizeScalarTest, ScalarBoundaryTypes) {
   test_bucketize_bound_types();
 }
@@ -327,6 +342,42 @@ TEST_F(OpBucketizeTest, EmptyAll) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
+TEST_F(OpBucketizeTest, InfInput) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make(
+      {2},
+      {-std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity()});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor expected = tf_out.make({2}, {0, 5});
+  Tensor out = tf_out.zeros({2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, InfBoundaries) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make(
+      {2},
+      {-std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity()});
+  Tensor expected = tf_out.ones({2, 2});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
 TEST_F(OpBucketizeTest, BoundariesNDFails) {
   TensorFactory<ScalarType::Long> tf_out;
   TensorFactory<ScalarType::Float> tf_dtype;
diff --git a/kernels/test/test_bucketize.py b/kernels/test/test_bucketize.py
index 037411bed8d..c4ecc97afcd 100644
--- a/kernels/test/test_bucketize.py
+++ b/kernels/test/test_bucketize.py
@@ -152,6 +152,29 @@ def test_bucketize_tensor_empty_input(self):
             model, (x, bounds), "test_bucketize_tensor_empty_input.pte"
         )
 
+    def test_bucketize_tensor_inf_input(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([-torch.inf, torch.inf], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_inf_input.pte")
+
+    def test_bucketize_tensor_inf_boundary(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([-torch.inf, torch.inf], dtype=torch.float)
+        self._run_and_compare(
+            model, (x, bounds), "test_bucketize_tensor_inf_boundary.pte"
+        )
+
+    def test_bucketize_scalar_inf_input(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.inf
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_inf_input.pte")
+
 
 if __name__ == "__main__":
     unittest.main()

From 0744f2f7d42b946c8fc78d114c5032cfaf612969 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Wed, 17 Jun 2026 13:21:53 +0200
Subject: [PATCH 20/26] Bucketize: check input and output dim order match

---
 kernels/portable/cpu/op_bucketize.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 51dd14b99a2..dbfbe955ef2 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -176,6 +176,8 @@ Tensor& bucketize_tensor_out(
       out);
   ET_KERNEL_CHECK(
       context, tensors_have_same_shape(self, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      context, tensors_have_same_dim_order(self, out), InvalidArgument, out);
   ET_KERNEL_CHECK(
       context, tensor_is_realhbf16_type(self), InvalidArgument, out);
 

From 30576429f55392b723119d46680e4d4c97d2e267 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Wed, 17 Jun 2026 13:22:33 +0200
Subject: [PATCH 21/26] Bucketize: use 4D tensors in sanity check test

---
 kernels/test/op_bucketize_test.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 5ae4d29cd05..82591e1f3cd 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -198,18 +198,18 @@ TEST_F(OpBucketizeTest, SanityCheck) {
   TensorFactory<ScalarType::Float> tf_comp;
 
   Tensor values = tf_comp.make(
-      {2, 4, 4}, {0, 4, 6, 8, 1, 4, 5, 8, 1,  5, 6, 8, -1, 4, 6, 9,
+      {2, 2, 2, 4}, {0, 4, 6, 8, 1, 4, 5, 8, 1,  5, 6, 8, -1, 4, 6, 9,
 
-                  1, 4, 6, 8, 1, 4, 7, 8, -2, 4, 6, 8, 1,  4, 6, 8});
+                     1, 4, 6, 8, 1, 4, 7, 8, -2, 4, 6, 8, 1,  4, 6, 8});
 
   Tensor boundaries = tf_comp.make({5}, {0, 3, 5, 7, 9});
 
-  Tensor expected =
-      tf_out.make({2, 4, 4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 3, 3, 4, 0, 2, 3, 5,
+  Tensor expected = tf_out.make(
+      {2, 2, 2, 4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 3, 3, 4, 0, 2, 3, 5,
 
-                              1, 2, 3, 4, 1, 2, 4, 4, 0, 2, 3, 4, 1, 2, 3, 4});
+                     1, 2, 3, 4, 1, 2, 4, 4, 0, 2, 3, 4, 1, 2, 3, 4});
 
-  Tensor out = tf_out.zeros({2, 4, 4});
+  Tensor out = tf_out.zeros({2, 2, 2, 4});
 
   // The execution of the operator
   Tensor ret = op_bucketize_out(values, boundaries, false, true, out);

From bcff767f79c1f6c01197a79c90db4aaf0e8afe29 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Mon, 22 Jun 2026 19:52:36 +0200
Subject: [PATCH 22/26] Bucketize: add missing license headers

---
 kernels/portable/cpu/op_bucketize.cpp | 8 ++++++++
 kernels/test/op_bucketize_test.cpp    | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index dbfbe955ef2..51a12a98841 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -1,3 +1,11 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
index 82591e1f3cd..3d3e6df881c 100644
--- a/kernels/test/op_bucketize_test.cpp
+++ b/kernels/test/op_bucketize_test.cpp
@@ -1,3 +1,11 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>

From 081b2ae733bad8d93d2bc6fbc4cb27afaa104e2d Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Mon, 22 Jun 2026 20:00:44 +0200
Subject: [PATCH 23/26] Bucketize: pass right by value in bucketize_scalar_impl
 and bucketize_tensor_impl

---
 kernels/portable/cpu/op_bucketize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 51a12a98841..478baa3145f 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -74,7 +74,7 @@ void bucketize_tensor_impl(
     KernelRuntimeContext& context,
     const Tensor& self,
     const Tensor& boundaries,
-    const bool& right,
+    bool right,
     Tensor& out) {
   auto in_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
       context, self, SupportedTensorDtypes::REALHBF16);
@@ -113,7 +113,7 @@ void bucketize_scalar_impl(
     KernelRuntimeContext& context,
     const Scalar self,
     const Tensor& boundaries,
-    const bool& right,
+    bool right,
     Tensor& out) {
   CTYPE_COMPUTE compute_val = utils::scalar_to<CTYPE_COMPUTE>(self);
 

From ab1697dd779b18dc2ead8222e9e319e7e445eb0c Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Mon, 22 Jun 2026 20:04:28 +0200
Subject: [PATCH 24/26] Bucketize: pass scalar by reference in
 bucketize_scalar_impl

---
 kernels/portable/cpu/op_bucketize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 478baa3145f..75f2f37ecf1 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -111,7 +111,7 @@ void bucketize_tensor_impl(
 template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
 void bucketize_scalar_impl(
     KernelRuntimeContext& context,
-    const Scalar self,
+    const Scalar& self,
     const Tensor& boundaries,
     bool right,
     Tensor& out) {

From 7431b290d999bb995b5bd9bccd82c44a93a978d4 Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Mon, 22 Jun 2026 20:10:31 +0200
Subject: [PATCH 25/26] Bucketize: refactor

---
 kernels/portable/cpu/op_bucketize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 75f2f37ecf1..3c496d0b36f 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -85,7 +85,7 @@ void bucketize_tensor_impl(
       context, boundaries, SupportedTensorDtypes::REALHBF16);
   const ssize_t bd_elem_size = boundaries.element_size();
   auto bd_data = reinterpret_cast<const char*>(boundaries.const_data_ptr());
-  int64_t bd_end = boundaries.sizes().back();
+  int64_t bd_end = boundaries.numel();
 
   auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
 
@@ -121,7 +121,7 @@ void bucketize_scalar_impl(
       context, boundaries, SupportedTensorDtypes::REALHBF16);
   const ssize_t bd_elem_size = boundaries.element_size();
   auto bd_data = reinterpret_cast<const char*>(boundaries.const_data_ptr());
-  int64_t bd_end = boundaries.sizes().back();
+  int64_t bd_end = boundaries.numel();
 
   auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
 

From 4f7a6551632a251a202963dea8f837ce370f46ce Mon Sep 17 00:00:00 2001
From: Gallinator <lcgallinator4@gmail.com>
Date: Mon, 22 Jun 2026 20:26:53 +0200
Subject: [PATCH 26/26] Bucketize: resize the out tensor instead of checking
 its shape

---
 kernels/portable/cpu/op_bucketize.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
index 3c496d0b36f..5da44c7ed1d 100644
--- a/kernels/portable/cpu/op_bucketize.cpp
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -183,7 +183,10 @@ Tensor& bucketize_tensor_out(
       InvalidArgument,
       out);
   ET_KERNEL_CHECK(
-      context, tensors_have_same_shape(self, out), InvalidArgument, out);
+      context,
+      resize_tensor(out, self.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
   ET_KERNEL_CHECK(
       context, tensors_have_same_dim_order(self, out), InvalidArgument, out);
   ET_KERNEL_CHECK(