diff --git a/kernels/portable/cpu/op_bucketize.cpp b/kernels/portable/cpu/op_bucketize.cpp
new file mode 100644
index 00000000000..5da44c7ed1d
--- /dev/null
+++ b/kernels/portable/cpu/op_bucketize.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace {
+
+using executorch::runtime::isRealHBF16Type;
+using torch::executor::native::utils::SupportedTensorDtypes;
+using torch::executor::native::utils::internal::get_load_to_compute_fn;
+using torch::executor::native::utils::internal::load_to_compute_fn;
+
+constexpr int64_t BUCKETIZE_GRAIN_SIZE = 200;
+
+template <typename CTYPE>
+int64_t cus_lower_bound(
+    int64_t end,
+    const CTYPE val,
+    const char* bd,
+    load_to_compute_fn<CTYPE> bd_load_fn,
+    ssize_t bd_elem_size) {
+  int64_t start = 0;
+
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    CTYPE mid_bd = bd_load_fn(&bd[mid * bd_elem_size]);
+
+    if (mid_bd < val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template <typename CTYPE>
+int64_t cus_upper_bound(
+    int64_t end,
+    const CTYPE val,
+    const char* bd,
+    load_to_compute_fn<CTYPE> bd_load_fn,
+    ssize_t bd_elem_size) {
+  int64_t start = 0;
+
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    CTYPE mid_bd = bd_load_fn(&bd[mid * bd_elem_size]);
+
+    if (mid_bd <= val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
+void bucketize_tensor_impl(
+    KernelRuntimeContext& context,
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool right,
+    Tensor& out) {
+  auto in_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
+      context, self, SupportedTensorDtypes::REALHBF16);
+  const ssize_t in_size = self.element_size();
+  auto in_data = reinterpret_cast<const char*>(self.const_data_ptr());
+
+  auto bd_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
+      context, boundaries, SupportedTensorDtypes::REALHBF16);
+  const ssize_t bd_elem_size = boundaries.element_size();
+  auto bd_data = reinterpret_cast<const char*>(boundaries.const_data_ptr());
+  int64_t bd_end = boundaries.numel();
+
+  auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
+
+  const bool success = parallel_for(
+      0,
+      self.numel(),
+      BUCKETIZE_GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        for (const auto i : c10::irange(begin, end)) {
+          auto compute_val = in_load_fn(&in_data[i * in_size]);
+          int64_t pos = right
+              ? cus_upper_bound(
+                    bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size)
+              : cus_lower_bound(
+                    bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size);
+          out_data[i] = pos;
+        }
+      });
+
+  ET_KERNEL_CHECK_MSG(context, success, Internal, , "parallel_for failed");
+}
+
+template <typename CTYPE_COMPUTE, typename CTYPE_OUT, const char* op_name>
+void bucketize_scalar_impl(
+    KernelRuntimeContext& context,
+    const Scalar& self,
+    const Tensor& boundaries,
+    bool right,
+    Tensor& out) {
+  CTYPE_COMPUTE compute_val = utils::scalar_to<CTYPE_COMPUTE>(self);
+
+  auto bd_load_fn = get_load_to_compute_fn<CTYPE_COMPUTE, op_name>(
+      context, boundaries, SupportedTensorDtypes::REALHBF16);
+  const ssize_t bd_elem_size = boundaries.element_size();
+  auto bd_data = reinterpret_cast<const char*>(boundaries.const_data_ptr());
+  int64_t bd_end = boundaries.numel();
+
+  auto out_data = out.mutable_data_ptr<CTYPE_OUT>();
+
+  int64_t pos = right
+      ? cus_upper_bound(bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size)
+      : cus_lower_bound(bd_end, compute_val, bd_data, bd_load_fn, bd_elem_size);
+  out_data[0] = pos;
+}
+
+// Performs check which are common to both tensor and scalar implementations:
+// - Boundaries must be 1D
+// - Out type must be consistent with out_int32 parameter
+// - Boundaries type must be realhbf16
+// Boundaries size is not checked against out type as SizesType always fits into
+// an int32_t.
+Error bucketize_common_pre_checks(
+    const Tensor& boundaries,
+    bool out_int32,
+    Tensor& out) {
+  ET_CHECK_OR_RETURN_ERROR(
+      boundaries.dim() == 1,
+      InvalidArgument,
+      "boundaries tensor must be 1 dimension, but got dim(%zu)",
+      boundaries.dim());
+
+  ScalarType out_dtype = out.scalar_type();
+  ET_CHECK_OR_RETURN_ERROR(
+      (out_dtype == ScalarType::Long && !out_int32) ||
+          (out_dtype == ScalarType::Int && out_int32),
+      InvalidArgument,
+      "torch.bucketize(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on whether out_int32 flag is True, but we got output tensor dtype %s and out_int32 flag is %s",
+      toString(out_dtype),
+      (out_int32 ? "True" : "False"));
+
+  ScalarType bound_dtype = boundaries.scalar_type();
+  ET_CHECK_OR_RETURN_ERROR(
+      isRealHBF16Type(bound_dtype),
+      InvalidArgument,
+      "boundaries tensor of type %s is not supported",
+      toString(bound_dtype));
+
+  return Error::Ok;
+}
+
+} // namespace
+
+using executorch::runtime::tensor_is_realhbf16_type;
+
+Tensor& bucketize_tensor_out(
+    KernelRuntimeContext& context,
+    const Tensor& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      context,
+      bucketize_common_pre_checks(boundaries, out_int32, out) == Error::Ok,
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      context,
+      resize_tensor(out, self.sizes()) == Error::Ok,
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(
+      context, tensors_have_same_dim_order(self, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      context, tensor_is_realhbf16_type(self), InvalidArgument, out);
+
+  ScalarType common_type =
+      promoteTypes(self.scalar_type(), boundaries.scalar_type());
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  static constexpr const char op_name[] = "bucketize.Tensor_out";
+
+  ET_SWITCH_REALHBF16_TYPES(
+      compute_type, context, op_name, CTYPE_COMPUTE, [&]() {
+        if (out_int32) {
+          bucketize_tensor_impl<CTYPE_COMPUTE, int32_t, op_name>(
+              context, self, boundaries, right, out);
+        } else {
+          bucketize_tensor_impl<CTYPE_COMPUTE, int64_t, op_name>(
+              context, self, boundaries, right, out);
+        }
+      });
+  return out;
+}
+
+Tensor& bucketize_scalar_out(
+    KernelRuntimeContext& context,
+    const Scalar& self,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      context,
+      bucketize_common_pre_checks(boundaries, out_int32, out) == Error::Ok,
+      InvalidArgument,
+      out);
+  ET_KERNEL_CHECK(context, out.dim() == 0, InvalidArgument, out);
+
+  ScalarType common_type =
+      utils::promote_type_with_scalar(boundaries.scalar_type(), self);
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  static constexpr const char op_name[] = "bucketize.Scalar_out";
+
+  ET_SWITCH_REALHBF16_TYPES(
+      compute_type, context, op_name, CTYPE_COMPUTE, [&]() {
+        if (out_int32) {
+          bucketize_scalar_impl<CTYPE_COMPUTE, int32_t, op_name>(
+              context, self, boundaries, right, out);
+        } else {
+          bucketize_scalar_impl<CTYPE_COMPUTE, int64_t, op_name>(
+              context, self, boundaries, right, out);
+        }
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
\ No newline at end of file
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index ecf62ee3606..8f4dcf6e4bd 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -242,6 +242,16 @@
     - arg_meta: null
       kernel_name: torch::executor::bmm_out
 
+- op: bucketize.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::bucketize_tensor_out
+
+- op: bucketize.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::bucketize_scalar_out
+
 - op: cat.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 2707ba5db71..e45fed272ef 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -184,6 +184,7 @@ set(all_test_sources
     "op_bitwise_right_shift_test.cpp"
     "op_bitwise_xor_test.cpp"
     "op_bmm_test.cpp"
+    "op_bucketize_test.cpp"
     "op_cat_test.cpp"
     "op_cdist_forward_test.cpp"
     "op_ceil_test.cpp"
diff --git a/kernels/test/op_bucketize_test.cpp b/kernels/test/op_bucketize_test.cpp
new file mode 100644
index 00000000000..3d3e6df881c
--- /dev/null
+++ b/kernels/test/op_bucketize_test.cpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <gtest/gtest.h>
+#include <limits>
+
+using namespace ::testing;
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpBucketizeScalarTest : public OperatorTest {
+ protected:
+  Tensor& op_bucketize_out(
+      const Scalar& self,
+      const Tensor& boundaries,
+      bool out_int32,
+      bool right,
+      Tensor& out) {
+    return torch::executor::aten::bucketize_outf(
+        context_, self, boundaries, out_int32, right, out);
+  }
+
+  template <ScalarType BOUND_DTYPE>
+  void test_bucketize_types() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<BOUND_DTYPE> tf_bound;
+
+    Scalar value = 2;
+    Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+    Tensor expected = tf_out.make({}, {1});
+    Tensor out = tf_out.zeros({});
+
+    Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+    EXPECT_TENSOR_EQ(ret, expected);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  void test_bucketize_bound_types() {
+#define RUN_TEST(ctype, dtype) test_bucketize_types<ScalarType::dtype>();
+    ET_FORALL_REALHBF16_TYPES(RUN_TEST)
+#undef RUN_TEST
+  }
+};
+
+TEST_F(OpBucketizeScalarTest, SanityCheck) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2.5;
+  Tensor boundaries = tf_bound.make({5}, {0, 2, 4, 6, 8});
+  Tensor expected = tf_out.make({}, {2});
+  Tensor out = tf_out.zeros({});
+
+  Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarEmptyBoundaries) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2.5;
+  Tensor boundaries = tf_bound.make({0}, {});
+  Tensor expected = tf_out.make({}, {0});
+  Tensor out = tf_out.zeros({});
+
+  Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarInfInput) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Int> tf_bound;
+
+  Scalar value = std::numeric_limits<float>::infinity();
+  Tensor boundaries = tf_bound.make({5}, {0, 2, 4, 6, 8});
+  Tensor expected = tf_out.make({}, {5});
+  Tensor out = tf_out.zeros({});
+
+  Tensor ret = op_bucketize_out(value, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarBoundaryTypes) {
+  test_bucketize_bound_types();
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarOut1DFails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2;
+  Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({5});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(value, boundaries, false, true, out));
+}
+
+TEST_F(OpBucketizeScalarTest, ScalarOutNDFails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_bound;
+
+  Scalar value = 2;
+  Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({5, 5});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(value, boundaries, false, true, out));
+}
+
+class OpBucketizeTest : public OperatorTest {
+ protected:
+  Tensor& op_bucketize_out(
+      const Tensor& in,
+      const Tensor& boundaries,
+      bool out_int32,
+      bool right,
+      Tensor& out) {
+    return torch::executor::aten::bucketize_outf(
+        context_, in, boundaries, out_int32, right, out);
+  }
+
+  template <ScalarType IN_DTYPE, ScalarType BOUND_DTYPE>
+  void test_bucketize_types() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<IN_DTYPE> tf_in;
+    TensorFactory<BOUND_DTYPE> tf_bound;
+
+    Tensor values = tf_in.make({2, 2}, {1, 4, 6, 8});
+    Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+    Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+    Tensor out = tf_out.zeros({2, 2});
+
+    Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+    EXPECT_TENSOR_EQ(ret, expected);
+    EXPECT_TENSOR_EQ(out, expected);
+  }
+
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_bucketize_complex_boundary() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<ScalarType::Float> tf_in;
+    TensorFactory<DTYPE> tf_bound;
+
+    Tensor values = tf_in.make({2, 2}, {1, 4, 6, 8});
+    Tensor boundaries = tf_bound.make({1}, {CTYPE(0, 1)});
+    Tensor out = tf_out.zeros({2, 2});
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_bucketize_out(values, boundaries, false, false, out));
+  }
+
+  template <typename CTYPE, ScalarType DTYPE>
+  void test_bucketize_complex_input() {
+    TensorFactory<ScalarType::Long> tf_out;
+    TensorFactory<DTYPE> tf_in;
+    TensorFactory<ScalarType::Float> tf_bound;
+
+    Tensor values = tf_in.make({1}, {CTYPE(0, 1)});
+    Tensor boundaries = tf_bound.make({5}, {0, 3, 5, 7, 9});
+    Tensor out = tf_out.zeros({2, 2});
+
+    ET_EXPECT_KERNEL_FAILURE(
+        context_, op_bucketize_out(values, boundaries, false, false, out));
+  }
+
+  template <ScalarType IN_DTYPE>
+  void test_bucketize_bound_types() {
+#define RUN_TEST(ctype, dtype) \
+  test_bucketize_types<IN_DTYPE, ScalarType::dtype>();
+    ET_FORALL_REALHBF16_TYPES(RUN_TEST)
+#undef RUN_TEST
+  }
+
+  void test_bucketize_in_types() {
+#define RUN_TEST(ctype, dtype) test_bucketize_bound_types<ScalarType::dtype>();
+    ET_FORALL_REALHBF16_TYPES(RUN_TEST)
+#undef RUN_TEST
+  }
+};
+
+TEST_F(OpBucketizeTest, SanityCheck) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_comp;
+
+  Tensor values = tf_comp.make(
+      {2, 2, 2, 4}, {0, 4, 6, 8, 1, 4, 5, 8, 1,  5, 6, 8, -1, 4, 6, 9,
+
+                     1, 4, 6, 8, 1, 4, 7, 8, -2, 4, 6, 8, 1,  4, 6, 8});
+
+  Tensor boundaries = tf_comp.make({5}, {0, 3, 5, 7, 9});
+
+  Tensor expected = tf_out.make(
+      {2, 2, 2, 4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 3, 3, 4, 0, 2, 3, 5,
+
+                     1, 2, 3, 4, 1, 2, 4, 4, 0, 2, 3, 4, 1, 2, 3, 4});
+
+  Tensor out = tf_out.zeros({2, 2, 2, 4});
+
+  // The execution of the operator
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, InAndBoundaryTypes) {
+  test_bucketize_in_types();
+}
+
+TEST_F(OpBucketizeTest, Int64Out) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, Int32Out) {
+  TensorFactory<ScalarType::Int> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, true, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, BoundariesRight) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 2, 3, 4});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {1, 2, 3, 4});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, BoundariesLeft) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 2, 3, 4});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {0, 1, 2, 3});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, OutOfBoundary) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {-1, -2, 6, 40});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({2, 2}, {0, 0, 5, 5});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, EmptyBoundaries) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({0}, {});
+  Tensor expected = tf_out.make({2, 2}, {0, 0, 0, 0});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, EmptyInput) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({0}, {});
+  Tensor boundaries = tf_dtype.make({5}, {1, 2, 3, 4, 5});
+  Tensor expected = tf_out.make({0}, {});
+  Tensor out = tf_out.zeros({0});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, EmptyAll) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({0}, {});
+  Tensor boundaries = tf_dtype.make({0}, {});
+  Tensor expected = tf_out.make({0}, {});
+  Tensor out = tf_out.zeros({0});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, false, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, InfInput) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make(
+      {2},
+      {-std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity()});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor expected = tf_out.make({2}, {0, 5});
+  Tensor out = tf_out.zeros({2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, InfBoundaries) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make(
+      {2},
+      {-std::numeric_limits<float>::infinity(),
+       std::numeric_limits<float>::infinity()});
+  Tensor expected = tf_out.ones({2, 2});
+  Tensor out = tf_out.zeros({2, 2});
+
+  Tensor ret = op_bucketize_out(values, boundaries, false, true, out);
+
+  EXPECT_TENSOR_EQ(ret, expected);
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpBucketizeTest, BoundariesNDFails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({3, 2}, {0, 3, 5, 7, 9, 11});
+  Tensor out = tf_out.zeros({2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, false, false, out));
+}
+
+TEST_F(OpBucketizeTest, MismatchingInOutDimsFails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({2, 3});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, false, false, out));
+}
+
+TEST_F(OpBucketizeTest, MismatchingIntArg32Fails) {
+  TensorFactory<ScalarType::Long> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, true, false, out));
+}
+
+TEST_F(OpBucketizeTest, MismatchingIntArg64Fails) {
+  TensorFactory<ScalarType::Int> tf_out;
+  TensorFactory<ScalarType::Float> tf_dtype;
+
+  Tensor values = tf_dtype.make({2, 2}, {1, 4, 6, 8});
+  Tensor boundaries = tf_dtype.make({5}, {0, 3, 5, 7, 9});
+  Tensor out = tf_out.zeros({2, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_bucketize_out(values, boundaries, false, false, out));
+}
+
+TEST_F(OpBucketizeTest, ComplexBoundaryTypesFails) {
+#define RUN_TEST(ctype, dtype) \
+  test_bucketize_complex_boundary<ctype, ScalarType::dtype>();
+  ET_FORALL_COMPLEXH_TYPES(RUN_TEST)
+#undef RUN_TEST
+}
+
+TEST_F(OpBucketizeTest, ComplexInputTypesFails) {
+#define RUN_TEST(ctype, dtype) \
+  test_bucketize_complex_input<ctype, ScalarType::dtype>();
+  ET_FORALL_COMPLEXH_TYPES(RUN_TEST)
+#undef RUN_TEST
+}
\ No newline at end of file
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 431ec96b447..71bf225288f 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -211,6 +211,7 @@ def define_common_targets():
     _common_op_test("op_bitwise_or_test", ["aten", "portable"])
     _common_op_test("op_bitwise_right_shift_test", ["portable"])
     _common_op_test("op_bitwise_xor_test", ["aten", "portable"])
+    _common_op_test("op_bucketize_test", ["portable"])
     _common_op_test("op_bmm_test", ["aten", "portable", "optimized"])
     _common_op_test("op_cat_test", ["aten", "portable"])
     _common_op_test("op_cdist_forward_test", ["aten", "portable"])
diff --git a/kernels/test/test_bucketize.py b/kernels/test/test_bucketize.py
new file mode 100644
index 00000000000..c4ecc97afcd
--- /dev/null
+++ b/kernels/test/test_bucketize.py
@@ -0,0 +1,180 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Test for bucketize operations in ExecuTorch.
+
+This test validates that the bucketize operator works correctly
+by creating simple models that use the operation and running inference.
+"""
+
+import tempfile
+import unittest
+from pathlib import Path
+
+import torch
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from executorch.extension.export_util.utils import save_pte_program
+from executorch.runtime import Runtime
+
+
+class BucketizeModule(torch.nn.Module):
+    """Module that uses bucketize"""
+
+    def __init__(self, out_int32: bool, right: bool):
+        super().__init__()
+        self.out_int32 = out_int32
+        self.right = right
+
+    def forward(self, x, bounds: torch.Tensor) -> torch.Tensor:
+        return torch.bucketize(x, bounds, out_int32=self.out_int32, right=self.right)
+
+
+def export_and_generate_pte(model, example_inputs, output_path):
+    """Export a model and generate a .pte file."""
+    exported_program = torch.export.export(model, example_inputs)
+    edge_program_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=None,
+        compile_config=EdgeCompileConfig(
+            _core_aten_ops_exception_list=[
+                torch.ops.aten.bucketize.Tensor,
+                torch.ops.aten.bucketize.Scalar,
+            ]
+        ),
+    )
+    executorch_program_manager = edge_program_manager.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
+    )
+    save_pte_program(executorch_program_manager, str(output_path))
+
+
+class TestBucketizeOperator(unittest.TestCase):
+    """Test bucketize operator in ExecuTorch."""
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.temp_path = Path(self.temp_dir.name)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def _run_and_compare(self, model, inputs, pte_name):
+        """Helper to export, run, and compare outputs."""
+        model.eval()
+        expected = model(*inputs)
+
+        pte_path = self.temp_path / pte_name
+        export_and_generate_pte(model, inputs, pte_path)
+
+        runtime = Runtime.get()
+        method = runtime.load_program(pte_path).load_method("forward")
+        outputs = method.execute(list(inputs))
+
+        self.assertEqual(len(outputs), 1)
+        print(outputs[0])
+        print(expected)
+        torch.testing.assert_close(outputs[0], expected)
+        return outputs[0]
+
+    # ==========================================================================
+    # Core tests: one per operator signature
+    # ==========================================================================
+
+    def test_bucketize_tensor_out_int64(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([[1, 4, 6, 8]], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_out_int64.pte")
+
+    def test_bucketize_tensor_out_int32(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=True, right=False)
+        x = torch.tensor([[1, 4, 6, 8]], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_out_int32.pte")
+
+    def test_bucketize_tensor_right(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=True)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_right.pte")
+
+    def test_bucketize_tensor_left(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_left.pte")
+
+    def test_bucketize_scalar_out_int64(self):
+        """Test bucketize.Tensor_out: (Scalar, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = 1
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int64.pte")
+
+    def test_bucketize_scalar_out_int32(self):
+        """Test bucketize.Tensor_out: (Scalar, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = 1
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_out_int32.pte")
+
+    # ==========================================================================
+    # Edge cases tests
+    # ==========================================================================
+
+    def test_bucketize_tensor_empty_boundary(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([], dtype=torch.float)
+        self._run_and_compare(
+            model, (x, bounds), "test_bucketize_tensor_empty_boundary.pte"
+        )
+
+    def test_bucketize_tensor_empty_input(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(
+            model, (x, bounds), "test_bucketize_tensor_empty_input.pte"
+        )
+
+    def test_bucketize_tensor_inf_input(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([-torch.inf, torch.inf], dtype=torch.float)
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_tensor_inf_input.pte")
+
+    def test_bucketize_tensor_inf_boundary(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.tensor([[1, 2, 3, 4]], dtype=torch.float)
+        bounds = torch.tensor([-torch.inf, torch.inf], dtype=torch.float)
+        self._run_and_compare(
+            model, (x, bounds), "test_bucketize_tensor_inf_boundary.pte"
+        )
+
+    def test_bucketize_scalar_inf_input(self):
+        """Test bucketize.Tensor_out: (Tensor, Tensor, bool, bool) -> Tensor."""
+        model = BucketizeModule(out_int32=False, right=False)
+        x = torch.inf
+        bounds = torch.tensor([0, 3, 5, 7, 9], dtype=torch.float)
+        self._run_and_compare(model, (x, bounds), "test_bucketize_scalar_inf_input.pte")
+
+
+if __name__ == "__main__":
+    unittest.main()