From ca7197d71f51da97c2785604528f36687cd31d17 Mon Sep 17 00:00:00 2001
From: Logan Riggs <logan.riggs@dremio.com>
Date: Sat, 24 Jan 2026 23:45:30 -0800
Subject: [PATCH] GH-48160: [C++][Gandiva] Pass CPU attributes to LLVM (#48161)

The CPU attributes are not passed to the LLVM layer, which means potential optimizations could be missed leading to inefficient code. This feature was lost as part of the refactoring in https://github.com/apache/arrow/commit/83cba25017a5c3a03e47f1851f242fa284f93533

I also discovered a bug with decimal alignments that was exposed by this change and was only reproducible in our test environment.

Pass the CPU attributes to the LLVM code generation, and a unit test.
Fix the 16 bit vs 8 bit decimal alignment problem. This was causing a crash sometimes on certain architectures with certain queries. Added a unit test.

Yes.

No.

* GitHub Issue: #48160

Lead-authored-by: Logan Riggs <logan.riggs@dremio.com>
Co-authored-by: lriggs <logan.riggs@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
---
 cpp/src/gandiva/llvm_generator.cc             |  16 +-
 cpp/src/gandiva/tests/CMakeLists.txt          |   1 +
 .../gandiva/tests/decimal_alignment_test.cc   | 252 ++++++++++++++++++
 3 files changed, 266 insertions(+), 3 deletions(-)
 create mode 100644 cpp/src/gandiva/tests/decimal_alignment_test.cc

diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc
index feaae336166..139f441f8c6 100644
--- a/cpp/src/gandiva/llvm_generator.cc
+++ b/cpp/src/gandiva/llvm_generator.cc
@@ -401,8 +401,13 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count,
   auto output_type_id = output->Type()->id();
   if (output_type_id == arrow::Type::BOOL) {
     SetPackedBitValue(output_ref, loop_var, output_value->data());
-  } else if (arrow::is_primitive(output_type_id) ||
-             output_type_id == arrow::Type::DECIMAL) {
+  } else if (output_type_id == arrow::Type::DECIMAL) {
+    // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned.
+    // Use CreateAlignedStore with 8-byte alignment to match Arrow's actual alignment.
+    auto slot_offset =
+        builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var);
+    builder->CreateAlignedStore(output_value->data(), slot_offset, llvm::MaybeAlign(8));
+  } else if (arrow::is_primitive(output_type_id)) {
     auto slot_offset =
         builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var);
     builder->CreateStore(output_value->data(), slot_offset);
@@ -643,7 +648,12 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) {
 
     case arrow::Type::DECIMAL: {
       auto slot_offset = builder->CreateGEP(types->i128_type(), slot_ref, slot_index);
-      slot_value = builder->CreateLoad(types->i128_type(), slot_offset, dex.FieldName());
+      // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned.
+      // Using CreateLoad with default alignment (16 for i128) causes crashes on
+      // misaligned data. Use CreateAlignedLoad with 8-byte alignment to match Arrow's
+      // actual alignment.
+      slot_value = builder->CreateAlignedLoad(
+          types->i128_type(), slot_offset, llvm::MaybeAlign(8), false, dex.FieldName());
       lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType());
       break;
     }
diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt
index 690609146ae..09428a87056 100644
--- a/cpp/src/gandiva/tests/CMakeLists.txt
+++ b/cpp/src/gandiva/tests/CMakeLists.txt
@@ -21,6 +21,7 @@ add_gandiva_test(projector-test
                  binary_test.cc
                  boolean_expr_test.cc
                  date_time_test.cc
+                 decimal_alignment_test.cc
                  decimal_single_test.cc
                  decimal_test.cc
                  filter_project_test.cc
diff --git a/cpp/src/gandiva/tests/decimal_alignment_test.cc b/cpp/src/gandiva/tests/decimal_alignment_test.cc
new file mode 100644
index 00000000000..3028ec81f31
--- /dev/null
+++ b/cpp/src/gandiva/tests/decimal_alignment_test.cc
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Test for decimal128 alignment issue fix.
+// Arrow decimal128 data may be 8-byte aligned but not 16-byte aligned.
+// This test verifies that Gandiva handles such data correctly.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array/array_decimal.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/decimal.h"
+
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+using arrow::Decimal128;
+
+namespace gandiva {
+
+class TestDecimalAlignment : public ::testing::Test {
+ public:
+  void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+  arrow::MemoryPool* pool_;
+};
+
+// Create a decimal128 array with data at a specific alignment offset
+// This simulates the real-world scenario where Arrow data from external sources
+// (like JNI/Java) may not be 16-byte aligned.
+std::shared_ptr<arrow::Array> MakeMisalignedDecimalArray(
+    const std::shared_ptr<arrow::Decimal128Type>& type,
+    const std::vector<Decimal128>& values, int alignment_offset) {
+  // Allocate buffer with extra space for misalignment
+  int64_t data_size = values.size() * 16;  // 16 bytes per Decimal128
+  int64_t buffer_size = data_size + 16;    // Extra space for offset
+
+  std::shared_ptr<arrow::Buffer> buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer));
+
+  // Calculate the starting offset to achieve desired alignment
+  // We want the data to be 8-byte aligned but NOT 16-byte aligned
+  uint8_t* raw_data = buffer->mutable_data();
+  uintptr_t addr = reinterpret_cast<uintptr_t>(raw_data);
+
+  // Find offset to get to 8-byte aligned but not 16-byte aligned address
+  int offset_to_8 = (8 - (addr % 8)) % 8;
+  int current_16_alignment = (addr + offset_to_8) % 16;
+
+  int final_offset;
+  if (alignment_offset == 8) {
+    // Want 8-byte aligned but NOT 16-byte aligned
+    if (current_16_alignment == 0) {
+      final_offset = offset_to_8 + 8;  // Add 8 to break 16-byte alignment
+    } else {
+      final_offset = offset_to_8;
+    }
+  } else {
+    // Want 16-byte aligned
+    final_offset = (16 - (addr % 16)) % 16;
+  }
+
+  // Copy decimal values to the offset location
+  uint8_t* data_start = raw_data + final_offset;
+  for (size_t i = 0; i < values.size(); i++) {
+    memcpy(data_start + i * 16, values[i].ToBytes().data(), 16);
+  }
+
+  // Verify alignment
+  uintptr_t data_addr = reinterpret_cast<uintptr_t>(data_start);
+  EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned";
+  if (alignment_offset == 8) {
+    EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned";
+  }
+
+  // Create a sliced buffer starting at our offset
+  auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size);
+
+  // Create validity buffer (all valid)
+  std::shared_ptr<arrow::Buffer> validity_buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer((values.size() + 7) / 8).Value(&validity_buffer));
+  memset(validity_buffer->mutable_data(), 0xFF, validity_buffer->size());
+
+  // Create the array with our misaligned data buffer
+  auto array_data = arrow::ArrayData::Make(type, static_cast<int64_t>(values.size()),
+                                           {validity_buffer, sliced_buffer});
+
+  return std::make_shared<arrow::Decimal128Array>(array_data);
+}
+
+// Test that decimal operations work correctly with 8-byte aligned (but not 16-byte
+// aligned) data
+TEST_F(TestDecimalAlignment, TestMisalignedDecimalSubtract) {
+  constexpr int32_t precision = 38;
+  constexpr int32_t scale = 17;
+  auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+  auto field_a = arrow::field("a", decimal_type);
+  auto field_b = arrow::field("b", decimal_type);
+  auto schema = arrow::schema({field_a, field_b});
+
+  Decimal128TypePtr output_type;
+  auto status = DecimalTypeUtil::GetResultType(
+      DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type);
+  ASSERT_OK(status);
+
+  auto res = arrow::field("res", output_type);
+  auto node_a = TreeExprBuilder::MakeField(field_a);
+  auto node_b = TreeExprBuilder::MakeField(field_b);
+  auto subtract =
+      TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type);
+  auto expr = TreeExprBuilder::MakeExpression(subtract, res);
+
+  std::shared_ptr<Projector> projector;
+  status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+  ASSERT_OK(status);
+
+  // Create test data
+  std::vector<Decimal128> values_a = {Decimal128(100), Decimal128(200), Decimal128(300)};
+  std::vector<Decimal128> values_b = {Decimal128(10), Decimal128(20), Decimal128(30)};
+
+  // Create arrays with 8-byte alignment (but NOT 16-byte aligned)
+  auto array_a = MakeMisalignedDecimalArray(decimal_type, values_a, 8);
+  auto array_b = MakeMisalignedDecimalArray(decimal_type, values_b, 8);
+
+  auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b});
+
+  // This should NOT crash even with misaligned data
+  arrow::ArrayVector outputs;
+  status = projector->Evaluate(*in_batch, pool_, &outputs);
+  ASSERT_OK(status);
+
+  // Verify results: 100-10=90, 200-20=180, 300-30=270
+  auto result = std::dynamic_pointer_cast<arrow::Decimal128Array>(outputs[0]);
+  ASSERT_NE(result, nullptr);
+  EXPECT_EQ(result->length(), 3);
+}
+
+// Create a misaligned output buffer for decimal128
+std::shared_ptr<arrow::ArrayData> MakeMisalignedDecimalOutput(
+    const std::shared_ptr<arrow::Decimal128Type>& type, int64_t num_records,
+    int alignment_offset) {
+  // Allocate data buffer with extra space for misalignment
+  int64_t data_size = num_records * 16;  // 16 bytes per Decimal128
+  int64_t buffer_size = data_size + 16;  // Extra space for offset
+
+  std::shared_ptr<arrow::Buffer> buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer));
+
+  uint8_t* raw_data = const_cast<uint8_t*>(buffer->data());
+  uintptr_t addr = reinterpret_cast<uintptr_t>(raw_data);
+
+  // Find offset to get to 8-byte aligned but not 16-byte aligned address
+  int offset_to_8 = (8 - (addr % 8)) % 8;
+  int current_16_alignment = (addr + offset_to_8) % 16;
+
+  int final_offset;
+  if (alignment_offset == 8) {
+    if (current_16_alignment == 0) {
+      final_offset = offset_to_8 + 8;
+    } else {
+      final_offset = offset_to_8;
+    }
+  } else {
+    final_offset = (16 - (addr % 16)) % 16;
+  }
+
+  // Verify alignment
+  uintptr_t data_addr = reinterpret_cast<uintptr_t>(raw_data + final_offset);
+  EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned";
+  if (alignment_offset == 8) {
+    EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned";
+  }
+
+  auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size);
+
+  // Create validity buffer
+  int64_t bitmap_size = (num_records + 7) / 8;
+  std::shared_ptr<arrow::Buffer> validity_buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer(bitmap_size).Value(&validity_buffer));
+  memset(const_cast<uint8_t*>(validity_buffer->data()), 0xFF, validity_buffer->size());
+
+  return arrow::ArrayData::Make(type, num_records, {validity_buffer, sliced_buffer});
+}
+
+// Test that decimal STORES work correctly with 8-byte aligned (but not 16-byte aligned)
+// output
+TEST_F(TestDecimalAlignment, TestMisalignedDecimalStore) {
+  constexpr int32_t precision = 38;
+  constexpr int32_t scale = 17;
+  auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+  auto field_a = arrow::field("a", decimal_type);
+  auto field_b = arrow::field("b", decimal_type);
+  auto schema = arrow::schema({field_a, field_b});
+
+  Decimal128TypePtr output_type;
+  auto status = DecimalTypeUtil::GetResultType(
+      DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type);
+  ASSERT_OK(status);
+
+  auto res = arrow::field("res", output_type);
+  auto node_a = TreeExprBuilder::MakeField(field_a);
+  auto node_b = TreeExprBuilder::MakeField(field_b);
+  auto subtract =
+      TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type);
+  auto expr = TreeExprBuilder::MakeExpression(subtract, res);
+
+  std::shared_ptr<Projector> projector;
+  status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+  ASSERT_OK(status);
+
+  // Create ALIGNED input arrays (using standard Arrow allocation)
+  auto array_a = MakeArrowArrayDecimal(
+      decimal_type, {Decimal128(100), Decimal128(200), Decimal128(300)},
+      {true, true, true});
+  auto array_b = MakeArrowArrayDecimal(
+      decimal_type, {Decimal128(10), Decimal128(20), Decimal128(30)}, {true, true, true});
+
+  auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b});
+
+  // Create MISALIGNED output buffer (8-byte aligned but NOT 16-byte aligned)
+  auto output_data = MakeMisalignedDecimalOutput(output_type, 3, 8);
+
+  // This should NOT crash even with misaligned output buffer
+  status = projector->Evaluate(*in_batch, {output_data});
+  ASSERT_OK(status);
+
+  // Verify the output was written correctly
+  auto result = std::make_shared<arrow::Decimal128Array>(output_data);
+  EXPECT_EQ(result->length(), 3);
+}
+
+}  // namespace gandiva