dremio · lriggs · Feb 10, 2026 · Jan 25, 2026
diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc
@@ -401,8 +401,13 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count,
   auto output_type_id = output->Type()->id();
   if (output_type_id == arrow::Type::BOOL) {
     SetPackedBitValue(output_ref, loop_var, output_value->data());
-  } else if (arrow::is_primitive(output_type_id) ||
-             output_type_id == arrow::Type::DECIMAL) {
+  } else if (output_type_id == arrow::Type::DECIMAL) {
+    // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned.
+    // Use CreateAlignedStore with 8-byte alignment to match Arrow's actual alignment.
+    auto slot_offset =
+        builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var);
+    builder->CreateAlignedStore(output_value->data(), slot_offset, llvm::MaybeAlign(8));
+  } else if (arrow::is_primitive(output_type_id)) {
     auto slot_offset =
         builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var);
     builder->CreateStore(output_value->data(), slot_offset);
@@ -643,7 +648,12 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) {
 
     case arrow::Type::DECIMAL: {
       auto slot_offset = builder->CreateGEP(types->i128_type(), slot_ref, slot_index);
-      slot_value = builder->CreateLoad(types->i128_type(), slot_offset, dex.FieldName());
+      // Arrow decimal128 data is only 8-byte aligned, not 16-byte aligned.
+      // Using CreateLoad with default alignment (16 for i128) causes crashes on
+      // misaligned data. Use CreateAlignedLoad with 8-byte alignment to match Arrow's
+      // actual alignment.
+      slot_value = builder->CreateAlignedLoad(
+          types->i128_type(), slot_offset, llvm::MaybeAlign(8), false, dex.FieldName());
       lvalue = generator_->BuildDecimalLValue(slot_value, dex.FieldType());
       break;
     }

diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt
@@ -21,6 +21,7 @@ add_gandiva_test(projector-test
                  binary_test.cc
                  boolean_expr_test.cc
                  date_time_test.cc
+                 decimal_alignment_test.cc
                  decimal_single_test.cc
                  decimal_test.cc
                  filter_project_test.cc

diff --git a/cpp/src/gandiva/tests/decimal_alignment_test.cc b/cpp/src/gandiva/tests/decimal_alignment_test.cc
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Test for decimal128 alignment issue fix.
+// Arrow decimal128 data may be 8-byte aligned but not 16-byte aligned.
+// This test verifies that Gandiva handles such data correctly.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array/array_decimal.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/decimal.h"
+
+#include "gandiva/decimal_type_util.h"
+#include "gandiva/projector.h"
+#include "gandiva/tests/test_util.h"
+#include "gandiva/tree_expr_builder.h"
+
+using arrow::Decimal128;
+
+namespace gandiva {
+
+class TestDecimalAlignment : public ::testing::Test {
+ public:
+  void SetUp() { pool_ = arrow::default_memory_pool(); }
+
+ protected:
+  arrow::MemoryPool* pool_;
+};
+
+// Create a decimal128 array with data at a specific alignment offset
+// This simulates the real-world scenario where Arrow data from external sources
+// (like JNI/Java) may not be 16-byte aligned.
+std::shared_ptr<arrow::Array> MakeMisalignedDecimalArray(
+    const std::shared_ptr<arrow::Decimal128Type>& type,
+    const std::vector<Decimal128>& values, int alignment_offset) {
+  // Allocate buffer with extra space for misalignment
+  int64_t data_size = values.size() * 16;  // 16 bytes per Decimal128
+  int64_t buffer_size = data_size + 16;    // Extra space for offset
+
+  std::shared_ptr<arrow::Buffer> buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer));
+
+  // Calculate the starting offset to achieve desired alignment
+  // We want the data to be 8-byte aligned but NOT 16-byte aligned
+  uint8_t* raw_data = buffer->mutable_data();
+  uintptr_t addr = reinterpret_cast<uintptr_t>(raw_data);
+
+  // Find offset to get to 8-byte aligned but not 16-byte aligned address
+  int offset_to_8 = (8 - (addr % 8)) % 8;
+  int current_16_alignment = (addr + offset_to_8) % 16;
+
+  int final_offset;
+  if (alignment_offset == 8) {
+    // Want 8-byte aligned but NOT 16-byte aligned
+    if (current_16_alignment == 0) {
+      final_offset = offset_to_8 + 8;  // Add 8 to break 16-byte alignment
+    } else {
+      final_offset = offset_to_8;
+    }
+  } else {
+    // Want 16-byte aligned
+    final_offset = (16 - (addr % 16)) % 16;
+  }
+
+  // Copy decimal values to the offset location
+  uint8_t* data_start = raw_data + final_offset;
+  for (size_t i = 0; i < values.size(); i++) {
+    memcpy(data_start + i * 16, values[i].ToBytes().data(), 16);
+  }
+
+  // Verify alignment
+  uintptr_t data_addr = reinterpret_cast<uintptr_t>(data_start);
+  EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned";
+  if (alignment_offset == 8) {
+    EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned";
+  }
+
+  // Create a sliced buffer starting at our offset
+  auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size);
+
+  // Create validity buffer (all valid)
+  std::shared_ptr<arrow::Buffer> validity_buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer((values.size() + 7) / 8).Value(&validity_buffer));
+  memset(validity_buffer->mutable_data(), 0xFF, validity_buffer->size());
+
+  // Create the array with our misaligned data buffer
+  auto array_data = arrow::ArrayData::Make(type, static_cast<int64_t>(values.size()),
+                                           {validity_buffer, sliced_buffer});
+
+  return std::make_shared<arrow::Decimal128Array>(array_data);
+}
+
+// Test that decimal operations work correctly with 8-byte aligned (but not 16-byte
+// aligned) data
+TEST_F(TestDecimalAlignment, TestMisalignedDecimalSubtract) {
+  constexpr int32_t precision = 38;
+  constexpr int32_t scale = 17;
+  auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+  auto field_a = arrow::field("a", decimal_type);
+  auto field_b = arrow::field("b", decimal_type);
+  auto schema = arrow::schema({field_a, field_b});
+
+  Decimal128TypePtr output_type;
+  auto status = DecimalTypeUtil::GetResultType(
+      DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type);
+  ASSERT_OK(status);
+
+  auto res = arrow::field("res", output_type);
+  auto node_a = TreeExprBuilder::MakeField(field_a);
+  auto node_b = TreeExprBuilder::MakeField(field_b);
+  auto subtract =
+      TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type);
+  auto expr = TreeExprBuilder::MakeExpression(subtract, res);
+
+  std::shared_ptr<Projector> projector;
+  status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+  ASSERT_OK(status);
+
+  // Create test data
+  std::vector<Decimal128> values_a = {Decimal128(100), Decimal128(200), Decimal128(300)};
+  std::vector<Decimal128> values_b = {Decimal128(10), Decimal128(20), Decimal128(30)};
+
+  // Create arrays with 8-byte alignment (but NOT 16-byte aligned)
+  auto array_a = MakeMisalignedDecimalArray(decimal_type, values_a, 8);
+  auto array_b = MakeMisalignedDecimalArray(decimal_type, values_b, 8);
+
+  auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b});
+
+  // This should NOT crash even with misaligned data
+  arrow::ArrayVector outputs;
+  status = projector->Evaluate(*in_batch, pool_, &outputs);
+  ASSERT_OK(status);
+
+  // Verify results: 100-10=90, 200-20=180, 300-30=270
+  auto result = std::dynamic_pointer_cast<arrow::Decimal128Array>(outputs[0]);
+  ASSERT_NE(result, nullptr);
+  EXPECT_EQ(result->length(), 3);
+}
+
+// Create a misaligned output buffer for decimal128
+std::shared_ptr<arrow::ArrayData> MakeMisalignedDecimalOutput(
+    const std::shared_ptr<arrow::Decimal128Type>& type, int64_t num_records,
+    int alignment_offset) {
+  // Allocate data buffer with extra space for misalignment
+  int64_t data_size = num_records * 16;  // 16 bytes per Decimal128
+  int64_t buffer_size = data_size + 16;  // Extra space for offset
+
+  std::shared_ptr<arrow::Buffer> buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer(buffer_size).Value(&buffer));
+
+  uint8_t* raw_data = const_cast<uint8_t*>(buffer->data());
+  uintptr_t addr = reinterpret_cast<uintptr_t>(raw_data);
+
+  // Find offset to get to 8-byte aligned but not 16-byte aligned address
+  int offset_to_8 = (8 - (addr % 8)) % 8;
+  int current_16_alignment = (addr + offset_to_8) % 16;
+
+  int final_offset;
+  if (alignment_offset == 8) {
+    if (current_16_alignment == 0) {
+      final_offset = offset_to_8 + 8;
+    } else {
+      final_offset = offset_to_8;
+    }
+  } else {
+    final_offset = (16 - (addr % 16)) % 16;
+  }
+
+  // Verify alignment
+  uintptr_t data_addr = reinterpret_cast<uintptr_t>(raw_data + final_offset);
+  EXPECT_EQ(data_addr % 8, 0) << "Data should be 8-byte aligned";
+  if (alignment_offset == 8) {
+    EXPECT_NE(data_addr % 16, 0) << "Data should NOT be 16-byte aligned";
+  }
+
+  auto sliced_buffer = arrow::SliceBuffer(buffer, final_offset, data_size);
+
+  // Create validity buffer
+  int64_t bitmap_size = (num_records + 7) / 8;
+  std::shared_ptr<arrow::Buffer> validity_buffer;
+  ARROW_EXPECT_OK(arrow::AllocateBuffer(bitmap_size).Value(&validity_buffer));
+  memset(const_cast<uint8_t*>(validity_buffer->data()), 0xFF, validity_buffer->size());
+
+  return arrow::ArrayData::Make(type, num_records, {validity_buffer, sliced_buffer});
+}
+
+// Test that decimal STORES work correctly with 8-byte aligned (but not 16-byte aligned)
+// output
+TEST_F(TestDecimalAlignment, TestMisalignedDecimalStore) {
+  constexpr int32_t precision = 38;
+  constexpr int32_t scale = 17;
+  auto decimal_type = std::make_shared<arrow::Decimal128Type>(precision, scale);
+  auto field_a = arrow::field("a", decimal_type);
+  auto field_b = arrow::field("b", decimal_type);
+  auto schema = arrow::schema({field_a, field_b});
+
+  Decimal128TypePtr output_type;
+  auto status = DecimalTypeUtil::GetResultType(
+      DecimalTypeUtil::kOpSubtract, {decimal_type, decimal_type}, &output_type);
+  ASSERT_OK(status);
+
+  auto res = arrow::field("res", output_type);
+  auto node_a = TreeExprBuilder::MakeField(field_a);
+  auto node_b = TreeExprBuilder::MakeField(field_b);
+  auto subtract =
+      TreeExprBuilder::MakeFunction("subtract", {node_a, node_b}, output_type);
+  auto expr = TreeExprBuilder::MakeExpression(subtract, res);
+
+  std::shared_ptr<Projector> projector;
+  status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
+  ASSERT_OK(status);
+
+  // Create ALIGNED input arrays (using standard Arrow allocation)
+  auto array_a = MakeArrowArrayDecimal(
+      decimal_type, {Decimal128(100), Decimal128(200), Decimal128(300)},
+      {true, true, true});
+  auto array_b = MakeArrowArrayDecimal(
+      decimal_type, {Decimal128(10), Decimal128(20), Decimal128(30)}, {true, true, true});
+
+  auto in_batch = arrow::RecordBatch::Make(schema, 3, {array_a, array_b});
+
+  // Create MISALIGNED output buffer (8-byte aligned but NOT 16-byte aligned)
+  auto output_data = MakeMisalignedDecimalOutput(output_type, 3, 8);
+
+  // This should NOT crash even with misaligned output buffer
+  status = projector->Evaluate(*in_batch, {output_data});
+  ASSERT_OK(status);
+
+  // Verify the output was written correctly
+  auto result = std::make_shared<arrow::Decimal128Array>(output_data);
+  EXPECT_EQ(result->length(), 3);
+}
+
+}  // namespace gandiva