Skip to content

Conversation

@folkertdev
Copy link
Contributor

The _mm256_madd_epi16 intrinsic performs first a pointwise widening multiplication, and then adds adjacent elements. In SIMD versions of the adler32 checksum algorithm, a trivial multiplication by an all-ones vector is used to get just the widening and addition behavior.

In the rust standard library, we like to implement intrinsics in terms of simpler building blocks, so that all backends can implement a small set of primitives instead of supporting all of LLVM's intrinsics. When we try that for _mm256_madd_epi16 in isolation it works, but when one of the arguments is an all-ones vector, the multiplication is optimized out long before the vpmaddwd instruction can be selected.

This PR recognizes the widening adjacent addition pattern that adler32 uses directly, and manually inserts a trivial multiplication by an all-ones vector. Experimentally, performing this optimization increases adler32 throughput from 41 gb/s to 67 gb/s (rust-lang/rust#150560 (comment))

cc rust-lang/stdarch#1985 rust-lang/rust#150560

@llvmbot
Copy link
Member

llvmbot commented Jan 1, 2026

@llvm/pr-subscribers-backend-x86

Author: Folkert de Vries (folkertdev)

Changes

The _mm256_madd_epi16 intrinsic performs first a pointwise widening multiplication, and then adds adjacent elements. In SIMD versions of the adler32 checksum algorithm, a trivial multiplication by an all-ones vector is used to get just the widening and addition behavior.

In the rust standard library, we like to implement intrinsics in terms of simpler building blocks, so that all backends can implement a small set of primitives instead of supporting all of LLVM's intrinsics. When we try that for _mm256_madd_epi16 in isolation it works, but when one of the arguments is an all-ones vector, the multiplication is optimized out long before the vpmaddwd instruction can be selected.

This PR recognizes the widening adjacent addition pattern that adler32 uses directly, and manually inserts a trivial multiplication by an all-ones vector. Experimentally, performing this optimization increases adler32 throughput from 41 gb/s to 67 gb/s (rust-lang/rust#150560 (comment))

cc rust-lang/stdarch#1985 rust-lang/rust#150560


Full diff: https://github.com/llvm/llvm-project/pull/174149.diff

2 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+28-11)
  • (modified) llvm/test/CodeGen/X86/combine-pmadd.ll (+29)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 50df19b3e6e47..0bfbc3f47d12b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58033,7 +58033,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
   //               (extract_elt Mul, 3),
   //               (extract_elt Mul, 5),
   //                   ...
-  // and identify Mul.
+  // and identify Mul. Mul must be either ISD::MUL, or can be ISD::SIGN_EXTEND
+  // in which case we add a trivial multiplication by an all-ones vector.
   SDValue Mul;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
@@ -58064,7 +58065,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
       // with 2X number of vector elements than the BUILD_VECTOR.
       // Both extracts must be from same MUL.
       Mul = Vec0L;
-      if (Mul.getOpcode() != ISD::MUL ||
+      if ((Mul.getOpcode() != ISD::MUL &&
+           Mul.getOpcode() != ISD::SIGN_EXTEND) ||
           Mul.getValueType().getVectorNumElements() != 2 * e)
         return SDValue();
     }
@@ -58073,16 +58075,31 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
       return SDValue();
   }
 
-  // Check if the Mul source can be safely shrunk.
-  ShrinkMode Mode;
-  if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
-      Mode == ShrinkMode::MULU16)
-    return SDValue();
+  SDValue N0, N1;
+  if (Mul.getOpcode() == ISD::MUL) {
+    // Check if the Mul source can be safely shrunk.
+    ShrinkMode Mode;
+    if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
+        Mode == ShrinkMode::MULU16)
+      return SDValue();
+
+    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                   VT.getVectorNumElements() * 2);
+    N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+    N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+  } else {
+    assert(Mul.getOpcode() == ISD::SIGN_EXTEND);
+
+    // Add a trivial multiplication with an all-ones vector so that we can make
+    // use of VPMADDWD.
+    N0 = Mul.getOperand(0);
+    EVT SrcVT = N0.getValueType();
+    N1 = DAG.getSplatVector(SrcVT, DL, DAG.getConstant(1, DL, MVT::i16));
 
-  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                 VT.getVectorNumElements() * 2);
-  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+    if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i16 ||
+        SrcVT.getVectorNumElements() != 2 * VT.getVectorNumElements())
+      return SDValue();
+  }
 
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
diff --git a/llvm/test/CodeGen/X86/combine-pmadd.ll b/llvm/test/CodeGen/X86/combine-pmadd.ll
index d9283aa8591fc..53f1374669ca5 100644
--- a/llvm/test/CodeGen/X86/combine-pmadd.ll
+++ b/llvm/test/CodeGen/X86/combine-pmadd.ll
@@ -331,3 +331,32 @@ define i1 @pmaddwd_pcmpgt_infinite_loop() {
   %8 = icmp eq i4 %7, 0
   ret i1 %8
 }
+
+; If the shuffle matches, but there is no multiply, introduce a trivial multiply by an all-ones vector.
+define <8 x i32> @introduce_trivial_multiply(<16 x i16> %x) {
+; SSE-LABEL: introduce_trivial_multiply:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; SSE-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE-NEXT:    pmaddwd %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: introduce_trivial_multiply:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: introduce_trivial_multiply:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    retq
+  %1 = sext <16 x i16> %x to <16 x i32>
+  %2 = shufflevector <16 x i32> %1, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %3 = shufflevector <16 x i32> %1, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %4 = add nsw <8 x i32> %2, %3
+  ret <8 x i32> %4
+}

@RKSimon RKSimon self-requested a review January 1, 2026 16:21
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants