NVIDIA
diff --git a/‎transformer_engine/jax/cpp_extensions/gemm.py‎
Lines changed: 81 additions & 134 deletions b/‎transformer_engine/jax/cpp_extensions/gemm.py‎
Lines changed: 81 additions & 134 deletions
@@ -41,93 +41,73 @@ class GroupedGemmPrimitive(BasePrimitive):
 
     name = "te_grouped_gemm_ffi"
     multiple_results = True
-    impl_static_args = (6, 7, 8, 9)
+    impl_static_args = ()
     inner_primitive = None
     outer_primitive = None
 
     @staticmethod
-    def abstract(
-        lhs_contig_aval,
-        lhs_scale_contig_aval,
-        rhs_contig_aval,
-        rhs_scale_contig_aval,
-        bias_contig_aval,
-        dim_list_aval,
-        *,
-        num_gemms,
-        scaling_mode,
-        out_dtype,
-        out_flat_size,
-    ):
-        del lhs_contig_aval, lhs_scale_contig_aval
-        del rhs_contig_aval, rhs_scale_contig_aval
-        del bias_contig_aval, dim_list_aval
-        del num_gemms, scaling_mode
-        out_flat_aval = jax.core.ShapedArray(shape=(out_flat_size,), dtype=out_dtype)
-        wkspace_size = get_cublas_workspace_size_bytes() * num_cublas_streams
-        wkspace_aval = jax.core.ShapedArray(shape=(wkspace_size,), dtype=jnp.uint8)
-        return (out_flat_aval, wkspace_aval)
+    def abstract(*args, num_gemms, scaling_mode, out_dtype, has_bias):
+        """
+        Args:
+            *args: Size num_gemms * 4 or num_gemms * 5 depending on has_bias:
+                args[  0         :   num_gemms] are the lhs tensors,
+                args[  num_gemms : 2*num_gemms] are the rhs tensors,
+                args[2*num_gemms : 3*num_gemms] are the lhs scale_inv tensors,
+                args[3*num_gemms : 4*num_gemms] are the rhs scale_inv tensors,
+                args[4*num_gemms : 5*num_gemms] are the bias tensors if has_bias is True.
+            num_gemms: Number of GEMM operations to perform.
+            scaling_mode: Scaling mode for the GEMM operations.
+            out_dtype: Data type of the output tensors.
+            has_bias: Boolean indicating if bias tensors are provided.
+
+        Returns:
+           A tuple of ShapedArray objects of size num_gemms+1:
+               ret[0 : num_gemms]: GEMM output tensors,
+               ret[num_gemms]:workspace tensor.
+        """
+        del scaling_mode
+        expected_num_args = 5 * num_gemms if has_bias else 4 * num_gemms
+        assert (
+            len(args) == expected_num_args
+        ), f"Expected {expected_num_args} input arguments, but got {len(args)}"
+        A_list = args[0:num_gemms]
+        B_list = args[num_gemms : 2 * num_gemms]
+        # A and B have shapes [1, m, k] and [1, n, k]
+        out_list_aval = tuple(
+            jax.core.ShapedArray((A.shape[1], B.shape[1]), dtype=out_dtype)
+            for A, B in zip(A_list, B_list)
+        )
+        workspace_size = get_cublas_workspace_size_bytes() * num_cublas_streams
+        workspace_aval = jax.core.ShapedArray(shape=(workspace_size,), dtype=jnp.uint8)
+        return (*out_list_aval, workspace_aval)
 
     @staticmethod
     def outer_abstract(*args, **kwargs):
         (out_aval, _) = GroupedGemmPrimitive.abstract(*args, **kwargs)
         return out_aval
 
     @staticmethod
-    def lowering(
-        ctx,
-        lhs_contig,
-        lhs_scale_inv_contig,
-        rhs_contig,
-        rhs_scale_inv_contig,
-        bias_contig,
-        dim_list,
-        *,
-        num_gemms,
-        scaling_mode,
-        out_dtype,
-        out_flat_size,
-    ) -> jnp.ndarray:
-        del out_dtype, out_flat_size
+    def lowering(ctx, *args, num_gemms, scaling_mode, out_dtype, has_bias):
+        del out_dtype
         return jax.ffi.ffi_lowering(GroupedGemmPrimitive.name)(
             ctx,
-            lhs_contig,
-            lhs_scale_inv_contig,
-            rhs_contig,
-            rhs_scale_inv_contig,
-            bias_contig,
-            dim_list,
+            *args,
             num_gemms=num_gemms,
-            scaling_mode=scaling_mode.value,
+            scaling_mode=int(scaling_mode),
+            has_bias=has_bias,
         )
 
     @staticmethod
-    def impl(
-        lhs_contig,
-        lhs_scale_inv_contig,
-        rhs_contig,
-        rhs_scale_inv_contig,
-        bias_contig,
-        dim_list,
-        num_gemms,
-        scaling_mode,
-        out_dtype,
-        out_flat_size,
-    ) -> jnp.ndarray:
+    def impl(*args, num_gemms, scaling_mode, out_dtype, has_bias):
         assert GroupedGemmPrimitive.inner_primitive is not None
         out = GroupedGemmPrimitive.inner_primitive.bind(
-            lhs_contig,
-            lhs_scale_inv_contig,
-            rhs_contig,
-            rhs_scale_inv_contig,
-            bias_contig,
-            dim_list,
+            *args,
             num_gemms=num_gemms,
-            scaling_mode=scaling_mode,
+            scaling_mode=scaling_mode.value,
             out_dtype=out_dtype,
-            out_flat_size=out_flat_size,
+            has_bias=has_bias,
         )
-        return out[0]  # out is [out_flat, wkspace], only return out_flat
+        return out[:-1]  # out is [out_list, wkspace], only return out_list
 
 
 register_primitive(GroupedGemmPrimitive)
@@ -366,6 +346,7 @@ def swizzled_scale(scales):
     rows, cols = scales.shape
     scales = scales.reshape(rows // 128, 4, 32, cols // 4, 4)
     scales = jnp.transpose(scales, (0, 3, 2, 1, 4))
+    scales = scales.reshape(rows, cols)
     return scales
 
 
@@ -380,18 +361,12 @@ def grouped_gemm(
         len(lhs_list) == len(rhs_list) == len(contracting_dims_list)
     ), "lhs_list, rhs_list, contracting_dims_list must have the same length"
 
-    # Flatten inputs and save their shapes
-    num_gemms = len(lhs_list)
-    out_flat_size = 0
-    dims = []
-    lhs_contig_ = []
-    rhs_contig_ = []
-    lhs_scale_inv_contig_ = []
-    rhs_scale_inv_contig_ = []
-    bias_contig_ = []
-    out_offsets = []
-    remain_shape_list = []
     num_gemms = len(lhs_list)
+    lhs_list_ = []
+    rhs_list_ = []
+    lhs_sinv_list_ = []
+    rhs_sinv_list_ = []
+    bias_list_ = []
     for i in range(num_gemms):
         lhs = lhs_list[i]
         rhs = rhs_list[i]
@@ -402,7 +377,7 @@ def grouped_gemm(
             lhs_shape = lhs.data.shape
             rhs_shape = rhs.data.shape
             out_dtype = lhs.dq_dtype
-            # For ScaledTensors and NVTE_DELAYED_TENSOR_SCALING, need to handle internal data_layout
+            # For ScaledTensors and DELAYED_TENSOR_SCALING, need to handle internal data_layout
             if lhs.scaling_mode == ScalingMode.DELAYED_TENSOR_SCALING:
                 assert not (
                     lhs.data.dtype == jnp.float8_e5m2 and rhs.data.dtype == jnp.float8_e5m2
@@ -427,6 +402,7 @@ def grouped_gemm(
         lhs_remain_shape = _calculate_remaining_shape(lhs_shape, lhs_contract)
         rhs_remain_shape = _calculate_remaining_shape(rhs_shape, rhs_contract)
 
+        # Note: do not squeeze() for {lhs, rhs}_3d, it will trigger a D2D memcpy
         if scaling_mode == ScalingMode.NO_SCALING:
             lhs_3d = _shape_normalization(lhs, lhs_dn)
             rhs_3d = _shape_normalization(rhs, rhs_dn)
@@ -438,13 +414,13 @@ def grouped_gemm(
             rhs_3d = _shape_normalization(rhs.data, rhs_dn)
             lhs_scale_inv = _shape_normalization(lhs.scale_inv, lhs_dn)
             rhs_scale_inv = _shape_normalization(rhs.scale_inv, rhs_dn)
+            # swizzled_scale requires a matrix
             lhs_scale_inv = swizzled_scale(lhs_scale_inv.squeeze())
             rhs_scale_inv = swizzled_scale(rhs_scale_inv.squeeze())
         else:
             raise NotImplementedError("Unsupported ScalingMode: {scaling_mode}")
 
-        # Note: if _shape_normalization() is updated to support non-TN, need to update here
-        # already_transposed doesn't matter for the output shape
+        # Note: already_transposed doesn't matter for the output shape
         # x.shape = [B, D1, D2]
         # contracting_dims = (2, )    --> output.shape = [1, B * D1, D2]
         # contracting_dims = (0, 1, ) --> output.shape = [1, D2, B * D1]
@@ -455,66 +431,37 @@ def grouped_gemm(
         bn = rhs_remain_shape[0]
         kl = lhs_3d.shape[-1]
         kr = rhs_3d.shape[-1]
-        remain_shape_list.append(((bm,), (bn,)))
-        assert kl == kr, f"lhs_3d.shape[-1] ({kl}) != rhs_3d.shape[-1] ({kr})"
-        k = kl
-
-        if (bm % 16 != 0) or (bn % 16 != 0) or (k % 16 != 0):
-            print(f"grouped_gemm input pair {i} has invalid problem shape for lowering: ")
-            print(
-                f"m = {bm}, n = {bn}, k = {k}; cuBLAS requires the problem shapes being multiples"
-                " of 16"
-            )
-            assert bm % 16 == 0 and bn % 16 == 0 and k % 16 == 0
-
-        dims.append((bm, bn, k))
-        lhs_contig_.append(lhs_3d.reshape(-1))
-        rhs_contig_.append(rhs_3d.reshape(-1))
+        assert kl == kr, f"After shape normalization, contracting dim size mismatch: {kl} != {kr}"
+        if (bm % 16 != 0) or (bn % 16 != 0) or (kl % 16 != 0):
+            print("grouped_gemm input pair {i} has invalid problem shape for lowering: ")
+            print(f"m = {bm}, n = {bn}, k = {kl}; ")
+            print("cuBLAS requires the problem shapes being multiples of 16")
+            assert (bm % 16 == 0) and (bn % 16 == 0) and (kl % 16 == 0)
+
+        lhs_list_.append(lhs_3d)
+        rhs_list_.append(rhs_3d)
         if scaling_mode == ScalingMode.NO_SCALING:
-            lhs_scale_inv_contig_.append(jnp.ones(1, dtype=jnp.float32))
-            rhs_scale_inv_contig_.append(jnp.ones(1, dtype=jnp.float32))
+            lhs_sinv_list_.append(jnp.ones(1, dtype=jnp.float32))
+            rhs_sinv_list_.append(jnp.ones(1, dtype=jnp.float32))
         if scaling_mode == ScalingMode.DELAYED_TENSOR_SCALING:
-            lhs_scale_inv_contig_.append(lhs.scale_inv.reshape(-1))
-            rhs_scale_inv_contig_.append(rhs.scale_inv.reshape(-1))
+            lhs_sinv_list_.append(lhs.scale_inv)
+            rhs_sinv_list_.append(rhs.scale_inv)
         if scaling_mode == ScalingMode.MXFP8_1D_SCALING:
-            lhs_scale_inv_contig_.append(lhs_scale_inv.reshape(-1))
-            rhs_scale_inv_contig_.append(rhs_scale_inv.reshape(-1))
+            lhs_sinv_list_.append(lhs_scale_inv)
+            rhs_sinv_list_.append(rhs_scale_inv)
         if bias_list is not None:
-            bias_contig_.append(bias_list[i].reshape(-1))
-        out_flat_size += bm * bn
-        out_offsets.append(out_flat_size)
-
-    lhs_contig = jnp.concatenate(lhs_contig_)
-    rhs_contig = jnp.concatenate(rhs_contig_)
-    lhs_scale_inv_contig = jnp.concatenate(lhs_scale_inv_contig_)
-    rhs_scale_inv_contig = jnp.concatenate(rhs_scale_inv_contig_)
-    bias_contig = jnp.empty(0) if bias_list is None else jnp.concatenate(bias_contig_)
-    dim_list = jnp.array(dims, dtype=jnp.int32)
-
-    # TE/common does not support NVTE_NO_SCALING yet
-    # It expects NVTE_DELAYED_TENSOR_SCALING as default for FP32, BF16, FP16
-    if scaling_mode == ScalingMode.NO_SCALING:
-        scaling_mode = ScalingMode.DELAYED_TENSOR_SCALING
-
-    # Perform batched GEMM on flattened inputs
-    out_contig = GroupedGemmPrimitive.outer_primitive.bind(
-        lhs_contig,
-        lhs_scale_inv_contig,
-        rhs_contig,
-        rhs_scale_inv_contig,
-        bias_contig,
-        dim_list,
+            bias_list_.append(bias_list[i])
+
+    out_list = GroupedGemmPrimitive.outer_primitive.bind(
+        *lhs_list_,
+        *rhs_list_,
+        *lhs_sinv_list_,
+        *rhs_sinv_list_,
+        *bias_list_,
         num_gemms=num_gemms,
-        scaling_mode=scaling_mode.value,
+        scaling_mode=scaling_mode,
         out_dtype=out_dtype,
-        out_flat_size=out_flat_size,
+        has_bias=1 if bias_list is not None else 0,
     )
 
-    # Split the output back into tensors
-    out_offsets = jnp.array(out_offsets)
-    out_flat_list = jnp.split(out_contig, out_offsets[:-1])
-    out_tensors = []
-    for out_flat, (lhs_remain_shape, rhs_remain_shape) in zip(out_flat_list, remain_shape_list):
-        out_tensors.append(out_flat.reshape(*lhs_remain_shape, *rhs_remain_shape))
-
-    return out_tensors
+    return out_list