From 883eac9ac829f1f39ec66a32f75152831fc0c164 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Wed, 15 Apr 2026 12:18:05 +0200 Subject: [PATCH 01/27] feat: add `awkward_IndexedArray_overlay_mask` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 3 ++- src/awkward/_connect/cuda/_compute.py | 9 +++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index 50768614b2..7c6778bbeb 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -19,7 +19,7 @@ "awkward_ListArray_broadcast_tooffsets", "awkward_ListArray_compact_offsets", "awkward_ListOffsetArray_flatten_offsets", - "awkward_IndexedArray_overlay_mask", + # "awkward_IndexedArray_overlay_mask", "awkward_ByteMaskedArray_numnull", "awkward_IndexedArray_numnull", "awkward_IndexedArray_numnull_parents", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 668e737581..bfeb9a78e8 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -100,6 +100,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_reduce_prod_bool", "awkward_reduce_count_64", "awkward_reduce_countnonzero", + "awkward_IndexedArray_overlay_mask", ) def _get_cuda_compute_impl(self, kernel_name: str): @@ -147,7 +148,7 @@ def _get_cuda_compute_impl(self, kernel_name: str): if kernel_name == "awkward_reduce_countnonzero": return cuda_compute.awkward_reduce_countnonzero - return None + return getattr(cuda_compute, kernel_name, None) def prepare_reducer(self, reducer: ak._reducers.Reducer) -> ak._reducers.Reducer: from awkward._connect.cuda import get_cuda_compute_reducer diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 0c78a3d392..efa129428a 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -629,3 +629,12 @@ def segment_reduce_count_nonzero(segment_id): segment_ids = CountingIterator(type_wrapper(0)) # TODO: try using segmented_reduce instead when https://github.com/NVIDIA/cccl/issues/6171 is fixed unary_transform(segment_ids, result, segment_reduce_count_nonzero, outlength) + + +# Overlays a mask onto an index array: masked positions become -1, unmasked positions keep their original index value. +def awkward_IndexedArray_overlay_mask(toindex, mask, fromindex, length): + def transform(i): + return -1 if mask[i] else fromindex[i] + + indices = CountingIterator(cp.int64(0)) + unary_transform(indices, toindex, transform, length) From ce870c79f7ef8dd4929f18dffadc278d6d303207 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Thu, 16 Apr 2026 12:28:58 +0200 Subject: [PATCH 02/27] feat: add `awkward_IndexedArray_reduce_next_64` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 1 + src/awkward/_connect/cuda/_compute.py | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index 7c6778bbeb..a2c940f7d3 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -97,7 +97,7 @@ "awkward_IndexedArray_local_preparenext_64", "awkward_IndexedArray_ranges_next_64", "awkward_IndexedArray_ranges_carry_next_64", - "awkward_IndexedArray_reduce_next_64", + # "awkward_IndexedArray_reduce_next_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64", "awkward_IndexedOptionArray_rpad_and_clip_mask_axis1", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index bfeb9a78e8..a3abac6cfd 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -101,6 +101,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_reduce_count_64", "awkward_reduce_countnonzero", "awkward_IndexedArray_overlay_mask", + "awkward_IndexedArray_reduce_next_64", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index efa129428a..21b60610a9 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -638,3 +638,28 @@ def transform(i): indices = CountingIterator(cp.int64(0)) unary_transform(indices, toindex, transform, length) + + +# Skips masked (-1) entries and packs the remaining valid entries into nextcarry and nextparents, tracking where each ended up in outindex. +def awkward_IndexedArray_reduce_next_64( + nextcarry, nextparents, outindex, index, parents, length +): + if length == 0: + return + + # Compute cumulative count of valid (non-negative) indices to determine compact output positions + # this needs to be done before going through all the indices in parallel later + scan = cp.cumsum(index >= 0) + + def scatter_and_fill(i): + if index[i] >= 0: + # Map valid entry to its compacted position + k = scan[i] - 1 + nextcarry[k] = index[i] + nextparents[k] = parents[i] + return k + # Masked entries get -1 in outindex + return -1 + + indices = CountingIterator(cp.int64(0)) + unary_transform(indices, outindex, scatter_and_fill, length) From ed40791b716aba6b7bfcd6d9416cb940dcdf42ea Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Fri, 17 Apr 2026 12:15:27 +0200 Subject: [PATCH 03/27] feat: add `IndexedArray_reduce_next_nonlocal_nextshifts_64` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 1 + src/awkward/_connect/cuda/_compute.py | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index a2c940f7d3..c1c6b2de50 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -98,7 +98,7 @@ "awkward_IndexedArray_ranges_next_64", "awkward_IndexedArray_ranges_carry_next_64", # "awkward_IndexedArray_reduce_next_64", - "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", + # "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_fromshifts_64", "awkward_IndexedOptionArray_rpad_and_clip_mask_axis1", "awkward_ListOffsetArray_local_preparenext_64", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index a3abac6cfd..3522bdda99 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -102,6 +102,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_reduce_countnonzero", "awkward_IndexedArray_overlay_mask", "awkward_IndexedArray_reduce_next_64", + "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 21b60610a9..60108732cd 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -663,3 +663,28 @@ def scatter_and_fill(i): indices = CountingIterator(cp.int64(0)) unary_transform(indices, outindex, scatter_and_fill, length) + + +# For each valid (non-negative) entry at position i, records the number of null (negative) entries +# that appeared before it. The k-th valid entry gets nextshifts[k] = count of nulls before position i. +# For example, für index = [0, 1, 2, -1, 3, -1, 4] → nextshifts = [0, 0, 0, 1, 2]. +def awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64(nextshifts, index, length): + if length == 0: + return + + index_slice = index[:length] + + # cumsum of (index < 0) gives the running null count at each position. + # this is basically equivalent to calling cuda.compute.inclusive_scan on index_slice < 0 + null_cumsum = cp.cumsum(index_slice < 0) + _ = cp.empty(length, dtype=cp.int64) + + def scatter(i): + null_count = null_cumsum[i] + if index_slice[i] >= 0: + nextshifts[i - null_count] = null_count # output slot = i - null_count + # return a dummy value otherwise + return cp.int64(0) + + indices = CountingIterator(cp.int64(0)) + unary_transform(indices, _, scatter, length) From ed9f4ee0f57345c15c35e7da188884d8589f3f46 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Fri, 17 Apr 2026 17:45:59 +0200 Subject: [PATCH 04/27] feat: add `ByteMaskedArray_getitem_nextcarry` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 5 +++++ src/awkward/_connect/cuda/_compute.py | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index c1c6b2de50..65fa9ec70b 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -79,7 +79,7 @@ "awkward_NumpyArray_reduce_adjust_starts_shifts_64", "awkward_RegularArray_getitem_next_at", "awkward_BitMaskedArray_to_IndexedOptionArray", - "awkward_ByteMaskedArray_getitem_nextcarry", + # "awkward_ByteMaskedArray_getitem_nextcarry", "awkward_ByteMaskedArray_getitem_nextcarry_outindex", "awkward_ByteMaskedArray_reduce_next_64", "awkward_ByteMaskedArray_reduce_next_nonlocal_nextshifts_64", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 3522bdda99..6cdae87c9e 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -75,6 +75,10 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: Other kernels that are currently supported: - awkward_sort - awkward_argsort (future) + - awkward_IndexedArray_overlay_mask + - awkward_IndexedArray_reduce_next_64 + - awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64 + - awkward_ByteMaskedArray_getitem_nextcarry These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -103,6 +107,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_IndexedArray_overlay_mask", "awkward_IndexedArray_reduce_next_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", + "awkward_ByteMaskedArray_getitem_nextcarry", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 60108732cd..1a9427936f 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -688,3 +688,20 @@ def scatter(i): indices = CountingIterator(cp.int64(0)) unary_transform(indices, _, scatter, length) + + +# Packs valid entries (where (mask[i] != 0) == validwhen) into tocarry in order. +# mask = [0, 1, 0, 1, 1], validwhen=True → tocarry = [1, 3, 4] +# mask = [0, 1, 0, 1, 1], validwhen=False → tocarry = [0, 2] +# mask = [0, 1, 0, 1, 1, -1, 1], validwhen=True → tocarry = [1, 3, 4, 5, 6] +def awkward_ByteMaskedArray_getitem_nextcarry(tocarry, mask, length, validwhen): + if length == 0: + return + + # valid = ((mask[:length] != 0) == validwhen) + # valid[i] is 1 when the masked element passes the validwhen condition. + + # get the indices of the valid entries using cp.nonzero + valid_indices = cp.nonzero((mask[:length] != 0) == validwhen)[0] + # in case tocarry is not exactly the right size, allocate it in two steps like this + tocarry[: len(valid_indices)] = valid_indices From 394d017f12fcc3a40e430f7b0fdf082a834dfa8f Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Mon, 20 Apr 2026 13:52:02 +0200 Subject: [PATCH 05/27] feat: add `awkward_ByteMaskedArray_numnull` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 2 ++ src/awkward/_connect/cuda/_compute.py | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index 65fa9ec70b..de88fb08c4 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -20,7 +20,7 @@ "awkward_ListArray_compact_offsets", "awkward_ListOffsetArray_flatten_offsets", # "awkward_IndexedArray_overlay_mask", - "awkward_ByteMaskedArray_numnull", + # "awkward_ByteMaskedArray_numnull", "awkward_IndexedArray_numnull", "awkward_IndexedArray_numnull_parents", "awkward_IndexedArray_numnull_unique_64", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 6cdae87c9e..408cc85fc4 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -79,6 +79,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_IndexedArray_reduce_next_64 - awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64 - awkward_ByteMaskedArray_getitem_nextcarry + - awkward_ByteMaskedArray_numnull These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -108,6 +109,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_IndexedArray_reduce_next_64", "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", "awkward_ByteMaskedArray_getitem_nextcarry", + "awkward_ByteMaskedArray_numnull", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 1a9427936f..d02edc46c4 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -691,6 +691,7 @@ def scatter(i): # Packs valid entries (where (mask[i] != 0) == validwhen) into tocarry in order. +# Examples: # mask = [0, 1, 0, 1, 1], validwhen=True → tocarry = [1, 3, 4] # mask = [0, 1, 0, 1, 1], validwhen=False → tocarry = [0, 2] # mask = [0, 1, 0, 1, 1, -1, 1], validwhen=True → tocarry = [1, 3, 4, 5, 6] @@ -705,3 +706,11 @@ def awkward_ByteMaskedArray_getitem_nextcarry(tocarry, mask, length, validwhen): valid_indices = cp.nonzero((mask[:length] != 0) == validwhen)[0] # in case tocarry is not exactly the right size, allocate it in two steps like this tocarry[: len(valid_indices)] = valid_indices + + +# Counts null (invalid) entries: positions where (mask[i] != 0) != validwhen. +# Examples: +# mask = [0, 1, 0, 1, 1], validwhen=True → numnull = 2 (positions 0 and 2 are null) +# mask = [0, 1, 0, 1, 1], validwhen=False → numnull = 3 (positions 1, 3 and 4 are null) +def awkward_ByteMaskedArray_numnull(numnull, mask, length, validwhen): + numnull[0] = cp.count_nonzero((mask[:length] != 0) != validwhen) From 0ed21758dc233c5dc864c19de575ea783e6e5e74 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 21 Apr 2026 11:26:51 +0200 Subject: [PATCH 06/27] feat: add `awkward_RegularArray_getitem_jagged_expand` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 2 ++ src/awkward/_connect/cuda/_compute.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index de88fb08c4..d6a840e3ad 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -51,7 +51,7 @@ "awkward_RegularArray_reduce_local_nextparents_64", "awkward_RegularArray_reduce_nonlocal_preparenext_64", "awkward_missing_repeat", - "awkward_RegularArray_getitem_jagged_expand", + # "awkward_RegularArray_getitem_jagged_expand", "awkward_ListArray_combinations_length", "awkward_ListArray_combinations", "awkward_RegularArray_combinations_64", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 408cc85fc4..a3525870ee 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -80,6 +80,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64 - awkward_ByteMaskedArray_getitem_nextcarry - awkward_ByteMaskedArray_numnull + - awkward_RegularArray_getitem_jagged_expand These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -110,6 +111,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_IndexedArray_reduce_next_nonlocal_nextshifts_64", "awkward_ByteMaskedArray_getitem_nextcarry", "awkward_ByteMaskedArray_numnull", + "awkward_RegularArray_getitem_jagged_expand", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index d02edc46c4..74a9123f81 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -714,3 +714,20 @@ def awkward_ByteMaskedArray_getitem_nextcarry(tocarry, mask, length, validwhen): # mask = [0, 1, 0, 1, 1], validwhen=False → numnull = 3 (positions 1, 3 and 4 are null) def awkward_ByteMaskedArray_numnull(numnull, mask, length, validwhen): numnull[0] = cp.count_nonzero((mask[:length] != 0) != validwhen) + + +# Broadcasts a single jagged offset array across all rows of a regular array +# Example: +# singleoffsets = [0, 2, 5], regularsize = 2, regularlength = 3 +# multistarts = [0, 2, 0, 2, 0, 2] +# multistops = [2, 5, 2, 5, 2, 5] +def awkward_RegularArray_getitem_jagged_expand( + multistarts, multistops, singleoffsets, regularsize, regularlength +): + if regularlength == 0 or regularsize == 0: + return + + # Reshape as (regularlength, regularsize) views (no copy) and broadcast-assign + # singleoffsets[:-1] / singleoffsets[1:] across all rows. + multistarts.reshape(regularlength, regularsize)[:] = singleoffsets[:regularsize] + multistops.reshape(regularlength, regularsize)[:] = singleoffsets[1:] From 2aa4c39dfcc54591588625abef6ccc9d575f45d5 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 21 Apr 2026 11:59:00 +0200 Subject: [PATCH 07/27] add an upper bound --- src/awkward/_connect/cuda/_compute.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 74a9123f81..6f1eb714ce 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -731,3 +731,5 @@ def awkward_RegularArray_getitem_jagged_expand( # singleoffsets[:-1] / singleoffsets[1:] across all rows. multistarts.reshape(regularlength, regularsize)[:] = singleoffsets[:regularsize] multistops.reshape(regularlength, regularsize)[:] = singleoffsets[1:] + multistops.reshape(regularlength, regularsize)[:] = singleoffsets[1:regularsize + 1] + From f117e7030ac9b72907235c9fef3a20dd19a70595 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Apr 2026 10:01:21 +0000 Subject: [PATCH 08/27] style: pre-commit fixes --- src/awkward/_connect/cuda/_compute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 6f1eb714ce..6a2276f168 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -731,5 +731,6 @@ def awkward_RegularArray_getitem_jagged_expand( # singleoffsets[:-1] / singleoffsets[1:] across all rows. multistarts.reshape(regularlength, regularsize)[:] = singleoffsets[:regularsize] multistops.reshape(regularlength, regularsize)[:] = singleoffsets[1:] - multistops.reshape(regularlength, regularsize)[:] = singleoffsets[1:regularsize + 1] - + multistops.reshape(regularlength, regularsize)[:] = singleoffsets[ + 1 : regularsize + 1 + ] From d491d372a8023aad33a21fad83d1951814c25e05 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 21 Apr 2026 12:32:42 +0200 Subject: [PATCH 09/27] feat: add `awkward_RegularArray_getitem_jagged_expand` kernel using cuda.compute --- src/awkward/_connect/cuda/_compute.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 6a2276f168..66d6de2c6d 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -730,7 +730,6 @@ def awkward_RegularArray_getitem_jagged_expand( # Reshape as (regularlength, regularsize) views (no copy) and broadcast-assign # singleoffsets[:-1] / singleoffsets[1:] across all rows. multistarts.reshape(regularlength, regularsize)[:] = singleoffsets[:regularsize] - multistops.reshape(regularlength, regularsize)[:] = singleoffsets[1:] multistops.reshape(regularlength, regularsize)[:] = singleoffsets[ 1 : regularsize + 1 ] From 167bc9404741a8eef3900d80ffb8f9aa374c044f Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 21 Apr 2026 17:37:43 +0200 Subject: [PATCH 10/27] feat: add `awkward_UnionArray_simplify_one` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 2 ++ src/awkward/_connect/cuda/_compute.py | 24 ++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index d6a840e3ad..f455300c02 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -118,7 +118,7 @@ "awkward_UnionArray_nestedfill_tags_index", "awkward_UnionArray_regular_index_getsize", "awkward_UnionArray_simplify", - "awkward_UnionArray_simplify_one", + # "awkward_UnionArray_simplify_one", "awkward_RecordArray_reduce_nonlocal_outoffsets_64", # "awkward_reduce_count_64", # "awkward_reduce_max", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index a3525870ee..c58d880c4d 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -81,6 +81,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_ByteMaskedArray_getitem_nextcarry - awkward_ByteMaskedArray_numnull - awkward_RegularArray_getitem_jagged_expand + - awkward_UnionArray_simplify_one These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -112,6 +113,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_ByteMaskedArray_getitem_nextcarry", "awkward_ByteMaskedArray_numnull", "awkward_RegularArray_getitem_jagged_expand", + "awkward_UnionArray_simplify_one", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 66d6de2c6d..7dd7b45e4e 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -4,6 +4,7 @@ from cuda.compute import ( CountingIterator, + DiscardIterator, gpu_struct, reduce_into, unary_transform, @@ -733,3 +734,26 @@ def awkward_RegularArray_getitem_jagged_expand( multistops.reshape(regularlength, regularsize)[:] = singleoffsets[ 1 : regularsize + 1 ] + + +# For each position i where fromtags[i] == fromwhich, sets totags[i] = towhich and +# toindex[i] = fromindex[i] + base. Other positions are left unchanged. +# Example: +# fromtags = [0, 1, 0, 1, 0], fromindex = [0, 0, 1, 1, 2] +# fromwhich=1, towhich=2, base=10 +# totags = [0, 2, 0, 2, 0] +# toindex = [0, 10, 1, 11, 2] +def awkward_UnionArray_simplify_one( + totags, toindex, fromtags, fromindex, towhich, fromwhich, length, base +): + if length == 0: + return + + def transform(i): + if fromtags[i] == fromwhich: + totags[i] = towhich + toindex[i] = fromindex[i] + base + return 0 # discarded + + indices = CountingIterator(cp.int64(0)) + unary_transform(indices, DiscardIterator(), transform, length) From 93104c69d0189ae756e963ec82b0238415874d6d Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Mon, 27 Apr 2026 14:18:51 +0200 Subject: [PATCH 11/27] feat: add `awkward_ListArray_broadcast_tooffsets` kernel using cuda.compute --- src/awkward/_backends/cupy.py | 2 ++ src/awkward/_connect/cuda/_compute.py | 39 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index c58d880c4d..bad687b256 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -82,6 +82,8 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_ByteMaskedArray_numnull - awkward_RegularArray_getitem_jagged_expand - awkward_UnionArray_simplify_one + TODO: fix the tests for this kernel --> + - awkward_ListArray_broadcast_tooffsets These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 7dd7b45e4e..8c62a8fd5f 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -757,3 +757,42 @@ def transform(i): indices = CountingIterator(cp.int64(0)) unary_transform(indices, DiscardIterator(), transform, length) + + +# TODO: fix tests for this kernel that are deliberately raising an error +# producing a carry index that maps each output element back to its position in the original content +# Example input: +# fromoffsets = [0, 3, 5], fromstarts = [10, 20], fromstops = [13, 22], lencontent = 25 +# Example output: +# i=0: range [10, 13) → [10, 11, 12] +# i=1: range [20, 22) → [20, 21] +# tocarry = [10, 11, 12, 20, 21] +def awkward_ListArray_broadcast_tooffsets( + tocarry, fromoffsets, offsetslength, fromstarts, fromstops, lencontent +): + if offsetslength <= 1: + return + + length = offsetslength - 1 + starts = fromstarts[:length] + stops = fromstops[:length] + # counts[i] = how many elements list i should have + counts = fromoffsets[1:offsetslength] - fromoffsets[:length] + + if int(cp.any(counts < 0)): + raise ValueError("broadcast's offsets must be monotonically increasing") + if int(cp.any(stops - starts != counts)): + raise ValueError("cannot broadcast nested list") + if int(cp.any((starts != stops) & (stops > lencontent))): + raise ValueError("stops[i] > len(content)") + + # For each segment i, write the content indices starts[i], starts[i]+1, ..., stops[i]-1 + # into the contiguous output slice tocarry[fromoffsets[i] : fromoffsets[i+1]]. + def fill_list(i): + start = starts[i] + stop = stops[i] + for j in range(start, stop): + tocarry[fromoffsets[i] + j - start] = j + return 0 + + unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill_list, length) From 219419c1beef583daf9f13cf19b095b941c3dbd0 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Mon, 27 Apr 2026 15:23:32 +0200 Subject: [PATCH 12/27] feat: add `awkward_ListArray_localindex` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 2 ++ src/awkward/_connect/cuda/_compute.py | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index f455300c02..307c768959 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -72,7 +72,7 @@ "awkward_UnionArray_regular_index", "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64", "awkward_ListArray_getitem_next_range_spreadadvanced", - "awkward_ListArray_localindex", + # "awkward_ListArray_localindex", "awkward_NumpyArray_pad_zero_to_length", "awkward_NumpyArray_reduce_adjust_starts_64", "awkward_NumpyArray_rearrange_shifted", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index bad687b256..6de59b46a3 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -84,6 +84,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_UnionArray_simplify_one TODO: fix the tests for this kernel --> - awkward_ListArray_broadcast_tooffsets + - awkward_ListArray_localindex These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -116,6 +117,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_ByteMaskedArray_numnull", "awkward_RegularArray_getitem_jagged_expand", "awkward_UnionArray_simplify_one", + "awkward_ListArray_localindex", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 8c62a8fd5f..fc9dada1e6 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -796,3 +796,23 @@ def fill_list(i): return 0 unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill_list, length) + + +# For each segment i, it fills toindex with the local position of each element within that segment — i.e. 0, 1, 2, ... +# Example: +# offsets = [0, 3, 5] +# toindex = [0, 1, 2, 0, 1] +def awkward_ListArray_localindex(toindex, offsets, length): + if length == 0: + return + + starts = offsets[:length] + stops = offsets[1 : length + 1] + + def fill(i): + start = starts[i] + stop = stops[i] + toindex[start:stop] = cp.arange(stop - start, dtype=toindex.dtype) + return 0 + + unary_transform(CountingIterator(cp.int8(0)), DiscardIterator(), fill, length) From 731e572a1f56a5805f64427d8388660babaa8f19 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Mon, 27 Apr 2026 15:32:29 +0200 Subject: [PATCH 13/27] fix the impl --- src/awkward/_connect/cuda/_compute.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index fc9dada1e6..ac2ff2227e 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -812,7 +812,8 @@ def awkward_ListArray_localindex(toindex, offsets, length): def fill(i): start = starts[i] stop = stops[i] - toindex[start:stop] = cp.arange(stop - start, dtype=toindex.dtype) + for j in range(start, stop): + toindex[j] = j - start return 0 - unary_transform(CountingIterator(cp.int8(0)), DiscardIterator(), fill, length) + unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill, length) From 0cb3721e559d0b2a83a0e0f5e9788267543bbde1 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 28 Apr 2026 12:55:26 +0200 Subject: [PATCH 14/27] feat: add `awkward_ListArray_compact_offsets` kernel using cuda.compute --- src/awkward/_backends/cupy.py | 2 ++ src/awkward/_connect/cuda/_compute.py | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 6de59b46a3..af816f2954 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -85,6 +85,8 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: TODO: fix the tests for this kernel --> - awkward_ListArray_broadcast_tooffsets - awkward_ListArray_localindex + TODO: fix the tests for this kernel --> + - awkward_ListArray_compact_offsets These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index ac2ff2227e..2f89fcbc32 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -6,6 +6,7 @@ CountingIterator, DiscardIterator, gpu_struct, + inclusive_scan, reduce_into, unary_transform, ) @@ -817,3 +818,29 @@ def fill(i): return 0 unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill, length) + + +# TODO: fix tests for this kernel that are deliberately raising an error +# Converts a ListArray's (starts, stops) pairs into offsets. +# tooffsets[0] = 0, tooffsets[i+1] = tooffsets[i] + (fromstops[i] - fromstarts[i]) +# Example: +# fromstarts = [10, 20], fromstops = [13, 22], length = 2 +# tooffsets = [0, 3, 5] +def awkward_ListArray_compact_offsets(tooffsets, fromstarts, fromstops, length): + tooffsets[0] = 0 + if length == 0: + return + + sizes = fromstops[:length] - fromstarts[:length] + + if cp.any(sizes < 0): + raise ValueError("stops[i] < starts[i]") + + # the same as `tooffsets[1 : length + 1] = cp.cumsum(sizes)` + inclusive_scan( + sizes, + tooffsets[1 : length + 1], + lambda a, b: a + b, + cp.array([0], dtype=tooffsets.dtype), + length, + ) From 598219780045e8ba199f9997110577475daa9366 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 28 Apr 2026 17:30:56 +0200 Subject: [PATCH 15/27] feat: add `awkward_ListArray_combinations_length` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 2 + src/awkward/_connect/cuda/_compute.py | 59 +++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index 307c768959..654b9de7e9 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -52,7 +52,7 @@ "awkward_RegularArray_reduce_nonlocal_preparenext_64", "awkward_missing_repeat", # "awkward_RegularArray_getitem_jagged_expand", - "awkward_ListArray_combinations_length", + # "awkward_ListArray_combinations_length", "awkward_ListArray_combinations", "awkward_RegularArray_combinations_64", "awkward_ListArray_getitem_jagged_apply", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index af816f2954..2027073676 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -87,6 +87,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_ListArray_localindex TODO: fix the tests for this kernel --> - awkward_ListArray_compact_offsets + - awkward_ListArray_combinations_length These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -120,6 +121,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_RegularArray_getitem_jagged_expand", "awkward_UnionArray_simplify_one", "awkward_ListArray_localindex", + "awkward_ListArray_combinations_length", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 2f89fcbc32..2671f511cc 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -844,3 +844,62 @@ def awkward_ListArray_compact_offsets(tooffsets, fromstarts, fromstops, length): cp.array([0], dtype=tooffsets.dtype), length, ) + + +# For each list i, counts the number of n-combinations of its elements +# (with or without replacement) and builds an offsets array into tooffsets. +# totallen[0] is set to the total number of combinations across all lists. +# +# Example (n=2, replacement=False): +# starts=[0, 0, 0], stops=[2, 3, 4] +# sizes = [2, 3, 4] +# C(2,2)=1, C(3,2)=3, C(4,2)=6 +# Then the output will be: tooffsets = [0, 1, 4, 10] +# totallen = 10 +def awkward_ListArray_combinations_length( + totallen, tooffsets, n, replacement, starts, stops, length +): + tooffsets[0] = 0 + if length == 0: + totallen[0] = 0 + return + + def combinations_len(i): + size = stops[i] - starts[i] + if replacement: + size = size + (n - 1) + thisn = n + if thisn > size: + return 0 + elif thisn == size: + return 1 + else: + # C(size, n) == C(size, size-n), so use the smaller one + # of the two to minimise the number of loop iterations + if thisn * 2 > size: + thisn = size - thisn + + # Compute C(size, thisn) = size! / (thisn! * (size-thisn)!) incrementally: + # result = size * (size-1) * ... * (size-thisn+1) / thisn! + result = size + for j in range(2, thisn + 1): + result = result * (size - j + 1) + result = result // j + return result + + # Compute the number of combinations for each list + counts = cp.empty(length, dtype=tooffsets.dtype) + unary_transform(CountingIterator(cp.int64(0)), counts, combinations_len, length) + + # Convert counts to offsets: + # tooffsets[i+1] = sum(counts[0..i]) + inclusive_scan( + counts, + tooffsets[1 : length + 1], + lambda a, b: a + b, + cp.array([0], dtype=tooffsets.dtype), + length, + ) + + # Total number of combinations across all lists + totallen[0] = tooffsets[length] From 4479ce255f730015f3df4781918c3f6f6ba18099 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Mon, 4 May 2026 17:04:25 +0200 Subject: [PATCH 16/27] feat: add `awkward_ListArray_combinations` kernel using cuda.compute --- dev/generate-kernel-signatures.py | 2 +- src/awkward/_backends/cupy.py | 2 + src/awkward/_connect/cuda/_compute.py | 158 ++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index 654b9de7e9..d569039815 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -53,7 +53,7 @@ "awkward_missing_repeat", # "awkward_RegularArray_getitem_jagged_expand", # "awkward_ListArray_combinations_length", - "awkward_ListArray_combinations", + # "awkward_ListArray_combinations", "awkward_RegularArray_combinations_64", "awkward_ListArray_getitem_jagged_apply", "awkward_ListArray_getitem_jagged_carrylen", diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 2027073676..414db2f38e 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -88,6 +88,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: TODO: fix the tests for this kernel --> - awkward_ListArray_compact_offsets - awkward_ListArray_combinations_length + - awkward_ListArray_combinations These kernels should be moved to awkward/_connect/cuda/reducers.py too in the next PR: - awkward_sum @@ -122,6 +123,7 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_UnionArray_simplify_one", "awkward_ListArray_localindex", "awkward_ListArray_combinations_length", + "awkward_ListArray_combinations", ) def _get_cuda_compute_impl(self, kernel_name: str): diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 2671f511cc..ef7ea38f1b 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -903,3 +903,161 @@ def combinations_len(i): # Total number of combinations across all lists totallen[0] = tooffsets[length] + + +# For each list i, enumerates all n-combinations (with or without replacement) +# of its elements and writes the indices into n output carry arrays. +# +# tocarry_ptrs is a CuPy int64 array of length n holding raw device pointers; +# each pointer refers to a pre-allocated int64 array of length totallen. +# +# Example (n=2, replacement=False): +# starts=[0], stops=[3] → elements [0,1,2] +# C(3,2) = 3 combinations in total +# combinations: (0,1),(0,2),(1,2) +# +# Output: +# tocarry_ptrs[0] → [0, 0, 1], tocarry_ptrs[1] → [1, 2, 2] +# toindex: [3, 3] +def awkward_ListArray_combinations( + tocarry_ptrs, toindex, fromindex, n, replacement, starts, stops, length +): + if length == 0: + return + + # Step 1: compute per-list combination counts (same as combinations_length!!) + # TODO: we can just pass combination offsets directly in the future (from src/awkward/contents/listoffsetarray.py:1405) + def combinations_len(i): + size = stops[i] - starts[i] + if replacement: + size = size + (n - 1) + thisn = n + if thisn > size: + return 0 + elif thisn == size: + return 1 + else: + if thisn * 2 > size: + thisn = size - thisn + result = size + for j in range(2, thisn + 1): + result = result * (size - j + 1) + result = result // j + return result + + counts = cp.empty(length, dtype=cp.int64) + unary_transform(CountingIterator(cp.int64(0)), counts, combinations_len, length) + + offsets = cp.empty(length + 1, dtype=cp.int64) + offsets[0] = 0 + inclusive_scan( + counts, + offsets[1:], + lambda a, b: a + b, + cp.array([0], dtype=cp.int64), + length, + ) + + totallen = int(offsets[length]) + if totallen == 0: + return + + # Step 2: wrap raw pointers from tocarry_ptrs into CuPy arrays + # raw int64 pointer values from tocarry_ptrs[k] can't be dereferenced inside a Numba closure, so + # we need this intermediate step + # + # (the pointers themselves are allocated at src/awkward/contents/listoffsetarray.py:1456-1464) + carry_arrays = [] + for k in range(n): + ptr_val = int(tocarry_ptrs[k]) + mem = cp.cuda.UnownedMemory(ptr_val, totallen * 8, None) + memptr = cp.cuda.MemoryPointer(mem, 0) + carry_arrays.append(cp.ndarray(totallen, dtype=cp.int64, memptr=memptr)) + + # ------------------------------------------------------------------------- + # Step 3: fill carry_arrays[k] for each combination position k in turn. + # + # For each output slot g in [0, totallen): + # + # a) Binary search offsets to find which source list i owns slot g, + # and compute the rank of this combination within that list + # (rank = g - offsets[i], i.e. the 0-based index among all combinations + # of list i in lexicographic order). + # + # b) Unrank: decode the rank back into the actual combination tuple using + # a combinatorial number system. Iterating over positions pos=0..n-1, + # at each position scan forward through candidate values j, counting + # how many combinations start with values < j at this position + # (= C(effective_size-j-1, n-pos-1)). Subtract from remaining rank + # until we find the j where remaining < count — that j is the value + # at position pos. + # + # c) Early exit: once pos==k we have the value for position k and write + # it to carry_k[g], skipping the rest of the unranking. This is why + # we do n separate passes (one per k) rather than one pass writing all + # n positions: each pass only needs to unrank up to position k. + # + # d) Content index: add start (the list's base offset into content) to + # convert from a within-list index to an absolute content index. + # For replacement, subtract pos to undo the stars-and-bars shift. + # ------------------------------------------------------------------------- + def make_pass(k, carry_k): + def fill_pos(g): + # a) Find source list i via binary search on offsets + lo = 0 + hi = length - 1 + while lo < hi: + mid = (lo + hi) >> 1 + if offsets[mid + 1] <= g: + lo = mid + 1 + else: + hi = mid + list_i = lo + start = starts[list_i] + size = stops[list_i] - starts[list_i] + rank = g - offsets[list_i] + # For replacement use stars-and-bars effective size + effective_size = size + n - 1 if replacement else size + + # b) Unrank: decode rank into the combination tuple + lower = 0 # lower bound for j at each position (enforces ordering) + remaining = rank + for pos in range(n): + for j in range(lower, effective_size - (n - pos - 1)): + # Count combinations where position pos has value j: + # = C(effective_size - j - 1, n - pos - 1) + top = effective_size - j - 1 + choose = n - pos - 1 + if choose == 0: + count = 1 + else: + if choose * 2 > top: # use smaller equivalent + choose = top - choose + c = top + for q in range(2, choose + 1): + c = c * (top - q + 1) + c = c // q + count = c + if remaining < count: + # c) j is the value at position pos + if pos == k: + # d) write absolute content index and exit early + carry_k[g] = (j - pos if replacement else j) + start + return 0 + lower = j + 1 # next position must be >= j+1 (no repeat) + break + remaining -= count + return 0 + + return fill_pos + + # One parallel pass per combination position k + for k in range(n): + unary_transform( + CountingIterator(cp.int64(0)), + DiscardIterator(), + make_pass(k, carry_arrays[k]), + totallen, + ) + + toindex[:n] = totallen From 1d1cc3d67967c6f2f1d907741f1c969af6de3bf2 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 5 May 2026 11:44:37 +0200 Subject: [PATCH 17/27] feat: add `awkward_UnionArray_nestedfill_tags_index` kernel using cuda.compute --- src/awkward/_connect/cuda/_compute.py | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index ef7ea38f1b..1524745a47 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -737,6 +737,68 @@ def awkward_RegularArray_getitem_jagged_expand( ] +# THIS KERNEL IS NOT USED (just for archive) +# Fills a tagged index for one union type: assigns a constant tag and +# sequential index into each segment defined by the starts/counts ranges +# Example input: +# tmpstarts = [0, 3], tag = 1, fromcounts = [3, 2] +# Example output: +# totags = [1, 1, 1, 1, 1] +# toindex = [0, 1, 2, 0, 1] +# also, the tmpstarts get rewritten with stops: tmpstarts = [3, 5] +def awkward_UnionArray_nestedfill_tags_index( + totags, toindex, tmpstarts, tag, fromcounts, length +): + if length == 0: + return + + starts = tmpstarts[:length] + counts = fromcounts[:length] + + # Total span of the output arrays we need to touch: + # the last segment's start + its count gives the furthest written position + total_size = int(starts[length - 1]) + int(counts[length - 1]) + + if total_size == 0: + return + + # +1 at each segment start, -1 just past each segment end. + # cumsum of this will later yield 1 inside any covered range, 0 in gaps. + diff = cp.zeros(total_size + 1, dtype=cp.int8) + + def scatter_and_update(i): + start = starts[i] + count = counts[i] + # Mark this segment's range in the difference array + diff[start] += cp.int8(1) + diff[start + count] -= cp.int8(1) + # update tmpstarts (for the next call of this kernel (for a different union type))? + tmpstarts[i] = start + count + return 0 + + # Scatter segment's ranges and update tmpstarts + unary_transform( + CountingIterator(cp.int64(0)), DiscardIterator(), scatter_and_update, length + ) + + # coverage[j] == 1 if position j falls inside any segment's range, 0 otherwise + coverage = cp.cumsum(diff[:total_size]) + + # scan[j] == local index of element j within its segment + # Since it's a cumsum, the first index starts from 1, 2, 3 ... + # so we'll have to -1 before writing it in toindex + scan = cp.cumsum(coverage, dtype=cp.int64) + + def fill(j): + if coverage[j]: + # Mark this position as belonging to the current tag + totags[j] = tag + toindex[j] = scan[j] - 1 + return 0 + + unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill, total_size) + + # For each position i where fromtags[i] == fromwhich, sets totags[i] = towhich and # toindex[i] = fromindex[i] + base. Other positions are left unchanged. # Example: From 8935bab28b1d4cbc443df4d1007bd0ed13375dd9 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Tue, 5 May 2026 16:28:20 +0200 Subject: [PATCH 18/27] fix the tests for kernels that are deliberately raising errors --- dev/generate-kernel-signatures.py | 4 +-- dev/generate-tests.py | 43 ++++++++++++++++++++------- src/awkward/_backends/cupy.py | 4 +-- src/awkward/_connect/cuda/_compute.py | 17 ++++++----- 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index d569039815..10dd718941 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -16,8 +16,8 @@ "awkward_ListArray_min_range", "awkward_ListArray_validity", "awkward_BitMaskedArray_to_ByteMaskedArray", - "awkward_ListArray_broadcast_tooffsets", - "awkward_ListArray_compact_offsets", + # "awkward_ListArray_broadcast_tooffsets", + # "awkward_ListArray_compact_offsets", "awkward_ListOffsetArray_flatten_offsets", # "awkward_IndexedArray_overlay_mask", # "awkward_ByteMaskedArray_numnull", diff --git a/dev/generate-tests.py b/dev/generate-tests.py index f398e6988e..25f1cff771 100644 --- a/dev/generate-tests.py +++ b/dev/generate-tests.py @@ -1567,19 +1567,40 @@ def gencudaunittests(specdict): count += 1 else: args += ", " + arg.name - f.write(" " * 4 + "funcC(" + args + ")\n") + # Determine if this is a cuda.compute kernel (raises errors eagerly) + # or compiled CUDA kernel (raises errors after `ak_cu.synchronize_cuda()`) + CUDA_COPUTE_KERNELS = { + "awkward_ListArray_compact_offsets", + "awkward_ListArray_broadcast_tooffsets", + } + + raises_error_eagerly = ( + spec.templatized_kernel_name in CUDA_COPUTE_KERNELS + ) + if test["error"]: - f.write( - f""" - error_message = re.escape("{test["message"]} in compiled CUDA code ({spec.templatized_kernel_name})") -""" - ) - f.write( - """ with pytest.raises(ValueError, match=rf"{error_message}"): - ak_cu.synchronize_cuda() -""" - ) + error_message_line = f' error_message = re.escape("{test["message"]} in compiled CUDA code ({spec.templatized_kernel_name})")\n' + if raises_error_eagerly: + # call a kernel directly inside `pytest.raises()` + f.write( + "\n" + + error_message_line + + ' with pytest.raises(ValueError, match=rf"{error_message}"):\n' + + " " * 8 + + "funcC(" + + args + + ")\n" + ) + else: + f.write(" " * 4 + "funcC(" + args + ")\n") + f.write( + "\n" + + error_message_line + + ' with pytest.raises(ValueError, match=rf"{error_message}"):\n' + " ak_cu.synchronize_cuda()\n" + ) else: + f.write(" " * 4 + "funcC(" + args + ")\n") f.write( """ try: diff --git a/src/awkward/_backends/cupy.py b/src/awkward/_backends/cupy.py index 414db2f38e..1981593310 100644 --- a/src/awkward/_backends/cupy.py +++ b/src/awkward/_backends/cupy.py @@ -82,10 +82,8 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: - awkward_ByteMaskedArray_numnull - awkward_RegularArray_getitem_jagged_expand - awkward_UnionArray_simplify_one - TODO: fix the tests for this kernel --> - awkward_ListArray_broadcast_tooffsets - awkward_ListArray_localindex - TODO: fix the tests for this kernel --> - awkward_ListArray_compact_offsets - awkward_ListArray_combinations_length - awkward_ListArray_combinations @@ -121,7 +119,9 @@ def _supports_cuda_compute(self, kernel_name: str) -> bool: "awkward_ByteMaskedArray_numnull", "awkward_RegularArray_getitem_jagged_expand", "awkward_UnionArray_simplify_one", + "awkward_ListArray_broadcast_tooffsets", "awkward_ListArray_localindex", + "awkward_ListArray_compact_offsets", "awkward_ListArray_combinations_length", "awkward_ListArray_combinations", ) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 1524745a47..ed3d3b1199 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -822,7 +822,6 @@ def transform(i): unary_transform(indices, DiscardIterator(), transform, length) -# TODO: fix tests for this kernel that are deliberately raising an error # producing a carry index that maps each output element back to its position in the original content # Example input: # fromoffsets = [0, 3, 5], fromstarts = [10, 20], fromstops = [13, 22], lencontent = 25 @@ -842,12 +841,15 @@ def awkward_ListArray_broadcast_tooffsets( # counts[i] = how many elements list i should have counts = fromoffsets[1:offsetslength] - fromoffsets[:length] + _K = "awkward_ListArray_broadcast_tooffsets" + if int(cp.any((starts != stops) & (stops > lencontent))): + raise ValueError(f"stops[i] > len(content) in compiled CUDA code ({_K})") if int(cp.any(counts < 0)): - raise ValueError("broadcast's offsets must be monotonically increasing") + raise ValueError( + f"broadcast's offsets must be monotonically increasing in compiled CUDA code ({_K})" + ) if int(cp.any(stops - starts != counts)): - raise ValueError("cannot broadcast nested list") - if int(cp.any((starts != stops) & (stops > lencontent))): - raise ValueError("stops[i] > len(content)") + raise ValueError(f"cannot broadcast nested list in compiled CUDA code ({_K})") # For each segment i, write the content indices starts[i], starts[i]+1, ..., stops[i]-1 # into the contiguous output slice tocarry[fromoffsets[i] : fromoffsets[i+1]]. @@ -882,7 +884,6 @@ def fill(i): unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill, length) -# TODO: fix tests for this kernel that are deliberately raising an error # Converts a ListArray's (starts, stops) pairs into offsets. # tooffsets[0] = 0, tooffsets[i+1] = tooffsets[i] + (fromstops[i] - fromstarts[i]) # Example: @@ -896,7 +897,9 @@ def awkward_ListArray_compact_offsets(tooffsets, fromstarts, fromstops, length): sizes = fromstops[:length] - fromstarts[:length] if cp.any(sizes < 0): - raise ValueError("stops[i] < starts[i]") + raise ValueError( + "stops[i] < starts[i] in compiled CUDA code (awkward_ListArray_compact_offsets)" + ) # the same as `tooffsets[1 : length + 1] = cp.cumsum(sizes)` inclusive_scan( From 2428e5c06e1b39ec30f51389758eca020f25daba Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Wed, 6 May 2026 11:42:20 +0200 Subject: [PATCH 19/27] compare `starts` and `stops` separately --- src/awkward/_connect/cuda/_compute.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index ed3d3b1199..276ddfbe0e 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -842,13 +842,13 @@ def awkward_ListArray_broadcast_tooffsets( counts = fromoffsets[1:offsetslength] - fromoffsets[:length] _K = "awkward_ListArray_broadcast_tooffsets" - if int(cp.any((starts != stops) & (stops > lencontent))): + if cp.any((starts != stops) & (stops > lencontent)): raise ValueError(f"stops[i] > len(content) in compiled CUDA code ({_K})") - if int(cp.any(counts < 0)): + if cp.any(counts < 0): raise ValueError( f"broadcast's offsets must be monotonically increasing in compiled CUDA code ({_K})" ) - if int(cp.any(stops - starts != counts)): + if cp.any(stops - starts != counts): raise ValueError(f"cannot broadcast nested list in compiled CUDA code ({_K})") # For each segment i, write the content indices starts[i], starts[i]+1, ..., stops[i]-1 @@ -894,13 +894,16 @@ def awkward_ListArray_compact_offsets(tooffsets, fromstarts, fromstops, length): if length == 0: return - sizes = fromstops[:length] - fromstarts[:length] + starts = fromstarts[:length] + stops = fromstops[:length] - if cp.any(sizes < 0): + if cp.any(stops < starts): raise ValueError( "stops[i] < starts[i] in compiled CUDA code (awkward_ListArray_compact_offsets)" ) + sizes = stops - starts + # the same as `tooffsets[1 : length + 1] = cp.cumsum(sizes)` inclusive_scan( sizes, From 28d62061aa14a6ef948b5a36bcd7f766f983ec0d Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Wed, 6 May 2026 12:02:37 +0200 Subject: [PATCH 20/27] ignore `memptr` argument for pylint --- src/awkward/_connect/cuda/_compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 1685ba4e6e..b061c479cd 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -1068,7 +1068,7 @@ def combinations_len(i): ptr_val = int(tocarry_ptrs[k]) mem = cp.cuda.UnownedMemory(ptr_val, totallen * 8, None) memptr = cp.cuda.MemoryPointer(mem, 0) - carry_arrays.append(cp.ndarray(totallen, dtype=cp.int64, memptr=memptr)) + carry_arrays.append(cp.ndarray(totallen, dtype=cp.int64, memptr=memptr)) # pylint: disable=unexpected-keyword-arg # ------------------------------------------------------------------------- # Step 3: fill carry_arrays[k] for each combination position k in turn. From 5b75fc407facac26530d54df8f5974e8f9fc9e8e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 13:24:03 +0000 Subject: [PATCH 21/27] style: pre-commit fixes --- src/awkward/_connect/cuda/_compute.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index bad38017d4..4f574393cc 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -5,10 +5,8 @@ from cuda.compute import ( CountingIterator, DiscardIterator, - gpu_struct, - inclusive_scan, - reduce_into, OpKind, + inclusive_scan, segmented_reduce, unary_transform, ) From 2d7b6b81254847e7d646517915970fa0b620e431 Mon Sep 17 00:00:00 2001 From: Ianna Osborne Date: Mon, 18 May 2026 16:02:41 +0200 Subject: [PATCH 22/27] Add functions for indexing and repeating arrays --- src/awkward/_connect/cuda/_compute.py | 91 +++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 4f574393cc..e7437ed491 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -1261,3 +1261,94 @@ def fill_pos(g): ) toindex[:n] = totallen + + + +def awkward_index_rpad_and_clip_axis0(toindex, target, length): + """ + Fill ``toindex[0..target)`` with the identity mapping ``[0..shorter)`` + followed by ``target - shorter`` entries of ``-1``, where + ``shorter = min(target, length)``. + + Called from ``Content._pad_none_axis0`` in + ``src/awkward/contents/content.py``. + """ + dtype = toindex.dtype.type + shorter = min(target, length) + + def fill(i): + return dtype(i) if i < shorter else dtype(-1) + + counters = CountingIterator(dtype(0)) + unary_transform( + d_in=counters, + d_out=toindex, + op=fill, + num_items=target, + ) + + +def awkward_index_rpad_and_clip_axis1(tostarts, tostops, target, length): + """ + Fills `tostarts` and `tostops` with rpad/clip offsets for axis=1 lists. + Each list is padded or clipped to length `target`. + """ + + def fill(i): + start = i * target + end = start + target + + tostarts[i] = tostarts.dtype.type(start) + return tostarts.dtype.type(end) + + segment_ids = CountingIterator(tostarts.dtype.type(0)) + + unary_transform( + d_in=segment_ids, + d_out=tostops, + op=fill, + num_items=length, + ) + + +def awkward_missing_repeat( + outindex, + index, + indexlength, + repetitions, + regularsize, +): + """ + Repeats an index array `repetitions` times, adjusting valid (non-negative) + indices by an offset of `regularsize` per repetition. + Missing values (-1) are preserved. + """ + index_dtype = outindex.dtype.type + output_size = repetitions * indexlength + + reg_size = index_dtype(regularsize) + idx_len = index_dtype(indexlength) + + def fill(counter): + # Position in the original index array + j = counter % idx_len + # Which repetition block are we in? + i = counter // idx_len + + base = index[j] + + # Awkward convention: -1 and lower are masked/missing + if base >= 0: + return index_dtype(base + i * reg_size) + else: + # Preserve the exact missing value (usually -1) + return base + + counters = CountingIterator(index_dtype(0)) + + unary_transform( + d_in=counters, + d_out=outindex, + num_items=output_size, + op=fill, + ) From 972a45608c4e63a7be3534e91737fe31dce55dd4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 14:03:07 +0000 Subject: [PATCH 23/27] style: pre-commit fixes --- src/awkward/_connect/cuda/_compute.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index e7437ed491..632e0038da 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -1263,7 +1263,6 @@ def fill_pos(g): toindex[:n] = totallen - def awkward_index_rpad_and_clip_axis0(toindex, target, length): """ Fill ``toindex[0..target)`` with the identity mapping ``[0..shorter)`` From ac3bb2115525d08672284eb1e27a3ae851f95bf8 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk <70752300+maxymnaumchyk@users.noreply.github.com> Date: Mon, 18 May 2026 22:07:55 +0300 Subject: [PATCH 24/27] return unary_transform call for segment_ids --- src/awkward/_connect/cuda/_compute.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 632e0038da..f266a572d3 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -763,6 +763,15 @@ def segment_reduce_countnonzero(segment_id): count += 1 return count + + segment_ids = CountingIterator(index_dtype(0)) + + unary_transform( + d_in=segment_ids, + d_out=result, + op=segment_reduce_countnonzero, + num_items=outlength, + ) # Overlays a mask onto an index array: masked positions become -1, unmasked positions keep their original index value. From 9bd7682fd9506010a6f2c7d3621ea13bbad5c7ba Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 19:08:18 +0000 Subject: [PATCH 25/27] style: pre-commit fixes --- src/awkward/_connect/cuda/_compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index f266a572d3..793b8be82b 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -763,7 +763,7 @@ def segment_reduce_countnonzero(segment_id): count += 1 return count - + segment_ids = CountingIterator(index_dtype(0)) unary_transform( From 8d859681237347e833cb4c20520ab4ea8517a5b5 Mon Sep 17 00:00:00 2001 From: maxymnaumchyk Date: Mon, 18 May 2026 21:35:20 +0200 Subject: [PATCH 26/27] update the `awkward_IndexedArray_reduce_next_64` to work with offsets --- src/awkward/_connect/cuda/_compute.py | 145 +++++++++++++++++++------- 1 file changed, 106 insertions(+), 39 deletions(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index 793b8be82b..d8919c984b 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -780,32 +780,71 @@ def transform(i): return -1 if mask[i] else fromindex[i] indices = CountingIterator(cp.int64(0)) - unary_transform(indices, toindex, transform, length) + unary_transform(d_in=indices, d_out=toindex, op=transform, num_items=length) -# Skips masked (-1) entries and packs the remaining valid entries into nextcarry and nextparents, tracking where each ended up in outindex. +# Skips masked (-1) entries and packs remaining valid entries into nextcarry, tracking where +# each ended up in outindex. Builds nextoffsets[j+1] = cumulative count of valid entries in +# segments 0..j as defined by the offsets array. +# +# Example: +# index = [3, -1, 5, -1, 2, -1, 4] +# offsets = [0, 4, 7] (2 segments: positions 0-3 and 4-6) +# outlength = 2 +# +# nextcarry = [3, 5, 2, 4] (valid index values, compacted) +# nextoffsets = [0, 2, 4] (segment 0 has 2 valid, segment 1 has 2 valid) +# outindex = [0, -1, 1, -1, 2, -1, 3] (position in nextcarry, or -1 if masked) def awkward_IndexedArray_reduce_next_64( - nextcarry, nextparents, outindex, index, parents, length + nextcarry, nextoffsets, outindex, index, offsets, outlength ): - if length == 0: + nextoffsets[0] = 0 + if outlength == 0: + return + + index_length = int(offsets[outlength]) + if index_length == 0: + nextoffsets[1 : outlength + 1] = 0 return - # Compute cumulative count of valid (non-negative) indices to determine compact output positions - # this needs to be done before going through all the indices in parallel later - scan = cp.cumsum(index >= 0) + idx_dtype = index.dtype + valid = (index[:index_length] >= 0).astype(idx_dtype) + scan = cp.empty(index_length, dtype=idx_dtype) + inclusive_scan( + d_in=valid, + d_out=scan, + op=lambda a, b: a + b, + init_value=cp.array([0], dtype=idx_dtype), + num_items=index_length, + ) def scatter_and_fill(i): if index[i] >= 0: - # Map valid entry to its compacted position k = scan[i] - 1 nextcarry[k] = index[i] - nextparents[k] = parents[i] return k - # Masked entries get -1 in outindex return -1 - indices = CountingIterator(cp.int64(0)) - unary_transform(indices, outindex, scatter_and_fill, length) + unary_transform( + d_in=CountingIterator(idx_dtype.type(0)), + d_out=outindex, + op=scatter_and_fill, + num_items=index_length, + ) + + off_dtype = offsets.dtype.type + + def fill_nextoffsets(j): + stop = offsets[j + 1] + nextoffsets[j + 1] = idx_dtype.type(0) if stop == 0 else scan[stop - 1] + return off_dtype(0) + + unary_transform( + d_in=CountingIterator(off_dtype(0)), + d_out=DiscardIterator(), + op=fill_nextoffsets, + num_items=outlength, + ) # For each valid (non-negative) entry at position i, records the number of null (negative) entries @@ -830,7 +869,7 @@ def scatter(i): return cp.int64(0) indices = CountingIterator(cp.int64(0)) - unary_transform(indices, _, scatter, length) + unary_transform(d_in=indices, d_out=_, op=scatter, num_items=length) # Packs valid entries (where (mask[i] != 0) == validwhen) into tocarry in order. @@ -919,7 +958,10 @@ def scatter_and_update(i): # Scatter segment's ranges and update tmpstarts unary_transform( - CountingIterator(cp.int64(0)), DiscardIterator(), scatter_and_update, length + d_in=CountingIterator(cp.int64(0)), + d_out=DiscardIterator(), + op=scatter_and_update, + num_items=length, ) # coverage[j] == 1 if position j falls inside any segment's range, 0 otherwise @@ -937,7 +979,12 @@ def fill(j): toindex[j] = scan[j] - 1 return 0 - unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill, total_size) + unary_transform( + d_in=CountingIterator(cp.int64(0)), + d_out=DiscardIterator(), + op=fill, + num_items=total_size, + ) # For each position i where fromtags[i] == fromwhich, sets totags[i] = towhich and @@ -960,7 +1007,7 @@ def transform(i): return 0 # discarded indices = CountingIterator(cp.int64(0)) - unary_transform(indices, DiscardIterator(), transform, length) + unary_transform(d_in=indices, d_out=DiscardIterator(), op=transform, num_items=length) # producing a carry index that maps each output element back to its position in the original content @@ -1001,7 +1048,12 @@ def fill_list(i): tocarry[fromoffsets[i] + j - start] = j return 0 - unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill_list, length) + unary_transform( + d_in=CountingIterator(cp.int64(0)), + d_out=DiscardIterator(), + op=fill_list, + num_items=length, + ) # For each segment i, it fills toindex with the local position of each element within that segment — i.e. 0, 1, 2, ... @@ -1022,7 +1074,12 @@ def fill(i): toindex[j] = j - start return 0 - unary_transform(CountingIterator(cp.int64(0)), DiscardIterator(), fill, length) + unary_transform( + d_in=CountingIterator(cp.int64(0)), + d_out=DiscardIterator(), + op=fill, + num_items=length, + ) # Converts a ListArray's (starts, stops) pairs into offsets. @@ -1047,11 +1104,11 @@ def awkward_ListArray_compact_offsets(tooffsets, fromstarts, fromstops, length): # the same as `tooffsets[1 : length + 1] = cp.cumsum(sizes)` inclusive_scan( - sizes, - tooffsets[1 : length + 1], - lambda a, b: a + b, - cp.array([0], dtype=tooffsets.dtype), - length, + d_in=sizes, + d_out=tooffsets[1 : length + 1], + op=lambda a, b: a + b, + init_value=cp.array([0], dtype=tooffsets.dtype), + num_items=length, ) @@ -1098,16 +1155,21 @@ def combinations_len(i): # Compute the number of combinations for each list counts = cp.empty(length, dtype=tooffsets.dtype) - unary_transform(CountingIterator(cp.int64(0)), counts, combinations_len, length) + unary_transform( + d_in=CountingIterator(cp.int64(0)), + d_out=counts, + op=combinations_len, + num_items=length, + ) # Convert counts to offsets: # tooffsets[i+1] = sum(counts[0..i]) inclusive_scan( - counts, - tooffsets[1 : length + 1], - lambda a, b: a + b, - cp.array([0], dtype=tooffsets.dtype), - length, + d_in=counts, + d_out=tooffsets[1 : length + 1], + op=lambda a, b: a + b, + init_value=cp.array([0], dtype=tooffsets.dtype), + num_items=length, ) # Total number of combinations across all lists @@ -1155,16 +1217,21 @@ def combinations_len(i): return result counts = cp.empty(length, dtype=cp.int64) - unary_transform(CountingIterator(cp.int64(0)), counts, combinations_len, length) + unary_transform( + d_in=CountingIterator(cp.int64(0)), + d_out=counts, + op=combinations_len, + num_items=length, + ) offsets = cp.empty(length + 1, dtype=cp.int64) offsets[0] = 0 inclusive_scan( - counts, - offsets[1:], - lambda a, b: a + b, - cp.array([0], dtype=cp.int64), - length, + d_in=counts, + d_out=offsets[1:], + op=lambda a, b: a + b, + init_value=cp.array([0], dtype=cp.int64), + num_items=length, ) totallen = int(offsets[length]) @@ -1263,10 +1330,10 @@ def fill_pos(g): # One parallel pass per combination position k for k in range(n): unary_transform( - CountingIterator(cp.int64(0)), - DiscardIterator(), - make_pass(k, carry_arrays[k]), - totallen, + d_in=CountingIterator(cp.int64(0)), + d_out=DiscardIterator(), + op=make_pass(k, carry_arrays[k]), + num_items=totallen, ) toindex[:n] = totallen From 11930839b7e46768f9e101e229c96f0d53f59fef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 19:37:11 +0000 Subject: [PATCH 27/27] style: pre-commit fixes --- src/awkward/_connect/cuda/_compute.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/awkward/_connect/cuda/_compute.py b/src/awkward/_connect/cuda/_compute.py index d8919c984b..fe30305386 100644 --- a/src/awkward/_connect/cuda/_compute.py +++ b/src/awkward/_connect/cuda/_compute.py @@ -1007,7 +1007,9 @@ def transform(i): return 0 # discarded indices = CountingIterator(cp.int64(0)) - unary_transform(d_in=indices, d_out=DiscardIterator(), op=transform, num_items=length) + unary_transform( + d_in=indices, d_out=DiscardIterator(), op=transform, num_items=length + ) # producing a carry index that maps each output element back to its position in the original content