From a75ff59d909a166f586f9e59a796ad89761d1635 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:51:12 +0100
Subject: [PATCH 1/6] Add GPU kernel using shared memory

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 src/nhs_grid.jl | 150 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index 3b5dd266..2b4c9c93 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -395,6 +395,156 @@ end
     end
 end
 
+# for cell in cells
+#     for point in cell
+#         for neighbor_cell in neighbor_cells
+
+#             for neighbor in neighbor_cell
+@inline function foreach_point_neighbor_localmem(f, system_coords, neighbor_coords,
+                                             neighborhood_search; search_radius = search_radius(neighborhood_search))
+    backend = KernelAbstractions.get_backend(system_coords)
+    max_particles_per_cell = 64
+    nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    linear_indices = neighborhood_search.cell_list.linear_indices
+    cartesian_indices = CartesianIndices(size(linear_indices))
+    lengths = Array(neighborhood_search.cell_list.cells.lengths)
+    # max_particles_per_cell = maximum(lengths)
+    nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
+    ndrange = max_particles_per_cell * length(nonempty_cells)
+    kernel = foreach_neighbor_localmem(backend, (max_particles_per_cell,))
+    kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+@kernel cpu=false function foreach_neighbor_localmem(f::F, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    local_points = @localmem Int32 MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+
+    pv = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(pv)
+    if particleidx <= n_particles_in_current_cell
+        point = @inbounds pv[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 point)
+        # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n")
+    else
+        point = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+    end
+
+    for i in -1:1, j in -1:1, k in -1:1
+        neighbor_cell = (cell[1] + i, cell[2] + j, cell[3] + k)
+    # for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
+    #     neighbor_cell = Tuple(neighbor_cell_)
+        points_view = points_in_cell(neighbor_cell, neighborhood_search)
+        n_particles_in_neighbor_cell = length(points_view)
+        # if n_particles_in_neighbor_cell
+        #     continue
+        # end
+
+        # First use all threads to load the neighbors into local memory in parallel
+        if particleidx <= n_particles_in_neighbor_cell
+            @inbounds p = local_points[particleidx] = points_view[particleidx]
+            # KernelAbstractions.@print("Point $point, neighbor $p with coords ($(neighbor_system_coords[1, p]), $(neighbor_system_coords[2, p]))\n")
+            for d in 1:ndims(neighborhood_search)
+                @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
+            end
+        end
+        @synchronize()
+        # Now each thread works on one point again
+        if particleidx <= n_particles_in_current_cell
+            for local_neighbor in 1:n_particles_in_neighbor_cell
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                  Val(ndims(neighborhood_search)), local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if distance2 <= search_radius^2
+                    # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n")
+                    distance = sqrt(distance2) # TODO: eventuell fastmath
+
+                    # Inline to avoid loss of performance
+                    # compared to not using `foreach_point_neighbor`.
+                    @inline f(point, neighbor, pos_diff, distance)
+                end
+            end
+        end
+        @synchronize()
+    end
+end
+
+@inline function foreach_point_neighbor_cell_blocks(f, system_coords, neighbor_coords,
+                                             neighborhood_search)
+    backend = KernelAbstractions.get_backend(system_coords)
+    max_particles_per_cell = 64
+    nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    ndrange = max_particles_per_cell * length(cells)
+    kernel = foreach_neighbor_cell_blocks(backend, (max_particles_per_cell,))
+    kernel(f, system_coords, neighbor_coords, neighborhood_search, Val(max_particles_per_cell); ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+@kernel cpu=false function foreach_neighbor_cell_blocks(f::F, system_coords, neighbor_coords,
+                               neighborhood_search, ::Val{MAX}) where {F, MAX}
+    cell_ = @index(Group)
+    nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    @inbounds cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    pv = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(pv)
+    if particleidx <= n_particles_in_current_cell
+        point = @inbounds pv[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 point)
+        # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n")
+
+        for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
+            neighbor_cell = Tuple(neighbor_cell_)
+            points_view = points_in_cell(neighbor_cell, neighborhood_search)
+
+            for neighbor in points_view
+                @inbounds neighbor_coords = extract_svector(neighbor_system_coords,
+                                                Val(ndims(neighborhood_search)), neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if distance2 <= search_radius^2
+                    # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n")
+                    distance = sqrt(distance2) # TODO: eventuell fastmath
+
+                    # Inline to avoid loss of performance
+                    # compared to not using `foreach_point_neighbor`.
+                    @inline f(point, neighbor, pos_diff, distance)
+                end
+            end
+        end
+    end
+end
+
 @inline function neighboring_cells(cell, neighborhood_search)
     NDIMS = ndims(neighborhood_search)
 

From 8371415d36f4f803f4b261bb9059a1cc25ec6217 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 8 Nov 2024 19:42:43 +0100
Subject: [PATCH 2/6] Add missing imports

---
 src/PointNeighbors.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl
index 22ef3893..21bbc30b 100644
--- a/src/PointNeighbors.jl
+++ b/src/PointNeighbors.jl
@@ -6,7 +6,7 @@ using Adapt: Adapt
 using Atomix: Atomix
 using Base: @propagate_inbounds
 using GPUArraysCore: AbstractGPUArray
-using KernelAbstractions: KernelAbstractions, @kernel, @index
+using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize
 using LinearAlgebra: dot
 using Polyester: Polyester
 @reexport using StaticArrays: SVector

From eee0a34c81394e45ed8b68b0d85517bdc8c061c7 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 8 Nov 2024 20:18:24 +0100
Subject: [PATCH 3/6] WIP: Implement double buffering

---
 src/nhs_grid.jl | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index 2b4c9c93..fc94df16 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -429,6 +429,9 @@ end
 
     local_points = @localmem Int32 MAX
     local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+    
+    next_local_points = @localmem Int32 MAX
+    next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
 
     pv = points_in_cell(cell, neighborhood_search)
     n_particles_in_current_cell = length(pv)
@@ -436,31 +439,38 @@ end
         point = @inbounds pv[particleidx]
         point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
                                                  point)
-        # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n")
     else
         point = zero(Int32)
         point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
     end
 
-    for i in -1:1, j in -1:1, k in -1:1
-        neighbor_cell = (cell[1] + i, cell[2] + j, cell[3] + k)
-    # for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
-    #     neighbor_cell = Tuple(neighbor_cell_)
+    @inline function stage!(local_points, local_neighbor_coords, neighbor_cell)
         points_view = points_in_cell(neighbor_cell, neighborhood_search)
         n_particles_in_neighbor_cell = length(points_view)
-        # if n_particles_in_neighbor_cell
-        #     continue
-        # end
 
         # First use all threads to load the neighbors into local memory in parallel
         if particleidx <= n_particles_in_neighbor_cell
             @inbounds p = local_points[particleidx] = points_view[particleidx]
-            # KernelAbstractions.@print("Point $point, neighbor $p with coords ($(neighbor_system_coords[1, p]), $(neighbor_system_coords[2, p]))\n")
             for d in 1:ndims(neighborhood_search)
                 @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
             end
         end
-        @synchronize()
+        return n_particles_in_neighbor_cell
+    end
+
+    neighborhood = neighboring_cells(cell, neighbor_search)
+    (neighbor_cell, state) = iterate(neighborhood)
+
+    n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell))
+    @synchronize()
+
+    while true
+        next = iterate(neighborhood, state)
+        if next !== nothing
+            (next_neighbor_cell, state) = next
+            next_n_particles_in_neighbor_cell = stage!(next_local_points, next_local_neighbor_coords, Tuple(next_neighbor_cell))
+        end
+
         # Now each thread works on one point again
         if particleidx <= n_particles_in_current_cell
             for local_neighbor in 1:n_particles_in_neighbor_cell
@@ -483,7 +493,16 @@ end
                 end
             end
         end
+        next === nothing && break
         @synchronize()
+        # swap variables
+        n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell
+        temp = local_points
+        local_points = next_local_points
+        next_local_points = temp
+        temp = local_neighbor_coords
+        local_neighbor_coords = next_local_neighbor_coords
+        next_local_neighbor_coords = temp
     end
 end
 

From 843116c518fb8034b5804b31c7f243615a51112b Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:21:15 +0100
Subject: [PATCH 4/6] Fix double buffered kernel

---
 src/nhs_grid.jl | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index fc94df16..7b994bd0 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -429,7 +429,7 @@ end
 
     local_points = @localmem Int32 MAX
     local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
-    
+
     next_local_points = @localmem Int32 MAX
     next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
 
@@ -446,28 +446,36 @@ end
 
     @inline function stage!(local_points, local_neighbor_coords, neighbor_cell)
         points_view = points_in_cell(neighbor_cell, neighborhood_search)
-        n_particles_in_neighbor_cell = length(points_view)
+        n_particles_in_neighbor_cell_ = length(points_view)
 
         # First use all threads to load the neighbors into local memory in parallel
-        if particleidx <= n_particles_in_neighbor_cell
+        if particleidx <= n_particles_in_neighbor_cell_
             @inbounds p = local_points[particleidx] = points_view[particleidx]
             for d in 1:ndims(neighborhood_search)
                 @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
             end
         end
-        return n_particles_in_neighbor_cell
+        return n_particles_in_neighbor_cell_
     end
 
-    neighborhood = neighboring_cells(cell, neighbor_search)
-    (neighbor_cell, state) = iterate(neighborhood)
+    neighborhood = neighboring_cells(cell, neighborhood_search)
+    # (neighbor_cell, state) = iterate(neighborhood)
+    neighbor_cell = first(neighborhood)
 
     n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell))
     @synchronize()
 
-    while true
-        next = iterate(neighborhood, state)
-        if next !== nothing
-            (next_neighbor_cell, state) = next
+    for neighbor_ in 1:length(neighborhood)
+        neighbor_cell = @inbounds neighborhood[neighbor_]
+
+    # while true
+    #     next = iterate(neighborhood, state)
+    #     if next !== nothing
+        # n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell))
+        # @synchronize
+        if neighbor_ < length(neighborhood)
+            next_neighbor_cell = neighborhood[neighbor_ + 1]
+            # (next_neighbor_cell, state) = next
             next_n_particles_in_neighbor_cell = stage!(next_local_points, next_local_neighbor_coords, Tuple(next_neighbor_cell))
         end
 
@@ -493,7 +501,8 @@ end
                 end
             end
         end
-        next === nothing && break
+        # next === nothing && break
+        neighbor_ >= length(neighborhood) && break
         @synchronize()
         # swap variables
         n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell

From 02b22f06b2084a3432599e20b253b0ec151c18e3 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Fri, 15 Nov 2024 14:24:43 +0100
Subject: [PATCH 5/6] Improve readability

---
 src/nhs_grid.jl | 161 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 132 insertions(+), 29 deletions(-)

diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index 7b994bd0..19d2a52f 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -420,6 +420,37 @@ end
     return nothing
 end
 
+@inline function copy_to_localmem!(local_points, local_neighbor_coords,
+                                   neighbor_cell, neighbor_system_coords,
+                                   neighborhood_search, particleidx)
+    points_view = points_in_cell(neighbor_cell, neighborhood_search)
+    n_particles_in_neighbor_cell = length(points_view)
+
+    # First use all threads to load the neighbors into local memory in parallel
+    if particleidx <= n_particles_in_neighbor_cell
+        @inbounds p = local_points[particleidx] = points_view[particleidx]
+        for d in 1:ndims(neighborhood_search)
+            @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
+        end
+    end
+    return n_particles_in_neighbor_cell
+end
+
+# @parallel(block) for cell in cells
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell
+#             copy_coordinates_to_localmem(neighbor)
+#
+#         # Make sure all threads finished the copying
+#         @synchronize
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we continue with copying
+#         @synchronize
 @kernel cpu=false function foreach_neighbor_localmem(f::F, system_coords, neighbor_system_coords,
                                neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
     cell_ = @index(Group)
@@ -427,16 +458,16 @@ end
     particleidx = @index(Local)
     @assert 1 <= particleidx <= MAX
 
+    # Coordinate buffer in local memory
     local_points = @localmem Int32 MAX
     local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
 
-    next_local_points = @localmem Int32 MAX
-    next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+    points = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
 
-    pv = points_in_cell(cell, neighborhood_search)
-    n_particles_in_current_cell = length(pv)
+    # Extract point coordinates if a point lies on this thread
     if particleidx <= n_particles_in_current_cell
-        point = @inbounds pv[particleidx]
+        point = @inbounds points[particleidx]
         point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
                                                  point)
     else
@@ -444,39 +475,113 @@ end
         point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
     end
 
-    @inline function stage!(local_points, local_neighbor_coords, neighbor_cell)
-        points_view = points_in_cell(neighbor_cell, neighborhood_search)
-        n_particles_in_neighbor_cell_ = length(points_view)
+    for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
+        neighbor_cell = Tuple(neighbor_cell_)
+
+        n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
+                                                         neighbor_cell, neighbor_system_coords,
+                                                         neighborhood_search, particleidx)
+
+        # Make sure all threads finished the copying
+        @synchronize
 
-        # First use all threads to load the neighbors into local memory in parallel
-        if particleidx <= n_particles_in_neighbor_cell_
-            @inbounds p = local_points[particleidx] = points_view[particleidx]
-            for d in 1:ndims(neighborhood_search)
-                @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
+        # Now each thread works on one point again
+        if particleidx <= n_particles_in_current_cell
+            for local_neighbor in 1:n_particles_in_neighbor_cell
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if distance2 <= search_radius^2
+                    distance = sqrt(distance2) # TODO: eventuell fastmath
+
+                    # Inline to avoid loss of performance
+                    # compared to not using `foreach_point_neighbor`.
+                    @inline f(point, neighbor, pos_diff, distance)
+                end
             end
         end
-        return n_particles_in_neighbor_cell_
+
+        # Make sure all threads finished computing before we continue with copying
+        @synchronize()
+    end
+end
+
+# @parallel(block) for cell in cells
+#     @parallel(thread) for neighbor in first_neighbor_cell
+#         copy_coordinates_to_localmem!(local_coords, neighbor)
+#
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell + 1
+#             copy_coordinates_to_localmem!(next_local_coords, neighbor)
+#
+#         # No synchronize needed. The following loop works on `local_coords`.
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we switch variables
+#         @synchronize
+#         local_coords, next_local_coords = next_local_coords, local_coords
+@kernel cpu=false function foreach_neighbor_double_buffer(f::F, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+
+    # Next coordinate buffer in local memory
+    next_local_points = @localmem Int32 MAX
+    next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+
+    points = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        point = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 point)
+    else
+        point = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
     end
 
     neighborhood = neighboring_cells(cell, neighborhood_search)
     # (neighbor_cell, state) = iterate(neighborhood)
-    neighbor_cell = first(neighborhood)
+    neighbor_cell = Tuple(first(neighborhood))
+
+    n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
+                                                     neighbor_cell, neighbor_system_coords,
+                                                     neighborhood_search, particleidx)
 
-    n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell))
     @synchronize()
 
     for neighbor_ in 1:length(neighborhood)
-        neighbor_cell = @inbounds neighborhood[neighbor_]
+        neighbor_cell = @inbounds Tuple(neighborhood[neighbor_])
 
     # while true
     #     next = iterate(neighborhood, state)
     #     if next !== nothing
-        # n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell))
-        # @synchronize
+
         if neighbor_ < length(neighborhood)
-            next_neighbor_cell = neighborhood[neighbor_ + 1]
+            next_neighbor_cell = @inbounds Tuple(neighborhood[neighbor_ + 1])
             # (next_neighbor_cell, state) = next
-            next_n_particles_in_neighbor_cell = stage!(next_local_points, next_local_neighbor_coords, Tuple(next_neighbor_cell))
+            next_n_particles_in_neighbor_cell = copy_to_localmem!(next_local_points, next_local_neighbor_coords,
+                                                             next_neighbor_cell, neighbor_system_coords,
+                                                             neighborhood_search, particleidx)
         end
 
         # Now each thread works on one point again
@@ -484,7 +589,8 @@ end
             for local_neighbor in 1:n_particles_in_neighbor_cell
                 @inbounds neighbor = local_points[local_neighbor]
                 @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
-                                                  Val(ndims(neighborhood_search)), local_neighbor)
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
 
                 pos_diff = point_coords - neighbor_coords
                 distance2 = dot(pos_diff, pos_diff)
@@ -492,7 +598,6 @@ end
                 # TODO periodic
 
                 if distance2 <= search_radius^2
-                    # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n")
                     distance = sqrt(distance2) # TODO: eventuell fastmath
 
                     # Inline to avoid loss of performance
@@ -501,17 +606,15 @@ end
                 end
             end
         end
+
         # next === nothing && break
         neighbor_ >= length(neighborhood) && break
         @synchronize()
+
         # swap variables
         n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell
-        temp = local_points
-        local_points = next_local_points
-        next_local_points = temp
-        temp = local_neighbor_coords
-        local_neighbor_coords = next_local_neighbor_coords
-        next_local_neighbor_coords = temp
+        local_points, next_local_points = next_local_points, local_points
+        local_neighbor_coords, next_local_neighbor_coords = next_local_neighbor_coords, local_neighbor_coords
     end
 end
 

From 3b681aadf1ab0b61ce10c98eb0c9ae6358856449 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 27 Oct 2025 15:57:45 +0100
Subject: [PATCH 6/6] Add another version of localmem kernel

---
 src/nhs_grid.jl | 161 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index 19d2a52f..eeff0713 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -676,6 +676,167 @@ end
     end
 end
 
+# Another version of the above that allows for groups smaller than cells
+# # for cell in cells
+# #     for point in cell
+# #         for neighbor_cell in neighbor_cells
+
+# #             for neighbor in neighbor_cell
+# function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search::GridNeighborhoodSearch;
+#                                 parallelization_backend::ParallelizationBackend = default_backend(system_coords),
+#                                 points = axes(system_coords, 2)) where {T}
+#     # The type annotation above is to make Julia specialize on the type of the function.
+#     # Otherwise, unspecialized code will cause a lot of allocations
+#     # and heavily impact performance.
+#     # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
+
+#     if system_coords !== neighbor_coords
+#         foreach_point_neighbor_fallback(f, system_coords, neighbor_coords,
+#                                         neighborhood_search;
+#                                         parallelization_backend,
+#                                         points)
+#         return nothing
+#     end
+
+#     # Explicit bounds check before the GPU kernel
+#     @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
+
+#     linear_indices = neighborhood_search.cell_list.linear_indices
+#     cartesian_indices = CartesianIndices(size(linear_indices))
+#     lengths = Array(neighborhood_search.cell_list.cells.lengths)
+#     # max_particles_per_cell = maximum(lengths)
+#     nonempty_cells = Adapt.adapt(parallelization_backend,
+#                                  filter(index -> lengths[linear_indices[index]] > 0,
+#                                         cartesian_indices))
+
+#     # Launch: one workgroup per non-empty cell
+#     workgroupsize = 16
+#     n_groups = length(nonempty_cells)
+#     ndrange  = n_groups * workgroupsize
+
+#     kernel = foreach_neighbor_gpu(parallelization_backend, workgroupsize)
+#     kernel(f, system_coords, neighbor_coords, neighborhood_search, points, nonempty_cells, Val(workgroupsize); ndrange)
+
+#     KernelAbstractions.synchronize(parallelization_backend)
+
+#     return nothing
+# end
+
+# function foreach_point_neighbor_fallback(f::T, system_coords, neighbor_coords, neighborhood_search;
+#                                 parallelization_backend::ParallelizationBackend = default_backend(system_coords),
+#                                 points = axes(system_coords, 2)) where {T}
+#     # The type annotation above is to make Julia specialize on the type of the function.
+#     # Otherwise, unspecialized code will cause a lot of allocations
+#     # and heavily impact performance.
+#     # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
+
+#     # Explicit bounds check before the hot loop (or GPU kernel)
+#     @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
+
+#     @threaded parallelization_backend for point in points
+#         # Now we can safely assume that `point` is inbounds
+#         @inbounds foreach_neighbor(f, system_coords, neighbor_coords,
+#                                    neighborhood_search, point)
+#     end
+
+#     return nothing
+# end
+
+# @kernel function foreach_neighbor_gpu(f, system_coords, neighbor_system_coords,
+#                                       neighborhood_search::GridNeighborhoodSearch, points,
+#                                       cells, ::Val{workgroupsize}) where {workgroupsize}
+#     gid = @index(Group)
+#     lid = @index(Local)
+
+#     ELTYPE = eltype(neighborhood_search)
+
+#     # Shared (local) memory to stage one tile of neighbor attributes
+#     # 2D shown here. For 3D, add z buffer likewise.
+#     sh_x = @localmem eltype(neighbor_system_coords) (workgroupsize,)
+#     sh_y = @localmem eltype(neighbor_system_coords) (workgroupsize,)
+#     # neighbor particle indices (use Int32 if your indices are 32-bit)
+#     sh_id = @localmem Int32 (workgroupsize,)
+
+#     # Each group processes exactly one cell
+#     @assert gid <= length(cells)
+#     this_cell = @inbounds Tuple(cells[gid])
+
+#     points_in_cell_ = @inbounds points_in_cell(this_cell, neighborhood_search)
+#     @assert length(points_in_cell_) != 0
+
+#     neighboring_cells_ = neighboring_cells(this_cell, neighborhood_search)
+#     search_radius2 = search_radius(neighborhood_search)^2
+
+#     # Tile particles in cell by workgroup size
+#     n_tiles = ceil(Int, length(points_in_cell_) / workgroupsize)
+
+#     for tile_i in 1:n_tiles
+#         cell_i = (tile_i - 1) * workgroupsize + lid
+
+#         if cell_i <= length(points_in_cell_)
+#             # Load one point from this cell
+#             point = @inbounds points_in_cell_[cell_i]
+#             point_coords = @inbounds extract_svector(system_coords,
+#                                                         Val(ndims(neighborhood_search)),
+#                                                         point)
+#         else
+#             point = zero(Int32)
+#             point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+#         end
+
+#         # Loop over neighboring cells
+#         for neighbor_cell_ in neighboring_cells_
+#             neighbor_cell = Tuple(neighbor_cell_)
+
+#             # We can use `@inbounds` here, assuming that the initialize/update steps verified
+#             # that all cells have valid neighbors.
+#             neighbors = @inbounds points_in_cell(neighbor_cell, neighborhood_search)
+#             n_neighbors = length(neighbors)
+
+#             # Tile neighbors by WG and stage to local memory
+#             j = 1
+#             # This loop should only have one iteration if workgroupsize is larger than cell size
+#             while j <= n_neighbors
+#                 # Cooperative load: each thread brings one neighbor into shared memory
+#                 lane_idx = j + (lid - 1)
+#                 if lane_idx <= n_neighbors
+#                     nb = @inbounds neighbors[lane_idx]
+#                     @inbounds sh_id[lid] = Int32(nb)
+#                     nb_coords = @inbounds extract_svector(neighbor_system_coords,
+#                                                         Val(ndims(neighborhood_search)), nb)
+#                     @inbounds sh_x[lid] = nb_coords[1]
+#                     @inbounds sh_y[lid] = nb_coords[2]
+#                 end
+
+#                 @synchronize
+
+#                 if cell_i <= length(points_in_cell_)
+#                     # Reuse the staged tile for all threads' i
+#                     # Determine how many valid neighbors were staged in this tile
+#                     tile_count = min(workgroupsize, n_neighbors - j + 1)
+
+#                     @inbounds for t in 1:tile_count
+#                         neighbor = @inbounds sh_id[t]
+#                         # Compute pairwise interaction
+#                         pos_diff1 = @inbounds convert(ELTYPE, point_coords[1] - sh_x[t])
+#                         pos_diff2 = @inbounds convert(ELTYPE, point_coords[2] - sh_y[t])
+#                         distance2 = pos_diff1 * pos_diff1 + pos_diff2 * pos_diff2
+#                         if distance2 <= search_radius2
+#                             distance = sqrt(distance2)
+
+#                             # Call the user-supplied functor
+#                             @inline f(point, neighbor, SVector(pos_diff1, pos_diff2), distance)
+#                         end
+#                     end
+#                 end
+
+#                 @synchronize
+#                 j += workgroupsize
+#             end
+#         end
+#     end
+# end
+
 @inline function neighboring_cells(cell, neighborhood_search)
     NDIMS = ndims(neighborhood_search)