diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl
index 6e7ad93e..4cc16576 100644
--- a/src/PointNeighbors.jl
+++ b/src/PointNeighbors.jl
@@ -6,7 +6,7 @@ using Adapt: Adapt
 using Atomix: Atomix
 using Base: @propagate_inbounds
 using GPUArraysCore: AbstractGPUArray
-using KernelAbstractions: KernelAbstractions, @kernel, @index
+using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize
 using LinearAlgebra: dot
 using Polyester: Polyester
 @reexport using StaticArrays: SVector
diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index ba613ecd..df2ecca4 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -535,6 +535,448 @@ end
     end
 end
 
+# for cell in cells
+#     for point in cell
+#         for neighbor_cell in neighbor_cells
+
+#             for neighbor in neighbor_cell
+@inline function foreach_point_neighbor_localmem(f, system_coords, neighbor_coords,
+                                             neighborhood_search; search_radius = search_radius(neighborhood_search))
+    backend = KernelAbstractions.get_backend(system_coords)
+    max_particles_per_cell = 64
+    nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    linear_indices = neighborhood_search.cell_list.linear_indices
+    cartesian_indices = CartesianIndices(size(linear_indices))
+    lengths = Array(neighborhood_search.cell_list.cells.lengths)
+    # max_particles_per_cell = maximum(lengths)
+    nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
+    ndrange = max_particles_per_cell * length(nonempty_cells)
+    kernel = foreach_neighbor_localmem(backend, (max_particles_per_cell,))
+    kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+@inline function copy_to_localmem!(local_points, local_neighbor_coords,
+                                   neighbor_cell, neighbor_system_coords,
+                                   neighborhood_search, particleidx)
+    points_view = points_in_cell(neighbor_cell, neighborhood_search)
+    n_particles_in_neighbor_cell = length(points_view)
+
+    # First use all threads to load the neighbors into local memory in parallel
+    if particleidx <= n_particles_in_neighbor_cell
+        @inbounds p = local_points[particleidx] = points_view[particleidx]
+        for d in 1:ndims(neighborhood_search)
+            @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
+        end
+    end
+    return n_particles_in_neighbor_cell
+end
+
+# @parallel(block) for cell in cells
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell
+#             copy_coordinates_to_localmem(neighbor)
+#
+#         # Make sure all threads finished the copying
+#         @synchronize
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we continue with copying
+#         @synchronize
+@kernel cpu=false function foreach_neighbor_localmem(f::F, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+
+    points = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        point = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 point)
+    else
+        point = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+    end
+
+    for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
+        neighbor_cell = Tuple(neighbor_cell_)
+
+        n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
+                                                         neighbor_cell, neighbor_system_coords,
+                                                         neighborhood_search, particleidx)
+
+        # Make sure all threads finished the copying
+        @synchronize
+
+        # Now each thread works on one point again
+        if particleidx <= n_particles_in_current_cell
+            for local_neighbor in 1:n_particles_in_neighbor_cell
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if distance2 <= search_radius^2
+                    distance = sqrt(distance2) # TODO: eventuell fastmath
+
+                    # Inline to avoid loss of performance
+                    # compared to not using `foreach_point_neighbor`.
+                    @inline f(point, neighbor, pos_diff, distance)
+                end
+            end
+        end
+
+        # Make sure all threads finished computing before we continue with copying
+        @synchronize()
+    end
+end
+
+# @parallel(block) for cell in cells
+#     @parallel(thread) for neighbor in first_neighbor_cell
+#         copy_coordinates_to_localmem!(local_coords, neighbor)
+#
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell + 1
+#             copy_coordinates_to_localmem!(next_local_coords, neighbor)
+#
+#         # No synchronize needed. The following loop works on `local_coords`.
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we switch variables
+#         @synchronize
+#         local_coords, next_local_coords = next_local_coords, local_coords
+@kernel cpu=false function foreach_neighbor_double_buffer(f::F, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+
+    # Next coordinate buffer in local memory
+    next_local_points = @localmem Int32 MAX
+    next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+
+    points = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        point = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 point)
+    else
+        point = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+    end
+
+    neighborhood = neighboring_cells(cell, neighborhood_search)
+    # (neighbor_cell, state) = iterate(neighborhood)
+    neighbor_cell = Tuple(first(neighborhood))
+
+    n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
+                                                     neighbor_cell, neighbor_system_coords,
+                                                     neighborhood_search, particleidx)
+
+    @synchronize()
+
+    for neighbor_ in 1:length(neighborhood)
+        neighbor_cell = @inbounds Tuple(neighborhood[neighbor_])
+
+    # while true
+    #     next = iterate(neighborhood, state)
+    #     if next !== nothing
+
+        if neighbor_ < length(neighborhood)
+            next_neighbor_cell = @inbounds Tuple(neighborhood[neighbor_ + 1])
+            # (next_neighbor_cell, state) = next
+            next_n_particles_in_neighbor_cell = copy_to_localmem!(next_local_points, next_local_neighbor_coords,
+                                                             next_neighbor_cell, neighbor_system_coords,
+                                                             neighborhood_search, particleidx)
+        end
+
+        # Now each thread works on one point again
+        if particleidx <= n_particles_in_current_cell
+            for local_neighbor in 1:n_particles_in_neighbor_cell
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if distance2 <= search_radius^2
+                    distance = sqrt(distance2) # TODO: eventuell fastmath
+
+                    # Inline to avoid loss of performance
+                    # compared to not using `foreach_point_neighbor`.
+                    @inline f(point, neighbor, pos_diff, distance)
+                end
+            end
+        end
+
+        # next === nothing && break
+        neighbor_ >= length(neighborhood) && break
+        @synchronize()
+
+        # swap variables
+        n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell
+        local_points, next_local_points = next_local_points, local_points
+        local_neighbor_coords, next_local_neighbor_coords = next_local_neighbor_coords, local_neighbor_coords
+    end
+end
+
+@inline function foreach_point_neighbor_cell_blocks(f, system_coords, neighbor_coords,
+                                             neighborhood_search)
+    backend = KernelAbstractions.get_backend(system_coords)
+    max_particles_per_cell = 64
+    nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    ndrange = max_particles_per_cell * length(cells)
+    kernel = foreach_neighbor_cell_blocks(backend, (max_particles_per_cell,))
+    kernel(f, system_coords, neighbor_coords, neighborhood_search, Val(max_particles_per_cell); ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+@kernel cpu=false function foreach_neighbor_cell_blocks(f::F, system_coords, neighbor_coords,
+                               neighborhood_search, ::Val{MAX}) where {F, MAX}
+    cell_ = @index(Group)
+    nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    @inbounds cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    pv = points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(pv)
+    if particleidx <= n_particles_in_current_cell
+        point = @inbounds pv[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 point)
+        # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n")
+
+        for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
+            neighbor_cell = Tuple(neighbor_cell_)
+            points_view = points_in_cell(neighbor_cell, neighborhood_search)
+
+            for neighbor in points_view
+                @inbounds neighbor_coords = extract_svector(neighbor_system_coords,
+                                                Val(ndims(neighborhood_search)), neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if distance2 <= search_radius^2
+                    # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n")
+                    distance = sqrt(distance2) # TODO: eventuell fastmath
+
+                    # Inline to avoid loss of performance
+                    # compared to not using `foreach_point_neighbor`.
+                    @inline f(point, neighbor, pos_diff, distance)
+                end
+            end
+        end
+    end
+end
+
+# Another version of the above that allows for groups smaller than cells
+# # for cell in cells
+# #     for point in cell
+# #         for neighbor_cell in neighbor_cells
+
+# #             for neighbor in neighbor_cell
+# function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search::GridNeighborhoodSearch;
+#                                 parallelization_backend::ParallelizationBackend = default_backend(system_coords),
+#                                 points = axes(system_coords, 2)) where {T}
+#     # The type annotation above is to make Julia specialize on the type of the function.
+#     # Otherwise, unspecialized code will cause a lot of allocations
+#     # and heavily impact performance.
+#     # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
+
+#     if system_coords !== neighbor_coords
+#         foreach_point_neighbor_fallback(f, system_coords, neighbor_coords,
+#                                         neighborhood_search;
+#                                         parallelization_backend,
+#                                         points)
+#         return nothing
+#     end
+
+#     # Explicit bounds check before the GPU kernel
+#     @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
+
+#     linear_indices = neighborhood_search.cell_list.linear_indices
+#     cartesian_indices = CartesianIndices(size(linear_indices))
+#     lengths = Array(neighborhood_search.cell_list.cells.lengths)
+#     # max_particles_per_cell = maximum(lengths)
+#     nonempty_cells = Adapt.adapt(parallelization_backend,
+#                                  filter(index -> lengths[linear_indices[index]] > 0,
+#                                         cartesian_indices))
+
+#     # Launch: one workgroup per non-empty cell
+#     workgroupsize = 16
+#     n_groups = length(nonempty_cells)
+#     ndrange  = n_groups * workgroupsize
+
+#     kernel = foreach_neighbor_gpu(parallelization_backend, workgroupsize)
+#     kernel(f, system_coords, neighbor_coords, neighborhood_search, points, nonempty_cells, Val(workgroupsize); ndrange)
+
+#     KernelAbstractions.synchronize(parallelization_backend)
+
+#     return nothing
+# end
+
+# function foreach_point_neighbor_fallback(f::T, system_coords, neighbor_coords, neighborhood_search;
+#                                 parallelization_backend::ParallelizationBackend = default_backend(system_coords),
+#                                 points = axes(system_coords, 2)) where {T}
+#     # The type annotation above is to make Julia specialize on the type of the function.
+#     # Otherwise, unspecialized code will cause a lot of allocations
+#     # and heavily impact performance.
+#     # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
+
+#     # Explicit bounds check before the hot loop (or GPU kernel)
+#     @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
+
+#     @threaded parallelization_backend for point in points
+#         # Now we can safely assume that `point` is inbounds
+#         @inbounds foreach_neighbor(f, system_coords, neighbor_coords,
+#                                    neighborhood_search, point)
+#     end
+
+#     return nothing
+# end
+
+# @kernel function foreach_neighbor_gpu(f, system_coords, neighbor_system_coords,
+#                                       neighborhood_search::GridNeighborhoodSearch, points,
+#                                       cells, ::Val{workgroupsize}) where {workgroupsize}
+#     gid = @index(Group)
+#     lid = @index(Local)
+
+#     ELTYPE = eltype(neighborhood_search)
+
+#     # Shared (local) memory to stage one tile of neighbor attributes
+#     # 2D shown here. For 3D, add z buffer likewise.
+#     sh_x = @localmem eltype(neighbor_system_coords) (workgroupsize,)
+#     sh_y = @localmem eltype(neighbor_system_coords) (workgroupsize,)
+#     # neighbor particle indices (use Int32 if your indices are 32-bit)
+#     sh_id = @localmem Int32 (workgroupsize,)
+
+#     # Each group processes exactly one cell
+#     @assert gid <= length(cells)
+#     this_cell = @inbounds Tuple(cells[gid])
+
+#     points_in_cell_ = @inbounds points_in_cell(this_cell, neighborhood_search)
+#     @assert length(points_in_cell_) != 0
+
+#     neighboring_cells_ = neighboring_cells(this_cell, neighborhood_search)
+#     search_radius2 = search_radius(neighborhood_search)^2
+
+#     # Tile particles in cell by workgroup size
+#     n_tiles = ceil(Int, length(points_in_cell_) / workgroupsize)
+
+#     for tile_i in 1:n_tiles
+#         cell_i = (tile_i - 1) * workgroupsize + lid
+
+#         if cell_i <= length(points_in_cell_)
+#             # Load one point from this cell
+#             point = @inbounds points_in_cell_[cell_i]
+#             point_coords = @inbounds extract_svector(system_coords,
+#                                                         Val(ndims(neighborhood_search)),
+#                                                         point)
+#         else
+#             point = zero(Int32)
+#             point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+#         end
+
+#         # Loop over neighboring cells
+#         for neighbor_cell_ in neighboring_cells_
+#             neighbor_cell = Tuple(neighbor_cell_)
+
+#             # We can use `@inbounds` here, assuming that the initialize/update steps verified
+#             # that all cells have valid neighbors.
+#             neighbors = @inbounds points_in_cell(neighbor_cell, neighborhood_search)
+#             n_neighbors = length(neighbors)
+
+#             # Tile neighbors by WG and stage to local memory
+#             j = 1
+#             # This loop should only have one iteration if workgroupsize is larger than cell size
+#             while j <= n_neighbors
+#                 # Cooperative load: each thread brings one neighbor into shared memory
+#                 lane_idx = j + (lid - 1)
+#                 if lane_idx <= n_neighbors
+#                     nb = @inbounds neighbors[lane_idx]
+#                     @inbounds sh_id[lid] = Int32(nb)
+#                     nb_coords = @inbounds extract_svector(neighbor_system_coords,
+#                                                         Val(ndims(neighborhood_search)), nb)
+#                     @inbounds sh_x[lid] = nb_coords[1]
+#                     @inbounds sh_y[lid] = nb_coords[2]
+#                 end
+
+#                 @synchronize
+
+#                 if cell_i <= length(points_in_cell_)
+#                     # Reuse the staged tile for all threads' i
+#                     # Determine how many valid neighbors were staged in this tile
+#                     tile_count = min(workgroupsize, n_neighbors - j + 1)
+
+#                     @inbounds for t in 1:tile_count
+#                         neighbor = @inbounds sh_id[t]
+#                         # Compute pairwise interaction
+#                         pos_diff1 = @inbounds convert(ELTYPE, point_coords[1] - sh_x[t])
+#                         pos_diff2 = @inbounds convert(ELTYPE, point_coords[2] - sh_y[t])
+#                         distance2 = pos_diff1 * pos_diff1 + pos_diff2 * pos_diff2
+#                         if distance2 <= search_radius2
+#                             distance = sqrt(distance2)
+
+#                             # Call the user-supplied functor
+#                             @inline f(point, neighbor, SVector(pos_diff1, pos_diff2), distance)
+#                         end
+#                     end
+#                 end
+
+#                 @synchronize
+#                 j += workgroupsize
+#             end
+#         end
+#     end
+# end
+
 @inline function neighboring_cells(cell, neighborhood_search)
     NDIMS = ndims(neighborhood_search)