From a75ff59d909a166f586f9e59a796ad89761d1635 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:51:12 +0100 Subject: [PATCH 1/6] Add GPU kernel using shared memory Co-authored-by: Valentin Churavy --- src/nhs_grid.jl | 150 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 3b5dd266..2b4c9c93 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -395,6 +395,156 @@ end end end +# for cell in cells +# for point in cell +# for neighbor_cell in neighbor_cells + +# for neighbor in neighbor_cell +@inline function foreach_point_neighbor_localmem(f, system_coords, neighbor_coords, + neighborhood_search; search_radius = search_radius(neighborhood_search)) + backend = KernelAbstractions.get_backend(system_coords) + max_particles_per_cell = 64 + nhs_size = size(neighborhood_search.cell_list.linear_indices) + # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + linear_indices = neighborhood_search.cell_list.linear_indices + cartesian_indices = CartesianIndices(size(linear_indices)) + lengths = Array(neighborhood_search.cell_list.cells.lengths) + # max_particles_per_cell = maximum(lengths) + nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices)) + ndrange = max_particles_per_cell * length(nonempty_cells) + kernel = foreach_neighbor_localmem(backend, (max_particles_per_cell,)) + kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange) + + KernelAbstractions.synchronize(backend) + + return nothing +end + +@kernel cpu=false function foreach_neighbor_localmem(f::F, system_coords, neighbor_system_coords, + neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX} + cell_ = @index(Group) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + local_points = @localmem Int32 MAX + local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) + + pv = points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(pv) + if particleidx <= n_particles_in_current_cell + point = @inbounds pv[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + point) + # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n") + else + point = zero(Int32) + point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + end + + for i in -1:1, j in -1:1, k in -1:1 + neighbor_cell = (cell[1] + i, cell[2] + j, cell[3] + k) + # for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + # neighbor_cell = Tuple(neighbor_cell_) + points_view = points_in_cell(neighbor_cell, neighborhood_search) + n_particles_in_neighbor_cell = length(points_view) + # if n_particles_in_neighbor_cell + # continue + # end + + # First use all threads to load the neighbors into local memory in parallel + if particleidx <= n_particles_in_neighbor_cell + @inbounds p = local_points[particleidx] = points_view[particleidx] + # KernelAbstractions.@print("Point $point, neighbor $p with coords ($(neighbor_system_coords[1, p]), $(neighbor_system_coords[2, p]))\n") + for d in 1:ndims(neighborhood_search) + @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p] + end + end + @synchronize() + # Now each thread works on one point again + if particleidx <= n_particles_in_current_cell + for local_neighbor in 1:n_particles_in_neighbor_cell + @inbounds neighbor = local_points[local_neighbor] + @inbounds neighbor_coords = extract_svector(local_neighbor_coords, + Val(ndims(neighborhood_search)), local_neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if distance2 <= search_radius^2 + # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n") + distance = sqrt(distance2) # TODO: eventuell fastmath + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + @inline f(point, neighbor, pos_diff, distance) + end + end + end + @synchronize() + end +end + +@inline function foreach_point_neighbor_cell_blocks(f, system_coords, neighbor_coords, + neighborhood_search) + backend = KernelAbstractions.get_backend(system_coords) + max_particles_per_cell = 64 + nhs_size = size(neighborhood_search.cell_list.linear_indices) + cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + ndrange = max_particles_per_cell * length(cells) + kernel = foreach_neighbor_cell_blocks(backend, (max_particles_per_cell,)) + kernel(f, system_coords, neighbor_coords, neighborhood_search, Val(max_particles_per_cell); ndrange) + + KernelAbstractions.synchronize(backend) + + return nothing +end + +@kernel cpu=false function foreach_neighbor_cell_blocks(f::F, system_coords, neighbor_coords, + neighborhood_search, ::Val{MAX}) where {F, MAX} + cell_ = @index(Group) + nhs_size = size(neighborhood_search.cell_list.linear_indices) + @inbounds cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + pv = points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(pv) + if particleidx <= n_particles_in_current_cell + point = @inbounds pv[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + point) + # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n") + + for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + points_view = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor in points_view + @inbounds neighbor_coords = extract_svector(neighbor_system_coords, + Val(ndims(neighborhood_search)), neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if distance2 <= search_radius^2 + # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n") + distance = sqrt(distance2) # TODO: eventuell fastmath + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + @inline f(point, neighbor, pos_diff, distance) + end + end + end + end +end + @inline function neighboring_cells(cell, neighborhood_search) NDIMS = ndims(neighborhood_search) From 8371415d36f4f803f4b261bb9059a1cc25ec6217 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 8 Nov 2024 19:42:43 +0100 Subject: [PATCH 2/6] Add missing imports --- src/PointNeighbors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 22ef3893..21bbc30b 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -6,7 +6,7 @@ using Adapt: Adapt using Atomix: Atomix using Base: @propagate_inbounds using GPUArraysCore: AbstractGPUArray -using KernelAbstractions: KernelAbstractions, @kernel, @index +using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize using LinearAlgebra: dot using Polyester: Polyester @reexport using StaticArrays: SVector From eee0a34c81394e45ed8b68b0d85517bdc8c061c7 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 8 Nov 2024 20:18:24 +0100 Subject: [PATCH 3/6] WIP: Implement double buffering --- src/nhs_grid.jl | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 2b4c9c93..fc94df16 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -429,6 +429,9 @@ end local_points = @localmem Int32 MAX local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) + + next_local_points = @localmem Int32 MAX + next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) pv = points_in_cell(cell, neighborhood_search) n_particles_in_current_cell = length(pv) @@ -436,31 +439,38 @@ end point = @inbounds pv[particleidx] point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), point) - # KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n") else point = zero(Int32) point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) end - for i in -1:1, j in -1:1, k in -1:1 - neighbor_cell = (cell[1] + i, cell[2] + j, cell[3] + k) - # for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) - # neighbor_cell = Tuple(neighbor_cell_) + @inline function stage!(local_points, local_neighbor_coords, neighbor_cell) points_view = points_in_cell(neighbor_cell, neighborhood_search) n_particles_in_neighbor_cell = length(points_view) - # if n_particles_in_neighbor_cell - # continue - # end # First use all threads to load the neighbors into local memory in parallel if particleidx <= n_particles_in_neighbor_cell @inbounds p = local_points[particleidx] = points_view[particleidx] - # KernelAbstractions.@print("Point $point, neighbor $p with coords ($(neighbor_system_coords[1, p]), $(neighbor_system_coords[2, p]))\n") for d in 1:ndims(neighborhood_search) @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p] end end - @synchronize() + return n_particles_in_neighbor_cell + end + + neighborhood = neighboring_cells(cell, neighbor_search) + (neighbor_cell, state) = iterate(neighborhood) + + n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell)) + @synchronize() + + while true + next = iterate(neighborhood, state) + if next !== nothing + (next_neighbor_cell, state) = next + next_n_particles_in_neighbor_cell = stage!(next_local_points, next_local_neighbor_coords, Tuple(next_neighbor_cell)) + end + # Now each thread works on one point again if particleidx <= n_particles_in_current_cell for local_neighbor in 1:n_particles_in_neighbor_cell @@ -483,7 +493,16 @@ end end end end + next === nothing && break @synchronize() + # swap variables + n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell + temp = local_points + local_points = next_local_points + next_local_points = temp + temp = local_neighbor_coords + local_neighbor_coords = next_local_neighbor_coords + next_local_neighbor_coords = temp end end From 843116c518fb8034b5804b31c7f243615a51112b Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:21:15 +0100 Subject: [PATCH 4/6] Fix double buffered kernel --- src/nhs_grid.jl | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index fc94df16..7b994bd0 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -429,7 +429,7 @@ end local_points = @localmem Int32 MAX local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) - + next_local_points = @localmem Int32 MAX next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) @@ -446,28 +446,36 @@ end @inline function stage!(local_points, local_neighbor_coords, neighbor_cell) points_view = points_in_cell(neighbor_cell, neighborhood_search) - n_particles_in_neighbor_cell = length(points_view) + n_particles_in_neighbor_cell_ = length(points_view) # First use all threads to load the neighbors into local memory in parallel - if particleidx <= n_particles_in_neighbor_cell + if particleidx <= n_particles_in_neighbor_cell_ @inbounds p = local_points[particleidx] = points_view[particleidx] for d in 1:ndims(neighborhood_search) @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p] end end - return n_particles_in_neighbor_cell + return n_particles_in_neighbor_cell_ end - neighborhood = neighboring_cells(cell, neighbor_search) - (neighbor_cell, state) = iterate(neighborhood) + neighborhood = neighboring_cells(cell, neighborhood_search) + # (neighbor_cell, state) = iterate(neighborhood) + neighbor_cell = first(neighborhood) n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell)) @synchronize() - while true - next = iterate(neighborhood, state) - if next !== nothing - (next_neighbor_cell, state) = next + for neighbor_ in 1:length(neighborhood) + neighbor_cell = @inbounds neighborhood[neighbor_] + + # while true + # next = iterate(neighborhood, state) + # if next !== nothing + # n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell)) + # @synchronize + if neighbor_ < length(neighborhood) + next_neighbor_cell = neighborhood[neighbor_ + 1] + # (next_neighbor_cell, state) = next next_n_particles_in_neighbor_cell = stage!(next_local_points, next_local_neighbor_coords, Tuple(next_neighbor_cell)) end @@ -493,7 +501,8 @@ end end end end - next === nothing && break + # next === nothing && break + neighbor_ >= length(neighborhood) && break @synchronize() # swap variables n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell From 02b22f06b2084a3432599e20b253b0ec151c18e3 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:24:43 +0100 Subject: [PATCH 5/6] Improve readability --- src/nhs_grid.jl | 161 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 132 insertions(+), 29 deletions(-) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 7b994bd0..19d2a52f 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -420,6 +420,37 @@ end return nothing end +@inline function copy_to_localmem!(local_points, local_neighbor_coords, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx) + points_view = points_in_cell(neighbor_cell, neighborhood_search) + n_particles_in_neighbor_cell = length(points_view) + + # First use all threads to load the neighbors into local memory in parallel + if particleidx <= n_particles_in_neighbor_cell + @inbounds p = local_points[particleidx] = points_view[particleidx] + for d in 1:ndims(neighborhood_search) + @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p] + end + end + return n_particles_in_neighbor_cell +end + +# @parallel(block) for cell in cells +# for neighbor_cell in neighboring_cells +# @parallel(thread) for neighbor in neighbor_cell +# copy_coordinates_to_localmem(neighbor) +# +# # Make sure all threads finished the copying +# @synchronize +# +# @parallel(thread) for particle in cell +# for neighbor in neighbor_cell +# # This uses the neighbor coordinates from the local memory +# compute(point, neighbor) +# +# # Make sure all threads finished computing before we continue with copying +# @synchronize @kernel cpu=false function foreach_neighbor_localmem(f::F, system_coords, neighbor_system_coords, neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX} cell_ = @index(Group) @@ -427,16 +458,16 @@ end particleidx = @index(Local) @assert 1 <= particleidx <= MAX + # Coordinate buffer in local memory local_points = @localmem Int32 MAX local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) - next_local_points = @localmem Int32 MAX - next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) + points = points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(points) - pv = points_in_cell(cell, neighborhood_search) - n_particles_in_current_cell = length(pv) + # Extract point coordinates if a point lies on this thread if particleidx <= n_particles_in_current_cell - point = @inbounds pv[particleidx] + point = @inbounds points[particleidx] point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), point) else @@ -444,39 +475,113 @@ end point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) end - @inline function stage!(local_points, local_neighbor_coords, neighbor_cell) - points_view = points_in_cell(neighbor_cell, neighborhood_search) - n_particles_in_neighbor_cell_ = length(points_view) + for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + + n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx) + + # Make sure all threads finished the copying + @synchronize - # First use all threads to load the neighbors into local memory in parallel - if particleidx <= n_particles_in_neighbor_cell_ - @inbounds p = local_points[particleidx] = points_view[particleidx] - for d in 1:ndims(neighborhood_search) - @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p] + # Now each thread works on one point again + if particleidx <= n_particles_in_current_cell + for local_neighbor in 1:n_particles_in_neighbor_cell + @inbounds neighbor = local_points[local_neighbor] + @inbounds neighbor_coords = extract_svector(local_neighbor_coords, + Val(ndims(neighborhood_search)), + local_neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if distance2 <= search_radius^2 + distance = sqrt(distance2) # TODO: eventuell fastmath + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + @inline f(point, neighbor, pos_diff, distance) + end end end - return n_particles_in_neighbor_cell_ + + # Make sure all threads finished computing before we continue with copying + @synchronize() + end +end + +# @parallel(block) for cell in cells +# @parallel(thread) for neighbor in first_neighbor_cell +# copy_coordinates_to_localmem!(local_coords, neighbor) +# +# for neighbor_cell in neighboring_cells +# @parallel(thread) for neighbor in neighbor_cell + 1 +# copy_coordinates_to_localmem!(next_local_coords, neighbor) +# +# # No synchronize needed. The following loop works on `local_coords`. +# +# @parallel(thread) for particle in cell +# for neighbor in neighbor_cell +# # This uses the neighbor coordinates from the local memory +# compute(point, neighbor) +# +# # Make sure all threads finished computing before we switch variables +# @synchronize +# local_coords, next_local_coords = next_local_coords, local_coords +@kernel cpu=false function foreach_neighbor_double_buffer(f::F, system_coords, neighbor_system_coords, + neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX} + cell_ = @index(Group) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + # Coordinate buffer in local memory + local_points = @localmem Int32 MAX + local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) + + # Next coordinate buffer in local memory + next_local_points = @localmem Int32 MAX + next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) + + points = points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(points) + + # Extract point coordinates if a point lies on this thread + if particleidx <= n_particles_in_current_cell + point = @inbounds points[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + point) + else + point = zero(Int32) + point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) end neighborhood = neighboring_cells(cell, neighborhood_search) # (neighbor_cell, state) = iterate(neighborhood) - neighbor_cell = first(neighborhood) + neighbor_cell = Tuple(first(neighborhood)) + + n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx) - n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell)) @synchronize() for neighbor_ in 1:length(neighborhood) - neighbor_cell = @inbounds neighborhood[neighbor_] + neighbor_cell = @inbounds Tuple(neighborhood[neighbor_]) # while true # next = iterate(neighborhood, state) # if next !== nothing - # n_particles_in_neighbor_cell = stage!(local_points, local_neighbor_coords, Tuple(neighbor_cell)) - # @synchronize + if neighbor_ < length(neighborhood) - next_neighbor_cell = neighborhood[neighbor_ + 1] + next_neighbor_cell = @inbounds Tuple(neighborhood[neighbor_ + 1]) # (next_neighbor_cell, state) = next - next_n_particles_in_neighbor_cell = stage!(next_local_points, next_local_neighbor_coords, Tuple(next_neighbor_cell)) + next_n_particles_in_neighbor_cell = copy_to_localmem!(next_local_points, next_local_neighbor_coords, + next_neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx) end # Now each thread works on one point again @@ -484,7 +589,8 @@ end for local_neighbor in 1:n_particles_in_neighbor_cell @inbounds neighbor = local_points[local_neighbor] @inbounds neighbor_coords = extract_svector(local_neighbor_coords, - Val(ndims(neighborhood_search)), local_neighbor) + Val(ndims(neighborhood_search)), + local_neighbor) pos_diff = point_coords - neighbor_coords distance2 = dot(pos_diff, pos_diff) @@ -492,7 +598,6 @@ end # TODO periodic if distance2 <= search_radius^2 - # KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n") distance = sqrt(distance2) # TODO: eventuell fastmath # Inline to avoid loss of performance @@ -501,17 +606,15 @@ end end end end + # next === nothing && break neighbor_ >= length(neighborhood) && break @synchronize() + # swap variables n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell - temp = local_points - local_points = next_local_points - next_local_points = temp - temp = local_neighbor_coords - local_neighbor_coords = next_local_neighbor_coords - next_local_neighbor_coords = temp + local_points, next_local_points = next_local_points, local_points + local_neighbor_coords, next_local_neighbor_coords = next_local_neighbor_coords, local_neighbor_coords end end From 3b681aadf1ab0b61ce10c98eb0c9ae6358856449 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 27 Oct 2025 15:57:45 +0100 Subject: [PATCH 6/6] Add another version of localmem kernel --- src/nhs_grid.jl | 161 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 19d2a52f..eeff0713 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -676,6 +676,167 @@ end end end +# Another version of the above that allows for groups smaller than cells +# # for cell in cells +# # for point in cell +# # for neighbor_cell in neighbor_cells + +# # for neighbor in neighbor_cell +# function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search::GridNeighborhoodSearch; +# parallelization_backend::ParallelizationBackend = default_backend(system_coords), +# points = axes(system_coords, 2)) where {T} +# # The type annotation above is to make Julia specialize on the type of the function. +# # Otherwise, unspecialized code will cause a lot of allocations +# # and heavily impact performance. +# # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing + +# if system_coords !== neighbor_coords +# foreach_point_neighbor_fallback(f, system_coords, neighbor_coords, +# neighborhood_search; +# parallelization_backend, +# points) +# return nothing +# end + +# # Explicit bounds check before the GPU kernel +# @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) + +# linear_indices = neighborhood_search.cell_list.linear_indices +# cartesian_indices = CartesianIndices(size(linear_indices)) +# lengths = Array(neighborhood_search.cell_list.cells.lengths) +# # max_particles_per_cell = maximum(lengths) +# nonempty_cells = Adapt.adapt(parallelization_backend, +# filter(index -> lengths[linear_indices[index]] > 0, +# cartesian_indices)) + +# # Launch: one workgroup per non-empty cell +# workgroupsize = 16 +# n_groups = length(nonempty_cells) +# ndrange = n_groups * workgroupsize + +# kernel = foreach_neighbor_gpu(parallelization_backend, workgroupsize) +# kernel(f, system_coords, neighbor_coords, neighborhood_search, points, nonempty_cells, Val(workgroupsize); ndrange) + +# KernelAbstractions.synchronize(parallelization_backend) + +# return nothing +# end + +# function foreach_point_neighbor_fallback(f::T, system_coords, neighbor_coords, neighborhood_search; +# parallelization_backend::ParallelizationBackend = default_backend(system_coords), +# points = axes(system_coords, 2)) where {T} +# # The type annotation above is to make Julia specialize on the type of the function. +# # Otherwise, unspecialized code will cause a lot of allocations +# # and heavily impact performance. +# # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing + +# # Explicit bounds check before the hot loop (or GPU kernel) +# @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) + +# @threaded parallelization_backend for point in points +# # Now we can safely assume that `point` is inbounds +# @inbounds foreach_neighbor(f, system_coords, neighbor_coords, +# neighborhood_search, point) +# end + +# return nothing +# end + +# @kernel function foreach_neighbor_gpu(f, system_coords, neighbor_system_coords, +# neighborhood_search::GridNeighborhoodSearch, points, +# cells, ::Val{workgroupsize}) where {workgroupsize} +# gid = @index(Group) +# lid = @index(Local) + +# ELTYPE = eltype(neighborhood_search) + +# # Shared (local) memory to stage one tile of neighbor attributes +# # 2D shown here. For 3D, add z buffer likewise. +# sh_x = @localmem eltype(neighbor_system_coords) (workgroupsize,) +# sh_y = @localmem eltype(neighbor_system_coords) (workgroupsize,) +# # neighbor particle indices (use Int32 if your indices are 32-bit) +# sh_id = @localmem Int32 (workgroupsize,) + +# # Each group processes exactly one cell +# @assert gid <= length(cells) +# this_cell = @inbounds Tuple(cells[gid]) + +# points_in_cell_ = @inbounds points_in_cell(this_cell, neighborhood_search) +# @assert length(points_in_cell_) != 0 + +# neighboring_cells_ = neighboring_cells(this_cell, neighborhood_search) +# search_radius2 = search_radius(neighborhood_search)^2 + +# # Tile particles in cell by workgroup size +# n_tiles = ceil(Int, length(points_in_cell_) / workgroupsize) + +# for tile_i in 1:n_tiles +# cell_i = (tile_i - 1) * workgroupsize + lid + +# if cell_i <= length(points_in_cell_) +# # Load one point from this cell +# point = @inbounds points_in_cell_[cell_i] +# point_coords = @inbounds extract_svector(system_coords, +# Val(ndims(neighborhood_search)), +# point) +# else +# point = zero(Int32) +# point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) +# end + +# # Loop over neighboring cells +# for neighbor_cell_ in neighboring_cells_ +# neighbor_cell = Tuple(neighbor_cell_) + +# # We can use `@inbounds` here, assuming that the initialize/update steps verified +# # that all cells have valid neighbors. +# neighbors = @inbounds points_in_cell(neighbor_cell, neighborhood_search) +# n_neighbors = length(neighbors) + +# # Tile neighbors by WG and stage to local memory +# j = 1 +# # This loop should only have one iteration if workgroupsize is larger than cell size +# while j <= n_neighbors +# # Cooperative load: each thread brings one neighbor into shared memory +# lane_idx = j + (lid - 1) +# if lane_idx <= n_neighbors +# nb = @inbounds neighbors[lane_idx] +# @inbounds sh_id[lid] = Int32(nb) +# nb_coords = @inbounds extract_svector(neighbor_system_coords, +# Val(ndims(neighborhood_search)), nb) +# @inbounds sh_x[lid] = nb_coords[1] +# @inbounds sh_y[lid] = nb_coords[2] +# end + +# @synchronize + +# if cell_i <= length(points_in_cell_) +# # Reuse the staged tile for all threads' i +# # Determine how many valid neighbors were staged in this tile +# tile_count = min(workgroupsize, n_neighbors - j + 1) + +# @inbounds for t in 1:tile_count +# neighbor = @inbounds sh_id[t] +# # Compute pairwise interaction +# pos_diff1 = @inbounds convert(ELTYPE, point_coords[1] - sh_x[t]) +# pos_diff2 = @inbounds convert(ELTYPE, point_coords[2] - sh_y[t]) +# distance2 = pos_diff1 * pos_diff1 + pos_diff2 * pos_diff2 +# if distance2 <= search_radius2 +# distance = sqrt(distance2) + +# # Call the user-supplied functor +# @inline f(point, neighbor, SVector(pos_diff1, pos_diff2), distance) +# end +# end +# end + +# @synchronize +# j += workgroupsize +# end +# end +# end +# end + @inline function neighboring_cells(cell, neighborhood_search) NDIMS = ndims(neighborhood_search)