diff --git a/Project.toml b/Project.toml index d4654124..32e1fdf5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PointNeighbors" uuid = "1c4d5385-0a27-49de-8e2c-43b175c8985c" authors = ["Erik Faulhaber "] -version = "0.6.6-dev" +version = "0.6.6" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl index ffabc805..ff08ebe3 100644 --- a/benchmarks/run_benchmarks.jl +++ b/benchmarks/run_benchmarks.jl @@ -50,10 +50,11 @@ run_benchmark(benchmark_count_neighbors, (10, 10), 3, ``` """ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborhood_searches; + search_radius_factor = 3.0, parallelization_backend = PolyesterBackend(), names = ["Neighborhood search $i" for i in 1:length(neighborhood_searches)]', - seed = 1, perturbation_factor_position = 1.0) + seed = 1, perturbation_factor_position = 1.0, shuffle = false) # Multiply number of points in each iteration (roughly) by this factor scaling_factor = 4 per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension)) @@ -64,24 +65,29 @@ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborho times = zeros(iterations, length(neighborhood_searches)) for iter in 1:iterations - coordinates = point_cloud(sizes[iter]; seed, perturbation_factor_position) + coordinates_ = point_cloud(sizes[iter], search_radius_factor; + seed, perturbation_factor_position, shuffle) + coordinates = convert.(typeof(search_radius_factor), coordinates_) domain_size = maximum(sizes[iter]) + 1 # Normalize domain size to 1 coordinates ./= domain_size # Make this Float32 to make sure that Float32 benchmarks use Float32 exclusively - search_radius = 4.0f0 / domain_size + search_radius = search_radius_factor / domain_size n_particles = size(coordinates, 2) neighborhood_searches_copy = copy_neighborhood_search.(neighborhood_searches, search_radius, n_particles) for i in eachindex(neighborhood_searches_copy) - neighborhood_search = neighborhood_searches_copy[i] - PointNeighbors.initialize!(neighborhood_search, coordinates, coordinates) + neighborhood_search_ = neighborhood_searches_copy[i] + neighborhood_search = PointNeighbors.Adapt.adapt(parallelization_backend, + neighborhood_search_) + coords = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates) + PointNeighbors.initialize!(neighborhood_search, coords, coords) - time = benchmark(neighborhood_search, coordinates; parallelization_backend) + time = benchmark(neighborhood_search, coords; parallelization_backend) times[iter, i] = time time_string = BenchmarkTools.prettytime(time * 1e9) time_string_per_particle = BenchmarkTools.prettytime(time * 1e9 / n_particles) @@ -170,23 +176,74 @@ include("benchmarks/benchmarks.jl") run_benchmark_gpu(benchmark_n_body, (10, 10), 3) ``` """ -function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...) +function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) NDIMS = length(n_points_per_dimension) min_corner = 0.0f0 .* n_points_per_dimension max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) - neighborhood_searches = [GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0, - cell_list = FullGridCellList(; - search_radius = 0.0f0, - min_corner, - max_corner)) - PrecomputedNeighborhoodSearch{NDIMS}(search_radius = 0.0f0)] - - names = ["GridNeighborhoodSearch with FullGridCellList";; - "PrecomputedNeighborhoodSearch"] + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + transpose_backend = parallelization_backend isa PointNeighbors.KernelAbstractions.GPU + neighborhood_searches = [ + grid_nhs + PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, + update_neighborhood_search = grid_nhs, + transpose_backend)#, max_neighbors=128) + ] + + names = [ + "GridNeighborhoodSearch with FullGridCellList";; + "PrecomputedNeighborhoodSearch" + ] run_benchmark(benchmark, n_points_per_dimension, iterations, - neighborhood_searches; names, kwargs...) + neighborhood_searches; names, parallelization_backend, kwargs...) +end + +""" + run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; kwargs...) + +Shortcut to call [`run_benchmark`](@ref) with a `GridNeighborhoodSearch` with a +`FullGridCellList`. This is the neighborhood search implementation that is used +in TrixiParticles.jl when performance is important. +Use this function to benchmark and profile TrixiParticles.jl kernels. + +# Arguments +- `benchmark`: The benchmark function. See [`benchmark_count_neighbors`](@ref), + [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref), + [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref). +- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number + of points. For example, use `(100, 100)` for a 2D benchmark or + `(10, 10, 10)` for a 3D benchmark. +- `iterations`: Number of refinement iterations + +# Keywords +See [`run_benchmark`](@ref) for a list of available keywords. + +# Examples +```julia +include("benchmarks/benchmarks.jl") + +run_benchmark_full_grid(benchmark_n_body, (10, 10), 3) +``` +""" +function run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) + NDIMS = length(n_points_per_dimension) + + min_corner = 0.0f0 .* n_points_per_dimension + max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + neighborhood_searches = [grid_nhs] + + names = ["GridNeighborhoodSearch with FullGridCellList";;] + + run_benchmark(benchmark, n_points_per_dimension, iterations, + neighborhood_searches; names, parallelization_backend, kwargs...) end """ diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 7286baca..88e38508 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -1,4 +1,5 @@ using PointNeighbors +using PointNeighbors.Adapt using TrixiParticles using BenchmarkTools @@ -43,47 +44,30 @@ This method is used to simulate an incompressible fluid. """ function benchmark_wcsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - density = 1000.0 - particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1, particle_spacing) - - sound_speed = 10.0 - state_equation = StateEquationCole(; sound_speed, reference_density = density, - exponent = 1) - - viscosity = ArtificialViscosityMonaghan(alpha = 0.02, beta = 0.0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end - -""" - benchmark_wcsph_fp32(neighborhood_search, coordinates; - parallelization_backend = default_backend(coordinates)) - -Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers. -""" -function benchmark_wcsph_fp32(neighborhood_search, coordinates_; - parallelization_backend = default_backend(coordinates_)) - coordinates = convert(Matrix{Float32}, coordinates_) - density = 1000.0f0 + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + density = convert(ELTYPE, 1000.0) particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1f0, particle_spacing) + fluid = InitialCondition(; coordinates = coordinates_cpu, density, + mass = convert(ELTYPE, 0.1) * particle_spacing, + particle_spacing) - sound_speed = 10.0f0 + # Make sure that the computed forces are not all zero + for i in eachindex(fluid.density) + fluid.density[i] += rand(eltype(fluid.density)) + end + + sound_speed = convert(ELTYPE, 10.0) state_equation = StateEquationCole(; sound_speed, reference_density = density, exponent = 1) - viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0) + viscosity = ArtificialViscosityMonaghan(alpha = convert(ELTYPE, 0.02), + beta = convert(ELTYPE, 0.0)) + density_diffusion = DensityDiffusionMolteniColagrossi(delta = convert(ELTYPE, 0.1)) - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end - -function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_equation, - viscosity, density_diffusion, parallelization_backend) # Compact support == 2 * smoothing length for these kernels smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2 if ndims(neighborhood_search) == 1 @@ -92,23 +76,21 @@ function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_e smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end - fluid_system = WeaklyCompressibleSPHSystem(initial_condition, ContinuityDensity(), + fluid_system = WeaklyCompressibleSPHSystem(fluid, ContinuityDensity(), state_equation, smoothing_kernel, smoothing_length, viscosity = viscosity, density_diffusion = density_diffusion) - system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system) + system = Adapt.adapt(parallelization_backend, fluid_system) # Remove unnecessary data structures that are only used for initialization - neighborhood_search_ = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) - nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search_) semi = DummySemidiscretization(nhs, parallelization_backend, true) - v = PointNeighbors.Adapt.adapt(parallelization_backend, - vcat(initial_condition.velocity, - initial_condition.density')) - u = PointNeighbors.Adapt.adapt(parallelization_backend, initial_condition.coordinates) + v = Adapt.adapt(parallelization_backend, + vcat(fluid.velocity, fluid.density')) + u = Adapt.adapt(parallelization_backend, fluid.coordinates) dv = zero(v) # Initialize the system @@ -128,8 +110,15 @@ This method is used to simulate an elastic structure. """ function benchmark_tlsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - material = (density = 1000.0, E = 1.4e6, nu = 0.4) - solid = InitialCondition(; coordinates, density = material.density, mass = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) + + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + material = (density = convert(ELTYPE, 1000.0), E = convert(ELTYPE, 1.4e6), + nu = convert(ELTYPE, 0.4)) + solid = InitialCondition(; coordinates = coordinates_cpu, + density = material.density, mass = convert(ELTYPE, 0.1)) # Compact support == 2 * smoothing length for these kernels smoothing_length_ = PointNeighbors.search_radius(neighborhood_search) / 2 @@ -142,15 +131,20 @@ function benchmark_tlsph(neighborhood_search, coordinates; solid_system = TotalLagrangianSPHSystem(solid, smoothing_kernel, smoothing_length, material.E, material.nu) - semi = DummySemidiscretization(neighborhood_search, parallelization_backend, true) + system_ = Adapt.adapt(parallelization_backend, solid_system) - v = copy(solid.velocity) - u = copy(solid.coordinates) + # Remove unnecessary data structures that are only used for initialization + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + system = TrixiParticles.@set system_.self_interaction_nhs = nhs + + semi = DummySemidiscretization(nhs, parallelization_backend, true) + + v = Adapt.adapt(parallelization_backend, copy(solid.velocity)) + u = Adapt.adapt(parallelization_backend, copy(solid.coordinates)) dv = zero(v) # Initialize the system - TrixiParticles.initialize!(solid_system, semi) + TrixiParticles.initialize!(system, semi) - return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, - $solid_system, $solid_system, $semi) + return @belapsed TrixiParticles.interact_structure_structure2!($dv, $v, $system, $semi) end diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 702c19e4..58b38687 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -20,7 +20,8 @@ include("nhs_grid.jl") include("nhs_precomputed.jl") include("gpu.jl") -export foreach_point_neighbor, foreach_neighbor +export foreach_point_neighbor, foreach_point_neighbor_unsafe, + foreach_neighbor, foreach_neighbor_unsafe export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch export DictionaryCellList, FullGridCellList, SpatialHashingCellList export DynamicVectorOfVectors diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 5e99b9cd..cf02774e 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -178,7 +178,7 @@ Note that `system_coords` and `neighbor_coords` can be identical. automatically based on the type of `system_coords`. See [`@threaded`](@ref) for a list of available backends. -See also [`initialize!`](@ref), [`update!`](@ref). +See also [`foreach_point_neighbor_unsafe`](@ref), [`initialize!`](@ref), [`update!`](@ref). """ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search; parallelization_backend::ParallelizationBackend = default_backend(system_coords), @@ -200,41 +200,143 @@ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborho return nothing end -@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, +""" + foreach_point_neighbor_unsafe(f, system_coords, neighbor_coords, neighborhood_search; + parallelization_backend = default_backend(system_coords), + points = axes(system_coords, 2)) + +Like [`foreach_point_neighbor`](@ref), but skips **all** bounds checks inside the +threaded loop or GPU kernel. +See [`foreach_neighbor_unsafe`](@ref) for more details on which bounds checks are skipped. + +!!! warning + The `neighborhood_search` must have been initialized or updated with `system_coords` + as first coordinate array and `neighbor_coords` as second coordinate array. + This can be skipped for certain implementations. See [`requires_update`](@ref). + +!!! warning + Use this only when `neighborhood_search` is known to be initialized correctly for + `system_coords` and `neighbor_coords`. +""" +function foreach_point_neighbor_unsafe(f::T, system_coords, neighbor_coords, + neighborhood_search; + parallelization_backend::ParallelizationBackend = default_backend(system_coords), + points = axes(system_coords, 2)) where {T} + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) + + @threaded parallelization_backend for point in points + foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search, point) + end + + return nothing +end + +""" + foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, point; + search_radius = search_radius(neighborhood_search)) + +Loop over all neighbors of `point` and execute `f(i, j, pos_diff, d)` for every +neighbor within `search_radius`, where `i` is `point`, `j` is the neighbor index, +`pos_diff` is the vector from neighbor to point, and `d` is the distance. + +This method performs bounds checks, even when called with `@inbounds`. +`@inbounds` only skips the bounds check for loading the coordinates of `point` +from `system_coords`. +See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks. +""" +@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point; search_radius = search_radius(neighborhood_search)) # Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. # We have to use `@propagate_inbounds`, which will also remove boundschecks - # in the neighbor loop, which is not safe (see comment below). + # in the neighbor loop, which is not safe (that's what `foreach_neighbor_unsafe` is for). # To avoid this, we have to use a function barrier to disable the `@inbounds` again. point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) - foreach_neighbor(f, neighbor_system_coords, neighborhood_search, + foreach_neighbor(f, neighbor_coords, neighborhood_search, point, point_coords, search_radius) end +# This is a function barrier to prevent the `@inbounds` in `foreach_neighbor` +# from propagating into the neighbor loop, which is not safe. +@inline function foreach_neighbor(f, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point, point_coords, search_radius) + foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, + point, point_coords, search_radius) +end + +""" + foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, point; + search_radius = search_radius(neighborhood_search)) + +Like [`foreach_neighbor`](@ref), but skips **all** bounds checks. + +`foreach_neighbor` performs the following bounds checks that are skipped here: +- Check if `point` is in bounds of `system_coords`. This is the only bounds check + that is skipped when calling `foreach_neighbor` with `@inbounds`, and the only one that + is safe to skip when `point` is guaranteed to be in bounds of `system_coords`. +- Check if the neighbors of `point` are in bounds of `neighbor_coords`. + This is not safe to skip when the neighborhood search was not initialized correctly; + for example when initialized with coordinate arrays of different sizes than the ones + passed to this function or when the internal data structures have been tampered with. + In this case, the cell lists (for [`GridNeighborhoodSearch`](@ref)) or neighbor lists + (for [`PrecomputedNeighborhoodSearch`](@ref)) might contain indices that are out of + bounds for `neighbor_coords`. +- With `GridNeighborhoodSearch` and [`FullGridCellList`](@ref), check if the neighboring + cells are in bounds of the grid. Again, this is not safe to skip when the neighborhood + search might not have been initialized correctly. +- With `PrecomputedNeighborhoodSearch`, check if `point` is in bounds of the neighbor lists. + Again, this is not safe to skip when the neighborhood search was not initialized correctly. + +Note that all these bounds checks are safe to skip if +- `point` is guaranteed to be in bounds of `system_coords`, +- the neighborhood search was initialized correctly with `system_coords` + and `neighbor_coords` and has not been tampered with since then. + +!!! warning + Use this only when `point` is known to be in bounds of `system_coords` + and when `neighborhood_search` is known to be initialized correctly for + `system_coords` and `neighbor_coords`. +""" +@inline function foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point; + search_radius = search_radius(neighborhood_search)) + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + point) + + @inbounds foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, + point, point_coords, search_radius) +end + # This is the generic function that is called for `TrivialNeighborhoodSearch`. # For `GridNeighborhoodSearch`, a specialized function is used for slightly better # performance. `PrecomputedNeighborhoodSearch` can skip the distance check altogether. -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::AbstractNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box) = neighborhood_search for neighbor in eachneighbor(point_coords, neighborhood_search) - # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. - # But we don't know if `neighbor` (extracted from the cell list) is in bounds. - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, search_radius, - periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) if distance2 <= search_radius^2 distance = sqrt(distance2) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 21759ab2..8e3c1ff2 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -508,14 +508,19 @@ end # Specialized version of the function in `neighborhood_search.jl`, which is faster # than looping over `eachneighbor`. -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::GridNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius) (; cell_list, periodic_box) = neighborhood_search cell = cell_coords(point_coords, neighborhood_search) for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) neighbor_cell = Tuple(neighbor_cell_) + + # Making the following `@inbounds` is not safe because we don't know if + # `neighbor_cell` is in bounds of the grid. neighbors = points_in_cell(neighbor_cell, neighborhood_search) # Boolean to indicate if this cell has a collision (only with `SpatialHashingCellList`) @@ -525,17 +530,19 @@ end for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. - # But we don't know if `neighbor` (extracted from the cell list) is in bounds. - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + # Making the following `@inbounds` is not safe because we don't know + # if `neighbor` (extracted from the cell list) is in bounds. + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), + neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, - search_radius, periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, + search_radius, periodic_box) if distance2 <= search_radius^2 distance = sqrt(distance2) @@ -543,7 +550,7 @@ end # If this cell has a collision, check if this point belongs to this cell # (only with `SpatialHashingCellList`). if cell_collision && - check_collision(neighbor_cell_, neighbor_coords, cell_list, + check_collision(neighbor_cell_, neighbor_point_coords, cell_list, neighborhood_search) continue end diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 187ff0c5..c7ddceca 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -200,29 +200,32 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, end end -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::PrecomputedNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::PrecomputedNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box, neighbor_lists) = neighborhood_search - neighbors = @inbounds neighbor_lists[point] + # Making the following `@inbounds` is not safe because the neighbor list + # might not contain `point` if the NHS was not initialized correctly. + neighbors = neighbor_lists[point] for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making this `@inbounds` is not perfectly safe because + # Making this `@inbounds` is not safe because # `neighbor` (extracted from the neighbor list) is only guaranteed to be in bounds # if the neighbor lists were constructed correctly and have not been corrupted. - # However, adding this `@inbounds` yields a ~20% speedup for TLSPH on GPUs (A4500). - neighbor_coords = @inbounds extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), - neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, search_radius, - periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) distance = sqrt(distance2) diff --git a/test/cell_lists/spatial_hashing.jl b/test/cell_lists/spatial_hashing.jl index 28d13b12..6a8e8104 100644 --- a/test/cell_lists/spatial_hashing.jl +++ b/test/cell_lists/spatial_hashing.jl @@ -71,9 +71,11 @@ @testset "Cell Coordinates Hash Function" begin # 1D coordinates - @test coordinates_flattened([1]) == UInt128(reinterpret(UInt32, Int32(1))) - @test coordinates_flattened([-1]) == UInt128(reinterpret(UInt32, Int32(-1))) - @test coordinates_flattened([0]) == Int128(0) + @test PointNeighbors.coordinates_flattened([1]) == + UInt128(reinterpret(UInt32, Int32(1))) + @test PointNeighbors.coordinates_flattened([-1]) == + UInt128(reinterpret(UInt32, Int32(-1))) + @test PointNeighbors.coordinates_flattened([0]) == Int128(0) # 2D coordinates coord2 = [-1, 1] @@ -82,18 +84,20 @@ # The second coordinate gives 1 shifted by 32 bits, so 1 * 2^32. expected2 = UInt128(2^32 - 1 + 2^32) - @test coordinates_flattened(coord2) == expected2 + @test PointNeighbors.coordinates_flattened(coord2) == expected2 # 3D coordinates coord3 = [1, 0, -1] expected3 = UInt128(1 + 0 * 2^32 + (2^32 - 1) * Int128(2)^64) - @test coordinates_flattened(coord3) == expected3 + @test PointNeighbors.coordinates_flattened(coord3) == expected3 # Extreme Int32 bounds max_val = Int32(typemax(Int32)) min_val = Int32(typemin(Int32)) - @test coordinates_flattened((max_val,)) == UInt128(reinterpret(UInt32, max_val)) - @test coordinates_flattened((min_val,)) == UInt128(reinterpret(UInt32, min_val)) + @test PointNeighbors.coordinates_flattened((max_val,)) == + UInt128(reinterpret(UInt32, max_val)) + @test PointNeighbors.coordinates_flattened((min_val,)) == + UInt128(reinterpret(UInt32, min_val)) # 3D extreme Int32 bounds coord4 = [min_val, max_val, Int32(0)] @@ -102,19 +106,19 @@ # `typemax(Int32)` gives the unsigned value 2^31 - 1. expected4 = UInt128(2^31 + (2^31 - 1) * 2^32) - @test coordinates_flattened(coord4) == expected4 + @test PointNeighbors.coordinates_flattened(coord4) == expected4 # Passing non-Int32-coercible should error large_val = typemax(Int32) + 1 - @test_throws InexactError coordinates_flattened([large_val]) + @test_throws InexactError PointNeighbors.coordinates_flattened([large_val]) small_val = typemin(Int32) - 1 - @test_throws InexactError coordinates_flattened([small_val]) + @test_throws InexactError PointNeighbors.coordinates_flattened([small_val]) - @test_throws InexactError coordinates_flattened([Inf]) - @test_throws InexactError coordinates_flattened([NaN]) + @test_throws InexactError PointNeighbors.coordinates_flattened([Inf]) + @test_throws InexactError PointNeighbors.coordinates_flattened([NaN]) # Too many dimensions should throw assertion error - @test_throws AssertionError coordinates_flattened([1, 2, 3, 4]) + @test_throws AssertionError PointNeighbors.coordinates_flattened([1, 2, 3, 4]) end end diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl index 65d3ed85..ed1a80dd 100644 --- a/test/neighborhood_search.jl +++ b/test/neighborhood_search.jl @@ -142,14 +142,14 @@ "($(seed == 1 ? "`initialize!`" : "`update!`"))" @testset verbose=true "$(name(cloud_size, seed)))" for cloud_size in cloud_sizes, seed in seeds - coords = point_cloud(cloud_size, seed = seed) + search_radius = 2.5 + coords = point_cloud(cloud_size, search_radius, seed = seed) NDIMS = length(cloud_size) n_points = size(coords, 2) - search_radius = 2.5 # Use different coordinates for `initialize!` and then `update!` with the # correct coordinates to make sure that `update!` is working as well. - coords_initialize = point_cloud(cloud_size, seed = 1) + coords_initialize = point_cloud(cloud_size, search_radius, seed = 1) # Compute expected neighbor lists by brute-force looping over all points # as potential neighbors (`TrivialNeighborhoodSearch`). @@ -254,7 +254,7 @@ names_copied = [name * " copied" for name in names] append!(names, names_copied) - @testset "$(names[i])" for i in eachindex(names) + @testset verbose=true "$(names[i])" for i in eachindex(names) nhs = neighborhood_searches[i] # Initialize with `seed = 1` @@ -267,17 +267,60 @@ update!(nhs, coords, coords) end - neighbors = [Int[] for _ in axes(coords, 2)] + # Test the regular `foreach_point_neighbor` + @testset "`foreach_point_neighbor`" begin + neighbors = [Int[] for _ in axes(coords, 2)] + foreach_point_neighbor(coords, coords, nhs, + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance + append!(neighbors[point], neighbor) + end + + @test sort.(neighbors) == neighbors_expected + end - foreach_point_neighbor(coords, coords, nhs, - parallelization_backend = SerialBackend()) do point, - neighbor, - pos_diff, - distance - append!(neighbors[point], neighbor) + # Test manual loop with `foreach_neighbor` + @testset "Manual Loop with `foreach_neighbor`" begin + neighbors_manual = [Int[] for _ in axes(coords, 2)] + for point in axes(coords, 2) + foreach_neighbor(coords, coords, nhs, + point) do point, neighbor, pos_diff, distance + append!(neighbors_manual[point], neighbor) + end + end + + @test sort.(neighbors_manual) == neighbors_expected + end + + # Repeat with foreach_point_neighbor_unsafe + @testset "`foreach_point_neighbor_unsafe`" begin + neighbors_unsafe = [Int[] for _ in axes(coords, 2)] + foreach_point_neighbor_unsafe(coords, coords, nhs, + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance + append!(neighbors_unsafe[point], neighbor) + end + + @test sort.(neighbors_unsafe) == neighbors_expected end - @test sort.(neighbors) == neighbors_expected + # Repeat with manual loop with `foreach_neighbor_unsafe` + @testset "Manual Loop with `foreach_neighbor_unsafe`" begin + neighbors_manual_unsafe = [Int[] for _ in axes(coords, 2)] + for point in axes(coords, 2) + foreach_neighbor_unsafe(coords, coords, nhs, + point) do point, neighbor, + pos_diff, distance + append!(neighbors_manual_unsafe[point], neighbor) + end + end + + @test sort.(neighbors_manual_unsafe) == neighbors_expected + end end end end diff --git a/test/point_cloud.jl b/test/point_cloud.jl index ba99b2e1..64c41bec 100644 --- a/test/point_cloud.jl +++ b/test/point_cloud.jl @@ -1,8 +1,9 @@ using Random # Generate a rectangular point cloud, optionally with a perturbation in the point positions -function point_cloud(n_points_per_dimension; - seed = 1, perturbation_factor_position = 1.0) +function point_cloud(n_points_per_dimension, search_radius; + seed = 1, perturbation_factor_position = 1.0, + sort = true, shuffle = false) # Fixed seed to ensure reproducibility Random.seed!(seed) @@ -10,8 +11,15 @@ function point_cloud(n_points_per_dimension; coordinates = Array{Float64}(undef, n_dims, prod(n_points_per_dimension)) cartesian_indices = CartesianIndices(n_points_per_dimension) + # Extra data structures for the sorting code below + cell_coords = Vector{SVector{n_dims, Int}}(undef, size(coordinates, 2)) + cell_size = ntuple(dim -> search_radius, n_dims) + for i in axes(coordinates, 2) - coordinates[:, i] .= Tuple(cartesian_indices[i]) + point_coords = SVector(Tuple(cartesian_indices[i])) + coordinates[:, i] .= point_coords + cell_coords[i] = PointNeighbors.cell_coords(point_coords, nothing, nothing, + cell_size) .+ 1 end # A standard deviation of 0.05 in the particle coordinates @@ -21,6 +29,23 @@ function point_cloud(n_points_per_dimension; # The benchmark results are also consistent with the timer output of the simulation. perturb!(coordinates, perturbation_factor_position * 0.05) + # Sort by linear cell index + if sort + if shuffle + throw(ArgumentError("cannot sort and shuffle at the same time")) + end + + # Sort by Z index (with `using Morton`) + # permutation = sortperm(cell_coords, by = c -> cartesian2morton(c)) + + permutation = sortperm(cell_coords) + coordinates .= coordinates[:, permutation] + elseif shuffle + # Sort randomly + permutation = shuffle(axes(coordinates, 2)) + coordinates .= coordinates[:, permutation] + end + return coordinates end diff --git a/test/unittest.jl b/test/unittest.jl index f0981614..262b1753 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -7,4 +7,5 @@ include("nhs_precomputed.jl") include("neighborhood_search.jl") include("cell_lists/full_grid.jl") + include("cell_lists/spatial_hashing.jl") end;