diff --git a/Project.toml b/Project.toml index 2b0ea58c92..b2bbe0b3b8 100644 --- a/Project.toml +++ b/Project.toml @@ -26,6 +26,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" @@ -37,14 +38,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" [weakdeps] OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"] +TrixiParticlesCUDAExt = "CUDA" [compat] Accessors = "0.1.43" Adapt = "4" CSV = "0.10" +CUDA = "5.9.1" DataFrames = "1.6" DelimitedFiles = "1" DiffEqCallbacks = "4" @@ -62,6 +66,7 @@ Polyester = "0.7.10" ReadVTK = "0.2" RecipesBase = "1" Reexport = "1" +SIMD = "3.7.2" SciMLBase = "2" StaticArrays = "1" Statistics = "1" diff --git a/docs/literate/src/tut_custom_kernel.jl b/docs/literate/src/tut_custom_kernel.jl index 25a3bed683..053f0c74a0 100644 --- a/docs/literate/src/tut_custom_kernel.jl +++ b/docs/literate/src/tut_custom_kernel.jl @@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end # By looking at the implementation of existing kernels in TrixiParticles.jl, # we can see that a kernel implementation requires three functions. -# `TrixiParticles.kernel`, which is the kernel function itself, -# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function, -# and `TrixiParticles.compact_support`, which defines the compact support of the -# kernel in relation to the smoothing length. +# `TrixiParticles.kernel_unsafe`, which is the kernel function itself, +# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the +# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines +# the compact support of the kernel in relation to the smoothing length. # The latter is relevant for determining the search radius of the neighborhood search. -function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h) +# +# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since +# this avoids an extra division in the hot loop and is robust near ``r=0``. +# The public function `TrixiParticles.kernel_deriv` is defined automatically by +# TrixiParticles from this method (and multiplies by ``r`` again when needed). +# In the unsafe functions, we do not check the compact support; this is handled in the +# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the +# compact support defined in `TrixiParticles.compact_support`. +function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h) q = r / h - if q < 2 - return 1 / (pi * h^2) * exp(-q^2) - end - - return 0.0 + return 1 / (pi * h^2) * exp(-q^2) end -function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h) +function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h) q = r / h - if q < 2 - return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h - end - - return 0.0 + kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h + return kernel_deriv / r end TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h diff --git a/ext/TrixiParticlesCUDAExt.jl b/ext/TrixiParticlesCUDAExt.jl new file mode 100644 index 0000000000..c747bde3ef --- /dev/null +++ b/ext/TrixiParticlesCUDAExt.jl @@ -0,0 +1,32 @@ +module TrixiParticlesCUDAExt + +using CUDA: CUDA +using TrixiParticles: TrixiParticles + +# Use faster version of `div_fast` for `Float64` on CUDA. +# By default, `div_fast` translates to `Base.FastMath.div_fast`, but there is +# no fast division for `Float64` on CUDA, so we need to redefine it here to use the +# improved fast reciprocal defined below. +CUDA.@device_override TrixiParticles.div_fast(x, y::Float64) = x * fast_inv_cuda(y) + +# Improved fast reciprocal for `Float64` by @Mikolaj-A-Kowalski, which is significantly +# more accurate than just calling "llvm.nvvm.rcp.approx.ftz.d" without the cubic iteration, +# while still being much faster than a full division. +# This is copied from Oceananigans.jl, see https://github.com/CliMA/Oceananigans.jl/pull/5140. +@inline function fast_inv_cuda(a::Float64) + # Get the approximate reciprocal + # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 + # This instruction chops off last 32bits of mantissa and computes inverse + # while treating all subnormal numbers as 0.0 + # If reciprocal would be subnormal, underflows to 0.0 + # 32 least significant bits of the result are filled with 0s + inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a) + + # Approximate the missing 32bits of mantissa with a single cubic iteration + e = fma(inv_a, -a, 1.0) + e = fma(e, e, e) + inv_a = fma(e, inv_a, inv_a) + return inv_a +end + +end # module diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index faa1fd2956..ea3ee48bd3 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -15,7 +15,7 @@ using FileIO: FileIO using ForwardDiff: ForwardDiff using GPUArraysCore: AbstractGPUArray using JSON: JSON -using KernelAbstractions: KernelAbstractions, @kernel, @index +using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize using LinearAlgebra: norm, normalize, cross, dot, I, tr, inv, pinv, det using MuladdMacro: @muladd using Polyester: Polyester, @batch @@ -26,6 +26,7 @@ using Random: seed! using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!, get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!, add_tstop! +using SIMD: vloada, Vec @reexport using StaticArrays: SVector using StaticArrays: @SMatrix, SMatrix, setindex using Statistics: Statistics diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index 10ccb6261f..eeceaaac63 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -130,10 +130,30 @@ end return kernel(smoothing_kernel, distance, smoothing_length(system, particle)) end +@inline function skip_zero_distance(system, distance, almostzero) + return skip_zero_distance(system_correction(system), system, distance, almostzero) +end + +# Robust/safe version of the function below. In performance-critical code, manually check +# the kernel support, call `skip_zero_distance` and then `smoothing_kernel_grad_unsafe`. @inline function smoothing_kernel_grad(system, pos_diff, distance, particle) - return corrected_kernel_grad(system_smoothing_kernel(system), pos_diff, - distance, smoothing_length(system, particle), - system_correction(system), system, particle) + h = smoothing_length(system, particle) + compact_support_ = compact_support(system_smoothing_kernel(system), h) + + # Note that `sqrt(eps(h^2)) != eps(h)` + if distance > compact_support_ || skip_zero_distance(system, distance, sqrt(eps(h^2))) + return zero(pos_diff) + end + + return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) +end + +# Skip the zero distance and compact support checks for maximum performance. +# Only call this when you are sure that `0 < distance < compact_support`. +@inline function smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) + return corrected_kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff, + distance, smoothing_length(system, particle), + system_correction(system), system, particle) end # System updates do nothing by default, but can be dispatched if needed diff --git a/src/general/corrections.jl b/src/general/corrections.jl index 292523feb8..e7422916b1 100644 --- a/src/general/corrections.jl +++ b/src/general/corrections.jl @@ -35,15 +35,16 @@ end # `rho_mean` is the mean density of the fluid, which is used to determine correction values near the free surface. # Return a tuple `(viscosity_correction, pressure_correction, surface_tension_correction)` representing the correction terms. @inline function free_surface_correction(correction::AkinciFreeSurfaceCorrection, - particle_system, rho_mean) + particle_system, rho_a, rho_b) # Equation 4 in ref + rho_mean = (rho_a + rho_b) / 2 k = correction.rho0 / rho_mean # Viscosity, pressure, surface_tension return k, 1, k end -@inline function free_surface_correction(correction, particle_system, rho_mean) +@inline function free_surface_correction(correction, particle_system, rho_a, rho_b) return 1, 1, 1 end @@ -337,31 +338,36 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system, v_neighbor_system = wrap_v(v_ode, neighbor_system, semi) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + almostzero = sqrt(eps(compact_support(system, neighbor_system)^2)) foreach_point_neighbor(system, neighbor_system, coordinates, neighbor_coords, semi) do particle, neighbor, pos_diff, distance - volume = hydrodynamic_mass(neighbor_system, neighbor) / - current_density(v_neighbor_system, neighbor_system, neighbor) - smoothing_length_ = smoothing_length(system, particle) - - function compute_grad_kernel(correction, smoothing_kernel, pos_diff, distance, - smoothing_length_, system, particle) - return smoothing_kernel_grad(system, pos_diff, distance, particle) + function kernel_grad_local(correction, smoothing_kernel, pos_diff, distance, + smoothing_length_, system, particle) + return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle) end # Compute gradient of corrected kernel - function compute_grad_kernel(correction::MixedKernelGradientCorrection, - smoothing_kernel, pos_diff, distance, - smoothing_length_, system, particle) - return corrected_kernel_grad(smoothing_kernel, pos_diff, distance, - smoothing_length_, KernelCorrection(), system, - particle) + function kernel_grad_local(correction::MixedKernelGradientCorrection, + smoothing_kernel, pos_diff, distance, + smoothing_length_, system, particle) + return corrected_kernel_grad_unsafe(smoothing_kernel, pos_diff, distance, + smoothing_length_, KernelCorrection(), + system, particle) end - grad_kernel = compute_grad_kernel(correction, smoothing_kernel, pos_diff, - distance, smoothing_length_, system, particle) + # Skip neighbors with the same position if the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(correction, system, distance, almostzero) && return - iszero(grad_kernel) && return + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + smoothing_length_ = smoothing_length(system, particle) + grad_kernel = kernel_grad_local(correction, smoothing_kernel, pos_diff, + distance, smoothing_length_, system, particle) + + volume = hydrodynamic_mass(neighbor_system, neighbor) / + current_density(v_neighbor_system, neighbor_system, neighbor) L = volume * grad_kernel * pos_diff' diff --git a/src/general/interpolation.jl b/src/general/interpolation.jl index 00eff1581b..58b171d0f7 100644 --- a/src/general/interpolation.jl +++ b/src/general/interpolation.jl @@ -593,7 +593,9 @@ end other_density[point] += m_b * W_ab if include_wall_velocity - velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor) + velocity_neighbor_ = current_velocity(v, neighbor_system, neighbor) + velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor, + velocity_neighbor_) for i in axes(velocity_neighbor, 1) cache.velocity[i, point] += velocity_neighbor[i] * volume_b * W_ab end diff --git a/src/general/smoothing_kernels.jl b/src/general/smoothing_kernels.jl index 4a97564648..3c6be90212 100644 --- a/src/general/smoothing_kernels.jl +++ b/src/general/smoothing_kernels.jl @@ -9,44 +9,87 @@ abstract type AbstractSmoothingKernel{NDIMS} end # `distance^2` is in the order of `h^2`, hence the comparison `distance^2 < eps(h^2)`. # Note that this is faster than `distance < sqrt(eps(h^2))`. # Also note that `sqrt(eps(h^2)) != eps(h)`. - distance^2 < eps(h^2) && return zero(pos_diff) + compact_support_ = compact_support(kernel, h) + nonzero = distance < compact_support_ && distance^2 > eps(h^2) + nonzero || return zero(pos_diff) - return kernel_deriv(kernel, distance, h) / distance * pos_diff + # Now we can use `kernel_grad_unsafe` without worrying about division by zero + return kernel_grad_unsafe(kernel, pos_diff, distance, h) end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, correction, system, - particle) - return kernel_grad(kernel, pos_diff, distance, h) +# Skip the zero distance and compact support checks for maximum performance. +# Only call this when you are sure that `distance` is not zero and within the compact support. +@inline function kernel_grad_unsafe(kernel, pos_diff, distance, h) + # Call an optimized version of the kernel derivative, skipping redundant checks. + # Since this is used as `kernel_deriv / distance * pos_diff`, we already divide by `r` + # (or rather skip a multiplication by `r`) inside `kernel_deriv_div_r_unsafe` + # to save expensive divisions. + return kernel_deriv_div_r_unsafe(kernel, distance, h) * pos_diff end -@inline function corrected_kernel_grad(kernel_, pos_diff, distance, h, ::KernelCorrection, - system, particle) +@inline function kernel(kernel, r::Real, h) + # Zero out result if outside of compact support + compact_support_ = compact_support(kernel, h) + return ifelse(r < compact_support_, kernel_unsafe(kernel, r, h), zero(r)) +end + +@inline function kernel_deriv(kernel, r::Real, h) + # Zero out result if outside of compact support or if `r` is almost zero + # (to avoid division by zero in the unsafe version). + compact_support_ = compact_support(kernel, h) + if r < compact_support_ && r^2 > eps(h^2) + # The unsafe version returns the kernel derivative divided by `r`, + # so we multiply it by `r` to get the actual derivative. + return kernel_deriv_div_r_unsafe(kernel, r, h) * r + end + + return zero(r) +end + +@inline function skip_zero_distance(correction, system, distance, almostzero) + return distance < almostzero +end + +@inline function skip_zero_distance(::Union{KernelCorrection, + MixedKernelGradientCorrection}, + system, distance, almostzero) + # For these corrections, the kernel gradient between a particle and itself is not zero + return false +end + +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, correction, + system, particle) + return kernel_grad_unsafe(kernel, pos_diff, distance, h) +end + +@inline function corrected_kernel_grad_unsafe(kernel_, pos_diff, distance, h, + ::KernelCorrection, system, particle) return (kernel_grad(kernel_, pos_diff, distance, h) .- kernel(kernel_, distance, h) * dw_gamma(system, particle)) / kernel_correction_coefficient(system, particle) end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, - corr::BlendedGradientCorrection, system, - particle) +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, + corr::BlendedGradientCorrection, system, + particle) (; blending_factor) = corr - grad = kernel_grad(kernel, pos_diff, distance, h) + grad = kernel_grad_unsafe(kernel, pos_diff, distance, h) return (1 - blending_factor) * grad + blending_factor * correction_matrix(system, particle) * grad end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, - ::GradientCorrection, system, particle) - grad = kernel_grad(kernel, pos_diff, distance, h) +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, + ::GradientCorrection, system, particle) + grad = kernel_grad_unsafe(kernel, pos_diff, distance, h) return correction_matrix(system, particle) * grad end -@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, - ::MixedKernelGradientCorrection, system, - particle) - grad = corrected_kernel_grad(kernel, pos_diff, distance, h, KernelCorrection(), - system, particle) +@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, + ::MixedKernelGradientCorrection, system, + particle) + grad = corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, KernelCorrection(), + system, particle) return correction_matrix(system, particle) * grad end @@ -80,32 +123,40 @@ which is beneficial in regards to stability but makes it less accurate. """ struct GaussianKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@inline @fastmath function kernel(kernel::GaussianKernel, r::Real, h) - q = r / h +@inline function kernel_unsafe(kernel::GaussianKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) - # Zero out result if q >= 3 - result = ifelse(q < 3, normalization_factor(kernel, h) * exp(-q^2), zero(q)) - - return result + return normalization_factor(kernel, h_inv) * exp_q2 end -@inline @fastmath function kernel_deriv(kernel::GaussianKernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv - - # Zero out result if q >= 3 - result = ifelse(q < 3, - -2 * q * normalization_factor(kernel, h) * exp(-q^2) * inner_deriv, - zero(q)) +@inline function kernel_deriv_div_r_unsafe(kernel::GaussianKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) - return result + return -2 * normalization_factor(kernel, h_inv) * exp_q2 * h_inv^2 end @inline compact_support(::GaussianKernel, h) = 3 * h -@inline normalization_factor(::GaussianKernel{2}, h) = 1 / (pi * h^2) -# First convert `pi` to the type of `h` to preserve the type of `h` -@inline normalization_factor(::GaussianKernel{3}, h) = 1 / (oftype(h, pi)^(3 // 2) * h^3) +@inline function normalization_factor(::GaussianKernel{2}, h_inv) + return oftype(h_inv, 1 / pi) * h_inv^2 +end + +@inline function normalization_factor(::GaussianKernel{3}, h_inv) + # 1 / (pi^1.5 * h^3) + return oftype(h_inv, 0.17958712212516656) * h_inv^3 +end @doc raw""" SchoenbergCubicSplineKernel{NDIMS}() @@ -140,40 +191,40 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct SchoenbergCubicSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@muladd @inline function kernel(kernel::SchoenbergCubicSplineKernel, r::Real, h) - q = r / h +@inline function kernel_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl. result = 1 * (2 - q)^3 / 4 - result = result - (q < 1) * (1 - q)^3 - - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(result)) + @muladd result = result - (q < 1) * (1 - q)^3 - return result + return normalization_factor(kernel, h_inv) * result end -@muladd @inline function kernel_deriv(kernel::SchoenbergCubicSplineKernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = -3 * (2 - q)^2 / 4 - result = result + 3 * (q < 1) * (1 - q)^2 + result = -3 * (2 - q)^2 / 4 + 3 * (q < 1) * (1 - q)^2 + kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result * inner_deriv, - zero(result)) - - return result + return div_fast(kernel_deriv, r) end @inline compact_support(::SchoenbergCubicSplineKernel, h) = 2 * h -# Note that `2 // 3 / h` saves one instruction but is significantly slower on GPUs (for now) -@inline normalization_factor(::SchoenbergCubicSplineKernel{1}, h) = 2 / (3 * h) -@inline normalization_factor(::SchoenbergCubicSplineKernel{2}, h) = 10 / (pi * h^2 * 7) -@inline normalization_factor(::SchoenbergCubicSplineKernel{3}, h) = 1 / (pi * h^3) +@inline function normalization_factor(::SchoenbergCubicSplineKernel{1}, h_inv) + return oftype(h_inv, 2 / 3) * h_inv +end + +@inline function normalization_factor(::SchoenbergCubicSplineKernel{2}, h_inv) + return oftype(h_inv, 10 / (pi * 7)) * h_inv^2 +end + +@inline function normalization_factor(::SchoenbergCubicSplineKernel{3}, h_inv) + return oftype(h_inv, 1 / pi) * h_inv^3 +end @doc raw""" SchoenbergQuarticSplineKernel{NDIMS}() @@ -217,54 +268,55 @@ struct SchoenbergQuarticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} en # to a power of three, see # https://github.com/JuliaLang/julia/blob/34934736fa4dcb30697ac1b23d11d5ad394d6a4d/base/intfuncs.jl#L327-L339 # By using the `@fastpow` macro, we are consciously trading off some precision in the result -# for enhanced computational speed. This is especially useful in scenarios where performance -# is a higher priority than exact precision. -@fastpow @muladd @inline function kernel(kernel::SchoenbergQuarticSplineKernel, r::Real, h) - q = r / h +# for enhanced computational speed by using simple multiplications for higher powers. +@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuarticSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Preserve the type of `q` - q5_2 = (5 // 2 - q) - q3_2 = (3 // 2 - q) - q1_2 = (1 // 2 - q) + # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason + q5_2 = 2.5f0 - q + q3_2 = 1.5f0 - q + q1_2 = 0.5f0 - q - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = q5_2^4 - result = result - 5 * (q < 3 // 2) * q3_2^4 - result = result + 10 * (q < 1 // 2) * q1_2^4 + # Note: `q < 3 // 2` is not working on GPUs. + # See https://github.com/JuliaGPU/CUDA.jl/issues/2681. + result = q5_2^4 - 5 * (q < 1.5f0) * q3_2^4 + 10 * (q < 0.5f0) * q1_2^4 - # Zero out result if q >= 5/2 - result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result, zero(result)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuarticSplineKernel, - r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuarticSplineKernel, + r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Preserve the type of `q` - q5_2 = 5 // 2 - q - q3_2 = 3 // 2 - q - q1_2 = 1 // 2 - q + # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason + q5_2 = 2.5f0 - q + q3_2 = 1.5f0 - q + q1_2 = 0.5f0 - q - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = -4 * q5_2^3 - result = result + 20 * (q < 3 // 2) * q3_2^3 - result = result - 40 * (q < 1 // 2) * q1_2^3 + # Note: `q < 3 // 2` is not working on GPUs. + # See https://github.com/JuliaGPU/CUDA.jl/issues/2681. + result = -4 * q5_2^3 + 20 * (q < 1.5f0) * q3_2^3 - 40 * (q < 0.5f0) * q1_2^3 - # Zero out result if q >= 5/2 - result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result * inner_deriv, - zero(result)) + kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv - return result + return div_fast(kernel_deriv, r) end -@inline compact_support(::SchoenbergQuarticSplineKernel, h) = 5 * h / 2 +@inline compact_support(::SchoenbergQuarticSplineKernel, h) = oftype(h, 5 / 2) * h -@inline normalization_factor(::SchoenbergQuarticSplineKernel{1}, h) = 1 / (24 * h) -@inline normalization_factor(::SchoenbergQuarticSplineKernel{2}, h) = 96 / (pi * h^2 * 1199) -@inline normalization_factor(::SchoenbergQuarticSplineKernel{3}, h) = 1 / (pi * h^3 * 20) +@inline function normalization_factor(::SchoenbergQuarticSplineKernel{1}, h_inv) + return oftype(h_inv, 1 / 24) * h_inv +end + +@inline function normalization_factor(::SchoenbergQuarticSplineKernel{2}, h_inv) + return oftype(h_inv, 96 / (pi * 1199)) * h_inv^2 +end + +@inline function normalization_factor(::SchoenbergQuarticSplineKernel{3}, h_inv) + return oftype(h_inv, 1 / (pi * 20)) * h_inv^3 +end @doc raw""" SchoenbergQuinticSplineKernel{NDIMS}() @@ -301,48 +353,40 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct SchoenbergQuinticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@fastpow @muladd @inline function kernel(kernel::SchoenbergQuinticSplineKernel, r::Real, h) - q = r / h - q3 = (3 - q) - q2 = (2 - q) - q1 = (1 - q) +@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuinticSplineKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = q3^5 - result = result - 6 * (q < 2) * q2^5 - result = result + 15 * (q < 1) * q1^5 + result = (3 - q)^5 - 6 * (q < 2) * (2 - q)^5 + 15 * (q < 1) * (1 - q)^5 - # Zero out result if q >= 3 - result = ifelse(q < 3, normalization_factor(kernel, h) * result, zero(result)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuinticSplineKernel, - r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv - q3 = (3 - q) - q2 = (2 - q) - q1 = (1 - q) +@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuinticSplineKernel, + r::Real, h) + h_inv = 1 / h + q = r * h_inv - # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl - result = -5 * q3^4 - result = result + 30 * (q < 2) * q2^4 - result = result - 75 * (q < 1) * q1^4 + result = -5 * (3 - q)^4 + 30 * (q < 2) * (2 - q)^4 - 75 * (q < 1) * (1 - q)^4 - # Zero out result if q >= 3 - result = ifelse(q < 3, normalization_factor(kernel, h) * result * inner_deriv, - zero(result)) + kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv - return result + return div_fast(kernel_deriv, r) end @inline compact_support(::SchoenbergQuinticSplineKernel, h) = 3 * h -@inline normalization_factor(::SchoenbergQuinticSplineKernel{1}, h) = 1 / (120 * h) -@inline normalization_factor(::SchoenbergQuinticSplineKernel{2}, h) = 7 / (pi * h^2 * 478) -@inline normalization_factor(::SchoenbergQuinticSplineKernel{3}, h) = 1 / (pi * h^3 * 120) +@inline function normalization_factor(::SchoenbergQuinticSplineKernel{1}, h_inv) + return oftype(h_inv, 1 / (120)) * h_inv +end + +@inline function normalization_factor(::SchoenbergQuinticSplineKernel{2}, h_inv) + return oftype(h_inv, 7 / (pi * 478)) * h_inv^2 +end + +@inline function normalization_factor(::SchoenbergQuinticSplineKernel{3}, h_inv) + return oftype(h_inv, 1 / (pi * 120)) * h_inv^3 +end abstract type AbstractWendlandKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end @@ -383,33 +427,26 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct WendlandC2Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end -@fastpow @inline function kernel(kernel::WendlandC2Kernel, r::Real, h) - q = r / h - - result = (1 - q / 2)^4 * (2 * q + 1) +@fastpow @inline function kernel_unsafe(kernel::WendlandC2Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * (1 - q / 2)^4 * (2 * q + 1) end -@inline function kernel_deriv(kernel::WendlandC2Kernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv - - result = -5 * (1 - q / 2)^3 * q +@inline function kernel_deriv_div_r_unsafe(kernel::WendlandC2Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 2 - result = ifelse(q < 2, - normalization_factor(kernel, h) * result * inner_deriv, zero(q)) - - return result + return -5 * normalization_factor(kernel, h_inv) * (1 - q / 2)^3 * h_inv^2 end -# Note that `7 // 4` saves one instruction but is significantly slower on GPUs (for now) -@inline normalization_factor(::WendlandC2Kernel{2}, h) = 7 / (pi * h^2 * 4) -@inline normalization_factor(::WendlandC2Kernel{3}, h) = 21 / (pi * h^3 * 16) +@inline function normalization_factor(::WendlandC2Kernel{2}, h_inv) + return oftype(h_inv, 7 / (4 * pi)) * h_inv^2 +end +@inline function normalization_factor(::WendlandC2Kernel{3}, h_inv) + return oftype(h_inv, 21 / (16 * pi)) * h_inv^3 +end @doc raw""" WendlandC4Kernel{NDIMS}() @@ -445,31 +482,30 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct WendlandC4Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end -@fastpow @inline function kernel(kernel::WendlandC4Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_unsafe(kernel::WendlandC4Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv result = (1 - q / 2)^6 * (35 * q^2 / 12 + 3 * q + 1) - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @inline function kernel_deriv(kernel::WendlandC4Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC4Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - derivative = -7 * q / 3 * (2 + 5 * q) * (1 - q / 2)^5 + result = oftype(r, -7 / 3) * (2 + 5 * q) * (1 - q / 2)^5 * h_inv^2 - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h, - zero(derivative)) - - return result + return normalization_factor(kernel, h_inv) * result end -@inline normalization_factor(::WendlandC4Kernel{2}, h) = 9 / (pi * h^2 * 4) -@inline normalization_factor(::WendlandC4Kernel{3}, h) = 495 / (pi * h^3 * 256) +@inline function normalization_factor(::WendlandC4Kernel{2}, h_inv) + return oftype(h_inv, 9 / (pi * 4)) * h_inv^2 +end +@inline function normalization_factor(::WendlandC4Kernel{3}, h_inv) + return oftype(h_inv, 495 / (pi * 256)) * h_inv^3 +end @doc raw""" WendlandC6Kernel{NDIMS}() @@ -505,31 +541,31 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct WendlandC6Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end -@fastpow @inline function kernel(kernel::WendlandC6Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_unsafe(kernel::WendlandC6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv result = (1 - q / 2)^8 * (4 * q^3 + 25 * q^2 / 4 + 4 * q + 1) - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * result end -@fastpow @muladd @inline function kernel_deriv(kernel::WendlandC6Kernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - derivative = -11 * q / 4 * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7 + result = oftype(r, -11 / 4) * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7 * h_inv^2 - # Zero out result if q >= 2 - result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h, - zero(derivative)) + return normalization_factor(kernel, h_inv) * result +end - return result +@inline function normalization_factor(::WendlandC6Kernel{2}, h_inv) + return oftype(h_inv, 39 / (pi * 14)) * h_inv^2 end -@inline normalization_factor(::WendlandC6Kernel{2}, h) = 39 / (pi * h^2 * 14) -@inline normalization_factor(::WendlandC6Kernel{3}, h) = 1365 / (pi * h^3 * 512) +@inline function normalization_factor(::WendlandC6Kernel{3}, h_inv) + return oftype(h_inv, 1365 / (pi * 512)) * h_inv^3 +end @doc raw""" Poly6Kernel{NDIMS}() @@ -541,7 +577,6 @@ especially in computer graphics contexts. It is defined as W(r, h) = \frac{1}{h^d} w(r/h) ``` - with ```math @@ -569,34 +604,31 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct Poly6Kernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@inline function kernel(kernel::Poly6Kernel, r::Real, h) - q = r / h - - result = (1 - q^2)^3 +@inline function kernel_unsafe(kernel::Poly6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # Zero out result if q >= 1 - result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q)) - - return result + return normalization_factor(kernel, h_inv) * (1 - q^2)^3 end -@inline function kernel_deriv(kernel::Poly6Kernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::Poly6Kernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - result = -6 * q * (1 - q^2)^2 + kernel_deriv = normalization_factor(kernel, h_inv) * -6 * q * (1 - q^2)^2 * h_inv - # Zero out result if q >= 1 - result = ifelse(q < 1, - result * normalization_factor(kernel, h) * inner_deriv, zero(q)) - return result + return div_fast(kernel_deriv, r) end @inline compact_support(::Poly6Kernel, h) = h -# Note that `315 // 64` saves one instruction but is significantly slower on GPUs (for now) -@inline normalization_factor(::Poly6Kernel{2}, h) = 4 / (pi * h^2) -@inline normalization_factor(::Poly6Kernel{3}, h) = 315 / (pi * h^3 * 64) +@inline function normalization_factor(::Poly6Kernel{2}, h_inv) + return oftype(h_inv, 4 / (pi)) * h_inv^2 +end + +@inline function normalization_factor(::Poly6Kernel{3}, h_inv) + return oftype(h_inv, 315 / (64 * pi)) * h_inv^3 +end @doc raw""" SpikyKernel{NDIMS}() @@ -634,38 +666,36 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct SpikyKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@inline function kernel(kernel::SpikyKernel, r::Real, h) - q = r / h - - result = (1 - q)^3 - - # Zero out result if q >= 1 - result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q)) +@inline function kernel_unsafe(kernel::SpikyKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - return result + return normalization_factor(kernel, h_inv) * (1 - q)^3 end -@inline function kernel_deriv(kernel::SpikyKernel, r::Real, h) - inner_deriv = 1 / h - q = r * inner_deriv +@inline function kernel_deriv_div_r_unsafe(kernel::SpikyKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - result = -3 * (1 - q)^2 + kernel_deriv = -3 * normalization_factor(kernel, h_inv) * (1 - q)^2 * h_inv - # Zero out result if q >= 1 - result = ifelse(q < 1, result * normalization_factor(kernel, h) * inner_deriv, zero(q)) - - return result + return div_fast(kernel_deriv, r) end @inline compact_support(::SpikyKernel, h) = h -@inline normalization_factor(::SpikyKernel{2}, h) = 10 / (pi * h^2) -@inline normalization_factor(::SpikyKernel{3}, h) = 15 / (pi * h^3) +@inline function normalization_factor(::SpikyKernel{2}, h_inv) + return oftype(h_inv, 10 / pi) * h_inv^2 +end + +@inline function normalization_factor(::SpikyKernel{3}, h_inv) + return oftype(h_inv, 15 / pi) * h_inv^3 +end @doc raw""" LaguerreGaussKernel{NDIMS}() -Truncated Laguerre–Gauss kernel (fourth-order smoothing) as defined in +Truncated Laguerre-Gauss kernel (fourth-order smoothing) as defined in [Wang2024](@cite). Its radial form uses ``q = r/h`` and is truncated at ``q = 2`` (compact support ``2h``): @@ -716,51 +746,65 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel) """ struct LaguerreGaussKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end -@fastpow @inline function kernel(kernel::LaguerreGaussKernel, r::Real, h) - q = r / h +@fastpow @inline function kernel_unsafe(kernel::LaguerreGaussKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv - # polynomial part: 1 - s^2/2 + s^4/6 + # Polynomial part poly = 1 - q^2 / 2 + q^4 / 6 - # zero out for s ≥ 2 - return ifelse(q < 2, normalization_factor(kernel, h) * poly * exp(-q^2), zero(q)) + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) + + return normalization_factor(kernel, h_inv) * poly * exp_q2 end -@inline function kernel_deriv(kernel::LaguerreGaussKernel, r::Real, h) - invh = 1 / h - q = r * invh +@inline function kernel_deriv_div_r_unsafe(kernel::LaguerreGaussKernel, r::Real, h) + h_inv = 1 / h + q = r * h_inv + + # dg/dq = (q / 3) * (-q^4 + 5q^2 - 9) * exp(-q^2), but we already divide by `r`` + poly = (h_inv / 3) * ((-q^2 + 5) * q^2 - 9) - # dg/dq = (q/3)*(-q^4 + 5q^2 - 9) * exp(-q^2) - poly = ((-q^2 + 5) * q^2 - 9) * (q / 3) + q2 = -q^2 + # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`, + # or it will translate to a generic power function instead of `q * q` (`literal_pow`) + # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640. + exp_q2 = @fastmath exp(q2) - return ifelse(q < 2, normalization_factor(kernel, h) * exp(-q^2) * poly * invh, zero(q)) + return normalization_factor(kernel, h_inv) * poly * exp_q2 * h_inv end @inline compact_support(::LaguerreGaussKernel, h) = 2 * h -# Original normalization factors as in Wang2024 -# @inline normalization_factor(::LaguerreGaussKernel{1}, h) = (8 / (5 * sqrt(pi))) / h -# @inline normalization_factor(::LaguerreGaussKernel{2}, h) = (3 / (pi)) / (h^2) -# @inline normalization_factor(::LaguerreGaussKernel{3}, h) = (8 / (pi^(3 // 2))) / (h^3) + +# Original normalization factors as in Wang2024: +# 1D: (8 / (5 * sqrt(pi))) / h +# 2D: (3 / (pi)) / (h^2) +# 3D: (8 / (pi^(3 / 2))) / (h^3) # Renormalized to the truncated integral over [0,2h] -@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h) - # C = 1/(2*(7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4)) - C = oftype(h, 0.65428780253539) - # C' = C/h - return C / h +@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h_inv) + # C = 1 / (2 * (7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4)) + C = oftype(h_inv, 0.6510370392044922) + # C' = C / h + return C * h_inv end -@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h) +@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h_inv) # C = 2 * pi * (5 - 17 * exp(-4)) / 12 - C = oftype(h, 2.454963094351984) + C_inv = oftype(h_inv, 0.4073380990128333) # C' = 1 / (h^2 * C) - return 1 / (h^2 * C) + return C_inv * h_inv^2 end -@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h) +@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h_inv) # C = (7 * sqrt(pi) / 32) * erf(2) - (77 / 24) * exp(-4) - C = oftype(h, 0.3271479336905373) + # C_ = 1 / (4 * pi * C) + C_ = oftype(h_inv, 0.24324613836999895) - # 4*pi cannot be pulled into C otherwise the test fails because of differences in rounding - return 1 / (C * 4 * pi * h^3) + # C' = 1 / (4 * pi * h^3 * C) = C_ / (h^3) + return C_ * h_inv^3 end diff --git a/src/io/write_vtk.jl b/src/io/write_vtk.jl index 56eb4ae12d..441508472b 100644 --- a/src/io/write_vtk.jl +++ b/src/io/write_vtk.jl @@ -350,18 +350,13 @@ function write2vtk!(vtk, v, u, t, system::AbstractFluidSystem) rho_b = current_density(v, system, neighbor) grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle) - surface_tension[1:ndims(system), - particle] .+= surface_tension_force(surface_tension_a, - surface_tension_b, - system, - system, - particle, - neighbor, - pos_diff, - distance, - rho_a, - rho_b, - grad_kernel) + dv_surface_tension = Ref(zero(pos_diff)) + surface_tension_force!(dv_surface_tension, + surface_tension_a, surface_tension_b, + system, system, particle, neighbor, + pos_diff, distance, rho_a, rho_b, grad_kernel) + + surface_tension[1:ndims(system), particle] .+= dv_surface_tension[] end vtk["surface_tension"] = surface_tension diff --git a/src/schemes/boundary/open_boundary/rhs.jl b/src/schemes/boundary/open_boundary/rhs.jl index 186a9e7b7d..4fd09f5f87 100644 --- a/src/schemes/boundary/open_boundary/rhs.jl +++ b/src/schemes/boundary/open_boundary/rhs.jl @@ -23,6 +23,9 @@ function interact!(dv, v_particle_system, u_particle_system, rho_a = @inbounds current_density(v_particle_system, particle_system, particle) rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + v_a = @inbounds current_velocity(v_particle_system, particle_system, particle) + v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) m_a = @inbounds hydrodynamic_mass(particle_system, particle) @@ -42,15 +45,17 @@ function interact!(dv, v_particle_system, u_particle_system, dv_pressure_boundary = 2 * p_boundary * (m_b / (rho_a * rho_b)) * grad_kernel # Propagate `@inbounds` to the viscosity function, which accesses particle data - dv_viscosity_ = @inbounds dv_viscosity(viscosity_model(fluid_system, - neighbor_system), - particle_system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, - grad_kernel) - - dv_particle = dv_pressure + dv_viscosity_ + dv_pressure_boundary + dv_viscosity_ = Ref(zero(pos_diff)) + @inbounds dv_viscosity(dv_viscosity_, + viscosity_model(fluid_system, + neighbor_system), + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + + dv_particle = dv_pressure + dv_viscosity_[] + dv_pressure_boundary for i in 1:ndims(particle_system) @inbounds dv[i, particle] += dv_particle[i] diff --git a/src/schemes/boundary/open_boundary/system.jl b/src/schemes/boundary/open_boundary/system.jl index 94aee1b048..dc5b58f7b7 100644 --- a/src/schemes/boundary/open_boundary/system.jl +++ b/src/schemes/boundary/open_boundary/system.jl @@ -644,8 +644,10 @@ function interpolate_velocity!(system::OpenBoundarySystem, boundary_zone, W_ab = kernel(smoothing_kernel, distance, smoothing_length) @inbounds shepard_coefficient[point] += volume_b * W_ab + velocity_neighbor_ = @inbounds current_velocity(v_neighbor, neighbor_system, + neighbor) velocity_neighbor = @inbounds viscous_velocity(v_neighbor, neighbor_system, - neighbor) + neighbor, velocity_neighbor_) for i in axes(velocity_neighbor, 1) @inbounds sample_velocity[i, point] += velocity_neighbor[i] * volume_b * W_ab diff --git a/src/schemes/boundary/wall_boundary/system.jl b/src/schemes/boundary/wall_boundary/system.jl index dbfa8613cd..6fc3f48d2c 100644 --- a/src/schemes/boundary/wall_boundary/system.jl +++ b/src/schemes/boundary/wall_boundary/system.jl @@ -145,15 +145,19 @@ end return zero(SVector{ndims(system), eltype(system)}) end -@inline function viscous_velocity(v, system::WallBoundarySystem, particle) - return viscous_velocity(v, system.boundary_model.viscosity, system, particle) +@inline function viscous_velocity(v, system::WallBoundarySystem, particle, v_particle) + return viscous_velocity(v, system.boundary_model.viscosity, system, + particle, v_particle) end -@inline function viscous_velocity(v, ::Nothing, system, particle) - return current_velocity(v, system, particle) +@inline function viscous_velocity(v, ::Nothing, system, particle, v_particle) + # Regular particle velocity is used for the viscosity calculation by default + return v_particle end -@inline function viscous_velocity(v, viscosity, system, particle) +@inline function viscous_velocity(v, viscosity, system, particle, v_particle) + # Wall velocity in the viscosity calculation contains the physical wall velocity + # and an interpolated velocity when a wall viscosity (no-slip BC) is used. return extract_svector(system.boundary_model.cache.wall_velocity, system, particle) end diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl index cea18484d4..17729ade7c 100644 --- a/src/schemes/fluid/entropically_damped_sph/rhs.jl +++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl @@ -24,6 +24,9 @@ function interact!(dv, v_particle_system, u_particle_system, rho_a = @inbounds current_density(v_particle_system, particle_system, particle) rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + v_a = @inbounds current_velocity(v_particle_system, particle_system, particle) + v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) p_b = @inbounds current_pressure(v_neighbor_system, neighbor_system, neighbor) @@ -46,32 +49,32 @@ function interact!(dv, v_particle_system, u_particle_system, rho_b, pos_diff, distance, grad_kernel, correction) - dv_viscosity_ = @inbounds dv_viscosity(particle_system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, - grad_kernel) + dv_particle = Ref(zero(pos_diff)) + @inbounds dv_viscosity(dv_particle, particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) # Extra terms in the momentum equation when using a shifting technique - dv_tvf = @inbounds dv_shifting(shifting_technique(particle_system), - particle_system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, m_a, m_b, rho_a, rho_b, - pos_diff, distance, grad_kernel, correction) - - dv_surface_tension = surface_tension_force(surface_tension_a, surface_tension_b, - particle_system, neighbor_system, - particle, neighbor, pos_diff, distance, - rho_a, rho_b, grad_kernel) - - dv_adhesion = adhesion_force(surface_tension_a, particle_system, neighbor_system, - particle, neighbor, pos_diff, distance) - - dv_particle = dv_pressure + dv_viscosity_ + dv_tvf + dv_surface_tension + - dv_adhesion + @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system), + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, m_a, m_b, rho_a, rho_b, + pos_diff, distance, grad_kernel, correction) + + @inbounds surface_tension_force!(dv_particle, surface_tension_a, + surface_tension_b, + particle_system, neighbor_system, + particle, neighbor, pos_diff, distance, + rho_a, rho_b, grad_kernel) + + @inbounds adhesion_force!(dv_particle, surface_tension_a, particle_system, + neighbor_system, + particle, neighbor, pos_diff, distance) for i in 1:ndims(particle_system) - @inbounds dv[i, particle] += dv_particle[i] + @inbounds dv[i, particle] += dv_particle[][i] end v_diff = current_velocity(v_particle_system, particle_system, particle) - diff --git a/src/schemes/fluid/fluid.jl b/src/schemes/fluid/fluid.jl index 8725190506..a937a5480a 100644 --- a/src/schemes/fluid/fluid.jl +++ b/src/schemes/fluid/fluid.jl @@ -136,31 +136,48 @@ end return dv end -# This formulation was chosen to be consistent with the used pressure_acceleration formulations -@propagate_inbounds function continuity_equation!(dv, density_calculator::ContinuityDensity, +# For compatibility with the old continuity equation function +@propagate_inbounds function continuity_equation!(dv::AbstractArray, + density_calculator::ContinuityDensity, particle_system::AbstractFluidSystem, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, m_b, rho_a, rho_b, grad_kernel) + drho_particle = Ref(zero(rho_a)) vdiff = current_velocity(v_particle_system, particle_system, particle) - current_velocity(v_neighbor_system, neighbor_system, neighbor) - vdiff += continuity_equation_shifting_term(shifting_technique(particle_system), - particle_system, neighbor_system, - particle, neighbor, rho_a, rho_b) + continuity_equation!(drho_particle, density_calculator, particle_system, + neighbor_system, v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) + dv[end, particle] += drho_particle[] +end - dv[end, particle] += rho_a / rho_b * m_b * dot(vdiff, grad_kernel) +# This formulation was chosen to be consistent with the used pressure_acceleration formulations +@propagate_inbounds function continuity_equation!(drho_particle, + density_calculator::ContinuityDensity, + particle_system::AbstractFluidSystem, + neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) + # vdiff += continuity_equation_shifting_term(shifting_technique(particle_system), + # particle_system, neighbor_system, + # particle, neighbor, rho_a, rho_b) + + drho_particle[] += div_fast(rho_a, rho_b) * m_b * dot(vdiff, grad_kernel) # Artificial density diffusion should only be applied to systems representing a fluid # with the same physical properties i.e. density and viscosity. # TODO: shouldn't be applied to particles on the interface (depends on PR #539) - if particle_system === neighbor_system - density_diffusion!(dv, density_diffusion(particle_system), - v_particle_system, particle, neighbor, - pos_diff, distance, m_b, rho_a, rho_b, particle_system, - grad_kernel) - end + # if particle_system === neighbor_system + # density_diffusion!(dv, density_diffusion(particle_system), + # v_particle_system, particle, neighbor, + # pos_diff, distance, m_b, rho_a, rho_b, particle_system, + # grad_kernel) + # end end function calculate_dt(v_ode, u_ode, cfl_number, system::AbstractFluidSystem, semi) diff --git a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl index ca7c4b2c35..25b3d09097 100644 --- a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl +++ b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl @@ -23,6 +23,9 @@ function interact!(dv, v_particle_system, u_particle_system, rho_a = @inbounds current_density(v_particle_system, particle_system, particle) rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + v_a = @inbounds current_velocity(v_particle_system, particle_system, particle) + v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) m_a = @inbounds hydrodynamic_mass(particle_system, particle) @@ -46,14 +49,15 @@ function interact!(dv, v_particle_system, u_particle_system, distance, grad_kernel, nothing) # Propagate `@inbounds` to the viscosity function, which accesses particle data - dv_viscosity_ = @inbounds dv_viscosity(particle_system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, - grad_kernel) + dv_viscosity_ = Ref(zero(pos_diff)) + @inbounds dv_viscosity(dv_viscosity_, particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) for i in 1:ndims(particle_system) - @inbounds dv[i, particle] += dv_pressure[i] + dv_viscosity_[i] + @inbounds dv[i, particle] += dv_pressure[i] + dv_viscosity_[][i] end end return dv diff --git a/src/schemes/fluid/implicit_incompressible_sph/system.jl b/src/schemes/fluid/implicit_incompressible_sph/system.jl index 8f02018bbf..a07b52786b 100644 --- a/src/schemes/fluid/implicit_incompressible_sph/system.jl +++ b/src/schemes/fluid/implicit_incompressible_sph/system.jl @@ -281,16 +281,21 @@ function calculate_predicted_velocity_and_d_ii_values!(system::ImplicitIncompres rho_a = @inbounds current_density(v_particle_system, system, particle) rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + v_a = @inbounds current_velocity(v_particle_system, system, particle) + v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle) - dv_viscosity_ = @inbounds dv_viscosity(system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, - grad_kernel) + dv_viscosity_ = Ref(zero(pos_diff)) + @inbounds dv_viscosity(dv_viscosity_, system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) # Add all other non-pressure forces for i in 1:ndims(system) - @inbounds advection_velocity[i, particle] += time_step * dv_viscosity_[i] + @inbounds advection_velocity[i, + particle] += time_step * dv_viscosity_[][i] end # Calculate d_ii with eq. 9 in Ihmsen et al. (2013) for i in 1:ndims(system) diff --git a/src/schemes/fluid/pressure_acceleration.jl b/src/schemes/fluid/pressure_acceleration.jl index 7ffa5850b3..30bb92be8c 100644 --- a/src/schemes/fluid/pressure_acceleration.jl +++ b/src/schemes/fluid/pressure_acceleration.jl @@ -7,14 +7,14 @@ # asymmetric version. @inline function pressure_acceleration_summation_density(m_a, m_b, rho_a, rho_b, p_a, p_b, W_a) - return -m_b * (p_a / rho_a^2 + p_b / rho_b^2) * W_a + return -m_b * (div_fast(p_a, rho_a^2) + div_fast(p_b, rho_b^2)) * W_a end # Same as above, but not assuming symmetry of the kernel gradient. To be used with # corrections that do not produce a symmetric kernel gradient. @inline function pressure_acceleration_summation_density(m_a, m_b, rho_a, rho_b, p_a, p_b, W_a, W_b) - return -m_b * (p_a / rho_a^2 * W_a - p_b / rho_b^2 * W_b) + return -m_b * (div_fast(p_a, rho_a^2) * W_a - div_fast(p_b, rho_b^2) * W_b) end # As shown in "Variational and momentum preservation aspects of Smooth Particle Hydrodynamic @@ -26,14 +26,16 @@ end # asymmetric version. @inline function pressure_acceleration_continuity_density(m_a, m_b, rho_a, rho_b, p_a, p_b, W_a) - return -m_b * (p_a + p_b) / (rho_a * rho_b) * W_a + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + return -m_b * div_fast(p_a + p_b, rho_a * rho_b) * W_a end # Same as above, but not assuming symmetry of the kernel gradient. To be used with # corrections that do not produce a symmetric kernel gradient. @inline function pressure_acceleration_continuity_density(m_a, m_b, rho_a, rho_b, p_a, p_b, W_a, W_b) - return -m_b / (rho_a * rho_b) * (p_a * W_a - p_b * W_b) + return -div_fast(m_b, rho_a * rho_b) * (p_a * W_a - p_b * W_b) end @doc raw""" diff --git a/src/schemes/fluid/shifting_techniques.jl b/src/schemes/fluid/shifting_techniques.jl index ceaff12a21..ed9644b471 100644 --- a/src/schemes/fluid/shifting_techniques.jl +++ b/src/schemes/fluid/shifting_techniques.jl @@ -41,11 +41,11 @@ function update_shifting!(system, shifting, v, u, v_ode, u_ode, semi) end # Additional term in the momentum equation due to the shifting technique -@inline function dv_shifting(shifting, system, neighbor_system, - v_system, v_neighbor_system, particle, neighbor, - m_a, m_b, rho_a, rho_b, pos_diff, distance, - grad_kernel, correction) - return zero(grad_kernel) +@inline function dv_shifting!(dv_particle, shifting, system, neighbor_system, + v_system, v_neighbor_system, particle, neighbor, + m_a, m_b, rho_a, rho_b, pos_diff, distance, + grad_kernel, correction) + return dv_particle end # Additional term(s) in the continuity equation due to the shifting technique @@ -324,22 +324,23 @@ See [`ParticleShiftingTechnique`](@ref). struct MomentumEquationTermSun2019 end # Additional term in the momentum equation due to the shifting technique -@propagate_inbounds function dv_shifting(shifting::ParticleShiftingTechnique, system, - neighbor_system, - v_system, v_neighbor_system, particle, neighbor, - m_a, m_b, rho_a, rho_b, pos_diff, distance, - grad_kernel, correction) - return dv_shifting(shifting.momentum_equation_term, system, neighbor_system, - v_system, v_neighbor_system, particle, neighbor, - m_a, m_b, rho_a, rho_b, pos_diff, distance, - grad_kernel, correction) +@propagate_inbounds function dv_shifting!(dv_particle, + shifting::ParticleShiftingTechnique, + system, neighbor_system, + v_system, v_neighbor_system, particle, neighbor, + m_a, m_b, rho_a, rho_b, pos_diff, distance, + grad_kernel, correction) + return dv_shifting!(dv_particle, shifting.momentum_equation_term, system, + neighbor_system, v_system, v_neighbor_system, + particle, neighbor, m_a, m_b, rho_a, rho_b, + pos_diff, distance, grad_kernel, correction) end -@propagate_inbounds function dv_shifting(::MomentumEquationTermSun2019, - system, neighbor_system, - v_system, v_neighbor_system, - particle, neighbor, m_a, m_b, rho_a, rho_b, - pos_diff, distance, grad_kernel, correction) +@propagate_inbounds function dv_shifting!(dv_particle, ::MomentumEquationTermSun2019, + system, neighbor_system, + v_system, v_neighbor_system, + particle, neighbor, m_a, m_b, rho_a, rho_b, + pos_diff, distance, grad_kernel, correction) delta_v_a = delta_v(system, particle) delta_v_b = delta_v(neighbor_system, neighbor) @@ -347,8 +348,11 @@ end v_b = current_velocity(v_neighbor_system, neighbor_system, neighbor) tensor_product = v_a * delta_v_a' + v_b * delta_v_b' - return m_b / rho_b * - (tensor_product * grad_kernel + v_a * dot(delta_v_a - delta_v_b, grad_kernel)) + dv_particle[] += m_b / rho_b * + (tensor_product * grad_kernel + + v_a * dot(delta_v_a - delta_v_b, grad_kernel)) + + return dv_particle end # `ParticleShiftingTechnique{<:Any, <:Any, true}` means `modify_continuity_equation=true` @@ -568,10 +572,11 @@ struct TransportVelocityAdami{modify_continuity_equation, T <: Real} <: end end -@propagate_inbounds function dv_shifting(::TransportVelocityAdami, system, neighbor_system, - v_system, v_neighbor_system, particle, neighbor, - m_a, m_b, rho_a, rho_b, pos_diff, distance, - grad_kernel, correction) +@propagate_inbounds function dv_shifting!(dv_particle, ::TransportVelocityAdami, + system, neighbor_system, + v_system, v_neighbor_system, particle, neighbor, + m_a, m_b, rho_a, rho_b, pos_diff, distance, + grad_kernel, correction) v_a = current_velocity(v_system, system, particle) delta_v_a = delta_v(system, particle) @@ -588,9 +593,11 @@ end # m_b * (A_a + A_b) / (ρ_a * ρ_b) * ∇W_ab. # In order to obtain this, we pass `p_a = A_a` and `p_b = A_b` to the # `pressure_acceleration` function. - return pressure_acceleration(system, neighbor_system, particle, neighbor, - m_a, m_b, A_a, A_b, rho_a, rho_b, pos_diff, - distance, grad_kernel, correction) + dv_particle[] += pressure_acceleration(system, neighbor_system, particle, neighbor, + m_a, m_b, A_a, A_b, rho_a, rho_b, + pos_diff, distance, grad_kernel, correction) + + return dv_particle end # The function above misuses the pressure acceleration function by passing a Matrix as `p_a`. diff --git a/src/schemes/fluid/surface_tension.jl b/src/schemes/fluid/surface_tension.jl index 7a0232c51c..48874fa0b0 100644 --- a/src/schemes/fluid/surface_tension.jl +++ b/src/schemes/fluid/surface_tension.jl @@ -162,66 +162,76 @@ end end # Skip -@inline function surface_tension_force(surface_tension_a, surface_tension_b, - particle_system, neighbor_system, particle, neighbor, - pos_diff, distance, rho_a, rho_b, grad_kernel) - return zero(pos_diff) +@inline function surface_tension_force!(dv_particle, surface_tension_a, surface_tension_b, + particle_system, neighbor_system, particle, + neighbor, pos_diff, distance, rho_a, rho_b, + grad_kernel) + return dv_particle end -@inline function surface_tension_force(surface_tension_a::CohesionForceAkinci, - surface_tension_b::CohesionForceAkinci, - particle_system::AbstractFluidSystem, - neighbor_system::AbstractFluidSystem, - particle, neighbor, pos_diff, distance, - rho_a, rho_b, grad_kernel) +@inline function surface_tension_force!(dv_particle, + surface_tension_a::CohesionForceAkinci, + surface_tension_b::CohesionForceAkinci, + particle_system::AbstractFluidSystem, + neighbor_system::AbstractFluidSystem, + particle, neighbor, pos_diff, distance, + rho_a, rho_b, grad_kernel) # No cohesion with oneself. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle m_b = hydrodynamic_mass(neighbor_system, neighbor) support_radius = compact_support(smoothing_kernel, smoothing_length(particle_system, particle)) - return cohesion_force_akinci(surface_tension_a, support_radius, m_b, pos_diff, distance) + dv_particle[] += cohesion_force_akinci(surface_tension_a, support_radius, m_b, + pos_diff, distance) + + return dv_particle end -@inline function surface_tension_force(surface_tension_a::SurfaceTensionAkinci, - surface_tension_b::SurfaceTensionAkinci, - particle_system::AbstractFluidSystem, - neighbor_system::AbstractFluidSystem, particle, - neighbor, - pos_diff, distance, rho_a, rho_b, grad_kernel) +@inline function surface_tension_force!(dv_particle, + surface_tension_a::SurfaceTensionAkinci, + surface_tension_b::SurfaceTensionAkinci, + particle_system::AbstractFluidSystem, + neighbor_system::AbstractFluidSystem, particle, + neighbor, + pos_diff, distance, rho_a, rho_b, grad_kernel) (; smoothing_kernel) = particle_system (; surface_tension_coefficient) = surface_tension_a smoothing_length_ = smoothing_length(particle_system, particle) # No surface tension with oneself. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle m_b = hydrodynamic_mass(neighbor_system, neighbor) n_a = surface_normal(particle_system, particle) n_b = surface_normal(neighbor_system, neighbor) support_radius = compact_support(smoothing_kernel, smoothing_length_) - return cohesion_force_akinci(surface_tension_a, support_radius, m_b, - pos_diff, distance) .- - (surface_tension_coefficient * (n_a - n_b) * smoothing_length_) -end + dv_particle[] += cohesion_force_akinci(surface_tension_a, support_radius, m_b, + pos_diff, distance) + dv_particle[] -= surface_tension_coefficient * (n_a - n_b) * smoothing_length_ -@inline function surface_tension_force(surface_tension_a::SurfaceTensionMorris, - surface_tension_b::SurfaceTensionMorris, - particle_system::AbstractFluidSystem, - neighbor_system::AbstractFluidSystem, - particle, neighbor, pos_diff, distance, - rho_a, rho_b, grad_kernel) + return dv_particle +end +@inline function surface_tension_force!(dv_particle, + surface_tension_a::SurfaceTensionMorris, + surface_tension_b::SurfaceTensionMorris, + particle_system::AbstractFluidSystem, + neighbor_system::AbstractFluidSystem, + particle, neighbor, pos_diff, distance, + rho_a, rho_b, grad_kernel) (; surface_tension_coefficient) = surface_tension_a # No surface tension with oneself. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle n_a = surface_normal(particle_system, particle) curvature_a = curvature(particle_system, particle) - return -surface_tension_coefficient / rho_a * curvature_a * n_a + dv_particle[] -= surface_tension_coefficient / rho_a * curvature_a * n_a + + return dv_particle end function compute_stress_tensors!(system, surface_tension, v, u, v_ode, u_ode, semi, t) @@ -277,46 +287,54 @@ function compute_surface_delta_function!(system, ::SurfaceTensionMomentumMorris, return system end -@inline function surface_tension_force(surface_tension_a::SurfaceTensionMomentumMorris, - surface_tension_b::SurfaceTensionMomentumMorris, - particle_system::AbstractFluidSystem, - neighbor_system::AbstractFluidSystem, - particle, neighbor, pos_diff, distance, - rho_a, rho_b, grad_kernel) +@inline function surface_tension_force!(dv_particle, + surface_tension_a::SurfaceTensionMomentumMorris, + surface_tension_b::SurfaceTensionMomentumMorris, + particle_system::AbstractFluidSystem, + neighbor_system::AbstractFluidSystem, + particle, neighbor, pos_diff, distance, + rho_a, rho_b, grad_kernel) (; surface_tension_coefficient) = surface_tension_a # No surface tension with oneself. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle S_a = stress_tensor(particle_system, particle) S_b = stress_tensor(neighbor_system, neighbor) m_b = hydrodynamic_mass(neighbor_system, neighbor) - return surface_tension_coefficient * m_b * (S_a + S_b) / (rho_a * rho_b) * grad_kernel + dv_particle[] += surface_tension_coefficient * m_b * (S_a + S_b) / (rho_a * rho_b) * + grad_kernel + + return dv_particle end -@inline function adhesion_force(surface_tension::AkinciTypeSurfaceTension, - particle_system::AbstractFluidSystem, - neighbor_system::AbstractBoundarySystem, particle, neighbor, - pos_diff, distance) +@inline function adhesion_force!(dv_particle, + surface_tension::AkinciTypeSurfaceTension, + particle_system::AbstractFluidSystem, + neighbor_system::AbstractBoundarySystem, particle, + neighbor, + pos_diff, distance) (; adhesion_coefficient) = neighbor_system # No adhesion with oneself. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle # No reason to calculate the adhesion force if adhesion coefficient is near zero - abs(adhesion_coefficient) < eps() && return zero(pos_diff) + abs(adhesion_coefficient) < eps() && return dv_particle m_b = hydrodynamic_mass(neighbor_system, neighbor) support_radius = compact_support(particle_system.smoothing_kernel, smoothing_length(particle_system, particle)) - return adhesion_force_akinci(surface_tension, support_radius, m_b, pos_diff, distance, - adhesion_coefficient) + dv_particle[] += adhesion_force_akinci(surface_tension, support_radius, m_b, pos_diff, + distance, adhesion_coefficient) + + return dv_particle end -@inline function adhesion_force(surface_tension, particle_system, neighbor_system, particle, - neighbor, pos_diff, distance) - return zero(pos_diff) +@inline function adhesion_force!(dv_particle, surface_tension, particle_system, + neighbor_system, particle, neighbor, pos_diff, distance) + return dv_particle end diff --git a/src/schemes/fluid/viscosity.jl b/src/schemes/fluid/viscosity.jl index 2a6c51ba71..03d9cc0f85 100644 --- a/src/schemes/fluid/viscosity.jl +++ b/src/schemes/fluid/viscosity.jl @@ -1,33 +1,43 @@ +@propagate_inbounds function viscous_velocity(v, system, particle, v_particle) + # Regular particle velocity is used for the viscosity calculation by default + return v_particle +end + # Unpack the neighboring systems viscosity to dispatch on the viscosity type. # This function is only necessary to allow `nothing` as viscosity. # Otherwise, we could just apply the viscosity as a function directly. -@propagate_inbounds function dv_viscosity(particle_system, neighbor_system, +@propagate_inbounds function dv_viscosity(dv_particle, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) viscosity = viscosity_model(particle_system, neighbor_system) - return dv_viscosity(viscosity, particle_system, neighbor_system, + return dv_viscosity(dv_particle, viscosity, particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) end -@propagate_inbounds function dv_viscosity(viscosity, particle_system, neighbor_system, +@propagate_inbounds function dv_viscosity(dv_particle, + viscosity, particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) - return viscosity(particle_system, neighbor_system, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + return viscosity(dv_particle, particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) end -@inline function dv_viscosity(viscosity::Nothing, particle_system, neighbor_system, +@inline function dv_viscosity(dv_particle, viscosity::Nothing, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) - return zero(pos_diff) + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) + return dv_particle end @doc raw""" @@ -76,13 +86,14 @@ struct ViscosityMorris{ELTYPE} end end -function kinematic_viscosity(system, viscosity::ViscosityMorris, smoothing_length, - sound_speed) +@inline function kinematic_viscosity(system, viscosity::ViscosityMorris, smoothing_length, + sound_speed) return viscosity.nu end @propagate_inbounds function (viscosity::Union{ArtificialViscosityMonaghan, - ViscosityMorris})(particle_system, + ViscosityMorris})(dv_particle, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, @@ -90,33 +101,34 @@ end pos_diff, distance, sound_speed, m_a, m_b, rho_a, rho_b, - grad_kernel) + v_a, v_b, grad_kernel) rho_mean = (rho_a + rho_b) / 2 - v_a = viscous_velocity(v_particle_system, particle_system, particle) - v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor) - v_diff = v_a - v_b + v_visc_a = viscous_velocity(v_particle_system, particle_system, particle, v_a) + v_visc_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b) + v_diff = v_visc_a - v_visc_b smoothing_length_particle = smoothing_length(particle_system, particle) smoothing_length_neighbor = smoothing_length(particle_system, neighbor) smoothing_length_average = (smoothing_length_particle + smoothing_length_neighbor) / 2 - nu_a = kinematic_viscosity(particle_system, - viscosity_model(neighbor_system, particle_system), - smoothing_length_particle, sound_speed) - nu_b = kinematic_viscosity(neighbor_system, - viscosity_model(particle_system, neighbor_system), - smoothing_length_neighbor, sound_speed) + nu_a = 0#kinematic_viscosity(particle_system, + # viscosity_model(neighbor_system, particle_system), + # smoothing_length_particle, sound_speed) + nu_b = 0#kinematic_viscosity(neighbor_system, + # viscosity_model(particle_system, neighbor_system), + # smoothing_length_neighbor, sound_speed) - pi_ab = viscosity(sound_speed, v_diff, pos_diff, distance, rho_mean, rho_a, rho_b, - smoothing_length_average, grad_kernel, nu_a, nu_b) + viscosity(dv_particle, sound_speed, v_diff, pos_diff, distance, rho_mean, rho_a, rho_b, + smoothing_length_average, grad_kernel, nu_a, nu_b, m_b) - return m_b * pi_ab + return dv_particle end -@inline function (viscosity::ArtificialViscosityMonaghan)(c, v_diff, pos_diff, distance, +@inline function (viscosity::ArtificialViscosityMonaghan)(dv_particle, c, v_diff, + pos_diff, distance, rho_mean, rho_a, rho_b, h, - grad_kernel, nu_a, nu_b) + grad_kernel, nu_a, nu_b, m_b) (; alpha, beta, epsilon) = viscosity # v_ab ⋅ r_ab @@ -127,31 +139,35 @@ end # approaching particles and turn it off for receding particles. In this way, the # viscosity is used for shocks and not rarefactions." if vr < 0 - mu = h * vr / (distance^2 + epsilon * h^2) - return (alpha * c * mu + beta * mu^2) / rho_mean * grad_kernel + # Since this is one of the most performance critical functions, using fast divisions + # here gives a significant speedup on GPUs. + mu = div_fast(h * vr, distance^2 + epsilon * h^2) + dv_particle[] += div_fast(m_b * alpha * c * mu + beta * mu^2, rho_mean) * grad_kernel end - return zero(v_diff) + return dv_particle end -@inline function (viscosity::ViscosityMorris)(c, v_diff, pos_diff, distance, rho_mean, - rho_a, rho_b, h, grad_kernel, nu_a, - nu_b) +@inline function (viscosity::ViscosityMorris)(dv_particle, c, v_diff, pos_diff, + distance, rho_mean, rho_a, rho_b, h, + grad_kernel, nu_a, nu_b, m_b) epsilon = viscosity.epsilon mu_a = nu_a * rho_a mu_b = nu_b * rho_b - return (mu_a + mu_b) / (rho_a * rho_b) * dot(pos_diff, grad_kernel) / - (distance^2 + epsilon * h^2) * v_diff + dv_particle[] += m_b * (mu_a + mu_b) / (rho_a * rho_b) * dot(pos_diff, grad_kernel) / + (distance^2 + epsilon * h^2) * v_diff + + return dv_particle end # See, e.g., # Joseph J. Monaghan. "Smoothed Particle Hydrodynamics". # In: Reports on Progress in Physics (2005), pages 1703-1759. # [doi: 10.1088/0034-4885/68/8/r01](http://dx.doi.org/10.1088/0034-4885/68/8/R01) -function kinematic_viscosity(system, viscosity::ArtificialViscosityMonaghan, - smoothing_length, sound_speed) +@inline function kinematic_viscosity(system, viscosity::ArtificialViscosityMonaghan, + smoothing_length, sound_speed) (; alpha) = viscosity return alpha * smoothing_length * sound_speed / (2 * ndims(system) + 4) @@ -177,8 +193,9 @@ struct ViscosityAdami{ELTYPE} end end -function adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel, - m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon) +@inline function adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff, + distance, grad_kernel, m_a, m_b, rho_a, rho_b, + v_diff, nu_a, nu_b, epsilon) eta_a = nu_a * rho_a eta_b = nu_b * rho_b @@ -200,14 +217,17 @@ function adami_viscosity_force(smoothing_length_average, pos_diff, distance, gra # See issue: https://github.com/trixi-framework/TrixiParticles.jl/issues/394 visc = (volume_a^2 + volume_b^2) * dot(grad_kernel, pos_diff) * tmp / m_a - return visc .* v_diff + dv_particle[] += visc .* v_diff + + return dv_particle end -@inline function (viscosity::ViscosityAdami)(particle_system, neighbor_system, +@inline function (viscosity::ViscosityAdami)(dv_particle, particle_system, + neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, sound_speed, m_a, m_b, - rho_a, rho_b, grad_kernel) + rho_a, rho_b, v_a, v_b, grad_kernel) epsilon = viscosity.epsilon smoothing_length_particle = smoothing_length(particle_system, particle) @@ -221,23 +241,20 @@ end viscosity_model(particle_system, neighbor_system), smoothing_length_neighbor, sound_speed) - v_a = viscous_velocity(v_particle_system, particle_system, particle) - v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor) + v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a) + v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b) v_diff = v_a - v_b - return adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel, - m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon) + return adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff, + distance, grad_kernel, m_a, m_b, rho_a, rho_b, + v_diff, nu_a, nu_b, epsilon) end -function kinematic_viscosity(system, viscosity::ViscosityAdami, smoothing_length, - sound_speed) +@inline function kinematic_viscosity(system, viscosity::ViscosityAdami, smoothing_length, + sound_speed) return viscosity.nu end -@propagate_inbounds function viscous_velocity(v, system, particle) - return current_velocity(v, system, particle) -end - @doc raw""" ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.01) @@ -289,13 +306,15 @@ end ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityAdamiSGS(nu, C_S, epsilon) -@propagate_inbounds function (viscosity::ViscosityAdamiSGS)(particle_system, +@propagate_inbounds function (viscosity::ViscosityAdamiSGS)(dv_particle, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, sound_speed, m_a, m_b, - rho_a, rho_b, grad_kernel) + rho_a, rho_b, + v_a, v_b, grad_kernel) epsilon = viscosity.epsilon smoothing_length_particle = smoothing_length(particle_system, particle) @@ -309,8 +328,8 @@ ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityAdamiSGS(nu, C_S, eps viscosity_model(particle_system, neighbor_system), smoothing_length_neighbor, sound_speed) - v_a = viscous_velocity(v_particle_system, particle_system, particle) - v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor) + v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a) + v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b) v_diff = v_a - v_b # ------------------------------------------------------------------------------ @@ -341,8 +360,9 @@ ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityAdamiSGS(nu, C_S, eps nu_a = nu_a + nu_SGS nu_b = nu_b + nu_SGS - return adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel, - m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon) + return adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff, + distance, grad_kernel, m_a, m_b, rho_a, rho_b, + v_diff, nu_a, nu_b, epsilon) end function kinematic_viscosity(system, viscosity::ViscosityAdamiSGS, smoothing_length, @@ -401,13 +421,15 @@ end ViscosityMorrisSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityMorrisSGS(nu, C_S, epsilon) -@propagate_inbounds function (viscosity::ViscosityMorrisSGS)(particle_system, +@propagate_inbounds function (viscosity::ViscosityMorrisSGS)(dv_particle, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, sound_speed, m_a, - m_b, rho_a, rho_b, grad_kernel) + m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) epsilon = viscosity.epsilon smoothing_length_particle = smoothing_length(particle_system, particle) @@ -421,8 +443,8 @@ ViscosityMorrisSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityMorrisSGS(nu, C_S, e viscosity_model(particle_system, neighbor_system), smoothing_length_neighbor, sound_speed) - v_a = viscous_velocity(v_particle_system, particle_system, particle) - v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor) + v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a) + v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b) v_diff = v_a - v_b # SGS part: Compute the subgrid-scale eddy viscosity. @@ -438,9 +460,10 @@ ViscosityMorrisSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityMorrisSGS(nu, C_S, e mu_a = nu_a_eff * rho_a mu_b = nu_b_eff * rho_b - force_Morris = (mu_a + mu_b) / (rho_a * rho_b) * (dot(pos_diff, grad_kernel)) / - (distance^2 + epsilon * smoothing_length_average^2) * v_diff - return m_b * force_Morris + dv_particle[] += m_b * (mu_a + mu_b) / (rho_a * rho_b) * (dot(pos_diff, grad_kernel)) / + (distance^2 + epsilon * smoothing_length_average^2) * v_diff + + return dv_particle end function kinematic_viscosity(system, viscosity::ViscosityMorrisSGS, smoothing_length, @@ -476,24 +499,24 @@ function ViscosityCarreauYasuda(; nu0, nu_inf, lambda, a, n, epsilon=0.01) ViscosityCarreauYasuda(nu0, nu_inf, lambda, a, n, epsilon) end -@propagate_inbounds function (viscosity::ViscosityCarreauYasuda)(particle_system, +@propagate_inbounds function (viscosity::ViscosityCarreauYasuda)(dv_particle, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, sound_speed, - m_a, m_b, - rho_a, rho_b, - grad_kernel) + m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) epsilon = viscosity.epsilon smoothing_length_particle = smoothing_length(particle_system, particle) smoothing_length_neighbor = smoothing_length(particle_system, neighbor) smoothing_length_average = (smoothing_length_particle + smoothing_length_neighbor) / 2 - v_a = viscous_velocity(v_particle_system, particle_system, particle) - v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor) + v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a) + v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b) v_diff = v_a - v_b gamma_dot = norm(v_diff) / (distance + epsilon) @@ -504,11 +527,12 @@ end nu_a = nu_eff nu_b = nu_eff - return adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel, - m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon) + return adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff, + distance, grad_kernel, m_a, m_b, rho_a, rho_b, + v_diff, nu_a, nu_b, epsilon) end -function kinematic_viscosity(system, viscosity::ViscosityCarreauYasuda, - smoothing_length, sound_speed) +@inline function kinematic_viscosity(system, viscosity::ViscosityCarreauYasuda, + smoothing_length, sound_speed) return viscosity.nu0 end diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index acb0f06894..dc8364cd79 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -14,119 +14,1161 @@ function interact!(dv, v_particle_system, u_particle_system, system_coords = current_coordinates(u_particle_system, particle_system) neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system) + neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi) - # In order to visualize quantities like pressure forces or viscosity forces, uncomment - # the following code and the two other lines below that are marked as "debug example". - # debug_array = zeros(ndims(particle_system), nparticles(particle_system)) - - # Loop over all pairs of particles and neighbors within the kernel cutoff - foreach_point_neighbor(particle_system, neighbor_system, - system_coords, neighbor_system_coords, semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance - # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are - # in bounds of the respective system. For performance reasons, we use `@inbounds` - # in this hot loop to avoid bounds checking when extracting particle quantities. - rho_a = @inbounds current_density(v_particle_system, particle_system, particle) - rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) - rho_mean = (rho_a + rho_b) / 2 - - # Determine correction factors. - # This can be ignored, as these are all 1 when no correction is used. - (viscosity_correction, pressure_correction, - surface_tension_correction) = free_surface_correction(correction, particle_system, - rho_mean) - - grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle) + # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient divides + # by zero. To account for rounding errors, we check if `distance` is almost zero. + # Since the coordinates are in the order of the compact support `c`, `distance^2` is in + # the order of `c^2`, so we need to check `distance < sqrt(eps(c^2))`. + # Note that `sqrt(eps(c^2)) != eps(c)`. + compact_support_ = compact_support(particle_system, neighbor_system) + almostzero = sqrt(eps(compact_support_^2)) + @threaded semi for particle in each_integrated_particle(particle_system) + # We are looping over the particles of `particle_system`, so it is guaranteed + # that `particle` is in bounds of `particle_system`. m_a = @inbounds hydrodynamic_mass(particle_system, particle) - m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) - - # The following call is equivalent to - # `p_a = current_pressure(v_particle_system, particle_system, particle)` - # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` - # Only when the neighbor system is a `WallBoundarySystem` or a `TotalLagrangianSPHSystem` - # with the boundary model `PressureMirroring`, this will return `p_b = p_a`, which is - # the pressure of the fluid particle. - p_a, - p_b = @inbounds particle_neighbor_pressure(v_particle_system, - v_neighbor_system, - particle_system, neighbor_system, - particle, neighbor) - - dv_pressure = pressure_correction * - pressure_acceleration(particle_system, neighbor_system, - particle, neighbor, - m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff, - distance, grad_kernel, correction) - - # Propagate `@inbounds` to the viscosity function, which accesses particle data - dv_viscosity_ = viscosity_correction * - @inbounds dv_viscosity(particle_system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, - grad_kernel) - - # Extra terms in the momentum equation when using a shifting technique - dv_tvf = @inbounds dv_shifting(shifting_technique(particle_system), - particle_system, neighbor_system, - v_particle_system, v_neighbor_system, - particle, neighbor, m_a, m_b, rho_a, rho_b, - pos_diff, distance, grad_kernel, correction) - - dv_surface_tension = surface_tension_correction * - surface_tension_force(surface_tension_a, surface_tension_b, - particle_system, neighbor_system, - particle, neighbor, pos_diff, distance, - rho_a, rho_b, grad_kernel) - - dv_adhesion = adhesion_force(surface_tension_a, particle_system, neighbor_system, - particle, neighbor, pos_diff, distance) - - dv_particle = dv_pressure + dv_viscosity_ + dv_tvf + dv_surface_tension + - dv_adhesion - - for i in 1:ndims(particle_system) - @inbounds dv[i, particle] += dv_particle[i] - # Debug example - # debug_array[i, particle] += dv_pressure[i] + p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) + + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + particle) + + # Accumulate the RHS contributions over all neighbors before writing to `dv`, + # to reduce the number of memory writes. + # Note that we need a `Ref` in order to be able to update these variables + # inside the closure in the `foreach_neighbor` loop. + dv_particle = Ref(zero(v_a)) + drho_particle = Ref(zero(rho_a)) + + # Loop over all neighbors within the kernel cutoff + @inbounds PointNeighbors.foreach_neighbor(system_coords, neighbor_system_coords, + neighborhood_search, + particle) do particle, neighbor, + pos_diff, distance + # Skip neighbors with the same position because the kernel gradient is zero. + # Note that `return` only exits the closure, i.e., skips the current neighbor. + skip_zero_distance(particle_system, distance, almostzero) && return + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + + # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` + m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) + (v_b, + rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + neighbor) + rho_mean = (rho_a + rho_b) / 2 + vdiff = v_a - v_b + + # The following call is equivalent to + # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` + # Only when the neighbor system is a `WallBoundarySystem` + # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`, + # this will return `p_b = p_a`, which is the pressure of the fluid particle. + p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system, + neighbor, p_a) + + # For `ContinuityDensity` without correction, this is equivalent to + # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel + dv_pressure = pressure_acceleration(particle_system, neighbor_system, + particle, neighbor, + m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff, + distance, grad_kernel, correction) + + # Determine correction factors. + # This can usually be ignored, as these are all 1 when no correction is used. + (viscosity_correction, pressure_correction, + surface_tension_correction) = free_surface_correction(correction, + particle_system, + rho_a, rho_b) + + # Accumulate contributions over all neighbors + dv_particle[] += dv_pressure * pressure_correction + + # Propagate `@inbounds` to the viscosity function, which accesses particle data + @inbounds dv_viscosity(dv_particle, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + + # Extra terms in the momentum equation when using a shifting technique + @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system), + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, m_a, m_b, rho_a, rho_b, + pos_diff, distance, grad_kernel, correction) + + # TODO surface_tension_correction + @inbounds surface_tension_force!(dv_particle, surface_tension_a, + surface_tension_b, + particle_system, neighbor_system, + particle, neighbor, pos_diff, distance, + rho_a, rho_b, grad_kernel) + + @inbounds adhesion_force!(dv_particle, surface_tension_a, particle_system, + neighbor_system, + particle, neighbor, pos_diff, distance) + + # TODO If variable smoothing_length is used, this should use the neighbor smoothing length + # Propagate `@inbounds` to the continuity equation, which accesses particle data + @inbounds continuity_equation!(drho_particle, density_calculator, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) end - # TODO If variable smoothing_length is used, this should use the neighbor smoothing length - # Propagate `@inbounds` to the continuity equation, which accesses particle data - @inbounds continuity_equation!(dv, density_calculator, particle_system, - neighbor_system, v_particle_system, - v_neighbor_system, particle, neighbor, - pos_diff, distance, m_b, rho_a, rho_b, grad_kernel) + for i in eachindex(dv_particle[]) + @inbounds dv[i, particle] += dv_particle[][i] + end + @inbounds dv[end, particle] += drho_particle[] end - # Debug example - # periodic_box = neighborhood_search.periodic_box - # Note: this saves a file in every stage of the integrator - # if !@isdefined iter; iter = 0; end - # TODO: This call should use public API. This requires some additional changes to simplify the calls. - # trixi2vtk(v_particle_system, u_particle_system, -1.0, particle_system, periodic_box, debug=debug_array, prefix="debug", iter=iter += 1) return dv end -@propagate_inbounds function particle_neighbor_pressure(v_particle_system, - v_neighbor_system, - particle_system, neighbor_system, - particle, neighbor) - p_a = current_pressure(v_particle_system, particle_system, particle) - p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor) - return p_a, p_b +# for cell in cells +# for point in cell +# for neighbor_cell in neighbor_cells + +# for neighbor in neighbor_cell +# +# 3.84 ms vs 2.92 ms for the interact! above +function interact_localmem!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi) + (; density_calculator, correction) = particle_system + + sound_speed = system_sound_speed(particle_system) + + surface_tension_a = surface_tension_model(particle_system) + surface_tension_b = surface_tension_model(neighbor_system) + + system_coords = current_coordinates(u_particle_system, particle_system) + neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi) + search_radius = PointNeighbors.search_radius(neighborhood_search) + + almostzero = eps(search_radius^2) + + backend = semi.parallelization_backend + # max_particles_per_cell = 64 + # nhs_size = size(neighborhood_search.cell_list.linear_indices) + # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + linear_indices = neighborhood_search.cell_list.linear_indices + cartesian_indices = CartesianIndices(size(linear_indices)) + lengths = Array(neighborhood_search.cell_list.cells.lengths) + max_particles_per_cell = maximum(lengths) + # max_particles_per_cell = 64 + nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices)) + ndrange = max_particles_per_cell * length(nonempty_cells) + kernel = foreach_neighbor_localmem(backend, Int64(max_particles_per_cell)) + kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, + Val(max_particles_per_cell), search_radius, almostzero, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, + sound_speed, density_calculator; ndrange) + + KernelAbstractions.synchronize(backend) + + return nothing +end + +@inline function copy_to_localmem!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx) + points_view = @inbounds PointNeighbors.points_in_cell(neighbor_cell, neighborhood_search) + n_particles_in_neighbor_cell = length(points_view) + + # First use all threads to load the neighbors into local memory in parallel + if particleidx <= n_particles_in_neighbor_cell + @inbounds p = local_points[particleidx] = points_view[particleidx] + for d in 1:ndims(neighborhood_search) + @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p] + end + v, rho = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, p) + for d in 1:ndims(neighborhood_search) + @inbounds local_neighbor_vrho[d, particleidx] = v[d] + end + @inbounds local_neighbor_vrho[ndims(neighborhood_search) + 1, particleidx] = rho + + m = @inbounds hydrodynamic_mass(neighbor_system, p) + @inbounds local_neighbor_mass[particleidx] = m + pressure = @inbounds current_pressure(v_neighbor_system, neighbor_system, p) + @inbounds local_neighbor_pressure[particleidx] = pressure + end + return n_particles_in_neighbor_cell +end + +# @parallel(block) for cell in cells +# for neighbor_cell in neighboring_cells +# @parallel(thread) for neighbor in neighbor_cell +# copy_coordinates_to_localmem(neighbor) +# +# # Make sure all threads finished the copying +# @synchronize +# +# @parallel(thread) for particle in cell +# for neighbor in neighbor_cell +# # This uses the neighbor coordinates from the local memory +# compute(point, neighbor) +# +# # Make sure all threads finished computing before we continue with copying +# @synchronize +@kernel cpu=false function foreach_neighbor_localmem(dv, system_coords, neighbor_system_coords, + neighborhood_search, cells, ::Val{MAX}, search_radius, + almostzero, particle_system, neighbor_system, + v_particle_system, v_neighbor_system, sound_speed, + density_calculator) where {MAX} + cell_ = @index(Group) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + # Coordinate buffer in local memory + local_points = @localmem Int32 MAX + local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX) + local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, MAX) + local_neighbor_mass = @localmem eltype(system_coords) MAX + local_neighbor_pressure = @localmem eltype(system_coords) MAX + + points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(points) + + # Extract point coordinates if a point lies on this thread + if particleidx <= n_particles_in_current_cell + particle = @inbounds points[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + particle) + + m_a = @inbounds hydrodynamic_mass(particle_system, particle) + p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) + + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + particle) + else + particle = zero(Int32) + point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + + m_a = zero(eltype(system_coords)) + p_a = zero(eltype(system_coords)) + v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + rho_a = zero(eltype(system_coords)) + end + + dv_particle = zero(v_a) + drho_particle = zero(rho_a) + + for neighbor_cell_ in PointNeighbors.neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + + n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx) + + # Make sure all threads finished the copying + @synchronize + + # Now each thread works on one particle again + if particleidx <= n_particles_in_current_cell + for local_neighbor in 1:n_particles_in_neighbor_cell + @inbounds neighbor = local_points[local_neighbor] + @inbounds neighbor_coords = extract_svector(local_neighbor_coords, + Val(ndims(neighborhood_search)), + local_neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if almostzero < distance2 <= search_radius^2 + distance = sqrt(distance2) + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + + m_b = @inbounds local_neighbor_mass[local_neighbor] + p_b = @inbounds local_neighbor_pressure[local_neighbor] + # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1), + # local_neighbor) + # v_b = v_rho_b[1:ndims(neighborhood_search)] + # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3]) + # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1] + (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system, + local_neighbor) + + # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` + # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) + # (v_b, + # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + # neighbor) + # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + vdiff = v_a - v_b + + # The following call is equivalent to + # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` + # Only when the neighbor system is a `WallBoundarySystem` + # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`, + # this will return `p_b = p_a`, which is the pressure of the fluid particle. + # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system, + # neighbor, p_a) + + # For `ContinuityDensity` without correction, this is equivalent to + # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel + dv_pressure = pressure_acceleration(particle_system, neighbor_system, + particle, neighbor, + m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff, + distance, grad_kernel, nothing) + + # Accumulate contributions over all neighbors + dv_particle += dv_pressure + + # Propagate `@inbounds` to the viscosity function, which accesses particle data + dv_viscosity_ = Ref(zero(dv_particle)) + @inbounds dv_viscosity(dv_viscosity_, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + dv_particle += dv_viscosity_[] + + # TODO If variable smoothing_length is used, this should use the neighbor smoothing length + # Propagate `@inbounds` to the continuity equation, which accesses particle data + drho_ = Ref(zero(drho_particle)) + @inbounds continuity_equation!(drho_, density_calculator, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) + drho_particle += drho_[] + end + end + end + + # Make sure all threads finished computing before we continue with copying + @synchronize() + end + + if particleidx <= n_particles_in_current_cell + for i in eachindex(dv_particle) + @inbounds dv[i, particle] += dv_particle[i] + end + @inbounds dv[end, particle] += drho_particle + end +end + +# Same as above, but local memory is used for 3 cells at once to reduce the number of synchronizations. +# 3.64 ms +function interact_localmem2!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi) + (; density_calculator, correction) = particle_system + + sound_speed = system_sound_speed(particle_system) + + surface_tension_a = surface_tension_model(particle_system) + surface_tension_b = surface_tension_model(neighbor_system) + + system_coords = current_coordinates(u_particle_system, particle_system) + neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi) + search_radius = PointNeighbors.search_radius(neighborhood_search) + + almostzero = eps(search_radius^2) + + backend = semi.parallelization_backend + # max_particles_per_cell = 64 + # nhs_size = size(neighborhood_search.cell_list.linear_indices) + # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + linear_indices = neighborhood_search.cell_list.linear_indices + cartesian_indices = CartesianIndices(size(linear_indices)) + lengths = Array(neighborhood_search.cell_list.cells.lengths) + max_particles_per_cell = maximum(lengths) + # max_particles_per_cell = 64 + nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices)) + ndrange = max_particles_per_cell * length(nonempty_cells) + kernel = foreach_neighbor_localmem2(backend, Int64(max_particles_per_cell)) + kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, + Val(max_particles_per_cell), search_radius, almostzero, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, + sound_speed, density_calculator; ndrange) + + KernelAbstractions.synchronize(backend) + + return nothing +end + +@inline function copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, offset) + points_view = @inbounds PointNeighbors.points_in_cell(neighbor_cell, neighborhood_search) + n_particles_in_neighbor_cell = length(points_view) + + # First use all threads to load the neighbors into local memory in parallel + if particleidx <= n_particles_in_neighbor_cell + write_idx = particleidx + offset + @inbounds p = local_points[write_idx] = points_view[particleidx] + for d in 1:ndims(neighborhood_search) + @inbounds local_neighbor_coords[d, write_idx] = neighbor_system_coords[d, p] + end + v, rho = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, p) + for d in 1:ndims(neighborhood_search) + @inbounds local_neighbor_vrho[d, write_idx] = v[d] + end + @inbounds local_neighbor_vrho[ndims(neighborhood_search) + 1, write_idx] = rho + + m = @inbounds hydrodynamic_mass(neighbor_system, p) + @inbounds local_neighbor_mass[write_idx] = m + pressure = @inbounds current_pressure(v_neighbor_system, neighbor_system, p) + @inbounds local_neighbor_pressure[write_idx] = pressure + end + return n_particles_in_neighbor_cell +end + +# @parallel(block) for cell in cells +# for neighbor_cell in neighboring_cells +# @parallel(thread) for neighbor in neighbor_cell +# copy_coordinates_to_localmem(neighbor) +# +# # Make sure all threads finished the copying +# @synchronize +# +# @parallel(thread) for particle in cell +# for neighbor in neighbor_cell +# # This uses the neighbor coordinates from the local memory +# compute(point, neighbor) +# +# # Make sure all threads finished computing before we continue with copying +# @synchronize +@kernel cpu=false function foreach_neighbor_localmem2(dv, system_coords, neighbor_system_coords, + neighborhood_search, cells, ::Val{MAX}, search_radius, + almostzero, particle_system, neighbor_system, + v_particle_system, v_neighbor_system, sound_speed, + density_calculator) where {MAX} + cell_ = @index(Group) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + # Coordinate buffer in local memory + local_points = @localmem Int32 3 * MAX + local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 3 * MAX) + local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 3 * MAX) + local_neighbor_mass = @localmem eltype(system_coords) 3 * MAX + local_neighbor_pressure = @localmem eltype(system_coords) 3 * MAX + + points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(points) + + # Extract point coordinates if a point lies on this thread + if particleidx <= n_particles_in_current_cell + particle = @inbounds points[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + particle) + + m_a = @inbounds hydrodynamic_mass(particle_system, particle) + p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) + + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + particle) + else + particle = zero(Int32) + point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + + m_a = zero(eltype(system_coords)) + p_a = zero(eltype(system_coords)) + v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + rho_a = zero(eltype(system_coords)) + end + + dv_particle = zero(v_a) + drho_particle = zero(rho_a) + + for neighbor_cell_block in ((cell[2] - 1, cell[3] - 1), (cell[2], cell[3] - 1), (cell[2] + 1, cell[3] - 1), + (cell[2] - 1, cell[3]), (cell[2], cell[3]), (cell[2] + 1, cell[3]), + (cell[2] - 1, cell[3] + 1), (cell[2], cell[3] + 1), (cell[2] + 1, cell[3] + 1)) + # for neighbor_cell_ in PointNeighbors.neighboring_cells(cell, neighborhood_search) + # neighbor_cell_block = Tuple(neighbor_cell_block_) + neighbor_cells = ((cell[1] - 1, neighbor_cell_block...), (cell[1], neighbor_cell_block...), (cell[1] + 1, neighbor_cell_block...)) + + neighbor_cell = @inbounds neighbor_cells[1] + n_particles_in_neighbor_cell1 = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, 0) + + neighbor_cell = @inbounds neighbor_cells[2] + n_particles_in_neighbor_cell2 = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, MAX) + + neighbor_cell = @inbounds neighbor_cells[3] + n_particles_in_neighbor_cell3 = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, 2 * MAX) + + # Make sure all threads finished the copying + @synchronize + + # Now each thread works on one particle again + if particleidx <= n_particles_in_current_cell + n_particles_in_cells = (n_particles_in_neighbor_cell1, n_particles_in_neighbor_cell2, n_particles_in_neighbor_cell3) + for neighbor_cell_idx in eachindex(neighbor_cells), local_neighbor in 1:n_particles_in_cells[neighbor_cell_idx] + local_neighbor = local_neighbor + (neighbor_cell_idx - 1) * MAX + @inbounds neighbor = local_points[local_neighbor] + @inbounds neighbor_coords = extract_svector(local_neighbor_coords, + Val(ndims(neighborhood_search)), + local_neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if almostzero < distance2 <= search_radius^2 + distance = sqrt(distance2) + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + + m_b = @inbounds local_neighbor_mass[local_neighbor] + p_b = @inbounds local_neighbor_pressure[local_neighbor] + # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1), + # local_neighbor) + # v_b = v_rho_b[1:ndims(neighborhood_search)] + # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3]) + # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1] + (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system, + local_neighbor) + + # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` + # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) + # (v_b, + # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + # neighbor) + # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + vdiff = v_a - v_b + + # The following call is equivalent to + # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` + # Only when the neighbor system is a `WallBoundarySystem` + # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`, + # this will return `p_b = p_a`, which is the pressure of the fluid particle. + # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system, + # neighbor, p_a) + + # For `ContinuityDensity` without correction, this is equivalent to + # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel + dv_pressure = pressure_acceleration(particle_system, neighbor_system, + particle, neighbor, + m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff, + distance, grad_kernel, nothing) + + # Accumulate contributions over all neighbors + dv_particle += dv_pressure + + # Propagate `@inbounds` to the viscosity function, which accesses particle data + dv_viscosity_ = Ref(zero(dv_particle)) + @inbounds dv_viscosity(dv_viscosity_, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + dv_particle += dv_viscosity_[] + + # TODO If variable smoothing_length is used, this should use the neighbor smoothing length + # Propagate `@inbounds` to the continuity equation, which accesses particle data + drho_ = Ref(zero(drho_particle)) + @inbounds continuity_equation!(drho_, density_calculator, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) + drho_particle += drho_[] + end + end + end + + # Make sure all threads finished computing before we continue with copying + @synchronize() + end + + if particleidx <= n_particles_in_current_cell + for i in eachindex(dv_particle) + @inbounds dv[i, particle] += dv_particle[i] + end + @inbounds dv[end, particle] += drho_particle + end +end + +# Double-buffered version of localmem2 +# 7.89 ms +function interact_localmem3!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi) + (; density_calculator, correction) = particle_system + + sound_speed = system_sound_speed(particle_system) + + surface_tension_a = surface_tension_model(particle_system) + surface_tension_b = surface_tension_model(neighbor_system) + + system_coords = current_coordinates(u_particle_system, particle_system) + neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi) + search_radius = PointNeighbors.search_radius(neighborhood_search) + + almostzero = eps(search_radius^2) + + backend = semi.parallelization_backend + # max_particles_per_cell = 64 + # nhs_size = size(neighborhood_search.cell_list.linear_indices) + # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + linear_indices = neighborhood_search.cell_list.linear_indices + cartesian_indices = CartesianIndices(size(linear_indices)) + lengths = Array(neighborhood_search.cell_list.cells.lengths) + max_particles_per_cell = maximum(lengths) + # max_particles_per_cell = 64 + nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices)) + ndrange = max_particles_per_cell * length(nonempty_cells) + kernel = foreach_neighbor_localmem3(backend, Int64(max_particles_per_cell)) + kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, + Val(max_particles_per_cell), search_radius, almostzero, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, + sound_speed, density_calculator; ndrange) + + KernelAbstractions.synchronize(backend) + + return nothing +end + +# @parallel(block) for cell in cells +# for neighbor_cell in neighboring_cells +# @parallel(thread) for neighbor in neighbor_cell +# copy_coordinates_to_localmem(neighbor) +# +# # Make sure all threads finished the copying +# @synchronize +# +# @parallel(thread) for particle in cell +# for neighbor in neighbor_cell +# # This uses the neighbor coordinates from the local memory +# compute(point, neighbor) +# +# # Make sure all threads finished computing before we continue with copying +# @synchronize +@kernel cpu=false function foreach_neighbor_localmem3(dv, system_coords, neighbor_system_coords, + neighborhood_search, cells, ::Val{MAX}, search_radius, + almostzero, particle_system, neighbor_system, + v_particle_system, v_neighbor_system, sound_speed, + density_calculator) where {MAX} + cell_ = @index(Group) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + # Coordinate buffer in local memory + local_points = @localmem Int32 3 * MAX + local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 3 * MAX) + local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 3 * MAX) + local_neighbor_mass = @localmem eltype(system_coords) 3 * MAX + local_neighbor_pressure = @localmem eltype(system_coords) 3 * MAX + + next_local_points = @localmem Int32 3 * MAX + next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 3 * MAX) + next_local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 3 * MAX) + next_local_neighbor_mass = @localmem eltype(system_coords) 3 * MAX + next_local_neighbor_pressure = @localmem eltype(system_coords) 3 * MAX + + points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(points) + + # Extract point coordinates if a point lies on this thread + if particleidx <= n_particles_in_current_cell + particle = @inbounds points[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + particle) + + m_a = @inbounds hydrodynamic_mass(particle_system, particle) + p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) + + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + particle) + else + particle = zero(Int32) + point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + + m_a = zero(eltype(system_coords)) + p_a = zero(eltype(system_coords)) + v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + rho_a = zero(eltype(system_coords)) + end + + dv_particle = zero(v_a) + drho_particle = zero(rho_a) + + # Load first cell block into local memory + neighbor_cell_block = @inbounds (cell[2] - 1, cell[3] - 1) + neighbor_cells = @inbounds ((cell[1] - 1, neighbor_cell_block...), (cell[1], neighbor_cell_block...), (cell[1] + 1, neighbor_cell_block...)) + + neighbor_cell = @inbounds neighbor_cells[1] + n_particles_in_neighbor_cell1 = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, 0) + + neighbor_cell = @inbounds neighbor_cells[2] + n_particles_in_neighbor_cell2 = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, MAX) + + neighbor_cell = @inbounds neighbor_cells[3] + n_particles_in_neighbor_cell3 = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, 2 * MAX) + + n_particles_in_cells = (n_particles_in_neighbor_cell1, n_particles_in_neighbor_cell2, n_particles_in_neighbor_cell3) + + @synchronize + + neighbor_cell_blocks = @inbounds ((cell[2] - 1, cell[3] - 1), (cell[2], cell[3] - 1), (cell[2] + 1, cell[3] - 1), + (cell[2] - 1, cell[3]), (cell[2], cell[3]), (cell[2] + 1, cell[3]), + (cell[2] - 1, cell[3] + 1), (cell[2], cell[3] + 1), (cell[2] + 1, cell[3] + 1)) + for neighbor_cell_block_idx in eachindex(neighbor_cell_blocks) + neighbor_cell_block = @inbounds neighbor_cell_blocks[neighbor_cell_block_idx] + neighbor_cells = @inbounds ((cell[1] - 1, neighbor_cell_block...), (cell[1], neighbor_cell_block...), (cell[1] + 1, neighbor_cell_block...)) + + if neighbor_cell_block_idx < length(neighbor_cell_blocks) + neighbor_cell = @inbounds neighbor_cells[1] + n_particles_in_neighbor_cell1 = copy_to_localmem2!(local_points, next_local_neighbor_coords, + next_local_neighbor_vrho, next_local_neighbor_mass, + next_local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, 0) + + neighbor_cell = @inbounds neighbor_cells[2] + n_particles_in_neighbor_cell2 = copy_to_localmem2!(next_local_points, next_local_neighbor_coords, + next_local_neighbor_vrho, next_local_neighbor_mass, + next_local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, MAX) + + neighbor_cell = @inbounds neighbor_cells[3] + n_particles_in_neighbor_cell3 = copy_to_localmem2!(next_local_points, next_local_neighbor_coords, + next_local_neighbor_vrho, next_local_neighbor_mass, + next_local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, 2 * MAX) + + next_n_particles_in_cells = (n_particles_in_neighbor_cell1, n_particles_in_neighbor_cell2, n_particles_in_neighbor_cell3) + end + + # Now each thread works on one particle again + if particleidx <= n_particles_in_current_cell + for neighbor_cell_idx in eachindex(neighbor_cells), local_neighbor in 1:n_particles_in_cells[neighbor_cell_idx] + local_neighbor = local_neighbor + (neighbor_cell_idx - 1) * MAX + @inbounds neighbor = local_points[local_neighbor] + @inbounds neighbor_coords = extract_svector(local_neighbor_coords, + Val(ndims(neighborhood_search)), + local_neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if almostzero < distance2 <= search_radius^2 + distance = sqrt(distance2) + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + + m_b = @inbounds local_neighbor_mass[local_neighbor] + p_b = @inbounds local_neighbor_pressure[local_neighbor] + # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1), + # local_neighbor) + # v_b = v_rho_b[1:ndims(neighborhood_search)] + # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3]) + # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1] + (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system, + local_neighbor) + + # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` + # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) + # (v_b, + # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + # neighbor) + # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + vdiff = v_a - v_b + + # The following call is equivalent to + # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` + # Only when the neighbor system is a `WallBoundarySystem` + # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`, + # this will return `p_b = p_a`, which is the pressure of the fluid particle. + # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system, + # neighbor, p_a) + + # For `ContinuityDensity` without correction, this is equivalent to + # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel + dv_pressure = pressure_acceleration(particle_system, neighbor_system, + particle, neighbor, + m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff, + distance, grad_kernel, nothing) + + # Accumulate contributions over all neighbors + dv_particle += dv_pressure + + # Propagate `@inbounds` to the viscosity function, which accesses particle data + dv_viscosity_ = Ref(zero(dv_particle)) + @inbounds dv_viscosity(dv_viscosity_, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + dv_particle += dv_viscosity_[] + + # TODO If variable smoothing_length is used, this should use the neighbor smoothing length + # Propagate `@inbounds` to the continuity equation, which accesses particle data + drho_ = Ref(zero(drho_particle)) + @inbounds continuity_equation!(drho_, density_calculator, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) + drho_particle += drho_[] + end + end + end + + # Make sure all threads finished computing before we continue with copying + @synchronize() + + neighbor_cell_block_idx >= length(neighbor_cell_blocks) && break + + # Swap local memory buffers + n_particles_in_cells = next_n_particles_in_cells + local_points, next_local_points = next_local_points, local_points + local_neighbor_coords, next_local_neighbor_coords = next_local_neighbor_coords, local_neighbor_coords + local_neighbor_vrho, next_local_neighbor_vrho = next_local_neighbor_vrho, local_neighbor_vrho + local_neighbor_mass, next_local_neighbor_mass = next_local_neighbor_mass, local_neighbor_mass + local_neighbor_pressure, next_local_neighbor_pressure = next_local_neighbor_pressure, local_neighbor_pressure + end + + if particleidx <= n_particles_in_current_cell + for i in eachindex(dv_particle) + @inbounds dv[i, particle] += dv_particle[i] + end + @inbounds dv[end, particle] += drho_particle + end +end + + +# Same as localmem2, but local memory is used for 27 cells at once to reduce the number of synchronizations. +# 6.73 ms +function interact_localmem4!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi) + (; density_calculator, correction) = particle_system + + sound_speed = system_sound_speed(particle_system) + + surface_tension_a = surface_tension_model(particle_system) + surface_tension_b = surface_tension_model(neighbor_system) + + system_coords = current_coordinates(u_particle_system, particle_system) + neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) + neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi) + search_radius = PointNeighbors.search_radius(neighborhood_search) + + almostzero = eps(search_radius^2) + + backend = semi.parallelization_backend + # max_particles_per_cell = 64 + # nhs_size = size(neighborhood_search.cell_list.linear_indices) + # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search))) + linear_indices = neighborhood_search.cell_list.linear_indices + cartesian_indices = CartesianIndices(size(linear_indices)) + lengths = Array(neighborhood_search.cell_list.cells.lengths) + max_particles_per_cell = maximum(lengths) + # max_particles_per_cell = 64 + nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices)) + ndrange = max_particles_per_cell * length(nonempty_cells) + kernel = foreach_neighbor_localmem4(backend, Int64(max_particles_per_cell)) + kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, + Val(max_particles_per_cell), search_radius, almostzero, + particle_system, neighbor_system, v_particle_system, v_neighbor_system, + sound_speed, density_calculator; ndrange) + + KernelAbstractions.synchronize(backend) + + return nothing +end + +# @parallel(block) for cell in cells +# for neighbor_cell in neighboring_cells +# @parallel(thread) for neighbor in neighbor_cell +# copy_coordinates_to_localmem(neighbor) +# +# # Make sure all threads finished the copying +# @synchronize +# +# @parallel(thread) for particle in cell +# for neighbor in neighbor_cell +# # This uses the neighbor coordinates from the local memory +# compute(point, neighbor) +# +# # Make sure all threads finished computing before we continue with copying +# @synchronize +@kernel cpu=false function foreach_neighbor_localmem4(dv, system_coords, neighbor_system_coords, + neighborhood_search, cells, ::Val{MAX}, search_radius, + almostzero, particle_system, neighbor_system, + v_particle_system, v_neighbor_system, sound_speed, + density_calculator) where {MAX} + cell_ = @index(Group) + cell = @inbounds Tuple(cells[cell_]) + particleidx = @index(Local) + @assert 1 <= particleidx <= MAX + + # Coordinate buffer in local memory + local_points = @localmem Int32 27 * MAX + local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 27 * MAX) + local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 27 * MAX) + local_neighbor_mass = @localmem eltype(system_coords) 27 * MAX + local_neighbor_pressure = @localmem eltype(system_coords) 27 * MAX + local_n_particles_in_neighbor_cell = @localmem Int32 27 + + points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search) + n_particles_in_current_cell = length(points) + + # Extract point coordinates if a point lies on this thread + if particleidx <= n_particles_in_current_cell + particle = @inbounds points[particleidx] + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + particle) + + m_a = @inbounds hydrodynamic_mass(particle_system, particle) + p_a = @inbounds current_pressure(v_particle_system, particle_system, particle) + + # In 3D, this function can combine velocity and density load into one wide load, + # which gives a significant speedup on GPUs. + (v_a, + rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system, + particle) + else + particle = zero(Int32) + point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + + m_a = zero(eltype(system_coords)) + p_a = zero(eltype(system_coords)) + v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)}) + rho_a = zero(eltype(system_coords)) + end + + dv_particle = zero(v_a) + drho_particle = zero(rho_a) + + neighboring_cells = PointNeighbors.neighboring_cells(cell, neighborhood_search) + for neighbor_cell_idx in 1:length(neighboring_cells) + neighbor_cell = @inbounds Tuple(neighboring_cells[neighbor_cell_idx]) + n_particles_in_neighbor_cell = copy_to_localmem2!(local_points, local_neighbor_coords, + local_neighbor_vrho, local_neighbor_mass, + local_neighbor_pressure, + v_neighbor_system, neighbor_system, + neighbor_cell, neighbor_system_coords, + neighborhood_search, particleidx, (neighbor_cell_idx - 1) * MAX) + @inbounds local_n_particles_in_neighbor_cell[neighbor_cell_idx] = n_particles_in_neighbor_cell + end + + # Make sure all threads finished the copying + @synchronize + + # Now each thread works on one particle again + if particleidx <= n_particles_in_current_cell + for neighbor_cell_idx in 1:length(neighboring_cells) + n_cells = @inbounds local_n_particles_in_neighbor_cell[neighbor_cell_idx] + for local_neighbor in 1:n_cells + local_neighbor = local_neighbor + (neighbor_cell_idx - 1) * MAX + @inbounds neighbor = local_points[local_neighbor] + @inbounds neighbor_coords = extract_svector(local_neighbor_coords, + Val(ndims(neighborhood_search)), + local_neighbor) + + pos_diff = point_coords - neighbor_coords + distance2 = dot(pos_diff, pos_diff) + + # TODO periodic + + if almostzero < distance2 <= search_radius^2 + distance = sqrt(distance2) + + # Now that we know that `distance` is not zero, we can safely call the unsafe + # version of the kernel gradient to avoid redundant zero checks. + grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff, + distance, particle) + + m_b = @inbounds local_neighbor_mass[local_neighbor] + p_b = @inbounds local_neighbor_pressure[local_neighbor] + # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1), + # local_neighbor) + # v_b = v_rho_b[1:ndims(neighborhood_search)] + # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3]) + # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1] + (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system, + local_neighbor) + + # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system` + # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor) + # (v_b, + # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, + # neighbor) + # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor) + # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor) + vdiff = v_a - v_b + + # The following call is equivalent to + # `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)` + # Only when the neighbor system is a `WallBoundarySystem` + # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`, + # this will return `p_b = p_a`, which is the pressure of the fluid particle. + # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system, + # neighbor, p_a) + + # For `ContinuityDensity` without correction, this is equivalent to + # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel + dv_pressure = pressure_acceleration(particle_system, neighbor_system, + particle, neighbor, + m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff, + distance, grad_kernel, nothing) + + # Accumulate contributions over all neighbors + dv_particle += dv_pressure + + # Propagate `@inbounds` to the viscosity function, which accesses particle data + dv_viscosity_ = Ref(zero(dv_particle)) + @inbounds dv_viscosity(dv_viscosity_, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, + v_a, v_b, grad_kernel) + dv_particle += dv_viscosity_[] + + # TODO If variable smoothing_length is used, this should use the neighbor smoothing length + # Propagate `@inbounds` to the continuity equation, which accesses particle data + drho_ = Ref(zero(drho_particle)) + @inbounds continuity_equation!(drho_, density_calculator, + particle_system, neighbor_system, + v_particle_system, v_neighbor_system, + particle, neighbor, pos_diff, distance, + m_b, rho_a, rho_b, vdiff, grad_kernel) + drho_particle += drho_[] + end + end + end + end + + if particleidx <= n_particles_in_current_cell + for i in eachindex(dv_particle) + @inbounds dv[i, particle] += dv_particle[i] + end + @inbounds dv[end, particle] += drho_particle + end +end + +@propagate_inbounds function neighbor_pressure(v_neighbor_system, neighbor_system, + neighbor, p_a) + return current_pressure(v_neighbor_system, neighbor_system, neighbor) +end + +@inline function neighbor_pressure(v_neighbor_system, + neighbor_system::WallBoundarySystem{<:BoundaryModelDummyParticles{PressureMirroring}}, + neighbor, p_a) + return p_a +end + +@propagate_inbounds function velocity_and_density(v, system, particle) + # For other systems, fall back to the default implementation + return velocity_and_density(v, nothing, system, particle) +end + +@propagate_inbounds function velocity_and_density(v, system::WeaklyCompressibleSPHSystem, + particle) + (; density_calculator) = system + + return velocity_and_density(v, density_calculator, system, particle) +end + +@propagate_inbounds function velocity_and_density(v, _, system, particle) + v_particle = current_velocity(v, system, particle) + rho_particle = current_density(v, system, particle) + + return v_particle, rho_particle end -@inline function particle_neighbor_pressure(v_particle_system, v_neighbor_system, - particle_system, - neighbor_system::WallBoundarySystem{<:BoundaryModelDummyParticles{PressureMirroring}}, - particle, neighbor) - p_a = current_pressure(v_particle_system, particle_system, particle) +@inline function velocity_and_density(v, ::ContinuityDensity, + ::WeaklyCompressibleSPHSystem{3}, particle) + vrho_a = vloada(Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1)) + a, b, c, d = Tuple(vrho_a) + v_particle = SVector(a, b, c) + rho_particle = d - return p_a, p_a + return v_particle, rho_particle end diff --git a/src/schemes/structure/rigid_body/system.jl b/src/schemes/structure/rigid_body/system.jl index c9281aa5f9..f3c8943ec6 100644 --- a/src/schemes/structure/rigid_body/system.jl +++ b/src/schemes/structure/rigid_body/system.jl @@ -282,23 +282,26 @@ function calc_normal!(system::AbstractFluidSystem, semi, surface_normal_method) end -@inline function adhesion_force(surface_tension::AkinciTypeSurfaceTension, - particle_system::AbstractFluidSystem, - neighbor_system::RigidBodySystem, - particle, neighbor, pos_diff, distance) +@inline function adhesion_force!(dv_particle, + surface_tension::AkinciTypeSurfaceTension, + particle_system::AbstractFluidSystem, + neighbor_system::RigidBodySystem, + particle, neighbor, pos_diff, distance) (; adhesion_coefficient) = neighbor_system # No adhesion with oneself. See `src/general/smoothing_kernels.jl` for more details. - distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff) + distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle - abs(adhesion_coefficient) < eps() && return zero(pos_diff) + abs(adhesion_coefficient) < eps() && return dv_particle m_b = hydrodynamic_mass(neighbor_system, neighbor) support_radius = compact_support(system_smoothing_kernel(particle_system), smoothing_length(particle_system, particle)) - return adhesion_force_akinci(surface_tension, support_radius, m_b, pos_diff, distance, - adhesion_coefficient) + dv_particle[] += adhesion_force_akinci(surface_tension, support_radius, m_b, + pos_diff, distance, adhesion_coefficient) + + return dv_particle end function write_u0!(u0, system::RigidBodySystem) @@ -506,9 +509,11 @@ end return neighbor_system.boundary_model.viscosity end -@inline function viscous_velocity(v, system::RigidBodySystem, particle) - # This function is only used in fluid-structure interaction, so it is never called when `boundary_model` is `nothing` - return viscous_velocity(v, system.boundary_model.viscosity, system, particle) +@inline function viscous_velocity(v, system::RigidBodySystem, particle, v_particle) + # This function is only used in fluid-structure interaction, + # so it is never called when `boundary_model` is `nothing` + return viscous_velocity(v, system.boundary_model.viscosity, system, + particle, v_particle) end @inline acceleration_source(system::RigidBodySystem) = system.acceleration diff --git a/src/schemes/structure/structure.jl b/src/schemes/structure/structure.jl index 1ac304d79f..0224760ad0 100644 --- a/src/schemes/structure/structure.jl +++ b/src/schemes/structure/structure.jl @@ -46,6 +46,9 @@ function interact_structure_fluid!(dv, v_particle_system, rho_a = current_density(v_particle_system, particle_system, particle) rho_b = current_density(v_neighbor_system, neighbor_system, neighbor) + v_a = current_velocity(v_particle_system, particle_system, particle) + v_b = current_velocity(v_neighbor_system, neighbor_system, neighbor) + surface_tension = surface_tension_model(neighbor_system) # In fluid-structure interaction, use the "hydrodynamic mass" of the structure particles @@ -66,17 +69,17 @@ function interact_structure_fluid!(dv, v_particle_system, pos_diff, distance, grad_kernel, system_correction(neighbor_system)) - dv_viscosity_ = dv_viscosity(neighbor_system, particle_system, - v_neighbor_system, v_particle_system, - neighbor, particle, pos_diff, distance, - sound_speed, m_b, m_a, rho_b, rho_a, grad_kernel) - - dv_adhesion = adhesion_force(surface_tension, neighbor_system, particle_system, - neighbor, particle, pos_diff, distance) + dv_particle = Ref(zero(pos_diff)) + dv_viscosity(dv_particle, neighbor_system, particle_system, + v_neighbor_system, v_particle_system, + neighbor, particle, pos_diff, distance, + sound_speed, m_b, m_a, rho_b, rho_a, + v_a, v_b, grad_kernel) - dv_fs = dv_boundary + dv_viscosity_ + dv_adhesion + adhesion_force!(dv_particle, surface_tension, neighbor_system, particle_system, + neighbor, particle, pos_diff, distance) - accumulate_structure_fluid_pair!(dv, dv_fs, particle_system, particle, m_b) + accumulate_structure_fluid_pair!(dv, dv_particle[], particle_system, particle, m_b) continuity_equation!(dv, v_particle_system, v_neighbor_system, particle, neighbor, pos_diff, distance, diff --git a/src/schemes/structure/total_lagrangian_sph/system.jl b/src/schemes/structure/total_lagrangian_sph/system.jl index cac4ed1f12..37b86ed85c 100644 --- a/src/schemes/structure/total_lagrangian_sph/system.jl +++ b/src/schemes/structure/total_lagrangian_sph/system.jl @@ -333,9 +333,11 @@ end return current_density(v, system.boundary_model, system) end -@inline function viscous_velocity(v, system::TotalLagrangianSPHSystem, particle) - # This function is only used in fluid-structure interaction, so it is never called when `boundary_model` is `nothing` - return viscous_velocity(v, system.boundary_model.viscosity, system, particle) +@inline function viscous_velocity(v, system::TotalLagrangianSPHSystem, particle, v_particle) + # This function is only used in fluid-structure interaction, + # so it is never called when `boundary_model` is `nothing` + return viscous_velocity(v, system.boundary_model.viscosity, system, + particle, v_particle) end # In fluid-structure interaction, use the "hydrodynamic pressure" of the structure particles diff --git a/src/util.jl b/src/util.jl index 855fe50765..682514563f 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,3 +1,9 @@ +# By default, use `div_fast` of `Base.FastMath`. +# In the `TrixiParticlesCUDAExt` extension, this is redefined for `Float64`. +@inline function div_fast(x, y) + return Base.FastMath.div_fast(x, y) +end + # Same as `foreach`, but it is unrolled by the compiler for small input tuples @inline function foreach_noalloc(func, collection) element = first(collection) diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl index 44772f8943..a08f207dd4 100644 --- a/test/examples/gpu.jl +++ b/test/examples/gpu.jl @@ -279,10 +279,8 @@ end fluid_density_calculator=SummationDensity(), maxiters=38, # 38 time steps on CPU clip_negative_pressure=true), - # Broken due to https://github.com/JuliaGPU/CUDA.jl/issues/2681 - # and https://github.com/JuliaGPU/Metal.jl/issues/550. - # "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1, - # smoothing_kernel=SchoenbergQuarticSplineKernel{2}()), + "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1, + smoothing_kernel=SchoenbergQuarticSplineKernel{2}()), "WCSPH with SchoenbergQuinticSplineKernel" => (smoothing_length=1.1, smoothing_kernel=SchoenbergQuinticSplineKernel{2}()), "WCSPH with WendlandC2Kernel" => (smoothing_length=1.5, diff --git a/test/general/density_calculator.jl b/test/general/density_calculator.jl index 2ded394766..6bf895f7be 100644 --- a/test/general/density_calculator.jl +++ b/test/general/density_calculator.jl @@ -1,5 +1,3 @@ -using OrdinaryDiffEq - # Setup a single particle and calculate its density @testset verbose=true "DensityCalculators" begin @testset verbose=true "SummationDensity" begin diff --git a/test/general/smoothing_kernels.jl b/test/general/smoothing_kernels.jl index fdd7f9a142..1acc72c554 100644 --- a/test/general/smoothing_kernels.jl +++ b/test/general/smoothing_kernels.jl @@ -1,11 +1,13 @@ @testset verbose=true "Smoothing Kernels" begin # Don't show all kernel tests in the final overview @testset verbose=false "Integral" begin - # All smoothing kernels should integrate to something close to 1 + # All smoothing kernels should integrate to something close to 1. + # We integrate slightly beyond the compact support to verify that the kernel is + # correctly evaluating to zero there. function integrate_kernel_2d(smk) integral_2d_radial, _ = quadgk(r -> r * TrixiParticles.kernel(smk, r, 1.0), 0, - TrixiParticles.compact_support(smk, 1.0), + TrixiParticles.compact_support(smk, 1.0) * 1.1, rtol=1e-15) return 2 * pi * integral_2d_radial end @@ -13,7 +15,7 @@ function integrate_kernel_3d(smk) integral_3d_radial, _ = quadgk(r -> r^2 * TrixiParticles.kernel(smk, r, 1.0), 0, - TrixiParticles.compact_support(smk, 1.0), + TrixiParticles.compact_support(smk, 1.0) * 1.1, rtol=1e-15) return 4 * pi * integral_3d_radial end @@ -70,13 +72,19 @@ # Test 4 different smoothing lengths smoothing_lengths = 0.25:0.25:1 - @testset "$kernel_type" for kernel_type in kernels - for ndims in 2:3 + for ndims in 2:3 + @testset "$kernel_type{$ndims}" for kernel_type in kernels kernel_ = kernel_type{ndims}() for h in smoothing_lengths + compact_support_ = TrixiParticles.compact_support(kernel_, h) # Test 11 different radii - radii = 0:(0.1h):(h + eps()) + radii = 0:(0.1 * compact_support_):(compact_support_ * 1.01) + + if kernel_ isa SpikyKernel + # The Spiky kernel is not differentiable at r=0 + radii = radii[2:end] + end for r in radii # Automatic differentation of `kernel` diff --git a/test/schemes/fluid/viscosity.jl b/test/schemes/fluid/viscosity.jl index 0e359ef5f1..ca50e6d5c1 100644 --- a/test/schemes/fluid/viscosity.jl +++ b/test/schemes/fluid/viscosity.jl @@ -25,12 +25,13 @@ grad_kernel = TrixiParticles.smoothing_kernel_grad(system_wcsph, pos_diff, distance, 1) - dv = viscosity(sound_speed, v_diff, pos_diff, distance, - rho_mean, rho_a, rho_b, smoothing_length, - grad_kernel, 0.0, 0.0) + dv = Ref(zero(v_diff)) + viscosity(dv, sound_speed, v_diff, pos_diff, distance, + rho_mean, rho_a, rho_b, smoothing_length, + grad_kernel, 0.0, 0.0, 1.0) - @test isapprox(dv[1], -0.02049217623299368, atol=6e-15) - @test isapprox(dv[2], 0.03073826434949052, atol=6e-15) + @test isapprox(dv[][1], -0.02049217623299368, atol=6e-15) + @test isapprox(dv[][2], 0.03073826434949052, atol=6e-15) end @testset verbose=true "`ViscosityMorris`" begin nu = 7e-3 @@ -51,12 +52,16 @@ v[1, 2] = 0.0 v[2, 2] = 0.0 - dv = viscosity(system_wcsph, system_wcsph, - v, v, 1, 2, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + v_a = v[:, 1] + v_b = v[:, 2] - @test isapprox(dv[1], -1.0895602048035404e-5, atol=6e-15) - @test isapprox(dv[2], 3.631867349345135e-5, atol=6e-15) + dv = Ref(zero(pos_diff)) + viscosity(dv, system_wcsph, system_wcsph, + v, v, 1, 2, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) + + @test isapprox(dv[][1], -1.0895602048035404e-5, atol=6e-15) + @test isapprox(dv[][2], 3.631867349345135e-5, atol=6e-15) end @testset verbose=true "`ViscosityAdami`" begin viscosity = ViscosityAdami(nu=7e-3) @@ -76,12 +81,16 @@ v[1, 2] = 0.0 v[2, 2] = 0.0 - dv = viscosity(system_wcsph, system_wcsph, - v, v, 1, 2, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + v_a = v[:, 1] + v_b = v[:, 2] + + dv = Ref(zero(pos_diff)) + viscosity(dv, system_wcsph, system_wcsph, + v, v, 1, 2, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) - @test isapprox(dv[1], -1.089560204803541e-5, atol=6e-15) - @test isapprox(dv[2], 3.6318673493451364e-5, atol=6e-15) + @test isapprox(dv[][1], -1.089560204803541e-5, atol=6e-15) + @test isapprox(dv[][2], 3.6318673493451364e-5, atol=6e-15) end @testset verbose=true "`ViscosityMorrisSGS`" begin nu = 7e-3 @@ -103,12 +112,16 @@ v[1, 2] = 0.0 v[2, 2] = 0.0 - dv = viscosity(system_wcsph, system_wcsph, - v, v, 1, 2, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + v_a = v[:, 1] + v_b = v[:, 2] + + dv = Ref(zero(pos_diff)) + viscosity(dv, system_wcsph, system_wcsph, + v, v, 1, 2, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) - @test isapprox(dv[1], -2.032835697804103e-5, atol=6e-15) - @test isapprox(dv[2], 6.776118992680343e-5, atol=6e-15) + @test isapprox(dv[][1], -2.032835697804103e-5, atol=6e-15) + @test isapprox(dv[][2], 6.776118992680343e-5, atol=6e-15) end @testset verbose=true "`ViscosityAdamiSGS`" begin viscosity = ViscosityAdamiSGS(nu=7e-3) @@ -128,12 +141,16 @@ v[1, 2] = 0.0 v[2, 2] = 0.0 - dv = viscosity(system_wcsph, system_wcsph, - v, v, 1, 2, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + v_a = v[:, 1] + v_b = v[:, 2] - @test isapprox(dv[1], -2.0328356978041036e-5, atol=6e-15) - @test isapprox(dv[2], 6.776118992680346e-5, atol=6e-15) + dv = Ref(zero(pos_diff)) + viscosity(dv, system_wcsph, system_wcsph, + v, v, 1, 2, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) + + @test isapprox(dv[][1], -2.0328356978041036e-5, atol=6e-15) + @test isapprox(dv[][2], 6.776118992680346e-5, atol=6e-15) end @testset verbose=true "`ViscosityCarreauYasuda`" begin @@ -161,12 +178,16 @@ grad_kernel = TrixiParticles.smoothing_kernel_grad(system_wcsph, pos_diff, distance, 1) - dv = viscosity(system_wcsph, system_wcsph, - v, v, 1, 2, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + v_a = v[:, 1] + v_b = v[:, 2] + + dv = Ref(zero(pos_diff)) + viscosity(dv, system_wcsph, system_wcsph, + v, v, 1, 2, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) - @test isapprox(dv[1], -1.0895602048035410e-5, atol=6e-15) - @test isapprox(dv[2], 3.6318673493451364e-5, atol=6e-15) + @test isapprox(dv[][1], -1.0895602048035410e-5, atol=6e-15) + @test isapprox(dv[][2], 3.6318673493451364e-5, atol=6e-15) # ------------------------------------------------------------------ # 2) Shear-thinning case: fixed (precomputed) values @@ -183,11 +204,15 @@ grad_kernel = TrixiParticles.smoothing_kernel_grad(system_wcsph, pos_diff, distance, 1) - dv = viscosity(system_wcsph, system_wcsph, - v, v, 1, 2, pos_diff, distance, - sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel) + v_a = v[:, 1] + v_b = v[:, 2] + + dv = Ref(zero(pos_diff)) + viscosity(dv, system_wcsph, system_wcsph, + v, v, 1, 2, pos_diff, distance, + sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel) - @test isapprox(dv[1], -5.33743497379846e-9, atol=6e-15) - @test isapprox(dv[2], 1.7791449912661534e-8, atol=6e-15) + @test isapprox(dv[][1], -5.33743497379846e-9, atol=6e-15) + @test isapprox(dv[][2], 1.7791449912661534e-8, atol=6e-15) end end diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl index 69f4a920c4..ed677d57bf 100644 --- a/test/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl @@ -96,7 +96,10 @@ function TrixiParticles.add_acceleration!(_, _, ::MockSystem) return nothing end - TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) = kernel_deriv + function TrixiParticles.smoothing_kernel_grad(::MockSystem, + pos_diff, distance, particle) + return kernel_deriv * pos_diff / distance + end TrixiParticles.compact_support(::MockSystem, ::MockSystem) = 1000.0 function TrixiParticles.current_coords(system::MockSystem, particle) return TrixiParticles.current_coords(initial_coordinates, system, particle) diff --git a/test/systems/tlsph_system.jl b/test/systems/tlsph_system.jl index 1e7b1edff5..3d5c2ff570 100644 --- a/test/systems/tlsph_system.jl +++ b/test/systems/tlsph_system.jl @@ -184,10 +184,10 @@ Base.getindex(::Val{:mock_material_density}, ::Int64) = density - function TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) - return kernel_derivative + function TrixiParticles.smoothing_kernel_grad(::Val{:mock_system_tensor}, + pos_diff, distance, particle) + return kernel_derivative * pos_diff / distance end - Base.eps(::Type{Val{:mock_smoothing_length}}) = eps() semi = DummySemidiscretization() # Compute deformation gradient