trixi-framework · efaulhaber · Apr 1, 2026 · Mar 23, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/Project.toml b/Project.toml
@@ -26,6 +26,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -37,14 +38,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 [weakdeps]
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
+TrixiParticlesCUDAExt = "CUDA"
 
 [compat]
 Accessors = "0.1.43"
 Adapt = "4"
 CSV = "0.10"
+CUDA = "5.9.1"
 DataFrames = "1.6"
 DelimitedFiles = "1"
 DiffEqCallbacks = "4"
@@ -62,6 +66,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"

diff --git a/docs/literate/src/tut_custom_kernel.jl b/docs/literate/src/tut_custom_kernel.jl
@@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end
 
 # By looking at the implementation of existing kernels in TrixiParticles.jl,
 # we can see that a kernel implementation requires three functions.
-# `TrixiParticles.kernel`, which is the kernel function itself,
-# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function,
-# and `TrixiParticles.compact_support`, which defines the compact support of the
-# kernel in relation to the smoothing length.
+# `TrixiParticles.kernel_unsafe`, which is the kernel function itself,
+# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the
+# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines
+# the compact support of the kernel in relation to the smoothing length.
 # The latter is relevant for determining the search radius of the neighborhood search.
-function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h)
+#
+# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since
+# this avoids an extra division in the hot loop and is robust near ``r=0``.
+# The public function `TrixiParticles.kernel_deriv` is defined automatically by
+# TrixiParticles from this method (and multiplies by ``r`` again when needed).
+# In the unsafe functions, we do not check the compact support; this is handled in the
+# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the
+# compact support defined in `TrixiParticles.compact_support`.
+function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * exp(-q^2)
-    end
-
-    return 0.0
+    return 1 / (pi * h^2) * exp(-q^2)
 end
 
-function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h)
+function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
-    end
-
-    return 0.0
+    kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
+    return kernel_deriv / r
 end
 
 TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h

diff --git a/ext/TrixiParticlesCUDAExt.jl b/ext/TrixiParticlesCUDAExt.jl
@@ -0,0 +1,32 @@
+module TrixiParticlesCUDAExt
+
+using CUDA: CUDA
+using TrixiParticles: TrixiParticles
+
+# Use faster version of `div_fast` for `Float64` on CUDA.
+# By default, `div_fast` translates to `Base.FastMath.div_fast`, but there is
+# no fast division for `Float64` on CUDA, so we need to redefine it here to use the
+# improved fast reciprocal defined below.
+CUDA.@device_override TrixiParticles.div_fast(x, y::Float64) = x * fast_inv_cuda(y)
+
+# Improved fast reciprocal for `Float64` by @Mikolaj-A-Kowalski, which is significantly
+# more accurate than just calling "llvm.nvvm.rcp.approx.ftz.d" without the cubic iteration,
+# while still being much faster than a full division.
+# This is copied from Oceananigans.jl, see https://github.com/CliMA/Oceananigans.jl/pull/5140.
+@inline function fast_inv_cuda(a::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_a, -a, 1.0)
+    e = fma(e, e, e)
+    inv_a = fma(e, inv_a, inv_a)
+    return inv_a
+end
+
+end # module
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -15,7 +15,7 @@ using FileIO: FileIO
 using ForwardDiff: ForwardDiff
 using GPUArraysCore: AbstractGPUArray
 using JSON: JSON
-using KernelAbstractions: KernelAbstractions, @kernel, @index
+using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize
 using LinearAlgebra: norm, normalize, cross, dot, I, tr, inv, pinv, det
 using MuladdMacro: @muladd
 using Polyester: Polyester, @batch
@@ -26,6 +26,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: vloada, Vec
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
@@ -130,10 +130,30 @@ end
     return kernel(smoothing_kernel, distance, smoothing_length(system, particle))
 end
 
+@inline function skip_zero_distance(system, distance, almostzero)
+    return skip_zero_distance(system_correction(system), system, distance, almostzero)
+end
+
+# Robust/safe version of the function below. In performance-critical code, manually check
+# the kernel support, call `skip_zero_distance` and then `smoothing_kernel_grad_unsafe`.
 @inline function smoothing_kernel_grad(system, pos_diff, distance, particle)
-    return corrected_kernel_grad(system_smoothing_kernel(system), pos_diff,
-                                 distance, smoothing_length(system, particle),
-                                 system_correction(system), system, particle)
+    h = smoothing_length(system, particle)
+    compact_support_ = compact_support(system_smoothing_kernel(system), h)
+
+    # Note that `sqrt(eps(h^2)) != eps(h)`
+    if distance > compact_support_ || skip_zero_distance(system, distance, sqrt(eps(h^2)))
+        return zero(pos_diff)
+    end
+
+    return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+end
+
+# Skip the zero distance and compact support checks for maximum performance.
+# Only call this when you are sure that `0 < distance < compact_support`.
+@inline function smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+    return corrected_kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff,
+                                        distance, smoothing_length(system, particle),
+                                        system_correction(system), system, particle)
 end
 
 # System updates do nothing by default, but can be dispatched if needed

diff --git a/src/general/corrections.jl b/src/general/corrections.jl
@@ -35,15 +35,16 @@ end
 # `rho_mean` is the mean density of the fluid, which is used to determine correction values near the free surface.
 #  Return a tuple `(viscosity_correction, pressure_correction, surface_tension_correction)` representing the correction terms.
 @inline function free_surface_correction(correction::AkinciFreeSurfaceCorrection,
-                                         particle_system, rho_mean)
+                                         particle_system, rho_a, rho_b)
     # Equation 4 in ref
+    rho_mean = (rho_a + rho_b) / 2
     k = correction.rho0 / rho_mean
 
     # Viscosity, pressure, surface_tension
     return k, 1, k
 end
 
-@inline function free_surface_correction(correction, particle_system, rho_mean)
+@inline function free_surface_correction(correction, particle_system, rho_a, rho_b)
     return 1, 1, 1
 end
 
@@ -337,31 +338,36 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system,
         v_neighbor_system = wrap_v(v_ode, neighbor_system, semi)
 
         neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+        almostzero = sqrt(eps(compact_support(system, neighbor_system)^2))
 
         foreach_point_neighbor(system, neighbor_system, coordinates, neighbor_coords,
                                semi) do particle, neighbor, pos_diff, distance
-            volume = hydrodynamic_mass(neighbor_system, neighbor) /
-                     current_density(v_neighbor_system, neighbor_system, neighbor)
-            smoothing_length_ = smoothing_length(system, particle)
-
-            function compute_grad_kernel(correction, smoothing_kernel, pos_diff, distance,
-                                         smoothing_length_, system, particle)
-                return smoothing_kernel_grad(system, pos_diff, distance, particle)
+            function kernel_grad_local(correction, smoothing_kernel, pos_diff, distance,
+                                       smoothing_length_, system, particle)
+                return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
             end
 
             # Compute gradient of corrected kernel
-            function compute_grad_kernel(correction::MixedKernelGradientCorrection,
-                                         smoothing_kernel, pos_diff, distance,
-                                         smoothing_length_, system, particle)
-                return corrected_kernel_grad(smoothing_kernel, pos_diff, distance,
-                                             smoothing_length_, KernelCorrection(), system,
-                                             particle)
+            function kernel_grad_local(correction::MixedKernelGradientCorrection,
+                                       smoothing_kernel, pos_diff, distance,
+                                       smoothing_length_, system, particle)
+                return corrected_kernel_grad_unsafe(smoothing_kernel, pos_diff, distance,
+                                                    smoothing_length_, KernelCorrection(),
+                                                    system, particle)
             end
 
-            grad_kernel = compute_grad_kernel(correction, smoothing_kernel, pos_diff,
-                                              distance, smoothing_length_, system, particle)
+            # Skip neighbors with the same position if the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(correction, system, distance, almostzero) && return
 
-            iszero(grad_kernel) && return
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            smoothing_length_ = smoothing_length(system, particle)
+            grad_kernel = kernel_grad_local(correction, smoothing_kernel, pos_diff,
+                                            distance, smoothing_length_, system, particle)
+
+            volume = hydrodynamic_mass(neighbor_system, neighbor) /
+                     current_density(v_neighbor_system, neighbor_system, neighbor)
 
             L = volume * grad_kernel * pos_diff'
 

diff --git a/src/general/interpolation.jl b/src/general/interpolation.jl
@@ -593,7 +593,9 @@ end
                 other_density[point] += m_b * W_ab
 
                 if include_wall_velocity
-                    velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor)
+                    velocity_neighbor_ = current_velocity(v, neighbor_system, neighbor)
+                    velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor,
+                                                         velocity_neighbor_)
                     for i in axes(velocity_neighbor, 1)
                         cache.velocity[i, point] += velocity_neighbor[i] * volume_b * W_ab
                     end