diff --git a/Project.toml b/Project.toml
index 2b0ea58c92..b2bbe0b3b8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,6 +26,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -37,14 +38,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 [weakdeps]
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
+TrixiParticlesCUDAExt = "CUDA"
 
 [compat]
 Accessors = "0.1.43"
 Adapt = "4"
 CSV = "0.10"
+CUDA = "5.9.1"
 DataFrames = "1.6"
 DelimitedFiles = "1"
 DiffEqCallbacks = "4"
@@ -62,6 +66,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"
diff --git a/docs/literate/src/tut_custom_kernel.jl b/docs/literate/src/tut_custom_kernel.jl
index 25a3bed683..053f0c74a0 100644
--- a/docs/literate/src/tut_custom_kernel.jl
+++ b/docs/literate/src/tut_custom_kernel.jl
@@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end
 
 # By looking at the implementation of existing kernels in TrixiParticles.jl,
 # we can see that a kernel implementation requires three functions.
-# `TrixiParticles.kernel`, which is the kernel function itself,
-# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function,
-# and `TrixiParticles.compact_support`, which defines the compact support of the
-# kernel in relation to the smoothing length.
+# `TrixiParticles.kernel_unsafe`, which is the kernel function itself,
+# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the
+# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines
+# the compact support of the kernel in relation to the smoothing length.
 # The latter is relevant for determining the search radius of the neighborhood search.
-function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h)
+#
+# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since
+# this avoids an extra division in the hot loop and is robust near ``r=0``.
+# The public function `TrixiParticles.kernel_deriv` is defined automatically by
+# TrixiParticles from this method (and multiplies by ``r`` again when needed).
+# In the unsafe functions, we do not check the compact support; this is handled in the
+# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the
+# compact support defined in `TrixiParticles.compact_support`.
+function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * exp(-q^2)
-    end
-
-    return 0.0
+    return 1 / (pi * h^2) * exp(-q^2)
 end
 
-function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h)
+function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
-    end
-
-    return 0.0
+    kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
+    return kernel_deriv / r
 end
 
 TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h
diff --git a/ext/TrixiParticlesCUDAExt.jl b/ext/TrixiParticlesCUDAExt.jl
new file mode 100644
index 0000000000..c747bde3ef
--- /dev/null
+++ b/ext/TrixiParticlesCUDAExt.jl
@@ -0,0 +1,32 @@
+module TrixiParticlesCUDAExt
+
+using CUDA: CUDA
+using TrixiParticles: TrixiParticles
+
+# Use faster version of `div_fast` for `Float64` on CUDA.
+# By default, `div_fast` translates to `Base.FastMath.div_fast`, but there is
+# no fast division for `Float64` on CUDA, so we need to redefine it here to use the
+# improved fast reciprocal defined below.
+CUDA.@device_override TrixiParticles.div_fast(x, y::Float64) = x * fast_inv_cuda(y)
+
+# Improved fast reciprocal for `Float64` by @Mikolaj-A-Kowalski, which is significantly
+# more accurate than just calling "llvm.nvvm.rcp.approx.ftz.d" without the cubic iteration,
+# while still being much faster than a full division.
+# This is copied from Oceananigans.jl, see https://github.com/CliMA/Oceananigans.jl/pull/5140.
+@inline function fast_inv_cuda(a::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_a, -a, 1.0)
+    e = fma(e, e, e)
+    inv_a = fma(e, inv_a, inv_a)
+    return inv_a
+end
+
+end # module
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
index faa1fd2956..ea3ee48bd3 100644
--- a/src/TrixiParticles.jl
+++ b/src/TrixiParticles.jl
@@ -15,7 +15,7 @@ using FileIO: FileIO
 using ForwardDiff: ForwardDiff
 using GPUArraysCore: AbstractGPUArray
 using JSON: JSON
-using KernelAbstractions: KernelAbstractions, @kernel, @index
+using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize
 using LinearAlgebra: norm, normalize, cross, dot, I, tr, inv, pinv, det
 using MuladdMacro: @muladd
 using Polyester: Polyester, @batch
@@ -26,6 +26,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: vloada, Vec
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics
diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
index 10ccb6261f..eeceaaac63 100644
--- a/src/general/abstract_system.jl
+++ b/src/general/abstract_system.jl
@@ -130,10 +130,30 @@ end
     return kernel(smoothing_kernel, distance, smoothing_length(system, particle))
 end
 
+@inline function skip_zero_distance(system, distance, almostzero)
+    return skip_zero_distance(system_correction(system), system, distance, almostzero)
+end
+
+# Robust/safe version of the function below. In performance-critical code, manually check
+# the kernel support, call `skip_zero_distance` and then `smoothing_kernel_grad_unsafe`.
 @inline function smoothing_kernel_grad(system, pos_diff, distance, particle)
-    return corrected_kernel_grad(system_smoothing_kernel(system), pos_diff,
-                                 distance, smoothing_length(system, particle),
-                                 system_correction(system), system, particle)
+    h = smoothing_length(system, particle)
+    compact_support_ = compact_support(system_smoothing_kernel(system), h)
+
+    # Note that `sqrt(eps(h^2)) != eps(h)`
+    if distance > compact_support_ || skip_zero_distance(system, distance, sqrt(eps(h^2)))
+        return zero(pos_diff)
+    end
+
+    return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+end
+
+# Skip the zero distance and compact support checks for maximum performance.
+# Only call this when you are sure that `0 < distance < compact_support`.
+@inline function smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
+    return corrected_kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff,
+                                        distance, smoothing_length(system, particle),
+                                        system_correction(system), system, particle)
 end
 
 # System updates do nothing by default, but can be dispatched if needed
diff --git a/src/general/corrections.jl b/src/general/corrections.jl
index 292523feb8..e7422916b1 100644
--- a/src/general/corrections.jl
+++ b/src/general/corrections.jl
@@ -35,15 +35,16 @@ end
 # `rho_mean` is the mean density of the fluid, which is used to determine correction values near the free surface.
 #  Return a tuple `(viscosity_correction, pressure_correction, surface_tension_correction)` representing the correction terms.
 @inline function free_surface_correction(correction::AkinciFreeSurfaceCorrection,
-                                         particle_system, rho_mean)
+                                         particle_system, rho_a, rho_b)
     # Equation 4 in ref
+    rho_mean = (rho_a + rho_b) / 2
     k = correction.rho0 / rho_mean
 
     # Viscosity, pressure, surface_tension
     return k, 1, k
 end
 
-@inline function free_surface_correction(correction, particle_system, rho_mean)
+@inline function free_surface_correction(correction, particle_system, rho_a, rho_b)
     return 1, 1, 1
 end
 
@@ -337,31 +338,36 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system,
         v_neighbor_system = wrap_v(v_ode, neighbor_system, semi)
 
         neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+        almostzero = sqrt(eps(compact_support(system, neighbor_system)^2))
 
         foreach_point_neighbor(system, neighbor_system, coordinates, neighbor_coords,
                                semi) do particle, neighbor, pos_diff, distance
-            volume = hydrodynamic_mass(neighbor_system, neighbor) /
-                     current_density(v_neighbor_system, neighbor_system, neighbor)
-            smoothing_length_ = smoothing_length(system, particle)
-
-            function compute_grad_kernel(correction, smoothing_kernel, pos_diff, distance,
-                                         smoothing_length_, system, particle)
-                return smoothing_kernel_grad(system, pos_diff, distance, particle)
+            function kernel_grad_local(correction, smoothing_kernel, pos_diff, distance,
+                                       smoothing_length_, system, particle)
+                return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
             end
 
             # Compute gradient of corrected kernel
-            function compute_grad_kernel(correction::MixedKernelGradientCorrection,
-                                         smoothing_kernel, pos_diff, distance,
-                                         smoothing_length_, system, particle)
-                return corrected_kernel_grad(smoothing_kernel, pos_diff, distance,
-                                             smoothing_length_, KernelCorrection(), system,
-                                             particle)
+            function kernel_grad_local(correction::MixedKernelGradientCorrection,
+                                       smoothing_kernel, pos_diff, distance,
+                                       smoothing_length_, system, particle)
+                return corrected_kernel_grad_unsafe(smoothing_kernel, pos_diff, distance,
+                                                    smoothing_length_, KernelCorrection(),
+                                                    system, particle)
             end
 
-            grad_kernel = compute_grad_kernel(correction, smoothing_kernel, pos_diff,
-                                              distance, smoothing_length_, system, particle)
+            # Skip neighbors with the same position if the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(correction, system, distance, almostzero) && return
 
-            iszero(grad_kernel) && return
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            smoothing_length_ = smoothing_length(system, particle)
+            grad_kernel = kernel_grad_local(correction, smoothing_kernel, pos_diff,
+                                            distance, smoothing_length_, system, particle)
+
+            volume = hydrodynamic_mass(neighbor_system, neighbor) /
+                     current_density(v_neighbor_system, neighbor_system, neighbor)
 
             L = volume * grad_kernel * pos_diff'
 
diff --git a/src/general/interpolation.jl b/src/general/interpolation.jl
index 00eff1581b..58b171d0f7 100644
--- a/src/general/interpolation.jl
+++ b/src/general/interpolation.jl
@@ -593,7 +593,9 @@ end
                 other_density[point] += m_b * W_ab
 
                 if include_wall_velocity
-                    velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor)
+                    velocity_neighbor_ = current_velocity(v, neighbor_system, neighbor)
+                    velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor,
+                                                         velocity_neighbor_)
                     for i in axes(velocity_neighbor, 1)
                         cache.velocity[i, point] += velocity_neighbor[i] * volume_b * W_ab
                     end
diff --git a/src/general/smoothing_kernels.jl b/src/general/smoothing_kernels.jl
index 4a97564648..3c6be90212 100644
--- a/src/general/smoothing_kernels.jl
+++ b/src/general/smoothing_kernels.jl
@@ -9,44 +9,87 @@ abstract type AbstractSmoothingKernel{NDIMS} end
     # `distance^2` is in the order of `h^2`, hence the comparison `distance^2 < eps(h^2)`.
     # Note that this is faster than `distance < sqrt(eps(h^2))`.
     # Also note that `sqrt(eps(h^2)) != eps(h)`.
-    distance^2 < eps(h^2) && return zero(pos_diff)
+    compact_support_ = compact_support(kernel, h)
+    nonzero = distance < compact_support_ && distance^2 > eps(h^2)
+    nonzero || return zero(pos_diff)
 
-    return kernel_deriv(kernel, distance, h) / distance * pos_diff
+    # Now we can use `kernel_grad_unsafe` without worrying about division by zero
+    return kernel_grad_unsafe(kernel, pos_diff, distance, h)
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h, correction, system,
-                                       particle)
-    return kernel_grad(kernel, pos_diff, distance, h)
+# Skip the zero distance and compact support checks for maximum performance.
+# Only call this when you are sure that `distance` is not zero and within the compact support.
+@inline function kernel_grad_unsafe(kernel, pos_diff, distance, h)
+    # Call an optimized version of the kernel derivative, skipping redundant checks.
+    # Since this is used as `kernel_deriv / distance * pos_diff`, we already divide by `r`
+    # (or rather skip a multiplication by `r`) inside `kernel_deriv_div_r_unsafe`
+    # to save expensive divisions.
+    return kernel_deriv_div_r_unsafe(kernel, distance, h) * pos_diff
 end
 
-@inline function corrected_kernel_grad(kernel_, pos_diff, distance, h, ::KernelCorrection,
-                                       system, particle)
+@inline function kernel(kernel, r::Real, h)
+    # Zero out result if outside of compact support
+    compact_support_ = compact_support(kernel, h)
+    return ifelse(r < compact_support_, kernel_unsafe(kernel, r, h), zero(r))
+end
+
+@inline function kernel_deriv(kernel, r::Real, h)
+    # Zero out result if outside of compact support or if `r` is almost zero
+    # (to avoid division by zero in the unsafe version).
+    compact_support_ = compact_support(kernel, h)
+    if r < compact_support_ && r^2 > eps(h^2)
+        # The unsafe version returns the kernel derivative divided by `r`,
+        # so we multiply it by `r` to get the actual derivative.
+        return kernel_deriv_div_r_unsafe(kernel, r, h) * r
+    end
+
+    return zero(r)
+end
+
+@inline function skip_zero_distance(correction, system, distance, almostzero)
+    return distance < almostzero
+end
+
+@inline function skip_zero_distance(::Union{KernelCorrection,
+                                            MixedKernelGradientCorrection},
+                                    system, distance, almostzero)
+    # For these corrections, the kernel gradient between a particle and itself is not zero
+    return false
+end
+
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, correction,
+                                              system, particle)
+    return kernel_grad_unsafe(kernel, pos_diff, distance, h)
+end
+
+@inline function corrected_kernel_grad_unsafe(kernel_, pos_diff, distance, h,
+                                              ::KernelCorrection, system, particle)
     return (kernel_grad(kernel_, pos_diff, distance, h) .-
             kernel(kernel_, distance, h) * dw_gamma(system, particle)) /
            kernel_correction_coefficient(system, particle)
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h,
-                                       corr::BlendedGradientCorrection, system,
-                                       particle)
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h,
+                                              corr::BlendedGradientCorrection, system,
+                                              particle)
     (; blending_factor) = corr
 
-    grad = kernel_grad(kernel, pos_diff, distance, h)
+    grad = kernel_grad_unsafe(kernel, pos_diff, distance, h)
     return (1 - blending_factor) * grad +
            blending_factor * correction_matrix(system, particle) * grad
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h,
-                                       ::GradientCorrection, system, particle)
-    grad = kernel_grad(kernel, pos_diff, distance, h)
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h,
+                                              ::GradientCorrection, system, particle)
+    grad = kernel_grad_unsafe(kernel, pos_diff, distance, h)
     return correction_matrix(system, particle) * grad
 end
 
-@inline function corrected_kernel_grad(kernel, pos_diff, distance, h,
-                                       ::MixedKernelGradientCorrection, system,
-                                       particle)
-    grad = corrected_kernel_grad(kernel, pos_diff, distance, h, KernelCorrection(),
-                                 system, particle)
+@inline function corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h,
+                                              ::MixedKernelGradientCorrection, system,
+                                              particle)
+    grad = corrected_kernel_grad_unsafe(kernel, pos_diff, distance, h, KernelCorrection(),
+                                        system, particle)
     return correction_matrix(system, particle) * grad
 end
 
@@ -80,32 +123,40 @@ which is beneficial in regards to stability but makes it less accurate.
 """
 struct GaussianKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@inline @fastmath function kernel(kernel::GaussianKernel, r::Real, h)
-    q = r / h
+@inline function kernel_unsafe(kernel::GaussianKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
 
-    # Zero out result if q >= 3
-    result = ifelse(q < 3, normalization_factor(kernel, h) * exp(-q^2), zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * exp_q2
 end
 
-@inline @fastmath function kernel_deriv(kernel::GaussianKernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
-
-    # Zero out result if q >= 3
-    result = ifelse(q < 3,
-                    -2 * q * normalization_factor(kernel, h) * exp(-q^2) * inner_deriv,
-                    zero(q))
+@inline function kernel_deriv_div_r_unsafe(kernel::GaussianKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
 
-    return result
+    return -2 * normalization_factor(kernel, h_inv) * exp_q2 * h_inv^2
 end
 
 @inline compact_support(::GaussianKernel, h) = 3 * h
 
-@inline normalization_factor(::GaussianKernel{2}, h) = 1 / (pi * h^2)
-# First convert `pi` to the type of `h` to preserve the type of `h`
-@inline normalization_factor(::GaussianKernel{3}, h) = 1 / (oftype(h, pi)^(3 // 2) * h^3)
+@inline function normalization_factor(::GaussianKernel{2}, h_inv)
+    return oftype(h_inv, 1 / pi) * h_inv^2
+end
+
+@inline function normalization_factor(::GaussianKernel{3}, h_inv)
+    # 1 / (pi^1.5 * h^3)
+    return oftype(h_inv, 0.17958712212516656) * h_inv^3
+end
 
 @doc raw"""
     SchoenbergCubicSplineKernel{NDIMS}()
@@ -140,40 +191,40 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct SchoenbergCubicSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@muladd @inline function kernel(kernel::SchoenbergCubicSplineKernel, r::Real, h)
-    q = r / h
+@inline function kernel_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
     # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl.
     result = 1 * (2 - q)^3 / 4
-    result = result - (q < 1) * (1 - q)^3
-
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(result))
+    @muladd result = result - (q < 1) * (1 - q)^3
 
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@muladd @inline function kernel_deriv(kernel::SchoenbergCubicSplineKernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergCubicSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = -3 * (2 - q)^2 / 4
-    result = result + 3 * (q < 1) * (1 - q)^2
+    result = -3 * (2 - q)^2 / 4 + 3 * (q < 1) * (1 - q)^2
+    kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result * inner_deriv,
-                    zero(result))
-
-    return result
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::SchoenbergCubicSplineKernel, h) = 2 * h
 
-# Note that `2 // 3 / h` saves one instruction but is significantly slower on GPUs (for now)
-@inline normalization_factor(::SchoenbergCubicSplineKernel{1}, h) = 2 / (3 * h)
-@inline normalization_factor(::SchoenbergCubicSplineKernel{2}, h) = 10 / (pi * h^2 * 7)
-@inline normalization_factor(::SchoenbergCubicSplineKernel{3}, h) = 1 / (pi * h^3)
+@inline function normalization_factor(::SchoenbergCubicSplineKernel{1}, h_inv)
+    return oftype(h_inv, 2 / 3) * h_inv
+end
+
+@inline function normalization_factor(::SchoenbergCubicSplineKernel{2}, h_inv)
+    return oftype(h_inv, 10 / (pi * 7)) * h_inv^2
+end
+
+@inline function normalization_factor(::SchoenbergCubicSplineKernel{3}, h_inv)
+    return oftype(h_inv, 1 / pi) * h_inv^3
+end
 
 @doc raw"""
     SchoenbergQuarticSplineKernel{NDIMS}()
@@ -217,54 +268,55 @@ struct SchoenbergQuarticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} en
 # to a power of three, see
 # https://github.com/JuliaLang/julia/blob/34934736fa4dcb30697ac1b23d11d5ad394d6a4d/base/intfuncs.jl#L327-L339
 # By using the `@fastpow` macro, we are consciously trading off some precision in the result
-# for enhanced computational speed. This is especially useful in scenarios where performance
-# is a higher priority than exact precision.
-@fastpow @muladd @inline function kernel(kernel::SchoenbergQuarticSplineKernel, r::Real, h)
-    q = r / h
+# for enhanced computational speed by using simple multiplications for higher powers.
+@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuarticSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Preserve the type of `q`
-    q5_2 = (5 // 2 - q)
-    q3_2 = (3 // 2 - q)
-    q1_2 = (1 // 2 - q)
+    # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason
+    q5_2 = 2.5f0 - q
+    q3_2 = 1.5f0 - q
+    q1_2 = 0.5f0 - q
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = q5_2^4
-    result = result - 5 * (q < 3 // 2) * q3_2^4
-    result = result + 10 * (q < 1 // 2) * q1_2^4
+    # Note: `q < 3 // 2` is not working on GPUs.
+    # See https://github.com/JuliaGPU/CUDA.jl/issues/2681.
+    result = q5_2^4 - 5 * (q < 1.5f0) * q3_2^4 + 10 * (q < 0.5f0) * q1_2^4
 
-    # Zero out result if q >= 5/2
-    result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result, zero(result))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuarticSplineKernel,
-                                               r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuarticSplineKernel,
+                                           r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Preserve the type of `q`
-    q5_2 = 5 // 2 - q
-    q3_2 = 3 // 2 - q
-    q1_2 = 1 // 2 - q
+    # Note: using `5 // 2` here makes it 8x slower on GPUs for some reason
+    q5_2 = 2.5f0 - q
+    q3_2 = 1.5f0 - q
+    q1_2 = 0.5f0 - q
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = -4 * q5_2^3
-    result = result + 20 * (q < 3 // 2) * q3_2^3
-    result = result - 40 * (q < 1 // 2) * q1_2^3
+    # Note: `q < 3 // 2` is not working on GPUs.
+    # See https://github.com/JuliaGPU/CUDA.jl/issues/2681.
+    result = -4 * q5_2^3 + 20 * (q < 1.5f0) * q3_2^3 - 40 * (q < 0.5f0) * q1_2^3
 
-    # Zero out result if q >= 5/2
-    result = ifelse(q < 5 // 2, normalization_factor(kernel, h) * result * inner_deriv,
-                    zero(result))
+    kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv
 
-    return result
+    return div_fast(kernel_deriv, r)
 end
 
-@inline compact_support(::SchoenbergQuarticSplineKernel, h) = 5 * h / 2
+@inline compact_support(::SchoenbergQuarticSplineKernel, h) = oftype(h, 5 / 2) * h
 
-@inline normalization_factor(::SchoenbergQuarticSplineKernel{1}, h) = 1 / (24 * h)
-@inline normalization_factor(::SchoenbergQuarticSplineKernel{2}, h) = 96 / (pi * h^2 * 1199)
-@inline normalization_factor(::SchoenbergQuarticSplineKernel{3}, h) = 1 / (pi * h^3 * 20)
+@inline function normalization_factor(::SchoenbergQuarticSplineKernel{1}, h_inv)
+    return oftype(h_inv, 1 / 24) * h_inv
+end
+
+@inline function normalization_factor(::SchoenbergQuarticSplineKernel{2}, h_inv)
+    return oftype(h_inv, 96 / (pi * 1199)) * h_inv^2
+end
+
+@inline function normalization_factor(::SchoenbergQuarticSplineKernel{3}, h_inv)
+    return oftype(h_inv, 1 / (pi * 20)) * h_inv^3
+end
 
 @doc raw"""
     SchoenbergQuinticSplineKernel{NDIMS}()
@@ -301,48 +353,40 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct SchoenbergQuinticSplineKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@fastpow @muladd @inline function kernel(kernel::SchoenbergQuinticSplineKernel, r::Real, h)
-    q = r / h
-    q3 = (3 - q)
-    q2 = (2 - q)
-    q1 = (1 - q)
+@fastpow @inline function kernel_unsafe(kernel::SchoenbergQuinticSplineKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = q3^5
-    result = result - 6 * (q < 2) * q2^5
-    result = result + 15 * (q < 1) * q1^5
+    result = (3 - q)^5 - 6 * (q < 2) * (2 - q)^5 + 15 * (q < 1) * (1 - q)^5
 
-    # Zero out result if q >= 3
-    result = ifelse(q < 3, normalization_factor(kernel, h) * result, zero(result))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @muladd @inline function kernel_deriv(kernel::SchoenbergQuinticSplineKernel,
-                                               r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
-    q3 = (3 - q)
-    q2 = (2 - q)
-    q1 = (1 - q)
+@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::SchoenbergQuinticSplineKernel,
+                                                    r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # We do not use `+=` or `-=` since these are not recognized by MuladdMacro.jl
-    result = -5 * q3^4
-    result = result + 30 * (q < 2) * q2^4
-    result = result - 75 * (q < 1) * q1^4
+    result = -5 * (3 - q)^4 + 30 * (q < 2) * (2 - q)^4 - 75 * (q < 1) * (1 - q)^4
 
-    # Zero out result if q >= 3
-    result = ifelse(q < 3, normalization_factor(kernel, h) * result * inner_deriv,
-                    zero(result))
+    kernel_deriv = normalization_factor(kernel, h_inv) * result * h_inv
 
-    return result
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::SchoenbergQuinticSplineKernel, h) = 3 * h
 
-@inline normalization_factor(::SchoenbergQuinticSplineKernel{1}, h) = 1 / (120 * h)
-@inline normalization_factor(::SchoenbergQuinticSplineKernel{2}, h) = 7 / (pi * h^2 * 478)
-@inline normalization_factor(::SchoenbergQuinticSplineKernel{3}, h) = 1 / (pi * h^3 * 120)
+@inline function normalization_factor(::SchoenbergQuinticSplineKernel{1}, h_inv)
+    return oftype(h_inv, 1 / (120)) * h_inv
+end
+
+@inline function normalization_factor(::SchoenbergQuinticSplineKernel{2}, h_inv)
+    return oftype(h_inv, 7 / (pi * 478)) * h_inv^2
+end
+
+@inline function normalization_factor(::SchoenbergQuinticSplineKernel{3}, h_inv)
+    return oftype(h_inv, 1 / (pi * 120)) * h_inv^3
+end
 
 abstract type AbstractWendlandKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
@@ -383,33 +427,26 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct WendlandC2Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::WendlandC2Kernel, r::Real, h)
-    q = r / h
-
-    result = (1 - q / 2)^4 * (2 * q + 1)
+@fastpow @inline function kernel_unsafe(kernel::WendlandC2Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * (1 - q / 2)^4 * (2 * q + 1)
 end
 
-@inline function kernel_deriv(kernel::WendlandC2Kernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
-
-    result = -5 * (1 - q / 2)^3 * q
+@inline function kernel_deriv_div_r_unsafe(kernel::WendlandC2Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2,
-                    normalization_factor(kernel, h) * result * inner_deriv, zero(q))
-
-    return result
+    return -5 * normalization_factor(kernel, h_inv) * (1 - q / 2)^3 * h_inv^2
 end
 
-# Note that `7 // 4` saves one instruction but is significantly slower on GPUs (for now)
-@inline normalization_factor(::WendlandC2Kernel{2}, h) = 7 / (pi * h^2 * 4)
-@inline normalization_factor(::WendlandC2Kernel{3}, h) = 21 / (pi * h^3 * 16)
+@inline function normalization_factor(::WendlandC2Kernel{2}, h_inv)
+    return oftype(h_inv, 7 / (4 * pi)) * h_inv^2
+end
+@inline function normalization_factor(::WendlandC2Kernel{3}, h_inv)
+    return oftype(h_inv, 21 / (16 * pi)) * h_inv^3
+end
 
 @doc raw"""
     WendlandC4Kernel{NDIMS}()
@@ -445,31 +482,30 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct WendlandC4Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::WendlandC4Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_unsafe(kernel::WendlandC4Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
     result = (1 - q / 2)^6 * (35 * q^2 / 12 + 3 * q + 1)
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @inline function kernel_deriv(kernel::WendlandC4Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC4Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    derivative = -7 * q / 3 * (2 + 5 * q) * (1 - q / 2)^5
+    result = oftype(r, -7 / 3) * (2 + 5 * q) * (1 - q / 2)^5 * h_inv^2
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h,
-                    zero(derivative))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@inline normalization_factor(::WendlandC4Kernel{2}, h) = 9 / (pi * h^2 * 4)
-@inline normalization_factor(::WendlandC4Kernel{3}, h) = 495 / (pi * h^3 * 256)
+@inline function normalization_factor(::WendlandC4Kernel{2}, h_inv)
+    return oftype(h_inv, 9 / (pi * 4)) * h_inv^2
+end
+@inline function normalization_factor(::WendlandC4Kernel{3}, h_inv)
+    return oftype(h_inv, 495 / (pi * 256)) * h_inv^3
+end
 
 @doc raw"""
     WendlandC6Kernel{NDIMS}()
@@ -505,31 +541,31 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct WendlandC6Kernel{NDIMS} <: AbstractWendlandKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::WendlandC6Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_unsafe(kernel::WendlandC6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
     result = (1 - q / 2)^8 * (4 * q^3 + 25 * q^2 / 4 + 4 * q + 1)
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * result
 end
 
-@fastpow @muladd @inline function kernel_deriv(kernel::WendlandC6Kernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_deriv_div_r_unsafe(kernel::WendlandC6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    derivative = -11 * q / 4 * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7
+    result = oftype(r, -11 / 4) * (8 * q^2 + 7 * q + 2) * (1 - q / 2)^7 * h_inv^2
 
-    # Zero out result if q >= 2
-    result = ifelse(q < 2, normalization_factor(kernel, h) * derivative / h,
-                    zero(derivative))
+    return normalization_factor(kernel, h_inv) * result
+end
 
-    return result
+@inline function normalization_factor(::WendlandC6Kernel{2}, h_inv)
+    return oftype(h_inv, 39 / (pi * 14)) * h_inv^2
 end
 
-@inline normalization_factor(::WendlandC6Kernel{2}, h) = 39 / (pi * h^2 * 14)
-@inline normalization_factor(::WendlandC6Kernel{3}, h) = 1365 / (pi * h^3 * 512)
+@inline function normalization_factor(::WendlandC6Kernel{3}, h_inv)
+    return oftype(h_inv, 1365 / (pi * 512)) * h_inv^3
+end
 
 @doc raw"""
     Poly6Kernel{NDIMS}()
@@ -541,7 +577,6 @@ especially in computer graphics contexts. It is defined as
 W(r, h) = \frac{1}{h^d} w(r/h)
 ```
 
-
 with
 
 ```math
@@ -569,34 +604,31 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct Poly6Kernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@inline function kernel(kernel::Poly6Kernel, r::Real, h)
-    q = r / h
-
-    result = (1 - q^2)^3
+@inline function kernel_unsafe(kernel::Poly6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # Zero out result if q >= 1
-    result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q))
-
-    return result
+    return normalization_factor(kernel, h_inv) * (1 - q^2)^3
 end
 
-@inline function kernel_deriv(kernel::Poly6Kernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::Poly6Kernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    result = -6 * q * (1 - q^2)^2
+    kernel_deriv = normalization_factor(kernel, h_inv) * -6 * q * (1 - q^2)^2 * h_inv
 
-    # Zero out result if q >= 1
-    result = ifelse(q < 1,
-                    result * normalization_factor(kernel, h) * inner_deriv, zero(q))
-    return result
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::Poly6Kernel, h) = h
 
-# Note that `315 // 64` saves one instruction but is significantly slower on GPUs (for now)
-@inline normalization_factor(::Poly6Kernel{2}, h) = 4 / (pi * h^2)
-@inline normalization_factor(::Poly6Kernel{3}, h) = 315 / (pi * h^3 * 64)
+@inline function normalization_factor(::Poly6Kernel{2}, h_inv)
+    return oftype(h_inv, 4 / (pi)) * h_inv^2
+end
+
+@inline function normalization_factor(::Poly6Kernel{3}, h_inv)
+    return oftype(h_inv, 315 / (64 * pi)) * h_inv^3
+end
 
 @doc raw"""
     SpikyKernel{NDIMS}()
@@ -634,38 +666,36 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct SpikyKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@inline function kernel(kernel::SpikyKernel, r::Real, h)
-    q = r / h
-
-    result = (1 - q)^3
-
-    # Zero out result if q >= 1
-    result = ifelse(q < 1, normalization_factor(kernel, h) * result, zero(q))
+@inline function kernel_unsafe(kernel::SpikyKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    return result
+    return normalization_factor(kernel, h_inv) * (1 - q)^3
 end
 
-@inline function kernel_deriv(kernel::SpikyKernel, r::Real, h)
-    inner_deriv = 1 / h
-    q = r * inner_deriv
+@inline function kernel_deriv_div_r_unsafe(kernel::SpikyKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    result = -3 * (1 - q)^2
+    kernel_deriv = -3 * normalization_factor(kernel, h_inv) * (1 - q)^2 * h_inv
 
-    # Zero out result if q >= 1
-    result = ifelse(q < 1, result * normalization_factor(kernel, h) * inner_deriv, zero(q))
-
-    return result
+    return div_fast(kernel_deriv, r)
 end
 
 @inline compact_support(::SpikyKernel, h) = h
 
-@inline normalization_factor(::SpikyKernel{2}, h) = 10 / (pi * h^2)
-@inline normalization_factor(::SpikyKernel{3}, h) = 15 / (pi * h^3)
+@inline function normalization_factor(::SpikyKernel{2}, h_inv)
+    return oftype(h_inv, 10 / pi) * h_inv^2
+end
+
+@inline function normalization_factor(::SpikyKernel{3}, h_inv)
+    return oftype(h_inv, 15 / pi) * h_inv^3
+end
 
 @doc raw"""
     LaguerreGaussKernel{NDIMS}()
 
-Truncated Laguerre–Gauss kernel (fourth-order smoothing) as defined in
+Truncated Laguerre-Gauss kernel (fourth-order smoothing) as defined in
 [Wang2024](@cite). Its radial form uses ``q = r/h`` and is truncated at
 ``q = 2`` (compact support ``2h``):
 
@@ -716,51 +746,65 @@ For general information and usage see [Smoothing Kernels](@ref smoothing_kernel)
 """
 struct LaguerreGaussKernel{NDIMS} <: AbstractSmoothingKernel{NDIMS} end
 
-@fastpow @inline function kernel(kernel::LaguerreGaussKernel, r::Real, h)
-    q = r / h
+@fastpow @inline function kernel_unsafe(kernel::LaguerreGaussKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
 
-    # polynomial part: 1 - s^2/2 + s^4/6
+    # Polynomial part
     poly = 1 - q^2 / 2 + q^4 / 6
 
-    # zero out for s ≥ 2
-    return ifelse(q < 2, normalization_factor(kernel, h) * poly * exp(-q^2), zero(q))
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
+
+    return normalization_factor(kernel, h_inv) * poly * exp_q2
 end
 
-@inline function kernel_deriv(kernel::LaguerreGaussKernel, r::Real, h)
-    invh = 1 / h
-    q = r * invh
+@inline function kernel_deriv_div_r_unsafe(kernel::LaguerreGaussKernel, r::Real, h)
+    h_inv = 1 / h
+    q = r * h_inv
+
+    # dg/dq = (q / 3) * (-q^4 + 5q^2 - 9) * exp(-q^2), but we already divide by `r``
+    poly = (h_inv / 3) * ((-q^2 + 5) * q^2 - 9)
 
-    # dg/dq = (q/3)*(-q^4 + 5q^2 - 9) * exp(-q^2)
-    poly = ((-q^2 + 5) * q^2 - 9) * (q / 3)
+    q2 = -q^2
+    # Only use `@fastmath` for the exponential. Make sure not to apply `@fastmath` to `q^2`,
+    # or it will translate to a generic power function instead of `q * q` (`literal_pow`)
+    # due to a bug in Julia 1.12, see https://github.com/JuliaLang/julia/pull/60640.
+    exp_q2 = @fastmath exp(q2)
 
-    return ifelse(q < 2, normalization_factor(kernel, h) * exp(-q^2) * poly * invh, zero(q))
+    return normalization_factor(kernel, h_inv) * poly * exp_q2 * h_inv
 end
 
 @inline compact_support(::LaguerreGaussKernel, h) = 2 * h
-# Original normalization factors as in Wang2024
-# @inline normalization_factor(::LaguerreGaussKernel{1}, h) = (8 / (5 * sqrt(pi))) / h
-# @inline normalization_factor(::LaguerreGaussKernel{2}, h) = (3 / (pi)) / (h^2)
-# @inline normalization_factor(::LaguerreGaussKernel{3}, h) = (8 / (pi^(3 // 2))) / (h^3)
+
+# Original normalization factors as in Wang2024:
+# 1D: (8 / (5 * sqrt(pi))) / h
+# 2D: (3 / (pi)) / (h^2)
+# 3D: (8 / (pi^(3 / 2))) / (h^3)
 
 # Renormalized to the truncated integral over [0,2h]
-@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h)
-    # C = 1/(2*(7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4))
-    C = oftype(h, 0.65428780253539)
-    # C' = C/h
-    return C / h
+@inline function normalization_factor(kernel::LaguerreGaussKernel{1}, h_inv)
+    # C = 1 / (2 * (7 * sqrt(pi) / 16) * erf(2) - (5 / 12) * exp(-4))
+    C = oftype(h_inv, 0.6510370392044922)
+    # C' = C / h
+    return C * h_inv
 end
 
-@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h)
+@inline function normalization_factor(kernel::LaguerreGaussKernel{2}, h_inv)
     # C = 2 * pi * (5 - 17 * exp(-4)) / 12
-    C = oftype(h, 2.454963094351984)
+    C_inv = oftype(h_inv, 0.4073380990128333)
     # C' = 1 / (h^2 * C)
-    return 1 / (h^2 * C)
+    return C_inv * h_inv^2
 end
 
-@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h)
+@inline function normalization_factor(kernel::LaguerreGaussKernel{3}, h_inv)
     # C = (7 * sqrt(pi) / 32) * erf(2) - (77 / 24) * exp(-4)
-    C = oftype(h, 0.3271479336905373)
+    # C_ = 1 / (4 * pi * C)
+    C_ = oftype(h_inv, 0.24324613836999895)
 
-    # 4*pi cannot be pulled into C otherwise the test fails because of differences in rounding
-    return 1 / (C * 4 * pi * h^3)
+    # C' = 1 / (4 * pi * h^3 * C) = C_ / (h^3)
+    return C_ * h_inv^3
 end
diff --git a/src/io/write_vtk.jl b/src/io/write_vtk.jl
index 56eb4ae12d..441508472b 100644
--- a/src/io/write_vtk.jl
+++ b/src/io/write_vtk.jl
@@ -350,18 +350,13 @@ function write2vtk!(vtk, v, u, t, system::AbstractFluidSystem)
             rho_b = current_density(v, system, neighbor)
             grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle)
 
-            surface_tension[1:ndims(system),
-                            particle] .+= surface_tension_force(surface_tension_a,
-                                                                surface_tension_b,
-                                                                system,
-                                                                system,
-                                                                particle,
-                                                                neighbor,
-                                                                pos_diff,
-                                                                distance,
-                                                                rho_a,
-                                                                rho_b,
-                                                                grad_kernel)
+            dv_surface_tension = Ref(zero(pos_diff))
+            surface_tension_force!(dv_surface_tension,
+                                   surface_tension_a, surface_tension_b,
+                                   system, system, particle, neighbor,
+                                   pos_diff, distance, rho_a, rho_b, grad_kernel)
+
+            surface_tension[1:ndims(system), particle] .+= dv_surface_tension[]
         end
         vtk["surface_tension"] = surface_tension
 
diff --git a/src/schemes/boundary/open_boundary/rhs.jl b/src/schemes/boundary/open_boundary/rhs.jl
index 186a9e7b7d..4fd09f5f87 100644
--- a/src/schemes/boundary/open_boundary/rhs.jl
+++ b/src/schemes/boundary/open_boundary/rhs.jl
@@ -23,6 +23,9 @@ function interact!(dv, v_particle_system, u_particle_system,
         rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
         rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
 
+        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
+        v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+
         grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
 
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
@@ -42,15 +45,17 @@ function interact!(dv, v_particle_system, u_particle_system,
         dv_pressure_boundary = 2 * p_boundary * (m_b / (rho_a * rho_b)) * grad_kernel
 
         # Propagate `@inbounds` to the viscosity function, which accesses particle data
-        dv_viscosity_ = @inbounds dv_viscosity(viscosity_model(fluid_system,
-                                                               neighbor_system),
-                                               particle_system, neighbor_system,
-                                               v_particle_system, v_neighbor_system,
-                                               particle, neighbor, pos_diff, distance,
-                                               sound_speed, m_a, m_b, rho_a, rho_b,
-                                               grad_kernel)
-
-        dv_particle = dv_pressure + dv_viscosity_ + dv_pressure_boundary
+        dv_viscosity_ = Ref(zero(pos_diff))
+        @inbounds dv_viscosity(dv_viscosity_,
+                               viscosity_model(fluid_system,
+                                               neighbor_system),
+                               particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system,
+                               particle, neighbor, pos_diff, distance,
+                               sound_speed, m_a, m_b, rho_a, rho_b,
+                               v_a, v_b, grad_kernel)
+
+        dv_particle = dv_pressure + dv_viscosity_[] + dv_pressure_boundary
 
         for i in 1:ndims(particle_system)
             @inbounds dv[i, particle] += dv_particle[i]
diff --git a/src/schemes/boundary/open_boundary/system.jl b/src/schemes/boundary/open_boundary/system.jl
index 94aee1b048..dc5b58f7b7 100644
--- a/src/schemes/boundary/open_boundary/system.jl
+++ b/src/schemes/boundary/open_boundary/system.jl
@@ -644,8 +644,10 @@ function interpolate_velocity!(system::OpenBoundarySystem, boundary_zone,
             W_ab = kernel(smoothing_kernel, distance, smoothing_length)
             @inbounds shepard_coefficient[point] += volume_b * W_ab
 
+            velocity_neighbor_ = @inbounds current_velocity(v_neighbor, neighbor_system,
+                                                            neighbor)
             velocity_neighbor = @inbounds viscous_velocity(v_neighbor, neighbor_system,
-                                                           neighbor)
+                                                           neighbor, velocity_neighbor_)
             for i in axes(velocity_neighbor, 1)
                 @inbounds sample_velocity[i,
                                           point] += velocity_neighbor[i] * volume_b * W_ab
diff --git a/src/schemes/boundary/wall_boundary/system.jl b/src/schemes/boundary/wall_boundary/system.jl
index dbfa8613cd..6fc3f48d2c 100644
--- a/src/schemes/boundary/wall_boundary/system.jl
+++ b/src/schemes/boundary/wall_boundary/system.jl
@@ -145,15 +145,19 @@ end
     return zero(SVector{ndims(system), eltype(system)})
 end
 
-@inline function viscous_velocity(v, system::WallBoundarySystem, particle)
-    return viscous_velocity(v, system.boundary_model.viscosity, system, particle)
+@inline function viscous_velocity(v, system::WallBoundarySystem, particle, v_particle)
+    return viscous_velocity(v, system.boundary_model.viscosity, system,
+                            particle, v_particle)
 end
 
-@inline function viscous_velocity(v, ::Nothing, system, particle)
-    return current_velocity(v, system, particle)
+@inline function viscous_velocity(v, ::Nothing, system, particle, v_particle)
+    # Regular particle velocity is used for the viscosity calculation by default
+    return v_particle
 end
 
-@inline function viscous_velocity(v, viscosity, system, particle)
+@inline function viscous_velocity(v, viscosity, system, particle, v_particle)
+    # Wall velocity in the viscosity calculation contains the physical wall velocity
+    # and an interpolated velocity when a wall viscosity (no-slip BC) is used.
     return extract_svector(system.boundary_model.cache.wall_velocity, system, particle)
 end
 
diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl
index cea18484d4..17729ade7c 100644
--- a/src/schemes/fluid/entropically_damped_sph/rhs.jl
+++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl
@@ -24,6 +24,9 @@ function interact!(dv, v_particle_system, u_particle_system,
         rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
         rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
 
+        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
+        v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+
         p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
         p_b = @inbounds current_pressure(v_neighbor_system, neighbor_system, neighbor)
 
@@ -46,32 +49,32 @@ function interact!(dv, v_particle_system, u_particle_system,
                                             rho_b, pos_diff, distance, grad_kernel,
                                             correction)
 
-        dv_viscosity_ = @inbounds dv_viscosity(particle_system, neighbor_system,
-                                               v_particle_system, v_neighbor_system,
-                                               particle, neighbor, pos_diff, distance,
-                                               sound_speed, m_a, m_b, rho_a, rho_b,
-                                               grad_kernel)
+        dv_particle = Ref(zero(pos_diff))
+        @inbounds dv_viscosity(dv_particle, particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system,
+                               particle, neighbor, pos_diff, distance,
+                               sound_speed, m_a, m_b, rho_a, rho_b,
+                               v_a, v_b, grad_kernel)
 
         # Extra terms in the momentum equation when using a shifting technique
-        dv_tvf = @inbounds dv_shifting(shifting_technique(particle_system),
-                                       particle_system, neighbor_system,
-                                       v_particle_system, v_neighbor_system,
-                                       particle, neighbor, m_a, m_b, rho_a, rho_b,
-                                       pos_diff, distance, grad_kernel, correction)
-
-        dv_surface_tension = surface_tension_force(surface_tension_a, surface_tension_b,
-                                                   particle_system, neighbor_system,
-                                                   particle, neighbor, pos_diff, distance,
-                                                   rho_a, rho_b, grad_kernel)
-
-        dv_adhesion = adhesion_force(surface_tension_a, particle_system, neighbor_system,
-                                     particle, neighbor, pos_diff, distance)
-
-        dv_particle = dv_pressure + dv_viscosity_ + dv_tvf + dv_surface_tension +
-                      dv_adhesion
+        @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system),
+                               particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system,
+                               particle, neighbor, m_a, m_b, rho_a, rho_b,
+                               pos_diff, distance, grad_kernel, correction)
+
+        @inbounds surface_tension_force!(dv_particle, surface_tension_a,
+                                         surface_tension_b,
+                                         particle_system, neighbor_system,
+                                         particle, neighbor, pos_diff, distance,
+                                         rho_a, rho_b, grad_kernel)
+
+        @inbounds adhesion_force!(dv_particle, surface_tension_a, particle_system,
+                                  neighbor_system,
+                                  particle, neighbor, pos_diff, distance)
 
         for i in 1:ndims(particle_system)
-            @inbounds dv[i, particle] += dv_particle[i]
+            @inbounds dv[i, particle] += dv_particle[][i]
         end
 
         v_diff = current_velocity(v_particle_system, particle_system, particle) -
diff --git a/src/schemes/fluid/fluid.jl b/src/schemes/fluid/fluid.jl
index 8725190506..a937a5480a 100644
--- a/src/schemes/fluid/fluid.jl
+++ b/src/schemes/fluid/fluid.jl
@@ -136,31 +136,48 @@ end
     return dv
 end
 
-# This formulation was chosen to be consistent with the used pressure_acceleration formulations
-@propagate_inbounds function continuity_equation!(dv, density_calculator::ContinuityDensity,
+# For compatibility with the old continuity equation function
+@propagate_inbounds function continuity_equation!(dv::AbstractArray,
+                                                  density_calculator::ContinuityDensity,
                                                   particle_system::AbstractFluidSystem,
                                                   neighbor_system,
                                                   v_particle_system, v_neighbor_system,
                                                   particle, neighbor, pos_diff, distance,
                                                   m_b, rho_a, rho_b, grad_kernel)
+    drho_particle = Ref(zero(rho_a))
     vdiff = current_velocity(v_particle_system, particle_system, particle) -
             current_velocity(v_neighbor_system, neighbor_system, neighbor)
 
-    vdiff += continuity_equation_shifting_term(shifting_technique(particle_system),
-                                               particle_system, neighbor_system,
-                                               particle, neighbor, rho_a, rho_b)
+    continuity_equation!(drho_particle, density_calculator, particle_system,
+                         neighbor_system, v_particle_system, v_neighbor_system,
+                         particle, neighbor, pos_diff, distance,
+                         m_b, rho_a, rho_b, vdiff, grad_kernel)
+    dv[end, particle] += drho_particle[]
+end
 
-    dv[end, particle] += rho_a / rho_b * m_b * dot(vdiff, grad_kernel)
+# This formulation was chosen to be consistent with the used pressure_acceleration formulations
+@propagate_inbounds function continuity_equation!(drho_particle,
+                                                  density_calculator::ContinuityDensity,
+                                                  particle_system::AbstractFluidSystem,
+                                                  neighbor_system,
+                                                  v_particle_system, v_neighbor_system,
+                                                  particle, neighbor, pos_diff, distance,
+                                                  m_b, rho_a, rho_b, vdiff, grad_kernel)
+    # vdiff += continuity_equation_shifting_term(shifting_technique(particle_system),
+    #                                            particle_system, neighbor_system,
+    #                                            particle, neighbor, rho_a, rho_b)
+
+    drho_particle[] += div_fast(rho_a, rho_b) * m_b * dot(vdiff, grad_kernel)
 
     # Artificial density diffusion should only be applied to systems representing a fluid
     # with the same physical properties i.e. density and viscosity.
     # TODO: shouldn't be applied to particles on the interface (depends on PR #539)
-    if particle_system === neighbor_system
-        density_diffusion!(dv, density_diffusion(particle_system),
-                           v_particle_system, particle, neighbor,
-                           pos_diff, distance, m_b, rho_a, rho_b, particle_system,
-                           grad_kernel)
-    end
+    # if particle_system === neighbor_system
+    #     density_diffusion!(dv, density_diffusion(particle_system),
+    #                        v_particle_system, particle, neighbor,
+    #                        pos_diff, distance, m_b, rho_a, rho_b, particle_system,
+    #                        grad_kernel)
+    # end
 end
 
 function calculate_dt(v_ode, u_ode, cfl_number, system::AbstractFluidSystem, semi)
diff --git a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
index ca7c4b2c35..25b3d09097 100644
--- a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
+++ b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
@@ -23,6 +23,9 @@ function interact!(dv, v_particle_system, u_particle_system,
         rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
         rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
 
+        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
+        v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+
         grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
 
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
@@ -46,14 +49,15 @@ function interact!(dv, v_particle_system, u_particle_system,
                                             distance, grad_kernel, nothing)
 
         # Propagate `@inbounds` to the viscosity function, which accesses particle data
-        dv_viscosity_ = @inbounds dv_viscosity(particle_system, neighbor_system,
-                                               v_particle_system, v_neighbor_system,
-                                               particle, neighbor, pos_diff, distance,
-                                               sound_speed, m_a, m_b, rho_a, rho_b,
-                                               grad_kernel)
+        dv_viscosity_ = Ref(zero(pos_diff))
+        @inbounds dv_viscosity(dv_viscosity_, particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system,
+                               particle, neighbor, pos_diff, distance,
+                               sound_speed, m_a, m_b, rho_a, rho_b,
+                               v_a, v_b, grad_kernel)
 
         for i in 1:ndims(particle_system)
-            @inbounds dv[i, particle] += dv_pressure[i] + dv_viscosity_[i]
+            @inbounds dv[i, particle] += dv_pressure[i] + dv_viscosity_[][i]
         end
     end
     return dv
diff --git a/src/schemes/fluid/implicit_incompressible_sph/system.jl b/src/schemes/fluid/implicit_incompressible_sph/system.jl
index 8f02018bbf..a07b52786b 100644
--- a/src/schemes/fluid/implicit_incompressible_sph/system.jl
+++ b/src/schemes/fluid/implicit_incompressible_sph/system.jl
@@ -281,16 +281,21 @@ function calculate_predicted_velocity_and_d_ii_values!(system::ImplicitIncompres
             rho_a = @inbounds current_density(v_particle_system, system, particle)
             rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
 
+            v_a = @inbounds current_velocity(v_particle_system, system, particle)
+            v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+
             grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle)
 
-            dv_viscosity_ = @inbounds dv_viscosity(system, neighbor_system,
-                                                   v_particle_system, v_neighbor_system,
-                                                   particle, neighbor, pos_diff, distance,
-                                                   sound_speed, m_a, m_b, rho_a, rho_b,
-                                                   grad_kernel)
+            dv_viscosity_ = Ref(zero(pos_diff))
+            @inbounds dv_viscosity(dv_viscosity_, system, neighbor_system,
+                                   v_particle_system, v_neighbor_system,
+                                   particle, neighbor, pos_diff, distance,
+                                   sound_speed, m_a, m_b, rho_a, rho_b,
+                                   v_a, v_b, grad_kernel)
             # Add all other non-pressure forces
             for i in 1:ndims(system)
-                @inbounds advection_velocity[i, particle] += time_step * dv_viscosity_[i]
+                @inbounds advection_velocity[i,
+                                             particle] += time_step * dv_viscosity_[][i]
             end
             # Calculate d_ii with eq. 9 in Ihmsen et al. (2013)
             for i in 1:ndims(system)
diff --git a/src/schemes/fluid/pressure_acceleration.jl b/src/schemes/fluid/pressure_acceleration.jl
index 7ffa5850b3..30bb92be8c 100644
--- a/src/schemes/fluid/pressure_acceleration.jl
+++ b/src/schemes/fluid/pressure_acceleration.jl
@@ -7,14 +7,14 @@
 # asymmetric version.
 @inline function pressure_acceleration_summation_density(m_a, m_b, rho_a, rho_b, p_a, p_b,
                                                          W_a)
-    return -m_b * (p_a / rho_a^2 + p_b / rho_b^2) * W_a
+    return -m_b * (div_fast(p_a, rho_a^2) + div_fast(p_b, rho_b^2)) * W_a
 end
 
 # Same as above, but not assuming symmetry of the kernel gradient. To be used with
 # corrections that do not produce a symmetric kernel gradient.
 @inline function pressure_acceleration_summation_density(m_a, m_b, rho_a, rho_b, p_a, p_b,
                                                          W_a, W_b)
-    return -m_b * (p_a / rho_a^2 * W_a - p_b / rho_b^2 * W_b)
+    return -m_b * (div_fast(p_a, rho_a^2) * W_a - div_fast(p_b, rho_b^2) * W_b)
 end
 
 # As shown in "Variational and momentum preservation aspects of Smooth Particle Hydrodynamic
@@ -26,14 +26,16 @@ end
 # asymmetric version.
 @inline function pressure_acceleration_continuity_density(m_a, m_b, rho_a, rho_b, p_a, p_b,
                                                           W_a)
-    return -m_b * (p_a + p_b) / (rho_a * rho_b) * W_a
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    return -m_b * div_fast(p_a + p_b, rho_a * rho_b) * W_a
 end
 
 # Same as above, but not assuming symmetry of the kernel gradient. To be used with
 # corrections that do not produce a symmetric kernel gradient.
 @inline function pressure_acceleration_continuity_density(m_a, m_b, rho_a, rho_b, p_a, p_b,
                                                           W_a, W_b)
-    return -m_b / (rho_a * rho_b) * (p_a * W_a - p_b * W_b)
+    return -div_fast(m_b, rho_a * rho_b) * (p_a * W_a - p_b * W_b)
 end
 
 @doc raw"""
diff --git a/src/schemes/fluid/shifting_techniques.jl b/src/schemes/fluid/shifting_techniques.jl
index ceaff12a21..ed9644b471 100644
--- a/src/schemes/fluid/shifting_techniques.jl
+++ b/src/schemes/fluid/shifting_techniques.jl
@@ -41,11 +41,11 @@ function update_shifting!(system, shifting, v, u, v_ode, u_ode, semi)
 end
 
 # Additional term in the momentum equation due to the shifting technique
-@inline function dv_shifting(shifting, system, neighbor_system,
-                             v_system, v_neighbor_system, particle, neighbor,
-                             m_a, m_b, rho_a, rho_b, pos_diff, distance,
-                             grad_kernel, correction)
-    return zero(grad_kernel)
+@inline function dv_shifting!(dv_particle, shifting, system, neighbor_system,
+                              v_system, v_neighbor_system, particle, neighbor,
+                              m_a, m_b, rho_a, rho_b, pos_diff, distance,
+                              grad_kernel, correction)
+    return dv_particle
 end
 
 # Additional term(s) in the continuity equation due to the shifting technique
@@ -324,22 +324,23 @@ See [`ParticleShiftingTechnique`](@ref).
 struct MomentumEquationTermSun2019 end
 
 # Additional term in the momentum equation due to the shifting technique
-@propagate_inbounds function dv_shifting(shifting::ParticleShiftingTechnique, system,
-                                         neighbor_system,
-                                         v_system, v_neighbor_system, particle, neighbor,
-                                         m_a, m_b, rho_a, rho_b, pos_diff, distance,
-                                         grad_kernel, correction)
-    return dv_shifting(shifting.momentum_equation_term, system, neighbor_system,
-                       v_system, v_neighbor_system, particle, neighbor,
-                       m_a, m_b, rho_a, rho_b, pos_diff, distance,
-                       grad_kernel, correction)
+@propagate_inbounds function dv_shifting!(dv_particle,
+                                          shifting::ParticleShiftingTechnique,
+                                          system, neighbor_system,
+                                          v_system, v_neighbor_system, particle, neighbor,
+                                          m_a, m_b, rho_a, rho_b, pos_diff, distance,
+                                          grad_kernel, correction)
+    return dv_shifting!(dv_particle, shifting.momentum_equation_term, system,
+                        neighbor_system, v_system, v_neighbor_system,
+                        particle, neighbor, m_a, m_b, rho_a, rho_b,
+                        pos_diff, distance, grad_kernel, correction)
 end
 
-@propagate_inbounds function dv_shifting(::MomentumEquationTermSun2019,
-                                         system, neighbor_system,
-                                         v_system, v_neighbor_system,
-                                         particle, neighbor, m_a, m_b, rho_a, rho_b,
-                                         pos_diff, distance, grad_kernel, correction)
+@propagate_inbounds function dv_shifting!(dv_particle, ::MomentumEquationTermSun2019,
+                                          system, neighbor_system,
+                                          v_system, v_neighbor_system,
+                                          particle, neighbor, m_a, m_b, rho_a, rho_b,
+                                          pos_diff, distance, grad_kernel, correction)
     delta_v_a = delta_v(system, particle)
     delta_v_b = delta_v(neighbor_system, neighbor)
 
@@ -347,8 +348,11 @@ end
     v_b = current_velocity(v_neighbor_system, neighbor_system, neighbor)
 
     tensor_product = v_a * delta_v_a' + v_b * delta_v_b'
-    return m_b / rho_b *
-           (tensor_product * grad_kernel + v_a * dot(delta_v_a - delta_v_b, grad_kernel))
+    dv_particle[] += m_b / rho_b *
+                     (tensor_product * grad_kernel +
+                      v_a * dot(delta_v_a - delta_v_b, grad_kernel))
+
+    return dv_particle
 end
 
 # `ParticleShiftingTechnique{<:Any, <:Any, true}` means `modify_continuity_equation=true`
@@ -568,10 +572,11 @@ struct TransportVelocityAdami{modify_continuity_equation, T <: Real} <:
     end
 end
 
-@propagate_inbounds function dv_shifting(::TransportVelocityAdami, system, neighbor_system,
-                                         v_system, v_neighbor_system, particle, neighbor,
-                                         m_a, m_b, rho_a, rho_b, pos_diff, distance,
-                                         grad_kernel, correction)
+@propagate_inbounds function dv_shifting!(dv_particle, ::TransportVelocityAdami,
+                                          system, neighbor_system,
+                                          v_system, v_neighbor_system, particle, neighbor,
+                                          m_a, m_b, rho_a, rho_b, pos_diff, distance,
+                                          grad_kernel, correction)
     v_a = current_velocity(v_system, system, particle)
     delta_v_a = delta_v(system, particle)
 
@@ -588,9 +593,11 @@ end
     #   m_b * (A_a + A_b) / (ρ_a * ρ_b) * ∇W_ab.
     # In order to obtain this, we pass `p_a = A_a` and `p_b = A_b` to the
     # `pressure_acceleration` function.
-    return pressure_acceleration(system, neighbor_system, particle, neighbor,
-                                 m_a, m_b, A_a, A_b, rho_a, rho_b, pos_diff,
-                                 distance, grad_kernel, correction)
+    dv_particle[] += pressure_acceleration(system, neighbor_system, particle, neighbor,
+                                           m_a, m_b, A_a, A_b, rho_a, rho_b,
+                                           pos_diff, distance, grad_kernel, correction)
+
+    return dv_particle
 end
 
 # The function above misuses the pressure acceleration function by passing a Matrix as `p_a`.
diff --git a/src/schemes/fluid/surface_tension.jl b/src/schemes/fluid/surface_tension.jl
index 7a0232c51c..48874fa0b0 100644
--- a/src/schemes/fluid/surface_tension.jl
+++ b/src/schemes/fluid/surface_tension.jl
@@ -162,66 +162,76 @@ end
 end
 
 # Skip
-@inline function surface_tension_force(surface_tension_a, surface_tension_b,
-                                       particle_system, neighbor_system, particle, neighbor,
-                                       pos_diff, distance, rho_a, rho_b, grad_kernel)
-    return zero(pos_diff)
+@inline function surface_tension_force!(dv_particle, surface_tension_a, surface_tension_b,
+                                        particle_system, neighbor_system, particle,
+                                        neighbor, pos_diff, distance, rho_a, rho_b,
+                                        grad_kernel)
+    return dv_particle
 end
 
-@inline function surface_tension_force(surface_tension_a::CohesionForceAkinci,
-                                       surface_tension_b::CohesionForceAkinci,
-                                       particle_system::AbstractFluidSystem,
-                                       neighbor_system::AbstractFluidSystem,
-                                       particle, neighbor, pos_diff, distance,
-                                       rho_a, rho_b, grad_kernel)
+@inline function surface_tension_force!(dv_particle,
+                                        surface_tension_a::CohesionForceAkinci,
+                                        surface_tension_b::CohesionForceAkinci,
+                                        particle_system::AbstractFluidSystem,
+                                        neighbor_system::AbstractFluidSystem,
+                                        particle, neighbor, pos_diff, distance,
+                                        rho_a, rho_b, grad_kernel)
     # No cohesion with oneself. See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff)
+    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle
 
     m_b = hydrodynamic_mass(neighbor_system, neighbor)
     support_radius = compact_support(smoothing_kernel,
                                      smoothing_length(particle_system, particle))
 
-    return cohesion_force_akinci(surface_tension_a, support_radius, m_b, pos_diff, distance)
+    dv_particle[] += cohesion_force_akinci(surface_tension_a, support_radius, m_b,
+                                           pos_diff, distance)
+
+    return dv_particle
 end
 
-@inline function surface_tension_force(surface_tension_a::SurfaceTensionAkinci,
-                                       surface_tension_b::SurfaceTensionAkinci,
-                                       particle_system::AbstractFluidSystem,
-                                       neighbor_system::AbstractFluidSystem, particle,
-                                       neighbor,
-                                       pos_diff, distance, rho_a, rho_b, grad_kernel)
+@inline function surface_tension_force!(dv_particle,
+                                        surface_tension_a::SurfaceTensionAkinci,
+                                        surface_tension_b::SurfaceTensionAkinci,
+                                        particle_system::AbstractFluidSystem,
+                                        neighbor_system::AbstractFluidSystem, particle,
+                                        neighbor,
+                                        pos_diff, distance, rho_a, rho_b, grad_kernel)
     (; smoothing_kernel) = particle_system
     (; surface_tension_coefficient) = surface_tension_a
 
     smoothing_length_ = smoothing_length(particle_system, particle)
     # No surface tension with oneself. See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff)
+    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle
 
     m_b = hydrodynamic_mass(neighbor_system, neighbor)
     n_a = surface_normal(particle_system, particle)
     n_b = surface_normal(neighbor_system, neighbor)
     support_radius = compact_support(smoothing_kernel, smoothing_length_)
 
-    return cohesion_force_akinci(surface_tension_a, support_radius, m_b,
-                                 pos_diff, distance) .-
-           (surface_tension_coefficient * (n_a - n_b) * smoothing_length_)
-end
+    dv_particle[] += cohesion_force_akinci(surface_tension_a, support_radius, m_b,
+                                           pos_diff, distance)
+    dv_particle[] -= surface_tension_coefficient * (n_a - n_b) * smoothing_length_
 
-@inline function surface_tension_force(surface_tension_a::SurfaceTensionMorris,
-                                       surface_tension_b::SurfaceTensionMorris,
-                                       particle_system::AbstractFluidSystem,
-                                       neighbor_system::AbstractFluidSystem,
-                                       particle, neighbor, pos_diff, distance,
-                                       rho_a, rho_b, grad_kernel)
+    return dv_particle
+end
+@inline function surface_tension_force!(dv_particle,
+                                        surface_tension_a::SurfaceTensionMorris,
+                                        surface_tension_b::SurfaceTensionMorris,
+                                        particle_system::AbstractFluidSystem,
+                                        neighbor_system::AbstractFluidSystem,
+                                        particle, neighbor, pos_diff, distance,
+                                        rho_a, rho_b, grad_kernel)
     (; surface_tension_coefficient) = surface_tension_a
 
     # No surface tension with oneself. See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff)
+    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle
 
     n_a = surface_normal(particle_system, particle)
     curvature_a = curvature(particle_system, particle)
 
-    return -surface_tension_coefficient / rho_a * curvature_a * n_a
+    dv_particle[] -= surface_tension_coefficient / rho_a * curvature_a * n_a
+
+    return dv_particle
 end
 
 function compute_stress_tensors!(system, surface_tension, v, u, v_ode, u_ode, semi, t)
@@ -277,46 +287,54 @@ function compute_surface_delta_function!(system, ::SurfaceTensionMomentumMorris,
     return system
 end
 
-@inline function surface_tension_force(surface_tension_a::SurfaceTensionMomentumMorris,
-                                       surface_tension_b::SurfaceTensionMomentumMorris,
-                                       particle_system::AbstractFluidSystem,
-                                       neighbor_system::AbstractFluidSystem,
-                                       particle, neighbor, pos_diff, distance,
-                                       rho_a, rho_b, grad_kernel)
+@inline function surface_tension_force!(dv_particle,
+                                        surface_tension_a::SurfaceTensionMomentumMorris,
+                                        surface_tension_b::SurfaceTensionMomentumMorris,
+                                        particle_system::AbstractFluidSystem,
+                                        neighbor_system::AbstractFluidSystem,
+                                        particle, neighbor, pos_diff, distance,
+                                        rho_a, rho_b, grad_kernel)
     (; surface_tension_coefficient) = surface_tension_a
 
     # No surface tension with oneself. See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff)
+    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle
 
     S_a = stress_tensor(particle_system, particle)
     S_b = stress_tensor(neighbor_system, neighbor)
 
     m_b = hydrodynamic_mass(neighbor_system, neighbor)
 
-    return surface_tension_coefficient * m_b * (S_a + S_b) / (rho_a * rho_b) * grad_kernel
+    dv_particle[] += surface_tension_coefficient * m_b * (S_a + S_b) / (rho_a * rho_b) *
+                     grad_kernel
+
+    return dv_particle
 end
 
-@inline function adhesion_force(surface_tension::AkinciTypeSurfaceTension,
-                                particle_system::AbstractFluidSystem,
-                                neighbor_system::AbstractBoundarySystem, particle, neighbor,
-                                pos_diff, distance)
+@inline function adhesion_force!(dv_particle,
+                                 surface_tension::AkinciTypeSurfaceTension,
+                                 particle_system::AbstractFluidSystem,
+                                 neighbor_system::AbstractBoundarySystem, particle,
+                                 neighbor,
+                                 pos_diff, distance)
     (; adhesion_coefficient) = neighbor_system
 
     # No adhesion with oneself. See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff)
+    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle
 
     # No reason to calculate the adhesion force if adhesion coefficient is near zero
-    abs(adhesion_coefficient) < eps() && return zero(pos_diff)
+    abs(adhesion_coefficient) < eps() && return dv_particle
 
     m_b = hydrodynamic_mass(neighbor_system, neighbor)
 
     support_radius = compact_support(particle_system.smoothing_kernel,
                                      smoothing_length(particle_system, particle))
-    return adhesion_force_akinci(surface_tension, support_radius, m_b, pos_diff, distance,
-                                 adhesion_coefficient)
+    dv_particle[] += adhesion_force_akinci(surface_tension, support_radius, m_b, pos_diff,
+                                           distance, adhesion_coefficient)
+
+    return dv_particle
 end
 
-@inline function adhesion_force(surface_tension, particle_system, neighbor_system, particle,
-                                neighbor, pos_diff, distance)
-    return zero(pos_diff)
+@inline function adhesion_force!(dv_particle, surface_tension, particle_system,
+                                 neighbor_system, particle, neighbor, pos_diff, distance)
+    return dv_particle
 end
diff --git a/src/schemes/fluid/viscosity.jl b/src/schemes/fluid/viscosity.jl
index 2a6c51ba71..03d9cc0f85 100644
--- a/src/schemes/fluid/viscosity.jl
+++ b/src/schemes/fluid/viscosity.jl
@@ -1,33 +1,43 @@
+@propagate_inbounds function viscous_velocity(v, system, particle, v_particle)
+    # Regular particle velocity is used for the viscosity calculation by default
+    return v_particle
+end
+
 # Unpack the neighboring systems viscosity to dispatch on the viscosity type.
 # This function is only necessary to allow `nothing` as viscosity.
 # Otherwise, we could just apply the viscosity as a function directly.
-@propagate_inbounds function dv_viscosity(particle_system, neighbor_system,
+@propagate_inbounds function dv_viscosity(dv_particle,
+                                          particle_system, neighbor_system,
                                           v_particle_system, v_neighbor_system,
                                           particle, neighbor, pos_diff, distance,
-                                          sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+                                          sound_speed, m_a, m_b, rho_a, rho_b,
+                                          v_a, v_b, grad_kernel)
     viscosity = viscosity_model(particle_system, neighbor_system)
 
-    return dv_viscosity(viscosity, particle_system, neighbor_system,
+    return dv_viscosity(dv_particle, viscosity, particle_system, neighbor_system,
                         v_particle_system, v_neighbor_system,
                         particle, neighbor, pos_diff, distance,
-                        sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+                        sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
 end
 
-@propagate_inbounds function dv_viscosity(viscosity, particle_system, neighbor_system,
+@propagate_inbounds function dv_viscosity(dv_particle,
+                                          viscosity, particle_system, neighbor_system,
                                           v_particle_system, v_neighbor_system,
                                           particle, neighbor, pos_diff, distance,
-                                          sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
-    return viscosity(particle_system, neighbor_system,
+                                          sound_speed, m_a, m_b, rho_a, rho_b,
+                                          v_a, v_b, grad_kernel)
+    return viscosity(dv_particle, particle_system, neighbor_system,
                      v_particle_system, v_neighbor_system,
                      particle, neighbor, pos_diff, distance,
-                     sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+                     sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
 end
 
-@inline function dv_viscosity(viscosity::Nothing, particle_system, neighbor_system,
+@inline function dv_viscosity(dv_particle, viscosity::Nothing,
+                              particle_system, neighbor_system,
                               v_particle_system, v_neighbor_system,
                               particle, neighbor, pos_diff, distance,
-                              sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
-    return zero(pos_diff)
+                              sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
+    return dv_particle
 end
 
 @doc raw"""
@@ -76,13 +86,14 @@ struct ViscosityMorris{ELTYPE}
     end
 end
 
-function kinematic_viscosity(system, viscosity::ViscosityMorris, smoothing_length,
-                             sound_speed)
+@inline function kinematic_viscosity(system, viscosity::ViscosityMorris, smoothing_length,
+                                     sound_speed)
     return viscosity.nu
 end
 
 @propagate_inbounds function (viscosity::Union{ArtificialViscosityMonaghan,
-                                               ViscosityMorris})(particle_system,
+                                               ViscosityMorris})(dv_particle,
+                                                                 particle_system,
                                                                  neighbor_system,
                                                                  v_particle_system,
                                                                  v_neighbor_system,
@@ -90,33 +101,34 @@ end
                                                                  pos_diff, distance,
                                                                  sound_speed,
                                                                  m_a, m_b, rho_a, rho_b,
-                                                                 grad_kernel)
+                                                                 v_a, v_b, grad_kernel)
     rho_mean = (rho_a + rho_b) / 2
 
-    v_a = viscous_velocity(v_particle_system, particle_system, particle)
-    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
-    v_diff = v_a - v_b
+    v_visc_a = viscous_velocity(v_particle_system, particle_system, particle, v_a)
+    v_visc_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b)
+    v_diff = v_visc_a - v_visc_b
 
     smoothing_length_particle = smoothing_length(particle_system, particle)
     smoothing_length_neighbor = smoothing_length(particle_system, neighbor)
     smoothing_length_average = (smoothing_length_particle + smoothing_length_neighbor) / 2
 
-    nu_a = kinematic_viscosity(particle_system,
-                               viscosity_model(neighbor_system, particle_system),
-                               smoothing_length_particle, sound_speed)
-    nu_b = kinematic_viscosity(neighbor_system,
-                               viscosity_model(particle_system, neighbor_system),
-                               smoothing_length_neighbor, sound_speed)
+    nu_a = 0#kinematic_viscosity(particle_system,
+                            #    viscosity_model(neighbor_system, particle_system),
+                            #    smoothing_length_particle, sound_speed)
+    nu_b = 0#kinematic_viscosity(neighbor_system,
+                            #    viscosity_model(particle_system, neighbor_system),
+                            #    smoothing_length_neighbor, sound_speed)
 
-    pi_ab = viscosity(sound_speed, v_diff, pos_diff, distance, rho_mean, rho_a, rho_b,
-                      smoothing_length_average, grad_kernel, nu_a, nu_b)
+    viscosity(dv_particle, sound_speed, v_diff, pos_diff, distance, rho_mean, rho_a, rho_b,
+              smoothing_length_average, grad_kernel, nu_a, nu_b, m_b)
 
-    return m_b * pi_ab
+    return dv_particle
 end
 
-@inline function (viscosity::ArtificialViscosityMonaghan)(c, v_diff, pos_diff, distance,
+@inline function (viscosity::ArtificialViscosityMonaghan)(dv_particle, c, v_diff,
+                                                          pos_diff, distance,
                                                           rho_mean, rho_a, rho_b, h,
-                                                          grad_kernel, nu_a, nu_b)
+                                                          grad_kernel, nu_a, nu_b, m_b)
     (; alpha, beta, epsilon) = viscosity
 
     # v_ab ⋅ r_ab
@@ -127,31 +139,35 @@ end
     # approaching particles and turn it off for receding particles. In this way, the
     # viscosity is used for shocks and not rarefactions."
     if vr < 0
-        mu = h * vr / (distance^2 + epsilon * h^2)
-        return (alpha * c * mu + beta * mu^2) / rho_mean * grad_kernel
+        # Since this is one of the most performance critical functions, using fast divisions
+        # here gives a significant speedup on GPUs.
+        mu = div_fast(h * vr, distance^2 + epsilon * h^2)
+        dv_particle[] += div_fast(m_b * alpha * c * mu + beta * mu^2, rho_mean) * grad_kernel
     end
 
-    return zero(v_diff)
+    return dv_particle
 end
 
-@inline function (viscosity::ViscosityMorris)(c, v_diff, pos_diff, distance, rho_mean,
-                                              rho_a, rho_b, h, grad_kernel, nu_a,
-                                              nu_b)
+@inline function (viscosity::ViscosityMorris)(dv_particle, c, v_diff, pos_diff,
+                                              distance, rho_mean, rho_a, rho_b, h,
+                                              grad_kernel, nu_a, nu_b, m_b)
     epsilon = viscosity.epsilon
 
     mu_a = nu_a * rho_a
     mu_b = nu_b * rho_b
 
-    return (mu_a + mu_b) / (rho_a * rho_b) * dot(pos_diff, grad_kernel) /
-           (distance^2 + epsilon * h^2) * v_diff
+    dv_particle[] += m_b * (mu_a + mu_b) / (rho_a * rho_b) * dot(pos_diff, grad_kernel) /
+                     (distance^2 + epsilon * h^2) * v_diff
+
+    return dv_particle
 end
 
 # See, e.g.,
 # Joseph J. Monaghan. "Smoothed Particle Hydrodynamics".
 # In: Reports on Progress in Physics (2005), pages 1703-1759.
 # [doi: 10.1088/0034-4885/68/8/r01](http://dx.doi.org/10.1088/0034-4885/68/8/R01)
-function kinematic_viscosity(system, viscosity::ArtificialViscosityMonaghan,
-                             smoothing_length, sound_speed)
+@inline function kinematic_viscosity(system, viscosity::ArtificialViscosityMonaghan,
+                                     smoothing_length, sound_speed)
     (; alpha) = viscosity
 
     return alpha * smoothing_length * sound_speed / (2 * ndims(system) + 4)
@@ -177,8 +193,9 @@ struct ViscosityAdami{ELTYPE}
     end
 end
 
-function adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel,
-                               m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon)
+@inline function adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff,
+                                        distance, grad_kernel, m_a, m_b, rho_a, rho_b,
+                                        v_diff, nu_a, nu_b, epsilon)
     eta_a = nu_a * rho_a
     eta_b = nu_b * rho_b
 
@@ -200,14 +217,17 @@ function adami_viscosity_force(smoothing_length_average, pos_diff, distance, gra
     # See issue: https://github.com/trixi-framework/TrixiParticles.jl/issues/394
     visc = (volume_a^2 + volume_b^2) * dot(grad_kernel, pos_diff) * tmp / m_a
 
-    return visc .* v_diff
+    dv_particle[] += visc .* v_diff
+
+    return dv_particle
 end
 
-@inline function (viscosity::ViscosityAdami)(particle_system, neighbor_system,
+@inline function (viscosity::ViscosityAdami)(dv_particle, particle_system,
+                                             neighbor_system,
                                              v_particle_system, v_neighbor_system,
                                              particle, neighbor, pos_diff,
                                              distance, sound_speed, m_a, m_b,
-                                             rho_a, rho_b, grad_kernel)
+                                             rho_a, rho_b, v_a, v_b, grad_kernel)
     epsilon = viscosity.epsilon
 
     smoothing_length_particle = smoothing_length(particle_system, particle)
@@ -221,23 +241,20 @@ end
                                viscosity_model(particle_system, neighbor_system),
                                smoothing_length_neighbor, sound_speed)
 
-    v_a = viscous_velocity(v_particle_system, particle_system, particle)
-    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
+    v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a)
+    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b)
     v_diff = v_a - v_b
 
-    return adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel,
-                                 m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon)
+    return adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff,
+                                  distance, grad_kernel, m_a, m_b, rho_a, rho_b,
+                                  v_diff, nu_a, nu_b, epsilon)
 end
 
-function kinematic_viscosity(system, viscosity::ViscosityAdami, smoothing_length,
-                             sound_speed)
+@inline function kinematic_viscosity(system, viscosity::ViscosityAdami, smoothing_length,
+                                     sound_speed)
     return viscosity.nu
 end
 
-@propagate_inbounds function viscous_velocity(v, system, particle)
-    return current_velocity(v, system, particle)
-end
-
 @doc raw"""
     ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.01)
 
@@ -289,13 +306,15 @@ end
 
 ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityAdamiSGS(nu, C_S, epsilon)
 
-@propagate_inbounds function (viscosity::ViscosityAdamiSGS)(particle_system,
+@propagate_inbounds function (viscosity::ViscosityAdamiSGS)(dv_particle,
+                                                            particle_system,
                                                             neighbor_system,
                                                             v_particle_system,
                                                             v_neighbor_system,
                                                             particle, neighbor, pos_diff,
                                                             distance, sound_speed, m_a, m_b,
-                                                            rho_a, rho_b, grad_kernel)
+                                                            rho_a, rho_b,
+                                                            v_a, v_b, grad_kernel)
     epsilon = viscosity.epsilon
 
     smoothing_length_particle = smoothing_length(particle_system, particle)
@@ -309,8 +328,8 @@ ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityAdamiSGS(nu, C_S, eps
                                viscosity_model(particle_system, neighbor_system),
                                smoothing_length_neighbor, sound_speed)
 
-    v_a = viscous_velocity(v_particle_system, particle_system, particle)
-    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
+    v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a)
+    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b)
     v_diff = v_a - v_b
 
     # ------------------------------------------------------------------------------
@@ -341,8 +360,9 @@ ViscosityAdamiSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityAdamiSGS(nu, C_S, eps
     nu_a = nu_a + nu_SGS
     nu_b = nu_b + nu_SGS
 
-    return adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel,
-                                 m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon)
+    return adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff,
+                                  distance, grad_kernel, m_a, m_b, rho_a, rho_b,
+                                  v_diff, nu_a, nu_b, epsilon)
 end
 
 function kinematic_viscosity(system, viscosity::ViscosityAdamiSGS, smoothing_length,
@@ -401,13 +421,15 @@ end
 
 ViscosityMorrisSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityMorrisSGS(nu, C_S, epsilon)
 
-@propagate_inbounds function (viscosity::ViscosityMorrisSGS)(particle_system,
+@propagate_inbounds function (viscosity::ViscosityMorrisSGS)(dv_particle,
+                                                             particle_system,
                                                              neighbor_system,
                                                              v_particle_system,
                                                              v_neighbor_system,
                                                              particle, neighbor, pos_diff,
                                                              distance, sound_speed, m_a,
-                                                             m_b, rho_a, rho_b, grad_kernel)
+                                                             m_b, rho_a, rho_b,
+                                                             v_a, v_b, grad_kernel)
     epsilon = viscosity.epsilon
 
     smoothing_length_particle = smoothing_length(particle_system, particle)
@@ -421,8 +443,8 @@ ViscosityMorrisSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityMorrisSGS(nu, C_S, e
                                viscosity_model(particle_system, neighbor_system),
                                smoothing_length_neighbor, sound_speed)
 
-    v_a = viscous_velocity(v_particle_system, particle_system, particle)
-    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
+    v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a)
+    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b)
     v_diff = v_a - v_b
 
     # SGS part: Compute the subgrid-scale eddy viscosity.
@@ -438,9 +460,10 @@ ViscosityMorrisSGS(; nu, C_S=0.1, epsilon=0.001) = ViscosityMorrisSGS(nu, C_S, e
     mu_a = nu_a_eff * rho_a
     mu_b = nu_b_eff * rho_b
 
-    force_Morris = (mu_a + mu_b) / (rho_a * rho_b) * (dot(pos_diff, grad_kernel)) /
-                   (distance^2 + epsilon * smoothing_length_average^2) * v_diff
-    return m_b * force_Morris
+    dv_particle[] += m_b * (mu_a + mu_b) / (rho_a * rho_b) * (dot(pos_diff, grad_kernel)) /
+                     (distance^2 + epsilon * smoothing_length_average^2) * v_diff
+
+    return dv_particle
 end
 
 function kinematic_viscosity(system, viscosity::ViscosityMorrisSGS, smoothing_length,
@@ -476,24 +499,24 @@ function ViscosityCarreauYasuda(; nu0, nu_inf, lambda, a, n, epsilon=0.01)
     ViscosityCarreauYasuda(nu0, nu_inf, lambda, a, n, epsilon)
 end
 
-@propagate_inbounds function (viscosity::ViscosityCarreauYasuda)(particle_system,
+@propagate_inbounds function (viscosity::ViscosityCarreauYasuda)(dv_particle,
+                                                                 particle_system,
                                                                  neighbor_system,
                                                                  v_particle_system,
                                                                  v_neighbor_system,
                                                                  particle, neighbor,
                                                                  pos_diff, distance,
                                                                  sound_speed,
-                                                                 m_a, m_b,
-                                                                 rho_a, rho_b,
-                                                                 grad_kernel)
+                                                                 m_a, m_b, rho_a, rho_b,
+                                                                 v_a, v_b, grad_kernel)
     epsilon = viscosity.epsilon
 
     smoothing_length_particle = smoothing_length(particle_system, particle)
     smoothing_length_neighbor = smoothing_length(particle_system, neighbor)
     smoothing_length_average = (smoothing_length_particle + smoothing_length_neighbor) / 2
 
-    v_a = viscous_velocity(v_particle_system, particle_system, particle)
-    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
+    v_a = viscous_velocity(v_particle_system, particle_system, particle, v_a)
+    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor, v_b)
     v_diff = v_a - v_b
 
     gamma_dot = norm(v_diff) / (distance + epsilon)
@@ -504,11 +527,12 @@ end
     nu_a = nu_eff
     nu_b = nu_eff
 
-    return adami_viscosity_force(smoothing_length_average, pos_diff, distance, grad_kernel,
-                                 m_a, m_b, rho_a, rho_b, v_diff, nu_a, nu_b, epsilon)
+    return adami_viscosity_force!(dv_particle, smoothing_length_average, pos_diff,
+                                  distance, grad_kernel, m_a, m_b, rho_a, rho_b,
+                                  v_diff, nu_a, nu_b, epsilon)
 end
 
-function kinematic_viscosity(system, viscosity::ViscosityCarreauYasuda,
-                             smoothing_length, sound_speed)
+@inline function kinematic_viscosity(system, viscosity::ViscosityCarreauYasuda,
+                                     smoothing_length, sound_speed)
     return viscosity.nu0
 end
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index acb0f06894..dc8364cd79 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -14,119 +14,1161 @@ function interact!(dv, v_particle_system, u_particle_system,
 
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
 
-    # In order to visualize quantities like pressure forces or viscosity forces, uncomment
-    # the following code and the two other lines below that are marked as "debug example".
-    # debug_array = zeros(ndims(particle_system), nparticles(particle_system))
-
-    # Loop over all pairs of particles and neighbors within the kernel cutoff
-    foreach_point_neighbor(particle_system, neighbor_system,
-                           system_coords, neighbor_system_coords, semi;
-                           points=each_integrated_particle(particle_system)) do particle,
-                                                                                neighbor,
-                                                                                pos_diff,
-                                                                                distance
-        # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
-        # in bounds of the respective system. For performance reasons, we use `@inbounds`
-        # in this hot loop to avoid bounds checking when extracting particle quantities.
-        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
-        rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
-        rho_mean = (rho_a + rho_b) / 2
-
-        # Determine correction factors.
-        # This can be ignored, as these are all 1 when no correction is used.
-        (viscosity_correction, pressure_correction,
-         surface_tension_correction) = free_surface_correction(correction, particle_system,
-                                                               rho_mean)
-
-        grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient divides
+    # by zero. To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the compact support `c`, `distance^2` is in
+    # the order of `c^2`, so we need to check `distance < sqrt(eps(c^2))`.
+    # Note that `sqrt(eps(c^2)) != eps(c)`.
+    compact_support_ = compact_support(particle_system, neighbor_system)
+    almostzero = sqrt(eps(compact_support_^2))
 
+    @threaded semi for particle in each_integrated_particle(particle_system)
+        # We are looping over the particles of `particle_system`, so it is guaranteed
+        # that `particle` is in bounds of `particle_system`.
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
-        m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
-
-        # The following call is equivalent to
-        #     `p_a = current_pressure(v_particle_system, particle_system, particle)`
-        #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
-        # Only when the neighbor system is a `WallBoundarySystem` or a `TotalLagrangianSPHSystem`
-        # with the boundary model `PressureMirroring`, this will return `p_b = p_a`, which is
-        # the pressure of the fluid particle.
-        p_a,
-        p_b = @inbounds particle_neighbor_pressure(v_particle_system,
-                                                   v_neighbor_system,
-                                                   particle_system, neighbor_system,
-                                                   particle, neighbor)
-
-        dv_pressure = pressure_correction *
-                      pressure_acceleration(particle_system, neighbor_system,
-                                            particle, neighbor,
-                                            m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
-                                            distance, grad_kernel, correction)
-
-        # Propagate `@inbounds` to the viscosity function, which accesses particle data
-        dv_viscosity_ = viscosity_correction *
-                        @inbounds dv_viscosity(particle_system, neighbor_system,
-                                               v_particle_system, v_neighbor_system,
-                                               particle, neighbor, pos_diff, distance,
-                                               sound_speed, m_a, m_b, rho_a, rho_b,
-                                               grad_kernel)
-
-        # Extra terms in the momentum equation when using a shifting technique
-        dv_tvf = @inbounds dv_shifting(shifting_technique(particle_system),
-                                       particle_system, neighbor_system,
-                                       v_particle_system, v_neighbor_system,
-                                       particle, neighbor, m_a, m_b, rho_a, rho_b,
-                                       pos_diff, distance, grad_kernel, correction)
-
-        dv_surface_tension = surface_tension_correction *
-                             surface_tension_force(surface_tension_a, surface_tension_b,
-                                                   particle_system, neighbor_system,
-                                                   particle, neighbor, pos_diff, distance,
-                                                   rho_a, rho_b, grad_kernel)
-
-        dv_adhesion = adhesion_force(surface_tension_a, particle_system, neighbor_system,
-                                     particle, neighbor, pos_diff, distance)
-
-        dv_particle = dv_pressure + dv_viscosity_ + dv_tvf + dv_surface_tension +
-                      dv_adhesion
-
-        for i in 1:ndims(particle_system)
-            @inbounds dv[i, particle] += dv_particle[i]
-            # Debug example
-            # debug_array[i, particle] += dv_pressure[i]
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
+
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 particle)
+
+        # Accumulate the RHS contributions over all neighbors before writing to `dv`,
+        # to reduce the number of memory writes.
+        # Note that we need a `Ref` in order to be able to update these variables
+        # inside the closure in the `foreach_neighbor` loop.
+        dv_particle = Ref(zero(v_a))
+        drho_particle = Ref(zero(rho_a))
+
+        # Loop over all neighbors within the kernel cutoff
+        @inbounds PointNeighbors.foreach_neighbor(system_coords, neighbor_system_coords,
+                                                  neighborhood_search,
+                                                  particle) do particle, neighbor,
+                                                               pos_diff, distance
+            # Skip neighbors with the same position because the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(particle_system, distance, almostzero) && return
+
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                       distance, particle)
+
+            # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
+            m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+            (v_b,
+             rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                                                     neighbor)
+            rho_mean = (rho_a + rho_b) / 2
+            vdiff = v_a - v_b
+
+            # The following call is equivalent to
+            #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+            # Only when the neighbor system is a `WallBoundarySystem`
+            # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+            # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+            p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                                              neighbor, p_a)
+
+            # For `ContinuityDensity` without correction, this is equivalent to
+            # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+            dv_pressure = pressure_acceleration(particle_system, neighbor_system,
+                                                particle, neighbor,
+                                                m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                distance, grad_kernel, correction)
+
+            # Determine correction factors.
+            # This can usually be ignored, as these are all 1 when no correction is used.
+            (viscosity_correction, pressure_correction,
+             surface_tension_correction) = free_surface_correction(correction,
+                                                                   particle_system,
+                                                                   rho_a, rho_b)
+
+            # Accumulate contributions over all neighbors
+            dv_particle[] += dv_pressure * pressure_correction
+
+            # Propagate `@inbounds` to the viscosity function, which accesses particle data
+            @inbounds dv_viscosity(dv_particle,
+                                   particle_system, neighbor_system,
+                                   v_particle_system, v_neighbor_system,
+                                   particle, neighbor, pos_diff, distance,
+                                   sound_speed, m_a, m_b, rho_a, rho_b,
+                                   v_a, v_b, grad_kernel)
+
+            # Extra terms in the momentum equation when using a shifting technique
+            @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system),
+                                   particle_system, neighbor_system,
+                                   v_particle_system, v_neighbor_system,
+                                   particle, neighbor, m_a, m_b, rho_a, rho_b,
+                                   pos_diff, distance, grad_kernel, correction)
+
+            # TODO surface_tension_correction
+            @inbounds surface_tension_force!(dv_particle, surface_tension_a,
+                                             surface_tension_b,
+                                             particle_system, neighbor_system,
+                                             particle, neighbor, pos_diff, distance,
+                                             rho_a, rho_b, grad_kernel)
+
+            @inbounds adhesion_force!(dv_particle, surface_tension_a, particle_system,
+                                      neighbor_system,
+                                      particle, neighbor, pos_diff, distance)
+
+            # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
+            # Propagate `@inbounds` to the continuity equation, which accesses particle data
+            @inbounds continuity_equation!(drho_particle, density_calculator,
+                                           particle_system, neighbor_system,
+                                           v_particle_system, v_neighbor_system,
+                                           particle, neighbor, pos_diff, distance,
+                                           m_b, rho_a, rho_b, vdiff, grad_kernel)
         end
 
-        # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
-        # Propagate `@inbounds` to the continuity equation, which accesses particle data
-        @inbounds continuity_equation!(dv, density_calculator, particle_system,
-                                       neighbor_system, v_particle_system,
-                                       v_neighbor_system, particle, neighbor,
-                                       pos_diff, distance, m_b, rho_a, rho_b, grad_kernel)
+        for i in eachindex(dv_particle[])
+            @inbounds dv[i, particle] += dv_particle[][i]
+        end
+        @inbounds dv[end, particle] += drho_particle[]
     end
-    # Debug example
-    # periodic_box = neighborhood_search.periodic_box
-    # Note: this saves a file in every stage of the integrator
-    # if !@isdefined iter; iter = 0; end
-    # TODO: This call should use public API. This requires some additional changes to simplify the calls.
-    # trixi2vtk(v_particle_system, u_particle_system, -1.0, particle_system, periodic_box, debug=debug_array, prefix="debug", iter=iter += 1)
 
     return dv
 end
 
-@propagate_inbounds function particle_neighbor_pressure(v_particle_system,
-                                                        v_neighbor_system,
-                                                        particle_system, neighbor_system,
-                                                        particle, neighbor)
-    p_a = current_pressure(v_particle_system, particle_system, particle)
-    p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)
 
-    return p_a, p_b
+# for cell in cells
+#     for point in cell
+#         for neighbor_cell in neighbor_cells
+
+#             for neighbor in neighbor_cell
+#
+# 3.84 ms vs 2.92 ms for the interact! above
+function interact_localmem!(dv, v_particle_system, u_particle_system,
+                            v_neighbor_system, u_neighbor_system,
+                            particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
+    (; density_calculator, correction) = particle_system
+
+    sound_speed = system_sound_speed(particle_system)
+
+    surface_tension_a = surface_tension_model(particle_system)
+    surface_tension_b = surface_tension_model(neighbor_system)
+
+    system_coords = current_coordinates(u_particle_system, particle_system)
+    neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    search_radius = PointNeighbors.search_radius(neighborhood_search)
+
+    almostzero = eps(search_radius^2)
+
+    backend = semi.parallelization_backend
+    # max_particles_per_cell = 64
+    # nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    linear_indices = neighborhood_search.cell_list.linear_indices
+    cartesian_indices = CartesianIndices(size(linear_indices))
+    lengths = Array(neighborhood_search.cell_list.cells.lengths)
+    max_particles_per_cell = maximum(lengths)
+    # max_particles_per_cell = 64
+    nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
+    ndrange = max_particles_per_cell * length(nonempty_cells)
+    kernel = foreach_neighbor_localmem(backend, Int64(max_particles_per_cell))
+    kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells,
+           Val(max_particles_per_cell), search_radius, almostzero,
+           particle_system, neighbor_system, v_particle_system, v_neighbor_system,
+           sound_speed, density_calculator; ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+@inline function copy_to_localmem!(local_points, local_neighbor_coords,
+                                    local_neighbor_vrho, local_neighbor_mass,
+                                    local_neighbor_pressure,
+                                    v_neighbor_system, neighbor_system,
+                                   neighbor_cell, neighbor_system_coords,
+                                   neighborhood_search, particleidx)
+    points_view = @inbounds PointNeighbors.points_in_cell(neighbor_cell, neighborhood_search)
+    n_particles_in_neighbor_cell = length(points_view)
+
+    # First use all threads to load the neighbors into local memory in parallel
+    if particleidx <= n_particles_in_neighbor_cell
+        @inbounds p = local_points[particleidx] = points_view[particleidx]
+        for d in 1:ndims(neighborhood_search)
+            @inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
+        end
+        v, rho = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, p)
+        for d in 1:ndims(neighborhood_search)
+            @inbounds local_neighbor_vrho[d, particleidx] = v[d]
+        end
+        @inbounds local_neighbor_vrho[ndims(neighborhood_search) + 1, particleidx] = rho
+
+        m = @inbounds hydrodynamic_mass(neighbor_system, p)
+        @inbounds local_neighbor_mass[particleidx] = m
+        pressure = @inbounds current_pressure(v_neighbor_system, neighbor_system, p)
+        @inbounds local_neighbor_pressure[particleidx] = pressure
+    end
+    return n_particles_in_neighbor_cell
+end
+
+# @parallel(block) for cell in cells
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell
+#             copy_coordinates_to_localmem(neighbor)
+#
+#         # Make sure all threads finished the copying
+#         @synchronize
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we continue with copying
+#         @synchronize
+@kernel cpu=false function foreach_neighbor_localmem(dv, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius,
+                               almostzero, particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system, sound_speed,
+                               density_calculator) where {MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)
+    local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, MAX)
+    local_neighbor_mass = @localmem eltype(system_coords) MAX
+    local_neighbor_pressure = @localmem eltype(system_coords) MAX
+
+    points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        particle = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 particle)
+
+        m_a = @inbounds hydrodynamic_mass(particle_system, particle)
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
+
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 particle)
+    else
+        particle = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+
+        m_a = zero(eltype(system_coords))
+        p_a = zero(eltype(system_coords))
+        v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+        rho_a = zero(eltype(system_coords))
+    end
+
+    dv_particle = zero(v_a)
+    drho_particle = zero(rho_a)
+
+    for neighbor_cell_ in PointNeighbors.neighboring_cells(cell, neighborhood_search)
+        neighbor_cell = Tuple(neighbor_cell_)
+
+        n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
+                                                         local_neighbor_vrho, local_neighbor_mass,
+                                                         local_neighbor_pressure,
+                                                            v_neighbor_system, neighbor_system,
+                                                         neighbor_cell, neighbor_system_coords,
+                                                         neighborhood_search, particleidx)
+
+        # Make sure all threads finished the copying
+        @synchronize
+
+        # Now each thread works on one particle again
+        if particleidx <= n_particles_in_current_cell
+            for local_neighbor in 1:n_particles_in_neighbor_cell
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if almostzero < distance2 <= search_radius^2
+                    distance = sqrt(distance2)
+
+                    # Now that we know that `distance` is not zero, we can safely call the unsafe
+                    # version of the kernel gradient to avoid redundant zero checks.
+                    grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                               distance, particle)
+
+                    m_b = @inbounds local_neighbor_mass[local_neighbor]
+                    p_b = @inbounds local_neighbor_pressure[local_neighbor]
+                    # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1),
+                    #                                     local_neighbor)
+                    # v_b = v_rho_b[1:ndims(neighborhood_search)]
+                    # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3])
+                    # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1]
+                    (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system,
+                                                                  local_neighbor)
+
+                    # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
+                    # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+                    # (v_b,
+                    # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                    #                                         neighbor)
+                    # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+                    # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+                    vdiff = v_a - v_b
+
+                    # The following call is equivalent to
+                    #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+                    # Only when the neighbor system is a `WallBoundarySystem`
+                    # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+                    # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+                    # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                    #                                 neighbor, p_a)
+
+                    # For `ContinuityDensity` without correction, this is equivalent to
+                    # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+                    dv_pressure = pressure_acceleration(particle_system, neighbor_system,
+                                                        particle, neighbor,
+                                                        m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                        distance, grad_kernel, nothing)
+
+                    # Accumulate contributions over all neighbors
+                    dv_particle += dv_pressure
+
+                    # Propagate `@inbounds` to the viscosity function, which accesses particle data
+                    dv_viscosity_ = Ref(zero(dv_particle))
+                    @inbounds dv_viscosity(dv_viscosity_,
+                                        particle_system, neighbor_system,
+                                        v_particle_system, v_neighbor_system,
+                                        particle, neighbor, pos_diff, distance,
+                                        sound_speed, m_a, m_b, rho_a, rho_b,
+                                        v_a, v_b, grad_kernel)
+                    dv_particle += dv_viscosity_[]
+
+                    # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
+                    # Propagate `@inbounds` to the continuity equation, which accesses particle data
+                    drho_ = Ref(zero(drho_particle))
+                    @inbounds continuity_equation!(drho_, density_calculator,
+                                                particle_system, neighbor_system,
+                                                v_particle_system, v_neighbor_system,
+                                                particle, neighbor, pos_diff, distance,
+                                                m_b, rho_a, rho_b, vdiff, grad_kernel)
+                    drho_particle += drho_[]
+                end
+            end
+        end
+
+        # Make sure all threads finished computing before we continue with copying
+        @synchronize()
+    end
+
+    if particleidx <= n_particles_in_current_cell
+        for i in eachindex(dv_particle)
+            @inbounds dv[i, particle] += dv_particle[i]
+        end
+        @inbounds dv[end, particle] += drho_particle
+    end
+end
+
+# Same as above, but local memory is used for 3 cells at once to reduce the number of synchronizations.
+# 3.64 ms
+function interact_localmem2!(dv, v_particle_system, u_particle_system,
+                            v_neighbor_system, u_neighbor_system,
+                            particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
+    (; density_calculator, correction) = particle_system
+
+    sound_speed = system_sound_speed(particle_system)
+
+    surface_tension_a = surface_tension_model(particle_system)
+    surface_tension_b = surface_tension_model(neighbor_system)
+
+    system_coords = current_coordinates(u_particle_system, particle_system)
+    neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    search_radius = PointNeighbors.search_radius(neighborhood_search)
+
+    almostzero = eps(search_radius^2)
+
+    backend = semi.parallelization_backend
+    # max_particles_per_cell = 64
+    # nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    linear_indices = neighborhood_search.cell_list.linear_indices
+    cartesian_indices = CartesianIndices(size(linear_indices))
+    lengths = Array(neighborhood_search.cell_list.cells.lengths)
+    max_particles_per_cell = maximum(lengths)
+    # max_particles_per_cell = 64
+    nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
+    ndrange = max_particles_per_cell * length(nonempty_cells)
+    kernel = foreach_neighbor_localmem2(backend, Int64(max_particles_per_cell))
+    kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells,
+           Val(max_particles_per_cell), search_radius, almostzero,
+           particle_system, neighbor_system, v_particle_system, v_neighbor_system,
+           sound_speed, density_calculator; ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+@inline function copy_to_localmem2!(local_points, local_neighbor_coords,
+                                    local_neighbor_vrho, local_neighbor_mass,
+                                    local_neighbor_pressure,
+                                    v_neighbor_system, neighbor_system,
+                                   neighbor_cell, neighbor_system_coords,
+                                   neighborhood_search, particleidx, offset)
+    points_view = @inbounds PointNeighbors.points_in_cell(neighbor_cell, neighborhood_search)
+    n_particles_in_neighbor_cell = length(points_view)
+
+    # First use all threads to load the neighbors into local memory in parallel
+    if particleidx <= n_particles_in_neighbor_cell
+        write_idx = particleidx + offset
+        @inbounds p = local_points[write_idx] = points_view[particleidx]
+        for d in 1:ndims(neighborhood_search)
+            @inbounds local_neighbor_coords[d, write_idx] = neighbor_system_coords[d, p]
+        end
+        v, rho = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, p)
+        for d in 1:ndims(neighborhood_search)
+            @inbounds local_neighbor_vrho[d, write_idx] = v[d]
+        end
+        @inbounds local_neighbor_vrho[ndims(neighborhood_search) + 1, write_idx] = rho
+
+        m = @inbounds hydrodynamic_mass(neighbor_system, p)
+        @inbounds local_neighbor_mass[write_idx] = m
+        pressure = @inbounds current_pressure(v_neighbor_system, neighbor_system, p)
+        @inbounds local_neighbor_pressure[write_idx] = pressure
+    end
+    return n_particles_in_neighbor_cell
+end
+
+# @parallel(block) for cell in cells
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell
+#             copy_coordinates_to_localmem(neighbor)
+#
+#         # Make sure all threads finished the copying
+#         @synchronize
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we continue with copying
+#         @synchronize
+@kernel cpu=false function foreach_neighbor_localmem2(dv, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius,
+                               almostzero, particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system, sound_speed,
+                               density_calculator) where {MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 3 * MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 3 * MAX)
+    local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 3 * MAX)
+    local_neighbor_mass = @localmem eltype(system_coords) 3 * MAX
+    local_neighbor_pressure = @localmem eltype(system_coords) 3 * MAX
+
+    points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        particle = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 particle)
+
+        m_a = @inbounds hydrodynamic_mass(particle_system, particle)
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
+
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 particle)
+    else
+        particle = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+
+        m_a = zero(eltype(system_coords))
+        p_a = zero(eltype(system_coords))
+        v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+        rho_a = zero(eltype(system_coords))
+    end
+
+    dv_particle = zero(v_a)
+    drho_particle = zero(rho_a)
+
+    for neighbor_cell_block in ((cell[2] - 1, cell[3] - 1), (cell[2], cell[3] - 1), (cell[2] + 1, cell[3] - 1),
+                                 (cell[2] - 1, cell[3]),     (cell[2], cell[3]),     (cell[2] + 1, cell[3]),
+                                 (cell[2] - 1, cell[3] + 1), (cell[2], cell[3] + 1), (cell[2] + 1, cell[3] + 1))
+    # for neighbor_cell_ in PointNeighbors.neighboring_cells(cell, neighborhood_search)
+        # neighbor_cell_block = Tuple(neighbor_cell_block_)
+        neighbor_cells = ((cell[1] - 1, neighbor_cell_block...), (cell[1], neighbor_cell_block...), (cell[1] + 1, neighbor_cell_block...))
+
+        neighbor_cell = @inbounds neighbor_cells[1]
+        n_particles_in_neighbor_cell1 = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                        local_neighbor_vrho, local_neighbor_mass,
+                                                        local_neighbor_pressure,
+                                                            v_neighbor_system, neighbor_system,
+                                                        neighbor_cell, neighbor_system_coords,
+                                                        neighborhood_search, particleidx, 0)
+
+        neighbor_cell = @inbounds neighbor_cells[2]
+        n_particles_in_neighbor_cell2 = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                        local_neighbor_vrho, local_neighbor_mass,
+                                                        local_neighbor_pressure,
+                                                            v_neighbor_system, neighbor_system,
+                                                        neighbor_cell, neighbor_system_coords,
+                                                        neighborhood_search, particleidx, MAX)
+
+        neighbor_cell = @inbounds neighbor_cells[3]
+        n_particles_in_neighbor_cell3 = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                        local_neighbor_vrho, local_neighbor_mass,
+                                                        local_neighbor_pressure,
+                                                            v_neighbor_system, neighbor_system,
+                                                        neighbor_cell, neighbor_system_coords,
+                                                        neighborhood_search, particleidx, 2 * MAX)
+
+        # Make sure all threads finished the copying
+        @synchronize
+
+        # Now each thread works on one particle again
+        if particleidx <= n_particles_in_current_cell
+            n_particles_in_cells = (n_particles_in_neighbor_cell1, n_particles_in_neighbor_cell2, n_particles_in_neighbor_cell3)
+            for neighbor_cell_idx in eachindex(neighbor_cells), local_neighbor in 1:n_particles_in_cells[neighbor_cell_idx]
+                local_neighbor = local_neighbor + (neighbor_cell_idx - 1) * MAX
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if almostzero < distance2 <= search_radius^2
+                    distance = sqrt(distance2)
+
+                    # Now that we know that `distance` is not zero, we can safely call the unsafe
+                    # version of the kernel gradient to avoid redundant zero checks.
+                    grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                               distance, particle)
+
+                    m_b = @inbounds local_neighbor_mass[local_neighbor]
+                    p_b = @inbounds local_neighbor_pressure[local_neighbor]
+                    # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1),
+                    #                                     local_neighbor)
+                    # v_b = v_rho_b[1:ndims(neighborhood_search)]
+                    # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3])
+                    # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1]
+                    (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system,
+                                                                  local_neighbor)
+
+                    # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
+                    # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+                    # (v_b,
+                    # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                    #                                         neighbor)
+                    # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+                    # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+                    vdiff = v_a - v_b
+
+                    # The following call is equivalent to
+                    #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+                    # Only when the neighbor system is a `WallBoundarySystem`
+                    # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+                    # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+                    # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                    #                                 neighbor, p_a)
+
+                    # For `ContinuityDensity` without correction, this is equivalent to
+                    # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+                    dv_pressure = pressure_acceleration(particle_system, neighbor_system,
+                                                        particle, neighbor,
+                                                        m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                        distance, grad_kernel, nothing)
+
+                    # Accumulate contributions over all neighbors
+                    dv_particle += dv_pressure
+
+                    # Propagate `@inbounds` to the viscosity function, which accesses particle data
+                    dv_viscosity_ = Ref(zero(dv_particle))
+                    @inbounds dv_viscosity(dv_viscosity_,
+                                        particle_system, neighbor_system,
+                                        v_particle_system, v_neighbor_system,
+                                        particle, neighbor, pos_diff, distance,
+                                        sound_speed, m_a, m_b, rho_a, rho_b,
+                                        v_a, v_b, grad_kernel)
+                    dv_particle += dv_viscosity_[]
+
+                    # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
+                    # Propagate `@inbounds` to the continuity equation, which accesses particle data
+                    drho_ = Ref(zero(drho_particle))
+                    @inbounds continuity_equation!(drho_, density_calculator,
+                                                particle_system, neighbor_system,
+                                                v_particle_system, v_neighbor_system,
+                                                particle, neighbor, pos_diff, distance,
+                                                m_b, rho_a, rho_b, vdiff, grad_kernel)
+                    drho_particle += drho_[]
+                end
+            end
+        end
+
+        # Make sure all threads finished computing before we continue with copying
+        @synchronize()
+    end
+
+    if particleidx <= n_particles_in_current_cell
+        for i in eachindex(dv_particle)
+            @inbounds dv[i, particle] += dv_particle[i]
+        end
+        @inbounds dv[end, particle] += drho_particle
+    end
+end
+
+# Double-buffered version of localmem2
+# 7.89 ms
+function interact_localmem3!(dv, v_particle_system, u_particle_system,
+                            v_neighbor_system, u_neighbor_system,
+                            particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
+    (; density_calculator, correction) = particle_system
+
+    sound_speed = system_sound_speed(particle_system)
+
+    surface_tension_a = surface_tension_model(particle_system)
+    surface_tension_b = surface_tension_model(neighbor_system)
+
+    system_coords = current_coordinates(u_particle_system, particle_system)
+    neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    search_radius = PointNeighbors.search_radius(neighborhood_search)
+
+    almostzero = eps(search_radius^2)
+
+    backend = semi.parallelization_backend
+    # max_particles_per_cell = 64
+    # nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    linear_indices = neighborhood_search.cell_list.linear_indices
+    cartesian_indices = CartesianIndices(size(linear_indices))
+    lengths = Array(neighborhood_search.cell_list.cells.lengths)
+    max_particles_per_cell = maximum(lengths)
+    # max_particles_per_cell = 64
+    nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
+    ndrange = max_particles_per_cell * length(nonempty_cells)
+    kernel = foreach_neighbor_localmem3(backend, Int64(max_particles_per_cell))
+    kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells,
+           Val(max_particles_per_cell), search_radius, almostzero,
+           particle_system, neighbor_system, v_particle_system, v_neighbor_system,
+           sound_speed, density_calculator; ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+# @parallel(block) for cell in cells
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell
+#             copy_coordinates_to_localmem(neighbor)
+#
+#         # Make sure all threads finished the copying
+#         @synchronize
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we continue with copying
+#         @synchronize
+@kernel cpu=false function foreach_neighbor_localmem3(dv, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius,
+                               almostzero, particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system, sound_speed,
+                               density_calculator) where {MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 3 * MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 3 * MAX)
+    local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 3 * MAX)
+    local_neighbor_mass = @localmem eltype(system_coords) 3 * MAX
+    local_neighbor_pressure = @localmem eltype(system_coords) 3 * MAX
+
+    next_local_points = @localmem Int32 3 * MAX
+    next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 3 * MAX)
+    next_local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 3 * MAX)
+    next_local_neighbor_mass = @localmem eltype(system_coords) 3 * MAX
+    next_local_neighbor_pressure = @localmem eltype(system_coords) 3 * MAX
+
+    points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        particle = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 particle)
+
+        m_a = @inbounds hydrodynamic_mass(particle_system, particle)
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
+
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 particle)
+    else
+        particle = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+
+        m_a = zero(eltype(system_coords))
+        p_a = zero(eltype(system_coords))
+        v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+        rho_a = zero(eltype(system_coords))
+    end
+
+    dv_particle = zero(v_a)
+    drho_particle = zero(rho_a)
+
+    # Load first cell block into local memory
+    neighbor_cell_block = @inbounds (cell[2] - 1, cell[3] - 1)
+    neighbor_cells = @inbounds ((cell[1] - 1, neighbor_cell_block...), (cell[1], neighbor_cell_block...), (cell[1] + 1, neighbor_cell_block...))
+
+    neighbor_cell = @inbounds neighbor_cells[1]
+    n_particles_in_neighbor_cell1 = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                    local_neighbor_vrho, local_neighbor_mass,
+                                                    local_neighbor_pressure,
+                                                        v_neighbor_system, neighbor_system,
+                                                    neighbor_cell, neighbor_system_coords,
+                                                    neighborhood_search, particleidx, 0)
+
+    neighbor_cell = @inbounds neighbor_cells[2]
+    n_particles_in_neighbor_cell2 = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                    local_neighbor_vrho, local_neighbor_mass,
+                                                    local_neighbor_pressure,
+                                                        v_neighbor_system, neighbor_system,
+                                                    neighbor_cell, neighbor_system_coords,
+                                                    neighborhood_search, particleidx, MAX)
+
+    neighbor_cell = @inbounds neighbor_cells[3]
+    n_particles_in_neighbor_cell3 = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                    local_neighbor_vrho, local_neighbor_mass,
+                                                    local_neighbor_pressure,
+                                                        v_neighbor_system, neighbor_system,
+                                                    neighbor_cell, neighbor_system_coords,
+                                                    neighborhood_search, particleidx, 2 * MAX)
+
+    n_particles_in_cells = (n_particles_in_neighbor_cell1, n_particles_in_neighbor_cell2, n_particles_in_neighbor_cell3)
+
+    @synchronize
+
+    neighbor_cell_blocks = @inbounds ((cell[2] - 1, cell[3] - 1), (cell[2], cell[3] - 1), (cell[2] + 1, cell[3] - 1),
+                                 (cell[2] - 1, cell[3]),     (cell[2], cell[3]),     (cell[2] + 1, cell[3]),
+                                 (cell[2] - 1, cell[3] + 1), (cell[2], cell[3] + 1), (cell[2] + 1, cell[3] + 1))
+    for neighbor_cell_block_idx in eachindex(neighbor_cell_blocks)
+        neighbor_cell_block = @inbounds neighbor_cell_blocks[neighbor_cell_block_idx]
+        neighbor_cells = @inbounds ((cell[1] - 1, neighbor_cell_block...), (cell[1], neighbor_cell_block...), (cell[1] + 1, neighbor_cell_block...))
+
+        if neighbor_cell_block_idx < length(neighbor_cell_blocks)
+            neighbor_cell = @inbounds neighbor_cells[1]
+            n_particles_in_neighbor_cell1 = copy_to_localmem2!(local_points, next_local_neighbor_coords,
+                                                            next_local_neighbor_vrho, next_local_neighbor_mass,
+                                                            next_local_neighbor_pressure,
+                                                                v_neighbor_system, neighbor_system,
+                                                            neighbor_cell, neighbor_system_coords,
+                                                            neighborhood_search, particleidx, 0)
+
+            neighbor_cell = @inbounds neighbor_cells[2]
+            n_particles_in_neighbor_cell2 = copy_to_localmem2!(next_local_points, next_local_neighbor_coords,
+                                                            next_local_neighbor_vrho, next_local_neighbor_mass,
+                                                            next_local_neighbor_pressure,
+                                                                v_neighbor_system, neighbor_system,
+                                                            neighbor_cell, neighbor_system_coords,
+                                                            neighborhood_search, particleidx, MAX)
+
+            neighbor_cell = @inbounds neighbor_cells[3]
+            n_particles_in_neighbor_cell3 = copy_to_localmem2!(next_local_points, next_local_neighbor_coords,
+                                                            next_local_neighbor_vrho, next_local_neighbor_mass,
+                                                            next_local_neighbor_pressure,
+                                                                v_neighbor_system, neighbor_system,
+                                                            neighbor_cell, neighbor_system_coords,
+                                                            neighborhood_search, particleidx, 2 * MAX)
+
+            next_n_particles_in_cells = (n_particles_in_neighbor_cell1, n_particles_in_neighbor_cell2, n_particles_in_neighbor_cell3)
+        end
+
+        # Now each thread works on one particle again
+        if particleidx <= n_particles_in_current_cell
+            for neighbor_cell_idx in eachindex(neighbor_cells), local_neighbor in 1:n_particles_in_cells[neighbor_cell_idx]
+                local_neighbor = local_neighbor + (neighbor_cell_idx - 1) * MAX
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if almostzero < distance2 <= search_radius^2
+                    distance = sqrt(distance2)
+
+                    # Now that we know that `distance` is not zero, we can safely call the unsafe
+                    # version of the kernel gradient to avoid redundant zero checks.
+                    grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                               distance, particle)
+
+                    m_b = @inbounds local_neighbor_mass[local_neighbor]
+                    p_b = @inbounds local_neighbor_pressure[local_neighbor]
+                    # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1),
+                    #                                     local_neighbor)
+                    # v_b = v_rho_b[1:ndims(neighborhood_search)]
+                    # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3])
+                    # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1]
+                    (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system,
+                                                                  local_neighbor)
+
+                    # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
+                    # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+                    # (v_b,
+                    # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                    #                                         neighbor)
+                    # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+                    # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+                    vdiff = v_a - v_b
+
+                    # The following call is equivalent to
+                    #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+                    # Only when the neighbor system is a `WallBoundarySystem`
+                    # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+                    # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+                    # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                    #                                 neighbor, p_a)
+
+                    # For `ContinuityDensity` without correction, this is equivalent to
+                    # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+                    dv_pressure = pressure_acceleration(particle_system, neighbor_system,
+                                                        particle, neighbor,
+                                                        m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                        distance, grad_kernel, nothing)
+
+                    # Accumulate contributions over all neighbors
+                    dv_particle += dv_pressure
+
+                    # Propagate `@inbounds` to the viscosity function, which accesses particle data
+                    dv_viscosity_ = Ref(zero(dv_particle))
+                    @inbounds dv_viscosity(dv_viscosity_,
+                                        particle_system, neighbor_system,
+                                        v_particle_system, v_neighbor_system,
+                                        particle, neighbor, pos_diff, distance,
+                                        sound_speed, m_a, m_b, rho_a, rho_b,
+                                        v_a, v_b, grad_kernel)
+                    dv_particle += dv_viscosity_[]
+
+                    # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
+                    # Propagate `@inbounds` to the continuity equation, which accesses particle data
+                    drho_ = Ref(zero(drho_particle))
+                    @inbounds continuity_equation!(drho_, density_calculator,
+                                                particle_system, neighbor_system,
+                                                v_particle_system, v_neighbor_system,
+                                                particle, neighbor, pos_diff, distance,
+                                                m_b, rho_a, rho_b, vdiff, grad_kernel)
+                    drho_particle += drho_[]
+                end
+            end
+        end
+
+        # Make sure all threads finished computing before we continue with copying
+        @synchronize()
+
+        neighbor_cell_block_idx >= length(neighbor_cell_blocks) && break
+
+        # Swap local memory buffers
+        n_particles_in_cells = next_n_particles_in_cells
+        local_points, next_local_points = next_local_points, local_points
+        local_neighbor_coords, next_local_neighbor_coords = next_local_neighbor_coords, local_neighbor_coords
+        local_neighbor_vrho, next_local_neighbor_vrho = next_local_neighbor_vrho, local_neighbor_vrho
+        local_neighbor_mass, next_local_neighbor_mass = next_local_neighbor_mass, local_neighbor_mass
+        local_neighbor_pressure, next_local_neighbor_pressure = next_local_neighbor_pressure, local_neighbor_pressure
+    end
+
+    if particleidx <= n_particles_in_current_cell
+        for i in eachindex(dv_particle)
+            @inbounds dv[i, particle] += dv_particle[i]
+        end
+        @inbounds dv[end, particle] += drho_particle
+    end
+end
+
+
+# Same as localmem2, but local memory is used for 27 cells at once to reduce the number of synchronizations.
+# 6.73 ms
+function interact_localmem4!(dv, v_particle_system, u_particle_system,
+                            v_neighbor_system, u_neighbor_system,
+                            particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
+    (; density_calculator, correction) = particle_system
+
+    sound_speed = system_sound_speed(particle_system)
+
+    surface_tension_a = surface_tension_model(particle_system)
+    surface_tension_b = surface_tension_model(neighbor_system)
+
+    system_coords = current_coordinates(u_particle_system, particle_system)
+    neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    search_radius = PointNeighbors.search_radius(neighborhood_search)
+
+    almostzero = eps(search_radius^2)
+
+    backend = semi.parallelization_backend
+    # max_particles_per_cell = 64
+    # nhs_size = size(neighborhood_search.cell_list.linear_indices)
+    # cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
+    linear_indices = neighborhood_search.cell_list.linear_indices
+    cartesian_indices = CartesianIndices(size(linear_indices))
+    lengths = Array(neighborhood_search.cell_list.cells.lengths)
+    max_particles_per_cell = maximum(lengths)
+    # max_particles_per_cell = 64
+    nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
+    ndrange = max_particles_per_cell * length(nonempty_cells)
+    kernel = foreach_neighbor_localmem4(backend, Int64(max_particles_per_cell))
+    kernel(dv, system_coords, neighbor_coords, neighborhood_search, nonempty_cells,
+           Val(max_particles_per_cell), search_radius, almostzero,
+           particle_system, neighbor_system, v_particle_system, v_neighbor_system,
+           sound_speed, density_calculator; ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return nothing
+end
+
+# @parallel(block) for cell in cells
+#     for neighbor_cell in neighboring_cells
+#         @parallel(thread) for neighbor in neighbor_cell
+#             copy_coordinates_to_localmem(neighbor)
+#
+#         # Make sure all threads finished the copying
+#         @synchronize
+#
+#         @parallel(thread) for particle in cell
+#             for neighbor in neighbor_cell
+#                 # This uses the neighbor coordinates from the local memory
+#                 compute(point, neighbor)
+#
+#         # Make sure all threads finished computing before we continue with copying
+#         @synchronize
+@kernel cpu=false function foreach_neighbor_localmem4(dv, system_coords, neighbor_system_coords,
+                               neighborhood_search, cells, ::Val{MAX}, search_radius,
+                               almostzero, particle_system, neighbor_system,
+                               v_particle_system, v_neighbor_system, sound_speed,
+                               density_calculator) where {MAX}
+    cell_ = @index(Group)
+    cell = @inbounds Tuple(cells[cell_])
+    particleidx = @index(Local)
+    @assert 1 <= particleidx <= MAX
+
+    # Coordinate buffer in local memory
+    local_points = @localmem Int32 27 * MAX
+    local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), 27 * MAX)
+    local_neighbor_vrho = @localmem eltype(system_coords) (ndims(neighborhood_search) + 1, 27 * MAX)
+    local_neighbor_mass = @localmem eltype(system_coords) 27 * MAX
+    local_neighbor_pressure = @localmem eltype(system_coords) 27 * MAX
+    local_n_particles_in_neighbor_cell = @localmem Int32 27
+
+    points = @inbounds PointNeighbors.points_in_cell(cell, neighborhood_search)
+    n_particles_in_current_cell = length(points)
+
+    # Extract point coordinates if a point lies on this thread
+    if particleidx <= n_particles_in_current_cell
+        particle = @inbounds points[particleidx]
+        point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
+                                                 particle)
+
+        m_a = @inbounds hydrodynamic_mass(particle_system, particle)
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
+
+        # In 3D, this function can combine velocity and density load into one wide load,
+        # which gives a significant speedup on GPUs.
+        (v_a,
+         rho_a) = @inbounds velocity_and_density(v_particle_system, particle_system,
+                                                 particle)
+    else
+        particle = zero(Int32)
+        point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+
+        m_a = zero(eltype(system_coords))
+        p_a = zero(eltype(system_coords))
+        v_a = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
+        rho_a = zero(eltype(system_coords))
+    end
+
+    dv_particle = zero(v_a)
+    drho_particle = zero(rho_a)
+
+    neighboring_cells = PointNeighbors.neighboring_cells(cell, neighborhood_search)
+    for neighbor_cell_idx in 1:length(neighboring_cells)
+        neighbor_cell = @inbounds Tuple(neighboring_cells[neighbor_cell_idx])
+        n_particles_in_neighbor_cell = copy_to_localmem2!(local_points, local_neighbor_coords,
+                                                        local_neighbor_vrho, local_neighbor_mass,
+                                                        local_neighbor_pressure,
+                                                        v_neighbor_system, neighbor_system,
+                                                        neighbor_cell, neighbor_system_coords,
+                                                        neighborhood_search, particleidx, (neighbor_cell_idx - 1) * MAX)
+        @inbounds local_n_particles_in_neighbor_cell[neighbor_cell_idx] = n_particles_in_neighbor_cell
+    end
+
+    # Make sure all threads finished the copying
+    @synchronize
+
+    # Now each thread works on one particle again
+    if particleidx <= n_particles_in_current_cell
+        for neighbor_cell_idx in 1:length(neighboring_cells)
+            n_cells = @inbounds local_n_particles_in_neighbor_cell[neighbor_cell_idx]
+            for local_neighbor in 1:n_cells
+                local_neighbor = local_neighbor + (neighbor_cell_idx - 1) * MAX
+                @inbounds neighbor = local_points[local_neighbor]
+                @inbounds neighbor_coords = extract_svector(local_neighbor_coords,
+                                                            Val(ndims(neighborhood_search)),
+                                                            local_neighbor)
+
+                pos_diff = point_coords - neighbor_coords
+                distance2 = dot(pos_diff, pos_diff)
+
+                # TODO periodic
+
+                if almostzero < distance2 <= search_radius^2
+                    distance = sqrt(distance2)
+
+                    # Now that we know that `distance` is not zero, we can safely call the unsafe
+                    # version of the kernel gradient to avoid redundant zero checks.
+                    grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                               distance, particle)
+
+                    m_b = @inbounds local_neighbor_mass[local_neighbor]
+                    p_b = @inbounds local_neighbor_pressure[local_neighbor]
+                    # v_rho_b = @inbounds extract_svector(local_neighbor_vrho, Val(ndims(neighborhood_search) + 1),
+                    #                                     local_neighbor)
+                    # v_b = v_rho_b[1:ndims(neighborhood_search)]
+                    # v_b = @inbounds SVector(v_rho_b[1], v_rho_b[2], v_rho_b[3])
+                    # rho_b = @inbounds v_rho_b[ndims(neighborhood_search) + 1]
+                    (v_b, rho_b) = @inbounds velocity_and_density(local_neighbor_vrho, neighbor_system,
+                                                                  local_neighbor)
+
+                    # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
+                    # m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+                    # (v_b,
+                    # rho_b) = @inbounds velocity_and_density(v_neighbor_system, neighbor_system,
+                    #                                         neighbor)
+                    # v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+                    # rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+                    vdiff = v_a - v_b
+
+                    # The following call is equivalent to
+                    #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+                    # Only when the neighbor system is a `WallBoundarySystem`
+                    # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+                    # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+                    # p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                    #                                 neighbor, p_a)
+
+                    # For `ContinuityDensity` without correction, this is equivalent to
+                    # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+                    dv_pressure = pressure_acceleration(particle_system, neighbor_system,
+                                                        particle, neighbor,
+                                                        m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                        distance, grad_kernel, nothing)
+
+                    # Accumulate contributions over all neighbors
+                    dv_particle += dv_pressure
+
+                    # Propagate `@inbounds` to the viscosity function, which accesses particle data
+                    dv_viscosity_ = Ref(zero(dv_particle))
+                    @inbounds dv_viscosity(dv_viscosity_,
+                                        particle_system, neighbor_system,
+                                        v_particle_system, v_neighbor_system,
+                                        particle, neighbor, pos_diff, distance,
+                                        sound_speed, m_a, m_b, rho_a, rho_b,
+                                        v_a, v_b, grad_kernel)
+                    dv_particle += dv_viscosity_[]
+
+                    # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
+                    # Propagate `@inbounds` to the continuity equation, which accesses particle data
+                    drho_ = Ref(zero(drho_particle))
+                    @inbounds continuity_equation!(drho_, density_calculator,
+                                                particle_system, neighbor_system,
+                                                v_particle_system, v_neighbor_system,
+                                                particle, neighbor, pos_diff, distance,
+                                                m_b, rho_a, rho_b, vdiff, grad_kernel)
+                    drho_particle += drho_[]
+                end
+            end
+        end
+    end
+
+    if particleidx <= n_particles_in_current_cell
+        for i in eachindex(dv_particle)
+            @inbounds dv[i, particle] += dv_particle[i]
+        end
+        @inbounds dv[end, particle] += drho_particle
+    end
+end
+
+@propagate_inbounds function neighbor_pressure(v_neighbor_system, neighbor_system,
+                                               neighbor, p_a)
+    return current_pressure(v_neighbor_system, neighbor_system, neighbor)
+end
+
+@inline function neighbor_pressure(v_neighbor_system,
+                                   neighbor_system::WallBoundarySystem{<:BoundaryModelDummyParticles{PressureMirroring}},
+                                   neighbor, p_a)
+    return p_a
+end
+
+@propagate_inbounds function velocity_and_density(v, system, particle)
+    # For other systems, fall back to the default implementation
+    return velocity_and_density(v, nothing, system, particle)
+end
+
+@propagate_inbounds function velocity_and_density(v, system::WeaklyCompressibleSPHSystem,
+                                                  particle)
+    (; density_calculator) = system
+
+    return velocity_and_density(v, density_calculator, system, particle)
+end
+
+@propagate_inbounds function velocity_and_density(v, _, system, particle)
+    v_particle = current_velocity(v, system, particle)
+    rho_particle = current_density(v, system, particle)
+
+    return v_particle, rho_particle
 end
 
-@inline function particle_neighbor_pressure(v_particle_system, v_neighbor_system,
-                                            particle_system,
-                                            neighbor_system::WallBoundarySystem{<:BoundaryModelDummyParticles{PressureMirroring}},
-                                            particle, neighbor)
-    p_a = current_pressure(v_particle_system, particle_system, particle)
+@inline function velocity_and_density(v, ::ContinuityDensity,
+                                      ::WeaklyCompressibleSPHSystem{3}, particle)
+    vrho_a = vloada(Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
+    a, b, c, d = Tuple(vrho_a)
+    v_particle = SVector(a, b, c)
+    rho_particle = d
 
-    return p_a, p_a
+    return v_particle, rho_particle
 end
diff --git a/src/schemes/structure/rigid_body/system.jl b/src/schemes/structure/rigid_body/system.jl
index c9281aa5f9..f3c8943ec6 100644
--- a/src/schemes/structure/rigid_body/system.jl
+++ b/src/schemes/structure/rigid_body/system.jl
@@ -282,23 +282,26 @@ function calc_normal!(system::AbstractFluidSystem,
                                  semi, surface_normal_method)
 end
 
-@inline function adhesion_force(surface_tension::AkinciTypeSurfaceTension,
-                                particle_system::AbstractFluidSystem,
-                                neighbor_system::RigidBodySystem,
-                                particle, neighbor, pos_diff, distance)
+@inline function adhesion_force!(dv_particle,
+                                 surface_tension::AkinciTypeSurfaceTension,
+                                 particle_system::AbstractFluidSystem,
+                                 neighbor_system::RigidBodySystem,
+                                 particle, neighbor, pos_diff, distance)
     (; adhesion_coefficient) = neighbor_system
 
     # No adhesion with oneself. See `src/general/smoothing_kernels.jl` for more details.
-    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return zero(pos_diff)
+    distance^2 < eps(initial_smoothing_length(particle_system)^2) && return dv_particle
 
-    abs(adhesion_coefficient) < eps() && return zero(pos_diff)
+    abs(adhesion_coefficient) < eps() && return dv_particle
 
     m_b = hydrodynamic_mass(neighbor_system, neighbor)
     support_radius = compact_support(system_smoothing_kernel(particle_system),
                                      smoothing_length(particle_system, particle))
 
-    return adhesion_force_akinci(surface_tension, support_radius, m_b, pos_diff, distance,
-                                 adhesion_coefficient)
+    dv_particle[] += adhesion_force_akinci(surface_tension, support_radius, m_b,
+                                           pos_diff, distance, adhesion_coefficient)
+
+    return dv_particle
 end
 
 function write_u0!(u0, system::RigidBodySystem)
@@ -506,9 +509,11 @@ end
     return neighbor_system.boundary_model.viscosity
 end
 
-@inline function viscous_velocity(v, system::RigidBodySystem, particle)
-    # This function is only used in fluid-structure interaction, so it is never called when `boundary_model` is `nothing`
-    return viscous_velocity(v, system.boundary_model.viscosity, system, particle)
+@inline function viscous_velocity(v, system::RigidBodySystem, particle, v_particle)
+    # This function is only used in fluid-structure interaction,
+    # so it is never called when `boundary_model` is `nothing`
+    return viscous_velocity(v, system.boundary_model.viscosity, system,
+                            particle, v_particle)
 end
 
 @inline acceleration_source(system::RigidBodySystem) = system.acceleration
diff --git a/src/schemes/structure/structure.jl b/src/schemes/structure/structure.jl
index 1ac304d79f..0224760ad0 100644
--- a/src/schemes/structure/structure.jl
+++ b/src/schemes/structure/structure.jl
@@ -46,6 +46,9 @@ function interact_structure_fluid!(dv, v_particle_system,
         rho_a = current_density(v_particle_system, particle_system, particle)
         rho_b = current_density(v_neighbor_system, neighbor_system, neighbor)
 
+        v_a = current_velocity(v_particle_system, particle_system, particle)
+        v_b = current_velocity(v_neighbor_system, neighbor_system, neighbor)
+
         surface_tension = surface_tension_model(neighbor_system)
 
         # In fluid-structure interaction, use the "hydrodynamic mass" of the structure particles
@@ -66,17 +69,17 @@ function interact_structure_fluid!(dv, v_particle_system,
                                             pos_diff, distance, grad_kernel,
                                             system_correction(neighbor_system))
 
-        dv_viscosity_ = dv_viscosity(neighbor_system, particle_system,
-                                     v_neighbor_system, v_particle_system,
-                                     neighbor, particle, pos_diff, distance,
-                                     sound_speed, m_b, m_a, rho_b, rho_a, grad_kernel)
-
-        dv_adhesion = adhesion_force(surface_tension, neighbor_system, particle_system,
-                                     neighbor, particle, pos_diff, distance)
+        dv_particle = Ref(zero(pos_diff))
+        dv_viscosity(dv_particle, neighbor_system, particle_system,
+                     v_neighbor_system, v_particle_system,
+                     neighbor, particle, pos_diff, distance,
+                     sound_speed, m_b, m_a, rho_b, rho_a,
+                     v_a, v_b, grad_kernel)
 
-        dv_fs = dv_boundary + dv_viscosity_ + dv_adhesion
+        adhesion_force!(dv_particle, surface_tension, neighbor_system, particle_system,
+                        neighbor, particle, pos_diff, distance)
 
-        accumulate_structure_fluid_pair!(dv, dv_fs, particle_system, particle, m_b)
+        accumulate_structure_fluid_pair!(dv, dv_particle[], particle_system, particle, m_b)
 
         continuity_equation!(dv, v_particle_system, v_neighbor_system,
                              particle, neighbor, pos_diff, distance,
diff --git a/src/schemes/structure/total_lagrangian_sph/system.jl b/src/schemes/structure/total_lagrangian_sph/system.jl
index cac4ed1f12..37b86ed85c 100644
--- a/src/schemes/structure/total_lagrangian_sph/system.jl
+++ b/src/schemes/structure/total_lagrangian_sph/system.jl
@@ -333,9 +333,11 @@ end
     return current_density(v, system.boundary_model, system)
 end
 
-@inline function viscous_velocity(v, system::TotalLagrangianSPHSystem, particle)
-    # This function is only used in fluid-structure interaction, so it is never called when `boundary_model` is `nothing`
-    return viscous_velocity(v, system.boundary_model.viscosity, system, particle)
+@inline function viscous_velocity(v, system::TotalLagrangianSPHSystem, particle, v_particle)
+    # This function is only used in fluid-structure interaction,
+    # so it is never called when `boundary_model` is `nothing`
+    return viscous_velocity(v, system.boundary_model.viscosity, system,
+                            particle, v_particle)
 end
 
 # In fluid-structure interaction, use the "hydrodynamic pressure" of the structure particles
diff --git a/src/util.jl b/src/util.jl
index 855fe50765..682514563f 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -1,3 +1,9 @@
+# By default, use `div_fast` of `Base.FastMath`.
+# In the `TrixiParticlesCUDAExt` extension, this is redefined for `Float64`.
+@inline function div_fast(x, y)
+    return Base.FastMath.div_fast(x, y)
+end
+
 # Same as `foreach`, but it is unrolled by the compiler for small input tuples
 @inline function foreach_noalloc(func, collection)
     element = first(collection)
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
index 44772f8943..a08f207dd4 100644
--- a/test/examples/gpu.jl
+++ b/test/examples/gpu.jl
@@ -279,10 +279,8 @@ end
                                                                      fluid_density_calculator=SummationDensity(),
                                                                      maxiters=38, # 38 time steps on CPU
                                                                      clip_negative_pressure=true),
-                # Broken due to https://github.com/JuliaGPU/CUDA.jl/issues/2681
-                # and https://github.com/JuliaGPU/Metal.jl/issues/550.
-                # "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1,
-                #                                                smoothing_kernel=SchoenbergQuarticSplineKernel{2}()),
+                "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1,
+                                                               smoothing_kernel=SchoenbergQuarticSplineKernel{2}()),
                 "WCSPH with SchoenbergQuinticSplineKernel" => (smoothing_length=1.1,
                                                                smoothing_kernel=SchoenbergQuinticSplineKernel{2}()),
                 "WCSPH with WendlandC2Kernel" => (smoothing_length=1.5,
diff --git a/test/general/density_calculator.jl b/test/general/density_calculator.jl
index 2ded394766..6bf895f7be 100644
--- a/test/general/density_calculator.jl
+++ b/test/general/density_calculator.jl
@@ -1,5 +1,3 @@
-using OrdinaryDiffEq
-
 # Setup a single particle and calculate its density
 @testset verbose=true "DensityCalculators" begin
     @testset verbose=true "SummationDensity" begin
diff --git a/test/general/smoothing_kernels.jl b/test/general/smoothing_kernels.jl
index fdd7f9a142..1acc72c554 100644
--- a/test/general/smoothing_kernels.jl
+++ b/test/general/smoothing_kernels.jl
@@ -1,11 +1,13 @@
 @testset verbose=true "Smoothing Kernels" begin
     # Don't show all kernel tests in the final overview
     @testset verbose=false "Integral" begin
-        # All smoothing kernels should integrate to something close to 1
+        # All smoothing kernels should integrate to something close to 1.
+        # We integrate slightly beyond the compact support to verify that the kernel is
+        # correctly evaluating to zero there.
         function integrate_kernel_2d(smk)
             integral_2d_radial,
             _ = quadgk(r -> r * TrixiParticles.kernel(smk, r, 1.0), 0,
-                       TrixiParticles.compact_support(smk, 1.0),
+                       TrixiParticles.compact_support(smk, 1.0) * 1.1,
                        rtol=1e-15)
             return 2 * pi * integral_2d_radial
         end
@@ -13,7 +15,7 @@
         function integrate_kernel_3d(smk)
             integral_3d_radial,
             _ = quadgk(r -> r^2 * TrixiParticles.kernel(smk, r, 1.0), 0,
-                       TrixiParticles.compact_support(smk, 1.0),
+                       TrixiParticles.compact_support(smk, 1.0) * 1.1,
                        rtol=1e-15)
             return 4 * pi * integral_3d_radial
         end
@@ -70,13 +72,19 @@
         # Test 4 different smoothing lengths
         smoothing_lengths = 0.25:0.25:1
 
-        @testset "$kernel_type" for kernel_type in kernels
-            for ndims in 2:3
+        for ndims in 2:3
+            @testset "$kernel_type{$ndims}" for kernel_type in kernels
                 kernel_ = kernel_type{ndims}()
 
                 for h in smoothing_lengths
+                    compact_support_ = TrixiParticles.compact_support(kernel_, h)
                     # Test 11 different radii
-                    radii = 0:(0.1h):(h + eps())
+                    radii = 0:(0.1 * compact_support_):(compact_support_ * 1.01)
+
+                    if kernel_ isa SpikyKernel
+                        # The Spiky kernel is not differentiable at r=0
+                        radii = radii[2:end]
+                    end
 
                     for r in radii
                         # Automatic differentation of `kernel`
diff --git a/test/schemes/fluid/viscosity.jl b/test/schemes/fluid/viscosity.jl
index 0e359ef5f1..ca50e6d5c1 100644
--- a/test/schemes/fluid/viscosity.jl
+++ b/test/schemes/fluid/viscosity.jl
@@ -25,12 +25,13 @@
         grad_kernel = TrixiParticles.smoothing_kernel_grad(system_wcsph, pos_diff,
                                                            distance, 1)
 
-        dv = viscosity(sound_speed, v_diff, pos_diff, distance,
-                       rho_mean, rho_a, rho_b, smoothing_length,
-                       grad_kernel, 0.0, 0.0)
+        dv = Ref(zero(v_diff))
+        viscosity(dv, sound_speed, v_diff, pos_diff, distance,
+                  rho_mean, rho_a, rho_b, smoothing_length,
+                  grad_kernel, 0.0, 0.0, 1.0)
 
-        @test isapprox(dv[1], -0.02049217623299368, atol=6e-15)
-        @test isapprox(dv[2], 0.03073826434949052, atol=6e-15)
+        @test isapprox(dv[][1], -0.02049217623299368, atol=6e-15)
+        @test isapprox(dv[][2], 0.03073826434949052, atol=6e-15)
     end
     @testset verbose=true "`ViscosityMorris`" begin
         nu = 7e-3
@@ -51,12 +52,16 @@
         v[1, 2] = 0.0
         v[2, 2] = 0.0
 
-        dv = viscosity(system_wcsph, system_wcsph,
-                       v, v, 1, 2, pos_diff, distance,
-                       sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+        v_a = v[:, 1]
+        v_b = v[:, 2]
 
-        @test isapprox(dv[1], -1.0895602048035404e-5, atol=6e-15)
-        @test isapprox(dv[2], 3.631867349345135e-5, atol=6e-15)
+        dv = Ref(zero(pos_diff))
+        viscosity(dv, system_wcsph, system_wcsph,
+                  v, v, 1, 2, pos_diff, distance,
+                  sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
+
+        @test isapprox(dv[][1], -1.0895602048035404e-5, atol=6e-15)
+        @test isapprox(dv[][2], 3.631867349345135e-5, atol=6e-15)
     end
     @testset verbose=true "`ViscosityAdami`" begin
         viscosity = ViscosityAdami(nu=7e-3)
@@ -76,12 +81,16 @@
         v[1, 2] = 0.0
         v[2, 2] = 0.0
 
-        dv = viscosity(system_wcsph, system_wcsph,
-                       v, v, 1, 2, pos_diff, distance,
-                       sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+        v_a = v[:, 1]
+        v_b = v[:, 2]
+
+        dv = Ref(zero(pos_diff))
+        viscosity(dv, system_wcsph, system_wcsph,
+                  v, v, 1, 2, pos_diff, distance,
+                  sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
 
-        @test isapprox(dv[1], -1.089560204803541e-5, atol=6e-15)
-        @test isapprox(dv[2], 3.6318673493451364e-5, atol=6e-15)
+        @test isapprox(dv[][1], -1.089560204803541e-5, atol=6e-15)
+        @test isapprox(dv[][2], 3.6318673493451364e-5, atol=6e-15)
     end
     @testset verbose=true "`ViscosityMorrisSGS`" begin
         nu = 7e-3
@@ -103,12 +112,16 @@
         v[1, 2] = 0.0
         v[2, 2] = 0.0
 
-        dv = viscosity(system_wcsph, system_wcsph,
-                       v, v, 1, 2, pos_diff, distance,
-                       sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+        v_a = v[:, 1]
+        v_b = v[:, 2]
+
+        dv = Ref(zero(pos_diff))
+        viscosity(dv, system_wcsph, system_wcsph,
+                  v, v, 1, 2, pos_diff, distance,
+                  sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
 
-        @test isapprox(dv[1], -2.032835697804103e-5, atol=6e-15)
-        @test isapprox(dv[2], 6.776118992680343e-5, atol=6e-15)
+        @test isapprox(dv[][1], -2.032835697804103e-5, atol=6e-15)
+        @test isapprox(dv[][2], 6.776118992680343e-5, atol=6e-15)
     end
     @testset verbose=true "`ViscosityAdamiSGS`" begin
         viscosity = ViscosityAdamiSGS(nu=7e-3)
@@ -128,12 +141,16 @@
         v[1, 2] = 0.0
         v[2, 2] = 0.0
 
-        dv = viscosity(system_wcsph, system_wcsph,
-                       v, v, 1, 2, pos_diff, distance,
-                       sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+        v_a = v[:, 1]
+        v_b = v[:, 2]
 
-        @test isapprox(dv[1], -2.0328356978041036e-5, atol=6e-15)
-        @test isapprox(dv[2], 6.776118992680346e-5, atol=6e-15)
+        dv = Ref(zero(pos_diff))
+        viscosity(dv, system_wcsph, system_wcsph,
+                  v, v, 1, 2, pos_diff, distance,
+                  sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
+
+        @test isapprox(dv[][1], -2.0328356978041036e-5, atol=6e-15)
+        @test isapprox(dv[][2], 6.776118992680346e-5, atol=6e-15)
     end
 
     @testset verbose=true "`ViscosityCarreauYasuda`" begin
@@ -161,12 +178,16 @@
 
         grad_kernel = TrixiParticles.smoothing_kernel_grad(system_wcsph, pos_diff,
                                                            distance, 1)
-        dv = viscosity(system_wcsph, system_wcsph,
-                       v, v, 1, 2, pos_diff, distance,
-                       sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+        v_a = v[:, 1]
+        v_b = v[:, 2]
+
+        dv = Ref(zero(pos_diff))
+        viscosity(dv, system_wcsph, system_wcsph,
+                  v, v, 1, 2, pos_diff, distance,
+                  sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
 
-        @test isapprox(dv[1], -1.0895602048035410e-5, atol=6e-15)
-        @test isapprox(dv[2], 3.6318673493451364e-5, atol=6e-15)
+        @test isapprox(dv[][1], -1.0895602048035410e-5, atol=6e-15)
+        @test isapprox(dv[][2], 3.6318673493451364e-5, atol=6e-15)
 
         # ------------------------------------------------------------------
         # 2) Shear-thinning case: fixed (precomputed) values
@@ -183,11 +204,15 @@
 
         grad_kernel = TrixiParticles.smoothing_kernel_grad(system_wcsph, pos_diff,
                                                            distance, 1)
-        dv = viscosity(system_wcsph, system_wcsph,
-                       v, v, 1, 2, pos_diff, distance,
-                       sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
+        v_a = v[:, 1]
+        v_b = v[:, 2]
+
+        dv = Ref(zero(pos_diff))
+        viscosity(dv, system_wcsph, system_wcsph,
+                  v, v, 1, 2, pos_diff, distance,
+                  sound_speed, m_a, m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
 
-        @test isapprox(dv[1], -5.33743497379846e-9, atol=6e-15)
-        @test isapprox(dv[2], 1.7791449912661534e-8, atol=6e-15)
+        @test isapprox(dv[][1], -5.33743497379846e-9, atol=6e-15)
+        @test isapprox(dv[][2], 1.7791449912661534e-8, atol=6e-15)
     end
 end
diff --git a/test/schemes/structure/total_lagrangian_sph/rhs.jl b/test/schemes/structure/total_lagrangian_sph/rhs.jl
index 69f4a920c4..ed677d57bf 100644
--- a/test/schemes/structure/total_lagrangian_sph/rhs.jl
+++ b/test/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -96,7 +96,10 @@
             function TrixiParticles.add_acceleration!(_, _, ::MockSystem)
                 return nothing
             end
-            TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) = kernel_deriv
+            function TrixiParticles.smoothing_kernel_grad(::MockSystem,
+                                                          pos_diff, distance, particle)
+                return kernel_deriv * pos_diff / distance
+            end
             TrixiParticles.compact_support(::MockSystem, ::MockSystem) = 1000.0
             function TrixiParticles.current_coords(system::MockSystem, particle)
                 return TrixiParticles.current_coords(initial_coordinates, system, particle)
diff --git a/test/systems/tlsph_system.jl b/test/systems/tlsph_system.jl
index 1e7b1edff5..3d5c2ff570 100644
--- a/test/systems/tlsph_system.jl
+++ b/test/systems/tlsph_system.jl
@@ -184,10 +184,10 @@
 
                 Base.getindex(::Val{:mock_material_density}, ::Int64) = density
 
-                function TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _)
-                    return kernel_derivative
+                function TrixiParticles.smoothing_kernel_grad(::Val{:mock_system_tensor},
+                                                              pos_diff, distance, particle)
+                    return kernel_derivative * pos_diff / distance
                 end
-                Base.eps(::Type{Val{:mock_smoothing_length}}) = eps()
                 semi = DummySemidiscretization()
 
                 # Compute deformation gradient