diff --git a/Project.toml b/Project.toml
index dff0de0627..84b5497b2a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,6 +26,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -62,6 +63,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"
diff --git a/dualsphysics_2d_benchmark_2m_particles.txt b/dualsphysics_2d_benchmark_2m_particles.txt
new file mode 100644
index 0000000000..12e3c2cafb
--- /dev/null
+++ b/dualsphysics_2d_benchmark_2m_particles.txt
@@ -0,0 +1,108 @@
+2D benchmark 2M particles
+
+DualSPHysics:
+1.55ms
+
+DualSPHysics no density diffusion:
+1.1ms
+
+main FP32, no density diffusion:
+3.1ms
+
+now with compact vov
+main mixed precision:
+5ms (3.2x)
+
+main FP32:
+4ms (2.6x)
+
+with dd, sorted by cell coords:
+4.8ms
+
+sorted by cell index:
+4.9ms
+
+blocks of three cells:
+4.9ms
+
+optimizations (inbounds, zero distance skip):
+4.7ms
+
+simplified kernel gradient:
+4.3ms
+
+no density diffusion:
+3.1ms
+
+unrolled continuity equation:
+3.1ms
+
+unrolled viscosity:
+2.9ms
+
+pull *_a quantities before the neighbor loop:
+2.7ms
+
+split dv and drho:
+2.55ms (2.3x)
+
+add to arrays after loop:
+2.17ms (1.97x)
+
+optimized day 1:
+1.3ms
+
+optimized day 1 mixed precision:
+3.7ms
+
+full grid NHS:
+1.5ms (2.1x)
+
+
+3D DualSPHysics 638400 fluid particles:
+4.9ms
++ 200µs KerUpdatePosCell
+
+main 3D TP, no DD:
+12ms
+
+3D TP stripped (above with pull quantities...) and compact vov:
+8.3ms (3.1x, 1.7x?)
+
+add to arrays after loop:
+7.5ms (2.8x, 1.5x?)
+
+split dv and drho:
+7.4ms
+
+fixed @inbounds:
+6.8ms
+
+fastmath if distance:
+4.7ms
+
+first vloada:
+4.6ms
+
+second vloada:
+4.3ms
+
+mixed precision:
+21.8ms???
+
+kernel grad fastmath:
+4ms
+
+targeted fastdiv with sorted NHS:
+4ms
+fluid-boundary: 0.4ms. fluid-*: 4.4ms
+
+full grid NHS:
+4.9ms (2.4x)
+total 5.3ms
+
+full grid with foreach_neighbor inbounds:
+4.8ms
+
+full grid with foreach_point_neighbor:
+7.7ms
diff --git a/examples/fluid/dam_break_2d_dualsphysics.jl b/examples/fluid/dam_break_2d_dualsphysics.jl
new file mode 100644
index 0000000000..472801d5a3
--- /dev/null
+++ b/examples/fluid/dam_break_2d_dualsphysics.jl
@@ -0,0 +1,75 @@
+# Modify the 01_DamBreak example of DualSPHysics like this:
+# <parameter key="StepAlgorithm" value="2" comment="Step Algorithm 1:Verlet, 2:Symplectic (default=1)" />
+# <parameter key="DensityDT" value="1" comment="Density Diffusion Term 0:None, 1:Molteni, 2:Fourtakas, 3:Fourtakas(full) (default=0)" />
+# <parameter key="TimeMax" value="1.0" comment="Time of simulation" units_comment="seconds" />
+#
+# When comparing with high resolution, change the resolution here:
+# <definition dp="0.002" units_comment="metres (m)">
+# With this resolution, use:
+# <parameter key="DtFixed" value="1e-5" comment="Fixed Dt value. Use 0 to disable (default=disabled)" units_comment="seconds" />
+
+using TrixiParticles, TrixiParticles.PointNeighbors
+
+fluid_particle_spacing = 0.002
+
+# Load setup from dam break example
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              fluid_particle_spacing=fluid_particle_spacing,
+              smoothing_length=1.414216 * fluid_particle_spacing,
+              tank_size=(4.0, 3.0), W=1.0, H=2.0,
+              spacing_ratio=1, boundary_layers=1,
+              sol=nothing, ode=nothing)
+
+tank = RectangularTank(fluid_particle_spacing, initial_fluid_size, tank_size, fluid_density,
+                       n_layers=boundary_layers, spacing_ratio=spacing_ratio,
+                       coordinates_eltype=Float64)
+
+tank.fluid.coordinates .+= 0.005
+tank.boundary.coordinates .+= 0.005
+
+# Define a GPU-compatible neighborhood search
+min_corner = minimum(tank.boundary.coordinates, dims=2)
+max_corner = maximum(tank.boundary.coordinates, dims=2)
+cell_list = FullGridCellList(; min_corner, max_corner)#, backend=PointNeighbors.CompactVectorOfVectors{Int32})
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list,
+                                                update_strategy=ParallelUpdate())
+
+search_radius = TrixiParticles.get_neighborhood_search(fluid_system, fluid_system, semi).search_radius
+nhs = PointNeighbors.copy_neighborhood_search(neighborhood_search, search_radius, 0)
+cell_coords(x) = PointNeighbors.cell_coords(x, nhs)
+cell_index(x) = PointNeighbors. cell_index(nhs.cell_list, cell_coords(x))
+coords = reinterpret(reshape, SVector{ndims(fluid_system), eltype(tank.fluid.coordinates)}, tank.fluid.coordinates)
+sort!(coords, by=cell_index)
+
+# function cells(coordinates, system, semi)
+#     nhs = TrixiParticles.get_neighborhood_search(fluid_system, fluid_system, semi)
+#     coords = reinterpret(reshape, SVector{ndims(system), eltype(coordinates)}, coordinates)
+#     return TrixiParticles.PointNeighbors.cell_coords.(coords, Ref(nhs))
+# end
+
+# For benchmarking, run:
+# trixi_include_changeprecision(Float32, "../TrixiParticles.jl/examples/fluid/dam_break_2d_dualsphysics.jl", parallelization_backend=CUDABackend(), tspan=(0.0f0, 1.0f-10), fluid_particle_spacing=0.001, coordinates_eltype=Float32);
+# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x; semi = ode.p; system = semi.systems[1]; dv = TrixiParticles.wrap_v(dv_ode, system, semi); v = TrixiParticles.wrap_v(v_ode, system, semi); u = TrixiParticles.wrap_u(u_ode, system, semi);
+# @benchmark TrixiParticles.interact_reassembled!($dv, $v, $u, $v, $u, $system, $system, $semi)
+
+# Run the dam break simulation with this neighborhood search
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              tank=tank,
+              smoothing_length=1.414216 * fluid_particle_spacing,
+              time_integration_scheme=SymplecticPositionVerlet(),
+              boundary_density_calculator=ContinuityDensity(),
+              fluid_particle_spacing=fluid_particle_spacing,
+              tank_size=(4.0, 3.0), W=1.0, H=2.0,
+              spacing_ratio=1, boundary_layers=1,
+              tspan=(0.0, 1.0), cfl=0.2,
+              neighborhood_search=neighborhood_search,
+              viscosity_wall=viscosity_fluid,
+              # This is the same saving frequency as in DualSPHysics for easier comparison
+              saving_callback=SolutionSavingCallback(dt=0.01),
+            #   extra_callback=SortingCallback(interval=1),
+              density_diffusion=nothing, # TODO only for benchmark
+              # For benchmarks, use spacing 0.002, fix time steps, and disable VTK saving:
+              dt=1e-5, stepsize_callback=nothing, #saving_callback=nothing,
+              parallelization_backend=PolyesterBackend())
diff --git a/examples/fluid/dam_break_3d.jl b/examples/fluid/dam_break_3d.jl
index a073c001ec..2fdd752732 100644
--- a/examples/fluid/dam_break_3d.jl
+++ b/examples/fluid/dam_break_3d.jl
@@ -66,12 +66,14 @@ boundary_system = WallBoundarySystem(tank.boundary, boundary_model)
 # ==========================================================================================
 # ==== Simulation
 semi = Semidiscretization(fluid_system, boundary_system,
+                          neighborhood_search=GridNeighborhoodSearch{3}(),
                           parallelization_backend=PolyesterBackend())
 ode = semidiscretize(semi, tspan)
 
 info_callback = InfoCallback(interval=100)
 saving_callback = SolutionSavingCallback(dt=0.1, prefix="")
-callbacks = CallbackSet(info_callback, saving_callback)
+extra_callback = nothing
+callbacks = CallbackSet(info_callback, saving_callback, extra_callback)
 
 # Use a Runge-Kutta method with automatic (error based) time step size control.
 # Limiting of the maximum stepsize is necessary to prevent crashing.
@@ -80,8 +82,10 @@ callbacks = CallbackSet(info_callback, saving_callback)
 # Sometimes, the method fails to do so because forces become extremely large when
 # fluid particles are very close to boundary particles, and the time integration method
 # interprets this as an instability.
-sol = solve(ode, RDPK3SpFSAL35(),
+time_integration_scheme = RDPK3SpFSAL35()
+sol = solve(ode, time_integration_scheme,
             abstol=1e-5, # Default abstol is 1e-6 (may need to be tuned to prevent boundary penetration)
             reltol=1e-4, # Default reltol is 1e-3 (may need to be tuned to prevent boundary penetration)
             dtmax=1e-2, # Limit stepsize to prevent crashing
+            dt=1.0,
             save_everystep=false, callback=callbacks);
diff --git a/examples/fluid/dam_break_3d_dualsphysics.jl b/examples/fluid/dam_break_3d_dualsphysics.jl
new file mode 100644
index 0000000000..c2aeceda81
--- /dev/null
+++ b/examples/fluid/dam_break_3d_dualsphysics.jl
@@ -0,0 +1,69 @@
+# Modify the 01_DamBreak example of DualSPHysics like this:
+# <parameter key="StepAlgorithm" value="2" comment="Step Algorithm 1:Verlet, 2:Symplectic (default=1)" />
+# <parameter key="DensityDT" value="1" comment="Density Diffusion Term 0:None, 1:Molteni, 2:Fourtakas, 3:Fourtakas(full) (default=0)" />
+# <parameter key="TimeMax" value="1.0" comment="Time of simulation" units_comment="seconds" />
+#
+# When comparing with high resolution, change the resolution here:
+# <definition dp="0.002" units_comment="metres (m)">
+# With this resolution, use:
+# <parameter key="DtFixed" value="1e-5" comment="Fixed Dt value. Use 0 to disable (default=disabled)" units_comment="seconds" />
+
+using TrixiParticles, TrixiParticles.PointNeighbors, OrdinaryDiffEq
+
+fluid_particle_spacing = 0.0085
+
+smoothing_length = 1.7320508 * fluid_particle_spacing
+tank_size = (1.6, 0.665, 0.4)
+initial_fluid_size = (0.4, 0.665, 0.3)
+acceleration = (0.0, 0.0, -9.81)
+spacing_ratio = 1
+boundary_layers = 1
+fluid_density = 1000.0
+sound_speed = 20 * sqrt(9.81 * tank_size[2])
+state_equation = StateEquationCole(; sound_speed, reference_density=fluid_density,
+                                   exponent=7)
+
+tank = RectangularTank(fluid_particle_spacing, initial_fluid_size, tank_size, fluid_density,
+                       n_layers=boundary_layers, spacing_ratio=spacing_ratio,
+                       coordinates_eltype=Float64)
+
+tank.fluid.coordinates .+= 0.005
+tank.boundary.coordinates .+= 0.005
+
+# Define a GPU-compatible neighborhood search
+min_corner = minimum(tank.boundary.coordinates, dims=2)
+max_corner = maximum(tank.boundary.coordinates, dims=2)
+cell_list = FullGridCellList(; min_corner, max_corner)#, backend=PointNeighbors.CompactVectorOfVectors{Int32})
+neighborhood_search = GridNeighborhoodSearch{3}(; cell_list,
+                                                update_strategy=ParallelUpdate())
+
+search_radius = TrixiParticles.compact_support(WendlandC2Kernel{3}(), smoothing_length)
+nhs = PointNeighbors.copy_neighborhood_search(neighborhood_search, search_radius, 0)
+cell_coords(x) = PointNeighbors.cell_coords(x, nhs)
+cell_index(x) = PointNeighbors. cell_index(nhs.cell_list, cell_coords(x))
+coords = reinterpret(reshape, SVector{ndims(nhs), eltype(tank.fluid.coordinates)}, tank.fluid.coordinates)
+sort!(coords, by=cell_index)
+
+# Run the dam break simulation with this neighborhood search
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_3d.jl"),
+              tank=tank,
+              smoothing_length=1.7320508 * fluid_particle_spacing,
+              time_integration_scheme=SymplecticPositionVerlet(),
+              boundary_density_calculator=ContinuityDensity(),
+              state_equation=state_equation,
+              fluid_particle_spacing=fluid_particle_spacing,
+              tank_size=tank_size, initial_fluid_size=initial_fluid_size,
+              acceleration=acceleration,
+              alpha=0.1,
+              spacing_ratio=spacing_ratio, boundary_layers=boundary_layers,
+              tspan=(0.0, 1.0), #cfl=0.2,
+              neighborhood_search=neighborhood_search,
+            #   viscosity_wall=viscosity_fluid, TODO
+              # This is the same saving frequency as in DualSPHysics for easier comparison
+            #   saving_callback=SolutionSavingCallback(dt=0.01),
+            #   extra_callback=SortingCallback(interval=1),
+              density_diffusion=nothing, # TODO only for benchmark
+              # For benchmarks, use spacing 0.002, fix time steps, and disable VTK saving:
+              dt=8e-5, #stepsize_callback=nothing, saving_callback=nothing,
+              parallelization_backend=PolyesterBackend())
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
index fa87c296ca..c6b07787a4 100644
--- a/src/TrixiParticles.jl
+++ b/src/TrixiParticles.jl
@@ -26,6 +26,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: vloada, Vec
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics
@@ -73,7 +74,7 @@ export WeaklyCompressibleSPHSystem, EntropicallyDampedSPHSystem, TotalLagrangian
 export BoundaryZone, InFlow, OutFlow, BidirectionalFlow
 export InfoCallback, SolutionSavingCallback, DensityReinitializationCallback,
        PostprocessCallback, StepsizeCallback, UpdateCallback, SteadyStateReachedCallback,
-       SplitIntegrationCallback
+       SplitIntegrationCallback, SortingCallback
 export ContinuityDensity, SummationDensity
 export PenaltyForceGanzenmueller, TransportVelocityAdami, ParticleShiftingTechnique,
        ParticleShiftingTechniqueSun2017, ConsistentShiftingSun2019,
diff --git a/src/callbacks/callbacks.jl b/src/callbacks/callbacks.jl
index 975c760a8e..eba012c4af 100644
--- a/src/callbacks/callbacks.jl
+++ b/src/callbacks/callbacks.jl
@@ -76,3 +76,4 @@ include("stepsize.jl")
 include("update.jl")
 include("steady_state_reached.jl")
 include("split_integration.jl")
+include("sorting.jl")
diff --git a/src/callbacks/sorting.jl b/src/callbacks/sorting.jl
new file mode 100644
index 0000000000..23b5a46ebc
--- /dev/null
+++ b/src/callbacks/sorting.jl
@@ -0,0 +1,185 @@
+# These are the systems that require sorting.
+# TODO: The `DEMSystem` should be added here in the future.
+# Boundary particles always stay fixed relative to each other, TLSPH computes in the initial configuration.
+const RequiresSortingSystem = AbstractFluidSystem
+
+struct SortingCallback{I}
+    interval::I
+    last_t::Float64
+end
+
+"""
+    SortingCallback(; interval=-1, dt=0.0, initial_sort=true)
+
+Reorders particles according to neighborhood-search cells for performance optimization.
+
+When particles become very unordered throughout a long-running simulation, performance
+decreases due to increased cache-misses (on CPUs) and lack of block structure (on GPUs).
+On GPUs, a fully shuffled particle ordering causes a 3-4x slowdown compared to a sorted configuration.
+On CPUs, there is no difference for small problems (<16k particles), but the performance penalty
+grows linearly with the problem size up to 10x slowdown for very large problems (65M particles).
+See [#1044](https://github.com/trixi-framework/TrixiParticles.jl/pull/1044) for more details.
+
+# Keywords
+- `interval`: Sort particles at the end of every `interval` time steps.
+- `dt`: Sort particles in regular intervals of `dt` in terms of integration time.
+        This callback does not add extra time steps / `tstops`; instead, reinitialization is
+        triggered at the first solver step after each `dt` interval has elapsed.
+- `initial_sort=true`: When enabled, particles are sorted at the beginning of the simulation.
+                       When the initial configuration is a perfect grid of particles,
+                       sorting at the beginning is not necessary and might even slightly
+                       slow down the first time steps, since a perfect grid is even better
+                       than sorting by NHS cell index.
+"""
+function SortingCallback(; interval::Integer=-1, dt=0.0)
+    if dt > 0 && interval !== -1
+        throw(ArgumentError("Setting both interval and dt is not supported!"))
+    end
+
+    # Sort in intervals in terms of simulation time
+    if dt > 0
+        interval = Float64(dt)
+
+        # Sort every time step (default)
+    elseif interval == -1
+        interval = 1
+    end
+
+    sorting_callback! = SortingCallback(interval, last_t)
+
+    # The first one is the `condition`, the second the `affect!`
+    return DiscreteCallback(sorting_callback!, sorting_callback!,
+                            initialize=(initial_sort!), save_positions=(false, false))
+end
+
+# `initialize`
+function initial_sort!(cb, u, t, integrator)
+    # The `SortingCallback` is either `cb.affect!` (with `DiscreteCallback`)
+    # or `cb.affect!.affect!` (with `PeriodicCallback`).
+    # Let recursive dispatch handle this.
+
+    initial_sort!(cb.affect!, u, t, integrator)
+end
+
+function initial_sort!(cb::SortingCallback, u, t, integrator)
+    return cb(integrator)
+end
+
+# `condition` with `interval`
+function (sorting_callback!::SortingCallback{Int})(u, t, integrator)
+    (; interval) = sorting_callback!
+
+    return condition_integrator_interval(integrator, interval)
+end
+
+# condition with `dt`
+function (sorting_callback!::SortingCallback)(u, t, integrator)
+    (; interval, last_t) = sorting_callback!
+
+    return (t - last_t) > interval
+end
+
+# `affect!`
+function (sorting_callback!::SortingCallback)(integrator)
+    semi = integrator.p
+    v_ode, u_ode = integrator.u.x
+
+    @trixi_timeit timer() "sorting callback" begin
+        foreach_system(semi) do system
+            v = wrap_v(v_ode, system, semi)
+            u = wrap_u(u_ode, system, semi)
+
+            sort_particles!(system, v, u, semi)
+        end
+    end
+
+    # Tell OrdinaryDiffEq that `integrator.u` has been modified
+    u_modified!(integrator, true)
+
+    return integrator
+end
+
+sort_particles!(system, v, u, semi) = system
+
+function sort_particles!(system::RequiresSortingSystem, v, u, semi)
+    nhs = get_neighborhood_search(system, semi)
+
+    if !(nhs isa GridNeighborhoodSearch)
+        throw(ArgumentError("`SortingCallback` can only be used with a `GridNeighborhoodSearch`"))
+    end
+
+    sort_particles!(system, v, u, nhs, nhs.cell_list, semi)
+end
+
+# TODO: Sort also masses and particle spacings for variable smoothing lengths.
+function sort_particles!(system::RequiresSortingSystem, v, u, nhs,
+                         cell_list::FullGridCellList, semi)
+    cell_coords = allocate(semi.parallelization_backend, SVector{ndims(system), Int},
+                           nparticles(system))
+    @threaded semi for particle in each_active_particle(system)
+        point_coords = current_coords(u, system, particle)
+        cell_coords[particle] = PointNeighbors.cell_coords(point_coords, nhs)
+    end
+
+    # TODO `sortperm` works on CUDA but not (yet) on Metal
+    perm = sortperm(transfer2cpu(cell_coords))
+
+    sort_system!(system, v, u, perm, system.buffer)
+
+    return system
+end
+
+function sort_system!(system, v, u, perm, buffer::Nothing)
+    system_coords = current_coordinates(u, system)
+    system_velocity = current_velocity(v, system)
+    system_density = current_density(v, system)
+    system_pressure = current_pressure(v, system)
+
+    system_coords .= system_coords[:, perm]
+    system_velocity .= system_velocity[:, perm]
+    system_pressure .= system_pressure[perm]
+    system_density .= system_density[perm]
+
+    return system
+end
+
+function Base.show(io::IO, cb::DiscreteCallback{<:Any, <:SortingCallback{Int}})
+    @nospecialize cb # reduce precompilation time
+    print(io, "SortingCallback(interval=", cb.affect!.interval, ")")
+end
+
+function Base.show(io::IO,
+                   cb::DiscreteCallback{<:Any, <:SortingCallback})
+    @nospecialize cb # reduce precompilation time
+    print(io, "SortingCallback(dt=", cb.affect!.affect!.interval, ")")
+end
+
+function Base.show(io::IO, ::MIME"text/plain",
+                   cb::DiscreteCallback{<:Any, <:SortingCallback{Int}})
+    @nospecialize cb # reduce precompilation time
+
+    if get(io, :compact, false)
+        show(io, cb)
+    else
+        sorting_cb = cb.affect!
+        setup = [
+            "interval" => sorting_cb.interval
+        ]
+        summary_box(io, "SortingCallback", setup)
+    end
+end
+
+function Base.show(io::IO, ::MIME"text/plain",
+                   cb::DiscreteCallback{<:Any, <:SortingCallback{Int}})
+    @nospecialize cb # reduce precompilation time
+
+    if get(io, :compact, false)
+        show(io, cb)
+    else
+        sorting_cb = cb.affect!.affect!
+        setup = [
+            "dt" => sorting_cb.interval
+        ]
+        summary_box(io, "SortingCallback", setup)
+    end
+end
diff --git a/src/general/buffer.jl b/src/general/buffer.jl
index 1e2c196799..6b0d0f7f61 100644
--- a/src/general/buffer.jl
+++ b/src/general/buffer.jl
@@ -80,3 +80,35 @@ end
 
     return system
 end
+
+function sort_system!(system, v, u, perm, buffer::SystemBuffer)
+    (; active_particle) = buffer
+
+    # Note that the following contain also inactive particles
+    system_coords = current_coordinates(u, system)
+    system_velocity = current_velocity(v, system)
+    system_density = current_density(v, system)
+    system_pressure = current_pressure(v, system)
+
+    # First permutation: sort by desired `perm`
+    active_particle_sorted = active_particle[perm]
+
+    # Second permutation: move inactive particles to the end (true first, false last)
+    perm2 = sortperm(transfer2cpu(active_particle_sorted), rev=true)
+
+    # Combined permutation
+    combined_perm = perm[perm2]
+
+    # Apply to all data
+    active_particle .= active_particle_sorted[perm2]
+    system_coords .= system_coords[:, combined_perm]
+    system_velocity .= system_velocity[:, combined_perm]
+    system_pressure .= system_pressure[combined_perm]
+    system_density .= system_density[combined_perm]
+
+    # Update buffer
+    buffer.active_particle_count[] = count(active_particle)
+    buffer.eachparticle[1:buffer.active_particle_count[]] .= 1:buffer.active_particle_count[]
+
+    return buffer
+end
diff --git a/src/general/smoothing_kernels.jl b/src/general/smoothing_kernels.jl
index 4a97564648..87622065e9 100644
--- a/src/general/smoothing_kernels.jl
+++ b/src/general/smoothing_kernels.jl
@@ -11,7 +11,8 @@ abstract type AbstractSmoothingKernel{NDIMS} end
     # Also note that `sqrt(eps(h^2)) != eps(h)`.
     distance^2 < eps(h^2) && return zero(pos_diff)
 
-    return kernel_deriv(kernel, distance, h) / distance * pos_diff
+    distance_inv = Base.FastMath.div_fast(1, distance)
+    return kernel_deriv(kernel, distance, h) * distance_inv * pos_diff
 end
 
 @inline function corrected_kernel_grad(kernel, pos_diff, distance, h, correction, system,
diff --git a/src/schemes/fluid/fluid.jl b/src/schemes/fluid/fluid.jl
index 8725190506..20fd08a4dc 100644
--- a/src/schemes/fluid/fluid.jl
+++ b/src/schemes/fluid/fluid.jl
@@ -137,7 +137,7 @@ end
 end
 
 # This formulation was chosen to be consistent with the used pressure_acceleration formulations
-@propagate_inbounds function continuity_equation!(dv, density_calculator::ContinuityDensity,
+@inline function continuity_equation!(dv, density_calculator::ContinuityDensity,
                                                   particle_system::AbstractFluidSystem,
                                                   neighbor_system,
                                                   v_particle_system, v_neighbor_system,
@@ -146,21 +146,44 @@ end
     vdiff = current_velocity(v_particle_system, particle_system, particle) -
             current_velocity(v_neighbor_system, neighbor_system, neighbor)
 
-    vdiff += continuity_equation_shifting_term(shifting_technique(particle_system),
-                                               particle_system, neighbor_system,
-                                               particle, neighbor, rho_a, rho_b)
+    # vdiff += continuity_equation_shifting_term(shifting_technique(particle_system),
+    #                                            particle_system, neighbor_system,
+    #                                            particle, neighbor, rho_a, rho_b)
 
     dv[end, particle] += rho_a / rho_b * m_b * dot(vdiff, grad_kernel)
 
     # Artificial density diffusion should only be applied to systems representing a fluid
     # with the same physical properties i.e. density and viscosity.
     # TODO: shouldn't be applied to particles on the interface (depends on PR #539)
-    if particle_system === neighbor_system
-        density_diffusion!(dv, density_diffusion(particle_system),
-                           v_particle_system, particle, neighbor,
-                           pos_diff, distance, m_b, rho_a, rho_b, particle_system,
-                           grad_kernel)
-    end
+    # if particle_system === neighbor_system
+    #     density_diffusion!(dv, density_diffusion(particle_system),
+    #                        v_particle_system, particle, neighbor,
+    #                        pos_diff, distance, m_b, rho_a, rho_b, particle_system,
+    #                        grad_kernel)
+    # end
+end
+
+@propagate_inbounds function continuity_equation!(drho_particle, density_calculator::ContinuityDensity,
+                                                  particle_system::AbstractFluidSystem,
+                                                  neighbor_system,
+                                                  particle, neighbor, pos_diff, distance,
+                                                  vdiff, m_b, rho_a, rho_b, grad_kernel)
+
+    # vdiff += continuity_equation_shifting_term(shifting_technique(particle_system),
+    #                                            particle_system, neighbor_system,
+    #                                            particle, neighbor, rho_a, rho_b)
+
+    drho_particle[] += Base.FastMath.div_fast(rho_a, rho_b) * m_b * dot(vdiff, grad_kernel)
+
+    # Artificial density diffusion should only be applied to systems representing a fluid
+    # with the same physical properties i.e. density and viscosity.
+    # TODO: shouldn't be applied to particles on the interface (depends on PR #539)
+    # if particle_system === neighbor_system
+    #     density_diffusion!(dv, density_diffusion(particle_system),
+    #                        v_particle_system, particle, neighbor,
+    #                        pos_diff, distance, m_b, rho_a, rho_b, particle_system,
+    #                        grad_kernel)
+    # end
 end
 
 function calculate_dt(v_ode, u_ode, cfl_number, system::AbstractFluidSystem, semi)
diff --git a/src/schemes/fluid/pressure_acceleration.jl b/src/schemes/fluid/pressure_acceleration.jl
index 7ffa5850b3..6dbb6e2c64 100644
--- a/src/schemes/fluid/pressure_acceleration.jl
+++ b/src/schemes/fluid/pressure_acceleration.jl
@@ -26,7 +26,8 @@ end
 # asymmetric version.
 @inline function pressure_acceleration_continuity_density(m_a, m_b, rho_a, rho_b, p_a, p_b,
                                                           W_a)
-    return -m_b * (p_a + p_b) / (rho_a * rho_b) * W_a
+    # return -m_b * (p_a + p_b) / (rho_a * rho_b) * W_a
+    return -m_b * Base.FastMath.div_fast(p_a + p_b, rho_a * rho_b) * W_a
 end
 
 # Same as above, but not assuming symmetry of the kernel gradient. To be used with
diff --git a/src/schemes/fluid/viscosity.jl b/src/schemes/fluid/viscosity.jl
index 2a6c51ba71..b178749f35 100644
--- a/src/schemes/fluid/viscosity.jl
+++ b/src/schemes/fluid/viscosity.jl
@@ -1,30 +1,30 @@
 # Unpack the neighboring systems viscosity to dispatch on the viscosity type.
 # This function is only necessary to allow `nothing` as viscosity.
 # Otherwise, we could just apply the viscosity as a function directly.
-@propagate_inbounds function dv_viscosity(particle_system, neighbor_system,
-                                          v_particle_system, v_neighbor_system,
+@propagate_inbounds function dv_viscosity(dv_particle, particle_system, neighbor_system,
+                                          vdiff,
                                           particle, neighbor, pos_diff, distance,
                                           sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
     viscosity = viscosity_model(particle_system, neighbor_system)
 
-    return dv_viscosity(viscosity, particle_system, neighbor_system,
-                        v_particle_system, v_neighbor_system,
+    return dv_viscosity(dv_particle, viscosity, particle_system, neighbor_system,
+                        vdiff,
                         particle, neighbor, pos_diff, distance,
                         sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
 end
 
-@propagate_inbounds function dv_viscosity(viscosity, particle_system, neighbor_system,
-                                          v_particle_system, v_neighbor_system,
+@propagate_inbounds function dv_viscosity(dv_particle, viscosity, particle_system, neighbor_system,
+                                          vdiff,
                                           particle, neighbor, pos_diff, distance,
                                           sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
-    return viscosity(particle_system, neighbor_system,
-                     v_particle_system, v_neighbor_system,
+    return viscosity(dv_particle, particle_system, neighbor_system,
+                     vdiff,
                      particle, neighbor, pos_diff, distance,
                      sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
 end
 
-@inline function dv_viscosity(viscosity::Nothing, particle_system, neighbor_system,
-                              v_particle_system, v_neighbor_system,
+@inline function dv_viscosity(dv_particle, viscosity::Nothing, particle_system, neighbor_system,
+                              vdiff,
                               particle, neighbor, pos_diff, distance,
                               sound_speed, m_a, m_b, rho_a, rho_b, grad_kernel)
     return zero(pos_diff)
@@ -82,41 +82,42 @@ function kinematic_viscosity(system, viscosity::ViscosityMorris, smoothing_lengt
 end
 
 @propagate_inbounds function (viscosity::Union{ArtificialViscosityMonaghan,
-                                               ViscosityMorris})(particle_system,
+                                               ViscosityMorris})(dv_particle, particle_system,
                                                                  neighbor_system,
-                                                                 v_particle_system,
-                                                                 v_neighbor_system,
+                                                                 v_diff,
                                                                  particle, neighbor,
                                                                  pos_diff, distance,
                                                                  sound_speed,
                                                                  m_a, m_b, rho_a, rho_b,
                                                                  grad_kernel)
-    rho_mean = (rho_a + rho_b) / 2
+    rho_mean = 0#(rho_a + rho_b) / 2
 
-    v_a = viscous_velocity(v_particle_system, particle_system, particle)
-    v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
-    v_diff = v_a - v_b
+    # v_a = viscous_velocity(v_particle_system, particle_system, particle)
+    # v_b = viscous_velocity(v_neighbor_system, neighbor_system, neighbor)
+    # v_diff = v_a - v_b
 
-    smoothing_length_particle = smoothing_length(particle_system, particle)
-    smoothing_length_neighbor = smoothing_length(particle_system, neighbor)
-    smoothing_length_average = (smoothing_length_particle + smoothing_length_neighbor) / 2
+    # smoothing_length_particle = smoothing_length(particle_system, particle)
+    # smoothing_length_neighbor = smoothing_length(particle_system, neighbor)
+    smoothing_length_average = smoothing_length(particle_system, particle)#(smoothing_length_particle + smoothing_length_neighbor) / 2
 
-    nu_a = kinematic_viscosity(particle_system,
-                               viscosity_model(neighbor_system, particle_system),
-                               smoothing_length_particle, sound_speed)
-    nu_b = kinematic_viscosity(neighbor_system,
-                               viscosity_model(particle_system, neighbor_system),
-                               smoothing_length_neighbor, sound_speed)
+    # nu_a = kinematic_viscosity(particle_system,
+    #                            viscosity_model(neighbor_system, particle_system),
+    #                            smoothing_length_particle, sound_speed)
+    # nu_b = kinematic_viscosity(neighbor_system,
+    #                            viscosity_model(particle_system, neighbor_system),
+    #                            smoothing_length_neighbor, sound_speed)
+    nu_a = 0
+    nu_b = 0
 
-    pi_ab = viscosity(sound_speed, v_diff, pos_diff, distance, rho_mean, rho_a, rho_b,
-                      smoothing_length_average, grad_kernel, nu_a, nu_b)
+    pi_ab = viscosity(dv_particle, sound_speed, v_diff, pos_diff, distance, rho_mean, rho_a, rho_b,
+                      smoothing_length_average, grad_kernel, nu_a, nu_b, m_b)
 
-    return m_b * pi_ab
+    # return m_b * pi_ab
 end
 
-@inline function (viscosity::ArtificialViscosityMonaghan)(c, v_diff, pos_diff, distance,
+@inline function (viscosity::ArtificialViscosityMonaghan)(dv_particle, c, v_diff, pos_diff, distance,
                                                           rho_mean, rho_a, rho_b, h,
-                                                          grad_kernel, nu_a, nu_b)
+                                                          grad_kernel, nu_a, nu_b, m_b)
     (; alpha, beta, epsilon) = viscosity
 
     # v_ab ⋅ r_ab
@@ -127,11 +128,12 @@ end
     # approaching particles and turn it off for receding particles. In this way, the
     # viscosity is used for shocks and not rarefactions."
     if vr < 0
-        mu = h * vr / (distance^2 + epsilon * h^2)
-        return (alpha * c * mu + beta * mu^2) / rho_mean * grad_kernel
+        mu = Base.FastMath.div_fast(h * vr, distance^2 + epsilon * h^2)
+        rho_mean = (rho_a + rho_b) / 2
+        dv_particle[] += m_b * Base.FastMath.div_fast(alpha * c * mu + beta * mu^2, rho_mean) * grad_kernel
     end
 
-    return zero(v_diff)
+    return nothing#zero(v_diff)
 end
 
 @inline function (viscosity::ViscosityMorris)(c, v_diff, pos_diff, distance, rho_mean,
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index acb0f06894..086816e767 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -2,7 +2,7 @@
 # in `neighbor_system` and updates `dv` accordingly.
 # It takes into account pressure forces, viscosity, and for `ContinuityDensity` updates
 # the density using the continuity equation.
-function interact!(dv, v_particle_system, u_particle_system,
+function interact_old!(dv, v_particle_system, u_particle_system,
                    v_neighbor_system, u_neighbor_system,
                    particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
     (; density_calculator, correction) = particle_system
@@ -63,12 +63,13 @@ function interact!(dv, v_particle_system, u_particle_system,
                                             distance, grad_kernel, correction)
 
         # Propagate `@inbounds` to the viscosity function, which accesses particle data
-        dv_viscosity_ = viscosity_correction *
-                        @inbounds dv_viscosity(particle_system, neighbor_system,
-                                               v_particle_system, v_neighbor_system,
-                                               particle, neighbor, pos_diff, distance,
-                                               sound_speed, m_a, m_b, rho_a, rho_b,
-                                               grad_kernel)
+        dv_viscosity_ = zero(pos_diff)
+        # viscosity_correction *
+        #                 @inbounds dv_viscosity(particle_system, neighbor_system,
+        #                                        v_particle_system, v_neighbor_system,
+        #                                        particle, neighbor, pos_diff, distance,
+        #                                        sound_speed, m_a, m_b, rho_a, rho_b,
+        #                                        grad_kernel)
 
         # Extra terms in the momentum equation when using a shifting technique
         dv_tvf = @inbounds dv_shifting(shifting_technique(particle_system),
@@ -112,6 +113,260 @@ function interact!(dv, v_particle_system, u_particle_system,
     return dv
 end
 
+function interact!(dv, v_particle_system, u_particle_system,
+                   v_neighbor_system, u_neighbor_system,
+                   particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi)
+    interact_old!(dv, v_particle_system, u_particle_system,
+                   v_neighbor_system, u_neighbor_system,
+                   particle_system, neighbor_system, semi)
+end
+
+function interact2!(dv, v_particle_system, u_particle_system,
+                   v_neighbor_system, u_neighbor_system,
+                   particle_system::WeaklyCompressibleSPHSystem,
+                   neighbor_system::WeaklyCompressibleSPHSystem, semi)
+    dv_ = view(dv, 1:ndims(particle_system), :)
+    drho = view(dv, ndims(particle_system) + 1, :)
+    interact!(dv_, drho, v_particle_system, u_particle_system,
+                   v_neighbor_system, u_neighbor_system,
+                   particle_system, neighbor_system, semi)
+end
+
+function interact!(dv, drho, v_particle_system, u_particle_system,
+                   v_neighbor_system, u_neighbor_system,
+                   particle_system::WeaklyCompressibleSPHSystem{NDIMS},
+                   neighbor_system::WeaklyCompressibleSPHSystem, semi) where NDIMS
+    system_coords = current_coordinates(u_particle_system, particle_system)
+    neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
+    # system_coords = vcat(system_coords, zero(drho)')
+    # neighbor_system_coords = vcat(neighbor_system_coords, zero(drho)')
+
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    cell_list = neighborhood_search.cell_list
+    search_radius2 = PointNeighbors.search_radius(neighborhood_search)^2
+
+    backend = semi.parallelization_backend
+    ndrange = n_integrated_particles(particle_system)
+    mykernel(backend)(dv, drho, system_coords, neighbor_system_coords, neighborhood_search,
+                      cell_list, search_radius2, v_particle_system, v_neighbor_system,
+                      particle_system, neighbor_system; ndrange=ndrange)
+
+    KernelAbstractions.synchronize(backend)
+
+    return dv
+end
+
+@kernel function mykernel(dv, drho,
+                          system_coords, neighbor_system_coords,
+                          nhs, cell_list, search_radius2,
+                          v_particle_system, v_neighbor_system,
+                          particle_system::WeaklyCompressibleSPHSystem{NDIMS},
+                          neighbor_system::WeaklyCompressibleSPHSystem) where NDIMS
+    particle = @index(Global)
+
+    sound_speed = particle_system.state_equation.sound_speed
+    # VT_coords = Vec{4, eltype(system_coords)}
+    # point_coords_ = vloada(VT_coords, pointer(system_coords, 4*(particle-1)+1))
+    # a, b, c, d = Tuple(point_coords_)
+    # point_coords = SVector(a, b, c)
+    point_coords = @inbounds extract_svector(system_coords, Val(NDIMS), particle)
+    p_a = @inbounds particle_system.pressure[particle]
+
+    VT = Vec{4, eltype(v_particle_system)}
+    vrho_a = vloada(VT, pointer(v_particle_system, 4*(particle-1)+1))
+    a, b, c, d = Tuple(vrho_a)
+    v_a = SVector(a, b, c)
+    rho_a = d
+    # v_a = @inbounds extract_svector(v_particle_system, Val(NDIMS), particle)
+    # rho_a = @inbounds v_particle_system[end, particle]
+
+    dv_particle = zero(v_a)
+    drho_particle = zero(rho_a)
+
+    cell = PointNeighbors.cell_coords(point_coords, nhs)
+
+    # cell_blocks = ((cell[1] - 1, cell[2] - 1), (cell[1] - 1, cell[2]), (cell[1] - 1, cell[2] + 1))
+    cell_blocks = CartesianIndices(ntuple(i -> (cell[i + 1] - 1):(cell[i + 1] + 1), Val(NDIMS - 1)))
+    for cell_block in cell_blocks
+        cell_block_start = (cell[1] - 1, Tuple(cell_block)...)
+        cell_index = @inbounds PointNeighbors.cell_index(cell_list, cell_block_start)
+        start = @inbounds cell_list.cells.first_bin_index[cell_index]
+        stop = @inbounds cell_list.cells.first_bin_index[cell_index + 3] - 1
+
+        for neighbor in start:stop
+
+    # for neighbor_cell_ in PointNeighbors.neighboring_cells(cell, nhs)
+    #     neighbor_cell = Tuple(neighbor_cell_)
+    #     neighbors = @inbounds PointNeighbors.points_in_cell(neighbor_cell, nhs)
+
+    #     for neighbor_ in eachindex(neighbors)
+    #         neighbor = @inbounds neighbors[neighbor_]
+
+            # neighbor_coords_ = vloada(VT_coords, pointer(neighbor_system_coords, 4*(neighbor-1)+1))
+            # a, b, c, d = Tuple(neighbor_coords_)
+            # neighbor_coords = SVector(a, b, c)
+            neighbor_coords = @inbounds extract_svector(neighbor_system_coords,
+                                                        Val(NDIMS), neighbor)
+
+            # pos_diff = convert.(eltype(particle_system), point_coords - neighbor_coords)
+            pos_diff = point_coords - neighbor_coords
+            distance2 = dot(pos_diff, pos_diff)
+
+            if eps(search_radius2) <= distance2 <= search_radius2
+                distance = sqrt(distance2)
+
+                m_b = @inbounds neighbor_system.mass[neighbor]
+                p_b = @inbounds neighbor_system.pressure[neighbor]
+
+                vrho_b = vloada(VT, pointer(v_neighbor_system, 4*(neighbor-1)+1))
+                a, b, c, d = Tuple(vrho_b)
+                v_b = SVector(a, b, c)
+                rho_b = d
+
+                # v_b = @inbounds extract_svector(v_neighbor_system, Val(NDIMS), neighbor)
+                # rho_b = @inbounds v_neighbor_system[end, neighbor]
+
+                grad_kernel = kernel_grad_ds(particle_system, pos_diff, distance)
+
+                # dv_particle += -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+                dv_particle += -m_b * Base.FastMath.div_fast(p_a + p_b, rho_a * rho_b) * grad_kernel
+
+                vdiff = v_a - v_b
+                # drho_particle += rho_a / rho_b * m_b * dot(vdiff, grad_kernel)
+                drho_particle += Base.FastMath.div_fast(rho_a, rho_b) * m_b * dot(vdiff, grad_kernel)
+
+                h = particle_system.cache.smoothing_length
+                alpha = particle_system.viscosity.alpha
+                epsilon = particle_system.viscosity.epsilon
+
+                vr = dot(vdiff, pos_diff)
+                if vr < 0
+                    # mu = h * vr / (distance2 + epsilon)
+                    mu = Base.FastMath.div_fast(h * vr, distance2 + epsilon)
+                    rho_mean = (rho_a + rho_b) / 2
+                    # @fastmath pi_ab = (alpha * sound_speed * mu) / rho_mean * grad_kernel
+                    pi_ab = Base.FastMath.div_fast(alpha * sound_speed * mu, rho_mean) * grad_kernel
+                    dv_particle += m_b * pi_ab
+                end
+            end
+        end
+    end
+
+    for i in eachindex(dv_particle)
+        @inbounds dv[i, particle] += dv_particle[i]
+        # Debug example
+        # debug_array[i, particle] += dv_pressure[i]
+    end
+    @inbounds drho[particle] += drho_particle
+end
+
+@inline function kernel_grad_ds(system, pos_diff, r)
+    h = system.cache.smoothing_length
+    normalization_factor = system.cache.normalization_factor
+
+    # q = r / h
+    q = Base.FastMath.div_fast(r, h)
+    wqq1 = (1 - q / 2)
+    return normalization_factor * wqq1 * wqq1 * wqq1 * pos_diff
+end
+
+function interact_reassembled!(dv, v_particle_system, u_particle_system,
+                   v_neighbor_system, u_neighbor_system,
+                   particle_system::WeaklyCompressibleSPHSystem{NDIMS},
+                   neighbor_system, semi) where NDIMS
+    (; density_calculator, correction) = particle_system
+
+    system_coords = current_coordinates(u_particle_system, particle_system)
+    neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
+
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    sound_speed = particle_system.state_equation.sound_speed
+
+    @threaded semi for particle in each_integrated_particle(particle_system)
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
+        m_a = @inbounds hydrodynamic_mass(particle_system, particle)
+
+        # v_a = @inbounds extract_svector(v_particle_system, Val(NDIMS), particle)
+        # rho_a = @inbounds v_particle_system[end, particle]
+        v_a, rho_a = @inbounds velocity_and_density(v_particle_system, particle_system, particle)
+
+        dv_particle = Ref(zero(v_a))
+        drho_particle = Ref(zero(rho_a))
+
+        @inbounds PointNeighbors.foreach_neighbor(system_coords, neighbor_system_coords,
+                         neighborhood_search, particle) do _, neighbor, pos_diff, distance
+            distance < eps() && return
+            
+            m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+            p_b = @inbounds current_pressure(v_neighbor_system, neighbor_system, neighbor)
+
+            # v_b = @inbounds extract_svector(v_neighbor_system, Val(NDIMS), neighbor)
+            # rho_b = @inbounds v_neighbor_system[end, neighbor]
+            v_b, rho_b = @inbounds velocity_and_density(v_neighbor_system, neighbor_system, neighbor)
+
+            grad_kernel = kernel_grad_ds(particle_system, pos_diff, distance)
+            # grad_kernel = smoothing_kernel_grad(particle_system, pos_diff, distance, particle)
+
+            # dv_particle += -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+            # dv_particle[] += -m_b * Base.FastMath.div_fast(p_a + p_b, rho_a * rho_b) * grad_kernel
+
+            dv_particle[] += pressure_acceleration(particle_system, neighbor_system,
+                                                   particle, neighbor,
+                                                   m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                   distance, grad_kernel, correction)
+
+            vdiff = v_a - v_b
+            # drho_particle += rho_a / rho_b * m_b * dot(vdiff, grad_kernel)
+            # drho_particle[] += Base.FastMath.div_fast(rho_a, rho_b) * m_b * dot(vdiff, grad_kernel)
+
+            @inbounds dv_viscosity(dv_particle, particle_system, neighbor_system,
+                                               vdiff,
+                                               particle, neighbor, pos_diff, distance,
+                                               sound_speed, m_a, m_b, rho_a, rho_b,
+                                               grad_kernel)
+
+            @inbounds continuity_equation!(drho_particle, density_calculator, particle_system,
+                                       neighbor_system, particle, neighbor,
+                                       pos_diff, distance, vdiff, m_b, rho_a, rho_b, grad_kernel)
+
+            # h = particle_system.cache.smoothing_length
+            # alpha = particle_system.viscosity.alpha
+            # epsilon = particle_system.viscosity.epsilon
+
+            # vr = dot(vdiff, pos_diff)
+            # if vr < 0
+            #     # mu = h * vr / (distance2 + epsilon)
+            #     mu = Base.FastMath.div_fast(h * vr, distance^2 + epsilon)
+            #     rho_mean = (rho_a + rho_b) / 2
+            #     # @fastmath pi_ab = (alpha * sound_speed * mu) / rho_mean * grad_kernel
+            #     pi_ab = Base.FastMath.div_fast(alpha * sound_speed * mu, rho_mean) * grad_kernel
+            #     dv_particle[] += m_b * pi_ab
+            # end
+        end
+
+        for i in eachindex(dv_particle[])
+            @inbounds dv[i, particle] += dv_particle[][i]
+        end
+        @inbounds dv[end, particle] += drho_particle[]
+    end
+end
+
+@propagate_inbounds function velocity_and_density(v, system, particle)
+    v_particle = current_velocity(v, system, particle)
+    rho_particle = current_density(v, system, particle)
+
+    return v_particle, rho_particle
+end
+
+@inline function velocity_and_density(v, ::WeaklyCompressibleSPHSystem{3}, particle)
+    vrho_a = vloada(Vec{4, eltype(v)}, pointer(v, 4 * (particle - 1) + 1))
+    a, b, c, d = Tuple(vrho_a)
+    v_particle = SVector(a, b, c)
+    rho_particle = d
+
+    return v_particle, rho_particle
+end
+
 @propagate_inbounds function particle_neighbor_pressure(v_particle_system,
                                                         v_neighbor_system,
                                                         particle_system, neighbor_system,
diff --git a/src/schemes/fluid/weakly_compressible_sph/system.jl b/src/schemes/fluid/weakly_compressible_sph/system.jl
index f84849f096..1f0b32d1f0 100644
--- a/src/schemes/fluid/weakly_compressible_sph/system.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/system.jl
@@ -154,7 +154,8 @@ function WeaklyCompressibleSPHSystem(initial_condition, density_calculator, stat
                                      smoothing_length)...,
              create_cache_shifting(initial_condition, shifting_technique)...,
              # Per-system color tag for colorfield surface-normal logic and VTK output.
-             color=Int(color_value))
+             color=Int(color_value),
+             normalization_factor = Float32(-2.7852 / smoothing_length^4))
 
     # If the `reference_density_spacing` is set calculate the `ideal_neighbor_count`
     if reference_particle_spacing > 0