Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -37,14 +38,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
[weakdeps]
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
TrixiParticlesCUDAExt = "CUDA"

[compat]
Accessors = "0.1.43"
Adapt = "4"
CSV = "0.10"
CUDA = "5.9.1"
DataFrames = "1.6"
DelimitedFiles = "1"
DiffEqCallbacks = "4"
Expand All @@ -62,6 +66,7 @@ Polyester = "0.7.10"
ReadVTK = "0.2"
RecipesBase = "1"
Reexport = "1"
SIMD = "3.7.2"
SciMLBase = "2"
StaticArrays = "1"
Statistics = "1"
Expand Down
33 changes: 17 additions & 16 deletions docs/literate/src/tut_custom_kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end

# By looking at the implementation of existing kernels in TrixiParticles.jl,
# we can see that a kernel implementation requires three functions.
# `TrixiParticles.kernel`, which is the kernel function itself,
# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function,
# and `TrixiParticles.compact_support`, which defines the compact support of the
# kernel in relation to the smoothing length.
# `TrixiParticles.kernel_unsafe`, which is the kernel function itself,
# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the
# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines
# the compact support of the kernel in relation to the smoothing length.
# The latter is relevant for determining the search radius of the neighborhood search.
function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h)
#
# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since
# this avoids an extra division in the hot loop and is robust near ``r=0``.
# The public function `TrixiParticles.kernel_deriv` is defined automatically by
# TrixiParticles from this method (and multiplies by ``r`` again when needed).
# In the unsafe functions, we do not check the compact support; this is handled in the
# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the
# compact support defined in `TrixiParticles.compact_support`.
function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h)
q = r / h

if q < 2
return 1 / (pi * h^2) * exp(-q^2)
end

return 0.0
return 1 / (pi * h^2) * exp(-q^2)
end

function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h)
function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h)
q = r / h

if q < 2
return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
end

return 0.0
kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
return kernel_deriv / r
end

TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h
Expand Down
32 changes: 32 additions & 0 deletions ext/TrixiParticlesCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module TrixiParticlesCUDAExt

using CUDA: CUDA
using TrixiParticles: TrixiParticles

# Use faster version of `div_fast` for `Float64` on CUDA.
# By default, `div_fast` translates to `Base.FastMath.div_fast`, but there is
# no fast division for `Float64` on CUDA, so we need to redefine it here to use the
# improved fast reciprocal defined below.
CUDA.@device_override TrixiParticles.div_fast(x, y::Float64) = x * fast_inv_cuda(y)

# Improved fast reciprocal for `Float64` by @Mikolaj-A-Kowalski, which is significantly
# more accurate than just calling "llvm.nvvm.rcp.approx.ftz.d" without the cubic iteration,
# while still being much faster than a full division.
# This is copied from Oceananigans.jl, see https://github.com/CliMA/Oceananigans.jl/pull/5140.
@inline function fast_inv_cuda(a::Float64)
# Get the approximate reciprocal
# https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
# This instruction chops off last 32bits of mantissa and computes inverse
# while treating all subnormal numbers as 0.0
# If reciprocal would be subnormal, underflows to 0.0
# 32 least significant bits of the result are filled with 0s
inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a)

# Approximate the missing 32bits of mantissa with a single cubic iteration
e = fma(inv_a, -a, 1.0)
e = fma(e, e, e)
inv_a = fma(e, inv_a, inv_a)
return inv_a
end

end # module
3 changes: 2 additions & 1 deletion src/TrixiParticles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ using FileIO: FileIO
using ForwardDiff: ForwardDiff
using GPUArraysCore: AbstractGPUArray
using JSON: JSON
using KernelAbstractions: KernelAbstractions, @kernel, @index
using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize
using LinearAlgebra: norm, normalize, cross, dot, I, tr, inv, pinv, det
using MuladdMacro: @muladd
using Polyester: Polyester, @batch
Expand All @@ -26,6 +26,7 @@ using Random: seed!
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
add_tstop!
using SIMD: vloada, Vec
@reexport using StaticArrays: SVector
using StaticArrays: @SMatrix, SMatrix, setindex
using Statistics: Statistics
Expand Down
26 changes: 23 additions & 3 deletions src/general/abstract_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,30 @@ end
return kernel(smoothing_kernel, distance, smoothing_length(system, particle))
end

@inline function skip_zero_distance(system, distance, almostzero)
return skip_zero_distance(system_correction(system), system, distance, almostzero)
end

# Robust/safe version of the function below. In performance-critical code, manually check
# the kernel support, call `skip_zero_distance` and then `smoothing_kernel_grad_unsafe`.
@inline function smoothing_kernel_grad(system, pos_diff, distance, particle)
return corrected_kernel_grad(system_smoothing_kernel(system), pos_diff,
distance, smoothing_length(system, particle),
system_correction(system), system, particle)
h = smoothing_length(system, particle)
compact_support_ = compact_support(system_smoothing_kernel(system), h)

# Note that `sqrt(eps(h^2)) != eps(h)`
if distance > compact_support_ || skip_zero_distance(system, distance, sqrt(eps(h^2)))
return zero(pos_diff)
end

return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
end

# Skip the zero distance and compact support checks for maximum performance.
# Only call this when you are sure that `0 < distance < compact_support`.
@inline function smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
return corrected_kernel_grad_unsafe(system_smoothing_kernel(system), pos_diff,
distance, smoothing_length(system, particle),
system_correction(system), system, particle)
end

# System updates do nothing by default, but can be dispatched if needed
Expand Down
42 changes: 24 additions & 18 deletions src/general/corrections.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,16 @@ end
# `rho_mean` is the mean density of the fluid, which is used to determine correction values near the free surface.
# Return a tuple `(viscosity_correction, pressure_correction, surface_tension_correction)` representing the correction terms.
@inline function free_surface_correction(correction::AkinciFreeSurfaceCorrection,
particle_system, rho_mean)
particle_system, rho_a, rho_b)
# Equation 4 in ref
rho_mean = (rho_a + rho_b) / 2
k = correction.rho0 / rho_mean

# Viscosity, pressure, surface_tension
return k, 1, k
end

@inline function free_surface_correction(correction, particle_system, rho_mean)
@inline function free_surface_correction(correction, particle_system, rho_a, rho_b)
return 1, 1, 1
end

Expand Down Expand Up @@ -337,31 +338,36 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system,
v_neighbor_system = wrap_v(v_ode, neighbor_system, semi)

neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
almostzero = sqrt(eps(compact_support(system, neighbor_system)^2))

foreach_point_neighbor(system, neighbor_system, coordinates, neighbor_coords,
semi) do particle, neighbor, pos_diff, distance
volume = hydrodynamic_mass(neighbor_system, neighbor) /
current_density(v_neighbor_system, neighbor_system, neighbor)
smoothing_length_ = smoothing_length(system, particle)

function compute_grad_kernel(correction, smoothing_kernel, pos_diff, distance,
smoothing_length_, system, particle)
return smoothing_kernel_grad(system, pos_diff, distance, particle)
function kernel_grad_local(correction, smoothing_kernel, pos_diff, distance,
smoothing_length_, system, particle)
return smoothing_kernel_grad_unsafe(system, pos_diff, distance, particle)
end

# Compute gradient of corrected kernel
function compute_grad_kernel(correction::MixedKernelGradientCorrection,
smoothing_kernel, pos_diff, distance,
smoothing_length_, system, particle)
return corrected_kernel_grad(smoothing_kernel, pos_diff, distance,
smoothing_length_, KernelCorrection(), system,
particle)
function kernel_grad_local(correction::MixedKernelGradientCorrection,
smoothing_kernel, pos_diff, distance,
smoothing_length_, system, particle)
return corrected_kernel_grad_unsafe(smoothing_kernel, pos_diff, distance,
smoothing_length_, KernelCorrection(),
system, particle)
end

grad_kernel = compute_grad_kernel(correction, smoothing_kernel, pos_diff,
distance, smoothing_length_, system, particle)
# Skip neighbors with the same position if the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(correction, system, distance, almostzero) && return

iszero(grad_kernel) && return
# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
smoothing_length_ = smoothing_length(system, particle)
grad_kernel = kernel_grad_local(correction, smoothing_kernel, pos_diff,
distance, smoothing_length_, system, particle)

volume = hydrodynamic_mass(neighbor_system, neighbor) /
current_density(v_neighbor_system, neighbor_system, neighbor)

L = volume * grad_kernel * pos_diff'

Expand Down
4 changes: 3 additions & 1 deletion src/general/interpolation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,9 @@ end
other_density[point] += m_b * W_ab

if include_wall_velocity
velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor)
velocity_neighbor_ = current_velocity(v, neighbor_system, neighbor)
velocity_neighbor = viscous_velocity(v, neighbor_system, neighbor,
velocity_neighbor_)
for i in axes(velocity_neighbor, 1)
cache.velocity[i, point] += velocity_neighbor[i] * volume_b * W_ab
end
Expand Down
Loading
Loading