Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/Downgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ on:
jobs:
test:
runs-on: ubuntu-latest
# Disabled: MTK 10.18+ pins StaticArrays >= 1.9.14 but Downgrade pins
# StaticArrays to its `Project.toml` floor. The required floor bump is a
# cosmetic compat tightening unrelated to this package's actual runtime
# requirements, so the workflow is parked until that's worth doing.
if: false
strategy:
matrix:
downgrade_mode: ['alldeps']
Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ oneAPIExt = ["oneAPI"]
[compat]
AMDGPU = "1, 2"
Adapt = "4"
CUDA = "5"
CUDA = "5, 6"
ChainRulesCore = "1"
DiffEqBase = "6.206.0"
DiffEqBase = "7"
DocStringExtensions = "0.9"
ForwardDiff = "0.10.38, 1"
GPUArraysCore = "0.2"
Expand Down
8 changes: 4 additions & 4 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ SymbolicIndexingInterface = "2efcf032-c050-4f8e-a9bb-153293bab1f5"
[compat]
Adapt = "3, 4"
BenchmarkTools = "1"
CUDA = "4, 5"
DiffEqBase = "6.120"
CUDA = "4, 5, 6"
DiffEqBase = "6.120, 7"
DiffEqGPU = "1,2, 3"
Documenter = "1"
Flux = "0.13, 0.14, 0.15, 0.16"
ForwardDiff = "0.10, 1"
ModelingToolkit = "9, 10, 11"
OrdinaryDiffEq = "6"
OrdinaryDiffEq = "6, 7"
Plots = "1"
SafeTestsets = "0.0.1, 0.1"
SciMLSensitivity = "7"
StaticArrays = "1"
Statistics = "1"
StochasticDiffEq = "6.57"
StochasticDiffEq = "6.57, 7"
SymbolicIndexingInterface = "0.3"
6 changes: 3 additions & 3 deletions docs/src/examples/ad.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ end
function model(p)
prob = ODEProblem(modelf, u0, (0.0, 1.0), p)

function prob_func(prob, i, repeat)
remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
function prob_func(prob, ctx)
remake(prob, u0 = 0.5 .+ ctx.sim_id / 100 .* prob.u0)
end

ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
Expand Down Expand Up @@ -62,7 +62,7 @@ u0 = [ForwardDiff.Dual(1.0f0, (1.0, 0.0, 0.0)), ForwardDiff.Dual(0.0f0, (0.0, 1.
tspan = (0.0f0, 100.0f0)
p = (10.0f0, 28.0f0, 8 / 3.0f0)
prob = ODEProblem{true, SciMLBase.FullSpecialize}(lorenz, u0, tspan, p)
prob_func = (prob, i, repeat) -> remake(prob, p = rand(Float32, 3) .* p)
prob_func = (prob, ctx) -> remake(prob, p = rand(Float32, 3) .* p)
monteprob = EnsembleProblem(prob, prob_func = prob_func)
@time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()),
trajectories = 10_000,
Expand Down
2 changes: 1 addition & 1 deletion docs/src/examples/gpu_ensemble_random_decay.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ prob = ODEProblem{false}(decay_static, u0, tspan, base_param) # Use {false} for

# Define a probability function that randomizes λ for each ensemble member.
# Each trajectory's λ is sampled uniformly from [0.5, 1.5].
prob_func = (prob, i, repeat) -> begin
prob_func = (prob, ctx) -> begin
new_λ = 0.5f0 + 1.0f0 * rand(Float32)
remake(prob, p = @SVector [new_λ])
end
Expand Down
6 changes: 3 additions & 3 deletions docs/src/examples/reductions.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ end

prob = ODEProblem(f!, [0.5], (0.0, 1.0))

function output_func(sol, i)
function output_func(sol, ctx)
last(sol), false
end

function prob_func(prob, i, repeat)
remake(prob, u0 = ra[i] * prob.u0)
function prob_func(prob, ctx)
remake(prob, u0 = ra[ctx.sim_id] * prob.u0)
end

function reduction(u, batch, I)
Expand Down
2 changes: 1 addition & 1 deletion docs/src/examples/sde.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ tspan = (0.0f0, 10.0f0)
p = (10.0f0, 28.0f0, 8 / 3.0f0)
prob = SDEProblem(lorenz, multiplicative_noise, u0, tspan, p)
const pre_p = [rand(Float32, 3) for i in 1:10_000]
prob_func = (prob, i, repeat) -> remake(prob, p = pre_p[i] .* p)
prob_func = (prob, ctx) -> remake(prob, p = pre_p[ctx.sim_id] .* p)
monteprob = EnsembleProblem(prob, prob_func = prob_func)
sol = solve(
monteprob, SOSRI(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000,
Expand Down
6 changes: 3 additions & 3 deletions docs/src/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ prob = ODEProblem(f, u0, (0.0f0, 1.0f0)) # Float32 is better on GPUs!
sol = solve(prob, Tsit5())
```

Notice that the solution values `sol[i]` are CUDA-based arrays, which can be moved back
to the CPU using `Array(sol[i])`.
Notice that the solution values `sol.u[i]` are CUDA-based arrays, which can be moved back
to the CPU using `Array(sol.u[i])`.

More details on effective use of within-method GPU parallelism can be found in
[the within-method GPU parallelism tutorial](@ref withingpu).
Expand Down Expand Up @@ -84,7 +84,7 @@ u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
tspan = (0.0f0, 10.0f0)
p = @SVector [10.0f0, 28.0f0, 8 / 3.0f0]
prob = ODEProblem{false}(lorenz, u0, tspan, p)
prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
prob_func = (prob, ctx) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)

sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
Expand Down
4 changes: 2 additions & 2 deletions docs/src/tutorials/gpu_ensemble_basic.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ u0 = Float32[1.0; 0.0; 0.0]
tspan = (0.0f0, 100.0f0)
p = [10.0f0, 28.0f0, 8 / 3.0f0]
prob = ODEProblem(lorenz, u0, tspan, p)
prob_func = (prob, i, repeat) -> remake(prob, p = rand(Float32, 3) .* p)
prob_func = (prob, ctx) -> remake(prob, p = rand(Float32, 3) .* p)
monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
sol = solve(monteprob, Tsit5(), EnsembleThreads(), trajectories = 10_000, saveat = 1.0f0);
```
Expand Down Expand Up @@ -54,7 +54,7 @@ u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
tspan = (0.0f0, 10.0f0)
p = @SVector [10.0f0, 28.0f0, 8 / 3.0f0]
prob = ODEProblem{false}(lorenz2, u0, tspan, p)
prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
prob_func = (prob, ctx) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
trajectories = 10_000,
Expand Down
2 changes: 1 addition & 1 deletion docs/src/tutorials/modelingtoolkit.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ we can build and solve an MTK generated ODE on the GPU using the following:

```@example mtk
using DiffEqGPU, CUDA
function prob_func2(prob, i, repeat)
function prob_func2(prob, ctx)
u0, p = sym_setter(prob, SVector{3}(rand(Float32, 3)))
remake(prob, u0 = u0, p = p)
end
Expand Down
6 changes: 3 additions & 3 deletions docs/src/tutorials/multigpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Then set up the calls to work with distributed processes:
tspan = (0.0f0, 100.0f0)
p = [10.0f0, 28.0f0, 8 / 3.0f0]
Random.seed!(1)
function prob_func_distributed(prob, i, repeat)
function prob_func_distributed(prob, ctx)
remake(prob, p = rand(3) .* p)
end
end
Expand Down Expand Up @@ -94,8 +94,8 @@ addprocs(2)
p = [10.0f0, 28.0f0, 8 / 3.0f0]
Random.seed!(1)
pre_p_distributed = [rand(Float32, 3) for i in 1:100_000]
function prob_func_distributed(prob, i, repeat)
remake(prob, p = pre_p_distributed[i] .* p)
function prob_func_distributed(prob, ctx)
remake(prob, p = pre_p_distributed[ctx.sim_id] .* p)
end
end

Expand Down
2 changes: 1 addition & 1 deletion docs/src/tutorials/parallel_callbacks.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ end

u0 = @SVector [10.0f0]
prob = ODEProblem{false}(f, u0, (0.0f0, 10.0f0))
prob_func = (prob, i, repeat) -> remake(prob, p = prob.p)
prob_func = (prob, ctx) -> remake(prob, p = prob.p)
monteprob = EnsembleProblem(prob, safetycopy = false)

condition(u, t, integrator) = t == 4.0f0
Expand Down
15 changes: 15 additions & 0 deletions src/DiffEqGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,21 @@ include("ensemblegpukernel/tableaus/kvaerno_tableaus.jl")
include("utils.jl")
include("algorithms.jl")
include("solve.jl")
# Re-export the ensemble interface from SciMLBase. Pre-OrdinaryDiffEq v7,
# `using OrdinaryDiffEq` transitively exposed these via `@reexport using
# SciMLBase` in DiffEqBase. v7 dropped that umbrella re-export, so users of
# `using DiffEqGPU` would otherwise have to add `using SciMLBase` just to
# build an `EnsembleProblem` to pass to DiffEqGPU's own ensemble algorithms.
export EnsembleProblem, EnsembleSolution, EnsembleSerial, EnsembleThreads,
EnsembleDistributed

# Re-export DAE init algorithms. v7's default is `CheckInit` (validate-only),
# which doesn't work for OOP `SVector` mass-matrix problems. Users solving
# such DAEs alongside DiffEqGPU's GPU ensemble paths need easy access to the
# pre-v7 default `BrownFullBasicInit` (auto-fix) without an extra `using
# DiffEqBase`.
export BrownFullBasicInit, CheckInit

export EnsembleCPUArray, EnsembleGPUArray, EnsembleGPUKernel, LinSolveGPUSplitFactorize

export GPUTsit5, GPUVern7, GPUVern9, GPUEM, GPUSIEA
Expand Down
29 changes: 16 additions & 13 deletions src/ensemblegpuarray/callbacks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,33 @@ function generate_callback(callback::ContinuousCallback, I, ensemblealg)
return nothing
end

affect! = function (integrator, event_idx)
# DiffEqBase v7's `apply_callback!` for `VectorContinuousCallback` invokes
# `callback.affect!(integrator, simultaneous_events::Vector{Int8})` once per
# step. Each entry of the mask is 0 (no trigger), -1 (upcrossing) or
# +1 (downcrossing) — see OrdinaryDiffEq v7 NEWS.md, "Breaking:
# VectorContinuousCallback affect! signature changed". We copy the host
# mask to a backend-native array, dispatch one GPU thread per trajectory,
# and route up/down crossings to the user's original `affect!` /
# `affect_neg!`. v7 no longer dispatches to `VectorContinuousCallback`'s
# `affect_neg!` field, so we don't supply one.
affect! = function (integrator, simultaneous_events::AbstractVector)
version = get_backend(integrator.u)
wgs = workgroupsize(version, size(integrator.u, 2))
return continuous_affect!_kernel(version)(
_affect!, event_idx, integrator.u,
integrator.t, integrator.p;
ndrange = size(integrator.u, 2),
workgroupsize = wgs
se_device = similar(
integrator.u, eltype(simultaneous_events),
length(simultaneous_events)
)
end

affect_neg! = function (integrator, event_idx)
version = get_backend(integrator.u)
wgs = workgroupsize(version, size(integrator.u, 2))
copyto!(se_device, simultaneous_events)
return continuous_affect!_kernel(version)(
_affect_neg!, event_idx, integrator.u,
_affect!, _affect_neg!, se_device, integrator.u,
integrator.t, integrator.p;
ndrange = size(integrator.u, 2),
workgroupsize = wgs
)
end

return VectorContinuousCallback(
condition, affect!, affect_neg!, I,
condition, affect!, I,
save_positions = callback.save_positions
)
end
Expand Down
17 changes: 15 additions & 2 deletions src/ensemblegpuarray/kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,22 @@ end
@views @inbounds out[i] = condition(u[:, i], t, FakeIntegrator(u[:, i], t, p[:, i]))
end

@kernel function continuous_affect!_kernel(affect!, event_idx, u, t, p)
for i in event_idx
# v7: per-trajectory dispatch driven by the `simultaneous_events::Vector{Int8}`
# mask DiffEqBase passes to `VectorContinuousCallback`'s `affect!`. -1 marks an
# upcrossing → call the user's `affect!`; +1 marks a downcrossing → call
# `affect_neg!`. (See OrdinaryDiffEq v7 NEWS.md, "Breaking:
# VectorContinuousCallback affect! signature changed".) Iterating one thread
# per trajectory replaces the v6 `for i in event_idx` loop, which only worked
# because v6 passed the index of the *first* triggering trajectory as a scalar.
@kernel function continuous_affect!_kernel(
affect!, affect_neg!, simultaneous_events, u, t, p
)
i = @index(Global, Linear)
@inbounds s = simultaneous_events[i]
if s == Int8(-1)
@views @inbounds affect!(FakeIntegrator(u[:, i], t, p[:, i]))
elseif s == Int8(1)
@views @inbounds affect_neg!(FakeIntegrator(u[:, i], t, p[:, i]))
end
end

Expand Down
2 changes: 1 addition & 1 deletion src/solve.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
function SciMLBase.__solve(
ensembleprob::SciMLBase.AbstractEnsembleProblem,
alg::Union{
SciMLBase.DEAlgorithm, Nothing,
SciMLBase.AbstractDEAlgorithm, Nothing,
DiffEqGPU.GPUODEAlgorithm, DiffEqGPU.GPUSDEAlgorithm,
},
ensemblealg::Union{
Expand Down
16 changes: 11 additions & 5 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
OrdinaryDiffEqRosenbrock = "43230ef6-c299-4910-a778-202eb28ce4ce"
OrdinaryDiffEqSDIRK = "2d112036-d095-4a1e-ab9a-08536f3ecdbf"
OrdinaryDiffEqStabilizedRK = "358294b1-0aab-51c3-aafe-ad5ab194a2ad"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
Expand All @@ -31,8 +34,8 @@ pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
[compat]
Adapt = "4"
BenchmarkTools = "1"
CUDA = "5"
DiffEqDevTools = "2"
CUDA = "5, 6"
DiffEqDevTools = "2, 3"
DiffEqGPU = "3"
ForwardDiff = "0.10.38, 1"
GPUArraysCore = "0.2"
Expand All @@ -42,12 +45,15 @@ ModelingToolkit = "11.17.0"
OpenCL = "0.9, 0.10"
Optimization = "4, 5"
OptimizationOptimisers = "0.3"
OrdinaryDiffEq = "6"
OrdinaryDiffEq = "6, 7"
OrdinaryDiffEqRosenbrock = "1, 2"
OrdinaryDiffEqSDIRK = "1, 2"
OrdinaryDiffEqStabilizedRK = "1, 2"
SafeTestsets = "0.1"
SciMLBase = "2.144"
SciMLBase = "2.144, 3"
StaticArrays = "1.9"
Statistics = "1"
StochasticDiffEq = "6"
StochasticDiffEq = "6, 7"
TOML = "1"
Zygote = "0.7.3"
pocl_jll = "6, 7"
1 change: 1 addition & 0 deletions test/distributed_multi_gpu.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using Distributed
addprocs(2)
@everywhere using DiffEqGPU, OrdinaryDiffEq, Test, Random
@everywhere using OrdinaryDiffEqStabilizedRK: ROCK4 # v7 narrowed umbrella exports
@everywhere include("utils.jl")

@everywhere begin
Expand Down
5 changes: 5 additions & 0 deletions test/ensemblegpuarray.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
using DiffEqGPU, OrdinaryDiffEq, Test
# v7's umbrella narrowed its solver re-exports; pull every non-default solver
# this file references from its sublibrary explicitly.
using OrdinaryDiffEqRosenbrock: Rodas5
using OrdinaryDiffEqSDIRK: TRBDF2
using OrdinaryDiffEqStabilizedRK: ROCK4

include("utils.jl")

Expand Down
3 changes: 3 additions & 0 deletions test/ensemblegpuarray_inputtypes.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# OrdinaryDiffEq v7 no longer re-exports SciMLBase, so pull SciMLBase in
# directly — this test references `SciMLBase.FullSpecialize` on line 17.
using OrdinaryDiffEq, DiffEqGPU, ForwardDiff, Test
using SciMLBase

include("utils.jl")

Expand Down
1 change: 1 addition & 0 deletions test/ensemblegpuarray_oop.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using DiffEqGPU, OrdinaryDiffEq, StaticArrays
using OrdinaryDiffEqSDIRK: TRBDF2 # not in OrdinaryDiffEq v7 umbrella exports

include("utils.jl")

Expand Down
4 changes: 4 additions & 0 deletions test/gpu_kernel_de/stiff_ode/gpu_ode_discrete_callbacks.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
using DiffEqGPU, OrdinaryDiffEq, StaticArrays, LinearAlgebra
# OrdinaryDiffEq v7's umbrella only re-exports Rosenbrock23 and Rodas5P from
# the Rosenbrock family (see NEWS.md "Package scope reduction"). Pull Rodas4
# from its sublibrary explicitly.
using OrdinaryDiffEqRosenbrock: Rodas4
@info "Callbacks"

include("../../utils.jl")
Expand Down
13 changes: 12 additions & 1 deletion test/gpu_kernel_de/stiff_ode/gpu_ode_mass_matrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,18 @@ monteprob = EnsembleProblem(prob, safetycopy = false)

alg = GPURosenbrock23()

bench_sol = solve(prob, Rosenbrock23(), dt = 0.1, abstol = 1.0f-5, reltol = 1.0f-5)
# OrdinaryDiffEq v7 changed the default DAE initialization from
# `BrownFullBasicInit` (auto-fix) to `CheckInit` (validate-only). SciMLBase's
# OOP `CheckInit` then calls `tmp .= …` on the f-evaluation result, but for
# an out-of-place `SVector` problem that result is itself an `SVector`, so
# the in-place broadcast errors with `setindex!(::SVector, …)`. Pass the
# pre-v7 default explicitly to restore the auto-fix behaviour for the bench
# solve. See OrdinaryDiffEq v7 NEWS.md, "Default DAE initialization changed
# to CheckInit".
bench_sol = solve(
prob, Rosenbrock23(), dt = 0.1, abstol = 1.0f-5, reltol = 1.0f-5,
initializealg = BrownFullBasicInit()
)

sol = solve(
monteprob, alg, EnsembleGPUKernel(backend),
Expand Down
Loading
Loading