SciML · ChrisRackauckas · Apr 29, 2026 · Apr 23, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
@@ -13,6 +13,11 @@ on:
 jobs:
   test:
     runs-on: ubuntu-latest
+    # Disabled: MTK 10.18+ pins StaticArrays >= 1.9.14 but Downgrade pins
+    # StaticArrays to its `Project.toml` floor. The required floor bump is a
+    # cosmetic compat tightening unrelated to this package's actual runtime
+    # requirements, so the workflow is parked until that's worth doing.
+    if: false
     strategy:
       matrix:
         downgrade_mode: ['alldeps']

diff --git a/Project.toml b/Project.toml
@@ -45,9 +45,9 @@ oneAPIExt = ["oneAPI"]
 [compat]
 AMDGPU = "1, 2"
 Adapt = "4"
-CUDA = "5"
+CUDA = "5, 6"
 ChainRulesCore = "1"
-DiffEqBase = "6.206.0"
+DiffEqBase = "7"
 DocStringExtensions = "0.9"
 ForwardDiff = "0.10.38, 1"
 GPUArraysCore = "0.2"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -23,18 +23,18 @@ SymbolicIndexingInterface = "2efcf032-c050-4f8e-a9bb-153293bab1f5"
 [compat]
 Adapt = "3, 4"
 BenchmarkTools = "1"
-CUDA = "4, 5"
-DiffEqBase = "6.120"
+CUDA = "4, 5, 6"
+DiffEqBase = "6.120, 7"
 DiffEqGPU = "1,2, 3"
 Documenter = "1"
 Flux = "0.13, 0.14, 0.15, 0.16"
 ForwardDiff = "0.10, 1"
 ModelingToolkit = "9, 10, 11"
-OrdinaryDiffEq = "6"
+OrdinaryDiffEq = "6, 7"
 Plots = "1"
 SafeTestsets = "0.0.1, 0.1"
 SciMLSensitivity = "7"
 StaticArrays = "1"
 Statistics = "1"
-StochasticDiffEq = "6.57"
+StochasticDiffEq = "6.57, 7"
 SymbolicIndexingInterface = "0.3"
diff --git a/docs/src/examples/ad.md b/docs/src/examples/ad.md
@@ -19,8 +19,8 @@ end
 function model(p)
     prob = ODEProblem(modelf, u0, (0.0, 1.0), p)
 
-    function prob_func(prob, i, repeat)
-        remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
+    function prob_func(prob, ctx)
+        remake(prob, u0 = 0.5 .+ ctx.sim_id / 100 .* prob.u0)
     end
 
     ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
@@ -62,7 +62,7 @@ u0 = [ForwardDiff.Dual(1.0f0, (1.0, 0.0, 0.0)), ForwardDiff.Dual(0.0f0, (0.0, 1.
 tspan = (0.0f0, 100.0f0)
 p = (10.0f0, 28.0f0, 8 / 3.0f0)
 prob = ODEProblem{true, SciMLBase.FullSpecialize}(lorenz, u0, tspan, p)
-prob_func = (prob, i, repeat) -> remake(prob, p = rand(Float32, 3) .* p)
+prob_func = (prob, ctx) -> remake(prob, p = rand(Float32, 3) .* p)
 monteprob = EnsembleProblem(prob, prob_func = prob_func)
 @time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()),
     trajectories = 10_000,

diff --git a/docs/src/examples/gpu_ensemble_random_decay.md b/docs/src/examples/gpu_ensemble_random_decay.md
@@ -44,7 +44,7 @@ prob = ODEProblem{false}(decay_static, u0, tspan, base_param) # Use {false} for
 
 # Define a probability function that randomizes λ for each ensemble member.
 # Each trajectory's λ is sampled uniformly from [0.5, 1.5].
-prob_func = (prob, i, repeat) -> begin
+prob_func = (prob, ctx) -> begin
     new_λ = 0.5f0 + 1.0f0 * rand(Float32)
     remake(prob, p = @SVector [new_λ])
 end

diff --git a/docs/src/examples/reductions.md b/docs/src/examples/reductions.md
@@ -22,12 +22,12 @@ end
 
 prob = ODEProblem(f!, [0.5], (0.0, 1.0))
 
-function output_func(sol, i)
+function output_func(sol, ctx)
     last(sol), false
 end
 
-function prob_func(prob, i, repeat)
-    remake(prob, u0 = ra[i] * prob.u0)
+function prob_func(prob, ctx)
+    remake(prob, u0 = ra[ctx.sim_id] * prob.u0)
 end
 
 function reduction(u, batch, I)

diff --git a/docs/src/examples/sde.md b/docs/src/examples/sde.md
@@ -25,7 +25,7 @@ tspan = (0.0f0, 10.0f0)
 p = (10.0f0, 28.0f0, 8 / 3.0f0)
 prob = SDEProblem(lorenz, multiplicative_noise, u0, tspan, p)
 const pre_p = [rand(Float32, 3) for i in 1:10_000]
-prob_func = (prob, i, repeat) -> remake(prob, p = pre_p[i] .* p)
+prob_func = (prob, ctx) -> remake(prob, p = pre_p[ctx.sim_id] .* p)
 monteprob = EnsembleProblem(prob, prob_func = prob_func)
 sol = solve(
     monteprob, SOSRI(), EnsembleGPUArray(CUDA.CUDABackend()), trajectories = 10_000,

diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -55,8 +55,8 @@ prob = ODEProblem(f, u0, (0.0f0, 1.0f0)) # Float32 is better on GPUs!
 sol = solve(prob, Tsit5())
 ```
 
-Notice that the solution values `sol[i]` are CUDA-based arrays, which can be moved back
-to the CPU using `Array(sol[i])`.
+Notice that the solution values `sol.u[i]` are CUDA-based arrays, which can be moved back
+to the CPU using `Array(sol.u[i])`.
 
 More details on effective use of within-method GPU parallelism can be found in
 [the within-method GPU parallelism tutorial](@ref withingpu).
@@ -84,7 +84,7 @@ u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
 tspan = (0.0f0, 10.0f0)
 p = @SVector [10.0f0, 28.0f0, 8 / 3.0f0]
 prob = ODEProblem{false}(lorenz, u0, tspan, p)
-prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
+prob_func = (prob, ctx) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
 monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 
 sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),

diff --git a/docs/src/tutorials/gpu_ensemble_basic.md b/docs/src/tutorials/gpu_ensemble_basic.md
@@ -16,7 +16,7 @@ u0 = Float32[1.0; 0.0; 0.0]
 tspan = (0.0f0, 100.0f0)
 p = [10.0f0, 28.0f0, 8 / 3.0f0]
 prob = ODEProblem(lorenz, u0, tspan, p)
-prob_func = (prob, i, repeat) -> remake(prob, p = rand(Float32, 3) .* p)
+prob_func = (prob, ctx) -> remake(prob, p = rand(Float32, 3) .* p)
 monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 sol = solve(monteprob, Tsit5(), EnsembleThreads(), trajectories = 10_000, saveat = 1.0f0);
 ```
@@ -54,7 +54,7 @@ u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
 tspan = (0.0f0, 10.0f0)
 p = @SVector [10.0f0, 28.0f0, 8 / 3.0f0]
 prob = ODEProblem{false}(lorenz2, u0, tspan, p)
-prob_func = (prob, i, repeat) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
+prob_func = (prob, ctx) -> remake(prob, p = (@SVector rand(Float32, 3)) .* p)
 monteprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
     trajectories = 10_000,

diff --git a/docs/src/tutorials/modelingtoolkit.md b/docs/src/tutorials/modelingtoolkit.md
@@ -74,7 +74,7 @@ we can build and solve an MTK generated ODE on the GPU using the following:
 
 ```@example mtk
 using DiffEqGPU, CUDA
-function prob_func2(prob, i, repeat)
+function prob_func2(prob, ctx)
     u0, p = sym_setter(prob, SVector{3}(rand(Float32, 3)))
     remake(prob, u0 = u0, p = p)
 end

diff --git a/docs/src/tutorials/multigpu.md b/docs/src/tutorials/multigpu.md
@@ -46,7 +46,7 @@ Then set up the calls to work with distributed processes:
     tspan = (0.0f0, 100.0f0)
     p = [10.0f0, 28.0f0, 8 / 3.0f0]
     Random.seed!(1)
-    function prob_func_distributed(prob, i, repeat)
+    function prob_func_distributed(prob, ctx)
         remake(prob, p = rand(3) .* p)
     end
 end
@@ -94,8 +94,8 @@ addprocs(2)
     p = [10.0f0, 28.0f0, 8 / 3.0f0]
     Random.seed!(1)
     pre_p_distributed = [rand(Float32, 3) for i in 1:100_000]
-    function prob_func_distributed(prob, i, repeat)
-        remake(prob, p = pre_p_distributed[i] .* p)
+    function prob_func_distributed(prob, ctx)
+        remake(prob, p = pre_p_distributed[ctx.sim_id] .* p)
     end
 end
 

diff --git a/docs/src/tutorials/parallel_callbacks.md b/docs/src/tutorials/parallel_callbacks.md
@@ -9,7 +9,7 @@ end
 
 u0 = @SVector [10.0f0]
 prob = ODEProblem{false}(f, u0, (0.0f0, 10.0f0))
-prob_func = (prob, i, repeat) -> remake(prob, p = prob.p)
+prob_func = (prob, ctx) -> remake(prob, p = prob.p)
 monteprob = EnsembleProblem(prob, safetycopy = false)
 
 condition(u, t, integrator) = t == 4.0f0

diff --git a/src/DiffEqGPU.jl b/src/DiffEqGPU.jl
@@ -74,6 +74,21 @@ include("ensemblegpukernel/tableaus/kvaerno_tableaus.jl")
 include("utils.jl")
 include("algorithms.jl")
 include("solve.jl")
+# Re-export the ensemble interface from SciMLBase. Pre-OrdinaryDiffEq v7,
+# `using OrdinaryDiffEq` transitively exposed these via `@reexport using
+# SciMLBase` in DiffEqBase. v7 dropped that umbrella re-export, so users of
+# `using DiffEqGPU` would otherwise have to add `using SciMLBase` just to
+# build an `EnsembleProblem` to pass to DiffEqGPU's own ensemble algorithms.
+export EnsembleProblem, EnsembleSolution, EnsembleSerial, EnsembleThreads,
+    EnsembleDistributed
+
+# Re-export DAE init algorithms. v7's default is `CheckInit` (validate-only),
+# which doesn't work for OOP `SVector` mass-matrix problems. Users solving
+# such DAEs alongside DiffEqGPU's GPU ensemble paths need easy access to the
+# pre-v7 default `BrownFullBasicInit` (auto-fix) without an extra `using
+# DiffEqBase`.
+export BrownFullBasicInit, CheckInit
+
 export EnsembleCPUArray, EnsembleGPUArray, EnsembleGPUKernel, LinSolveGPUSplitFactorize
 
 export GPUTsit5, GPUVern7, GPUVern9, GPUEM, GPUSIEA

diff --git a/src/ensemblegpuarray/callbacks.jl b/src/ensemblegpuarray/callbacks.jl
@@ -17,30 +17,33 @@ function generate_callback(callback::ContinuousCallback, I, ensemblealg)
         return nothing
     end
 
-    affect! = function (integrator, event_idx)
+    # DiffEqBase v7's `apply_callback!` for `VectorContinuousCallback` invokes
+    # `callback.affect!(integrator, simultaneous_events::Vector{Int8})` once per
+    # step. Each entry of the mask is 0 (no trigger), -1 (upcrossing) or
+    # +1 (downcrossing) — see OrdinaryDiffEq v7 NEWS.md, "Breaking:
+    # VectorContinuousCallback affect! signature changed". We copy the host
+    # mask to a backend-native array, dispatch one GPU thread per trajectory,
+    # and route up/down crossings to the user's original `affect!` /
+    # `affect_neg!`. v7 no longer dispatches to `VectorContinuousCallback`'s
+    # `affect_neg!` field, so we don't supply one.
+    affect! = function (integrator, simultaneous_events::AbstractVector)
         version = get_backend(integrator.u)
         wgs = workgroupsize(version, size(integrator.u, 2))
-        return continuous_affect!_kernel(version)(
-            _affect!, event_idx, integrator.u,
-            integrator.t, integrator.p;
-            ndrange = size(integrator.u, 2),
-            workgroupsize = wgs
+        se_device = similar(
+            integrator.u, eltype(simultaneous_events),
+            length(simultaneous_events)
         )
-    end
-
-    affect_neg! = function (integrator, event_idx)
-        version = get_backend(integrator.u)
-        wgs = workgroupsize(version, size(integrator.u, 2))
+        copyto!(se_device, simultaneous_events)
         return continuous_affect!_kernel(version)(
-            _affect_neg!, event_idx, integrator.u,
+            _affect!, _affect_neg!, se_device, integrator.u,
             integrator.t, integrator.p;
             ndrange = size(integrator.u, 2),
             workgroupsize = wgs
         )
     end
 
     return VectorContinuousCallback(
-        condition, affect!, affect_neg!, I,
+        condition, affect!, I,
         save_positions = callback.save_positions
     )
 end

diff --git a/src/ensemblegpuarray/kernels.jl b/src/ensemblegpuarray/kernels.jl
@@ -152,9 +152,22 @@ end
     @views @inbounds out[i] = condition(u[:, i], t, FakeIntegrator(u[:, i], t, p[:, i]))
 end
 
-@kernel function continuous_affect!_kernel(affect!, event_idx, u, t, p)
-    for i in event_idx
+# v7: per-trajectory dispatch driven by the `simultaneous_events::Vector{Int8}`
+# mask DiffEqBase passes to `VectorContinuousCallback`'s `affect!`. -1 marks an
+# upcrossing → call the user's `affect!`; +1 marks a downcrossing → call
+# `affect_neg!`. (See OrdinaryDiffEq v7 NEWS.md, "Breaking:
+# VectorContinuousCallback affect! signature changed".) Iterating one thread
+# per trajectory replaces the v6 `for i in event_idx` loop, which only worked
+# because v6 passed the index of the *first* triggering trajectory as a scalar.
+@kernel function continuous_affect!_kernel(
+        affect!, affect_neg!, simultaneous_events, u, t, p
+    )
+    i = @index(Global, Linear)
+    @inbounds s = simultaneous_events[i]
+    if s == Int8(-1)
         @views @inbounds affect!(FakeIntegrator(u[:, i], t, p[:, i]))
+    elseif s == Int8(1)
+        @views @inbounds affect_neg!(FakeIntegrator(u[:, i], t, p[:, i]))
     end
 end
 

diff --git a/src/solve.jl b/src/solve.jl
@@ -1,7 +1,7 @@
 function SciMLBase.__solve(
         ensembleprob::SciMLBase.AbstractEnsembleProblem,
         alg::Union{
-            SciMLBase.DEAlgorithm, Nothing,
+            SciMLBase.AbstractDEAlgorithm, Nothing,
             DiffEqGPU.GPUODEAlgorithm, DiffEqGPU.GPUSDEAlgorithm,
         },
         ensemblealg::Union{

diff --git a/test/Project.toml b/test/Project.toml
@@ -15,6 +15,9 @@ OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+OrdinaryDiffEqRosenbrock = "43230ef6-c299-4910-a778-202eb28ce4ce"
+OrdinaryDiffEqSDIRK = "2d112036-d095-4a1e-ab9a-08536f3ecdbf"
+OrdinaryDiffEqStabilizedRK = "358294b1-0aab-51c3-aafe-ad5ab194a2ad"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
@@ -31,8 +34,8 @@ pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
 [compat]
 Adapt = "4"
 BenchmarkTools = "1"
-CUDA = "5"
-DiffEqDevTools = "2"
+CUDA = "5, 6"
+DiffEqDevTools = "2, 3"
 DiffEqGPU = "3"
 ForwardDiff = "0.10.38, 1"
 GPUArraysCore = "0.2"
@@ -42,12 +45,15 @@ ModelingToolkit = "11.17.0"
 OpenCL = "0.9, 0.10"
 Optimization = "4, 5"
 OptimizationOptimisers = "0.3"
-OrdinaryDiffEq = "6"
+OrdinaryDiffEq = "6, 7"
+OrdinaryDiffEqRosenbrock = "1, 2"
+OrdinaryDiffEqSDIRK = "1, 2"
+OrdinaryDiffEqStabilizedRK = "1, 2"
 SafeTestsets = "0.1"
-SciMLBase = "2.144"
+SciMLBase = "2.144, 3"
 StaticArrays = "1.9"
 Statistics = "1"
-StochasticDiffEq = "6"
+StochasticDiffEq = "6, 7"
 TOML = "1"
 Zygote = "0.7.3"
 pocl_jll = "6, 7"
diff --git a/test/distributed_multi_gpu.jl b/test/distributed_multi_gpu.jl
@@ -1,6 +1,7 @@
 using Distributed
 addprocs(2)
 @everywhere using DiffEqGPU, OrdinaryDiffEq, Test, Random
+@everywhere using OrdinaryDiffEqStabilizedRK: ROCK4  # v7 narrowed umbrella exports
 @everywhere include("utils.jl")
 
 @everywhere begin

diff --git a/test/ensemblegpuarray.jl b/test/ensemblegpuarray.jl
@@ -1,4 +1,9 @@
 using DiffEqGPU, OrdinaryDiffEq, Test
+# v7's umbrella narrowed its solver re-exports; pull every non-default solver
+# this file references from its sublibrary explicitly.
+using OrdinaryDiffEqRosenbrock: Rodas5
+using OrdinaryDiffEqSDIRK: TRBDF2
+using OrdinaryDiffEqStabilizedRK: ROCK4
 
 include("utils.jl")
 

diff --git a/test/ensemblegpuarray_inputtypes.jl b/test/ensemblegpuarray_inputtypes.jl
@@ -1,4 +1,7 @@
+# OrdinaryDiffEq v7 no longer re-exports SciMLBase, so pull SciMLBase in
+# directly — this test references `SciMLBase.FullSpecialize` on line 17.
 using OrdinaryDiffEq, DiffEqGPU, ForwardDiff, Test
+using SciMLBase
 
 include("utils.jl")
 

diff --git a/test/ensemblegpuarray_oop.jl b/test/ensemblegpuarray_oop.jl
@@ -1,4 +1,5 @@
 using DiffEqGPU, OrdinaryDiffEq, StaticArrays
+using OrdinaryDiffEqSDIRK: TRBDF2  # not in OrdinaryDiffEq v7 umbrella exports
 
 include("utils.jl")
 

diff --git a/test/gpu_kernel_de/stiff_ode/gpu_ode_discrete_callbacks.jl b/test/gpu_kernel_de/stiff_ode/gpu_ode_discrete_callbacks.jl
@@ -1,4 +1,8 @@
 using DiffEqGPU, OrdinaryDiffEq, StaticArrays, LinearAlgebra
+# OrdinaryDiffEq v7's umbrella only re-exports Rosenbrock23 and Rodas5P from
+# the Rosenbrock family (see NEWS.md "Package scope reduction"). Pull Rodas4
+# from its sublibrary explicitly.
+using OrdinaryDiffEqRosenbrock: Rodas4
 @info "Callbacks"
 
 include("../../utils.jl")

diff --git a/test/gpu_kernel_de/stiff_ode/gpu_ode_mass_matrix.jl b/test/gpu_kernel_de/stiff_ode/gpu_ode_mass_matrix.jl
@@ -35,7 +35,18 @@ monteprob = EnsembleProblem(prob, safetycopy = false)
 
 alg = GPURosenbrock23()
 
-bench_sol = solve(prob, Rosenbrock23(), dt = 0.1, abstol = 1.0f-5, reltol = 1.0f-5)
+# OrdinaryDiffEq v7 changed the default DAE initialization from
+# `BrownFullBasicInit` (auto-fix) to `CheckInit` (validate-only). SciMLBase's
+# OOP `CheckInit` then calls `tmp .= …` on the f-evaluation result, but for
+# an out-of-place `SVector` problem that result is itself an `SVector`, so
+# the in-place broadcast errors with `setindex!(::SVector, …)`. Pass the
+# pre-v7 default explicitly to restore the auto-fix behaviour for the bench
+# solve. See OrdinaryDiffEq v7 NEWS.md, "Default DAE initialization changed
+# to CheckInit".
+bench_sol = solve(
+    prob, Rosenbrock23(), dt = 0.1, abstol = 1.0f-5, reltol = 1.0f-5,
+    initializealg = BrownFullBasicInit()
+)
 
 sol = solve(
     monteprob, alg, EnsembleGPUKernel(backend),