pytorch
diff --git a/‎backends/cuda/CMakeLists.txt‎
Lines changed: 13 additions & 1 deletion b/‎backends/cuda/CMakeLists.txt‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎backends/cuda/runtime/TARGETS‎
Lines changed: 27 additions & 0 deletions b/‎backends/cuda/runtime/TARGETS‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 13 additions & 5 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 13 additions & 5 deletions
@@ -184,7 +184,9 @@ install(
 )
 
 # CUDA backend implementation
-set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp)
+set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp
+                               runtime/cuda_mutable_state.cpp
+)
 if(_cuda_is_msvc_toolchain)
   # MSVC links aoti_cuda_backend into portable_lib without relying on C++
   # symbols exported from aoti_cuda_shims.dll.
@@ -236,3 +238,13 @@ install(
   EXPORT ExecuTorchTargets
   DESTINATION lib
 )
+
+if(BUILD_TESTING)
+  include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+  et_cxx_test(
+    test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
+    EXTRA_LIBS aoti_cuda_backend
+  )
+  target_compile_definitions(test_cuda_mutable_state PRIVATE CUDA_AVAILABLE=1)
+endif()
@@ -1,4 +1,6 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
 
 oncall("executorch")
@@ -105,9 +107,11 @@ runtime.cxx_library(
     name = "cuda_backend",
     srcs = [
         "cuda_backend.cpp",
+        "cuda_mutable_state.cpp",
     ],
     headers = [
         "cuda_delegate_handle.h",
+        "cuda_mutable_state.h",
     ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
@@ -135,3 +139,26 @@ runtime.cxx_library(
         ("cuda", None, "cuda-lazy"),
     ],
 )
+
+cpp_unittest(
+    name = "test_cuda_mutable_state",
+    srcs = [
+        "test/test_cuda_mutable_state.cpp",
+    ],
+    deps = [
+        ":cuda_backend",
+        "//executorch/backends/aoti:aoti_common_slim",
+        "//executorch/backends/aoti/slim/core:slimtensor",
+        "//executorch/backends/aoti/slim/factory:from_blob",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/platform:platform",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+    keep_gpu_sections = True,
+    remote_execution = re_test_utils.remote_execution(
+        platform = "gpu-remote-execution",
+    ),
+)
@@ -44,6 +44,7 @@
 #include <executorch/backends/aoti/utils.h>
 #include <executorch/backends/cuda/runtime/cuda_allocator.h>
 #include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
+#include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -436,6 +437,8 @@ class ET_EXPERIMENTAL CudaBackend final
           kCudaGraphWarmupSteps);
     }
 
+    mutable_state_note_handle(handle);
+
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
@@ -539,6 +542,8 @@ class ET_EXPERIMENTAL CudaBackend final
       }
     }
 
+    ET_CHECK_OK_OR_RETURN_ERROR(mutable_state_rebind_for_execute(handle));
+
     // ---------------------------------------------------------------
     // CUDA graph REPLAY path — skip all tensor setup and just replay
     // ---------------------------------------------------------------
@@ -826,6 +831,8 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;
 
+    mutable_state_forget_handle(handle);
+
     // The CUDA stream is managed by shared_ptr in the handle.
     // It will be automatically destroyed when the last handle using it
     // is destroyed. Just reset our reference.
@@ -899,11 +906,12 @@ class ET_EXPERIMENTAL CudaBackend final
   //   * Constants are assumed to be IMMUTABLE (parameters or read-only
   //     buffers). The AOTI shim today does not expose a mutability bit
   //     through GetConstantOriginalFQN, so we cannot detect or refuse
-  //     to share mutable buffers (e.g. a per-method KV cache). If a
-  //     future model exports the same FQN as a mutable buffer in
-  //     multiple methods, mutations from one method WILL be visible to
-  //     the other through the shared GPU memory. Callers that need
-  //     per-method mutable state must currently use distinct FQNs.
+  //     to share mutable buffers (for example, runtime caches). If a
+  //     model exports the same FQN as a mutable buffer in multiple
+  //     methods, mutations from one method WILL be visible to the other
+  //     through the shared GPU memory. Callers that need isolated mutable
+  //     state for shared FQNs must opt into cuda_mutable_state or use
+  //     distinct FQNs.
   //     TODO: when AOTInductor exposes a constant-type / mutability
   //     query, refuse to share entries that are not PARAMETER or
   //     non-mutable BUFFER.