diff --git a/src/compute/attention_cublas.cu b/src/compute/attention_cublas.cu index 22fa572..d37081c 100644 --- a/src/compute/attention_cublas.cu +++ b/src/compute/attention_cublas.cu @@ -34,6 +34,13 @@ static cublasHandle_t get_attn_cublas_handle() { return handle; } +void attention_cublas_prewarm() { + // Force lazy-init of the static cuBLAS handle. The first + // cublasGemmBatchedEx call after this is guaranteed not to trigger + // cublasCreate's internal cudaMalloc (illegal under stream capture). + (void)get_attn_cublas_handle(); +} + // --------------------------------------------------------------------------- // Fused causal softmax FP32 → FP16: reads FP32 S matrix, writes FP16 probs // to a separate output buffer. Replaces causal_softmax_fp32_inplace_kernel diff --git a/src/compute/attention_cublas.h b/src/compute/attention_cublas.h index 938c24e..7c5bf12 100644 --- a/src/compute/attention_cublas.h +++ b/src/compute/attention_cublas.h @@ -20,4 +20,10 @@ void attention_cublas_prefill(const Tensor& Q, const Tensor& K, const Tensor& V, int n_heads, int n_kv_heads, int head_dim, float scale, bool causal, float softcap = 0.0f, int q_offset = 0, cudaStream_t stream = nullptr); +// Force-create the static cuBLAS handle. Safe to call multiple times. +// Engine init calls this so the first attention_cublas_prefill invocation +// inside a captured stream can reuse the handle without cublasCreate +// (which does internal cudaMalloc for workspace and is illegal under capture). +void attention_cublas_prewarm(); + } // namespace imp diff --git a/src/runtime/engine.cpp b/src/runtime/engine.cpp index 53da479..42fc284 100644 --- a/src/runtime/engine.cpp +++ b/src/runtime/engine.cpp @@ -7,6 +7,7 @@ #include "model/gguf_loader.h" #include "model/chat_template.h" #include "compute/gemm.h" +#include "compute/attention_cublas.h" #include "compute/gemm_grouped.h" #include "compute/sampling.h" #include "compute/attention.h" @@ -963,6 +964,7 @@ bool Engine::init(std::shared_ptr model, const EngineConfig& config) { return false; } gemm_init(); + attention_cublas_prewarm(); scheduler_ = std::make_unique(config_.max_batch_size); (void)stream_.create(cudaStreamNonBlocking); @@ -2233,7 +2235,32 @@ void Engine::step_prefill_one(std::shared_ptr& req, int effective_chunk executor_->use_workspace(0); } Tensor logits_out; - executor_->forward_logits(state, logits_out, pf_stream); + + // Prefill graph capture (opt-in, Phase 4 of MoE-prefill-graphs work). + // Conditions: env-gated, pool path (stable device buffers), and + // chunk shape stable (in practice all non-last chunks share chunk_len + // = prefill_chunk_size). H2D upload happened above on pf_stream + // *before* this wrapper — captured region is forward_logits only, + // analogous to the decode graph pattern. + static const bool prefill_graph_enabled = (std::getenv("IMP_PREFILL_GRAPH") != nullptr); + const bool can_capture = prefill_graph_enabled && pf_pool_used && config_.use_cuda_graphs; + if (can_capture) { + const int block_count = static_cast(block_table.size()); + if (chunk_len != last_prefill_chunk_len_ || block_count != last_prefill_block_count_) { + prefill_graph_runner_.invalidate_for_update(); + last_prefill_chunk_len_ = chunk_len; + last_prefill_block_count_ = block_count; + } + prefill_graph_runner_.set_decode_fn([this, &state, &logits_out](cudaStream_t s) { + executor_->forward_logits(state, logits_out, s); + }); + prefill_graph_runner_.execute(pf_stream); + if (logits_out.data == nullptr) { + logits_out = executor_->get_logits_view(/*n_sequences=*/1); + } + } else { + executor_->forward_logits(state, logits_out, pf_stream); + } if (!pf_pool_used) { free_prefill_buffers(d_token_ids, d_positions, d_block_tables, d_context_lens, pf_stream); diff --git a/src/runtime/engine.h b/src/runtime/engine.h index cba1538..971e594 100644 --- a/src/runtime/engine.h +++ b/src/runtime/engine.h @@ -223,6 +223,14 @@ class Engine { static constexpr int kMaxGraphPoolSize = 32; CudaGraphRunner decode_graph_pool_[kMaxGraphPoolSize]; // index = n_sequences - 1 int last_decode_max_blocks_per_graph_[kMaxGraphPoolSize] = {}; + + // Prefill graph runner — captures forward_logits for non-last chunks of + // chunked prefill. Single runner: in practice chunk_len == prefill_chunk_size + // for all non-last chunks, so per-shape variability collapses to one shape. + // Opt-in via IMP_PREFILL_GRAPH=1 (Phase 4 of MoE-prefill-graphs work). + CudaGraphRunner prefill_graph_runner_; + int last_prefill_chunk_len_ = -1; + int last_prefill_block_count_ = -1; int32_t* h_sample_pinned_ = nullptr; // Async conditional graph loop CudaGraphConditionalRunner async_graph_runner_;