Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions tokenspeed-scheduler/csrc/fsm/forward_events.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,22 @@ std::vector<std::tuple<std::int32_t, std::int32_t>> BuildWriteBackPairs(

namespace tokenspeed::fsm {

void InsertHybridCache(HybridPrefixCache* hybrid_cache,
void InsertPrefixCache(KVPrefixCache* kv_prefix_cache, HybridPrefixCache* hybrid_cache,
const std::vector<std::span<const std::int32_t>>& full_paged_tokens,
std::unique_ptr<DeviceNodeRef>& device_node_ref, LocalKVAllocator* local_kv_allocator,
LocalMambaAllocator* local_mamba_allocator) {
if (hybrid_cache == nullptr) return;
if (kv_prefix_cache == nullptr) return;

std::vector<std::int32_t> prefix_pages = DevicePagesFromRoot(device_node_ref->Node());
std::int32_t new_page_count =
static_cast<std::int32_t>(full_paged_tokens.size()) - static_cast<std::int32_t>(prefix_pages.size());
if (new_page_count <= 0) return;

OwnedPages pages_to_insert = local_kv_allocator->TakeFirst(new_page_count);
auto insert_result = hybrid_cache->GetKVPrefixCache().Insert<ResourceType::Device>(full_paged_tokens, prefix_pages,
std::move(pages_to_insert));
auto insert_result =
kv_prefix_cache->Insert<ResourceType::Device>(full_paged_tokens, prefix_pages, std::move(pages_to_insert));

if (local_mamba_allocator != nullptr && local_mamba_allocator->HasCheckpoint()) {
if (hybrid_cache != nullptr && local_mamba_allocator != nullptr && local_mamba_allocator->HasCheckpoint()) {
hybrid_cache->InsertMamba(insert_result.last_node, local_mamba_allocator->DetachCheckpoint());
}
device_node_ref = std::make_unique<DeviceNodeRef>(insert_result.last_node);
Expand Down Expand Up @@ -157,7 +157,7 @@ std::variant<PrefillDone, Prefilling> SchedulePrefillEvent::operator()(Prefillin
if (end_of_window_pages < static_cast<std::int32_t>(paged_tokens.size())) {
paged_tokens.resize(end_of_window_pages);
}
InsertHybridCache(hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
InsertPrefixCache(kv_prefix_cache_, hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
local_mamba_allocator.get());
// Allocate KV pages for the new chunk
local_kv_allocator->Acquire(tokens_this_round_);
Expand Down Expand Up @@ -205,7 +205,7 @@ Decoding ScheduleDecodeEvent::operator()(PrefillDone&& state) {
if (end_of_window_pages < static_cast<std::int32_t>(paged_tokens.size())) {
paged_tokens.resize(end_of_window_pages);
}
InsertHybridCache(hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
InsertPrefixCache(kv_prefix_cache_, hybrid_prefix_cache_, paged_tokens, device_node_ref, local_kv_allocator.get(),
local_mamba_allocator.get());
// Allocate fresh checkpoint for decode-phase mamba state tracking
if (hybrid_prefix_cache_ != nullptr && local_mamba_allocator != nullptr) {
Expand Down
14 changes: 10 additions & 4 deletions tokenspeed-scheduler/csrc/fsm/forward_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace tokenspeed::fsm {
struct PrefetchDone;
struct Prefetching;

void InsertHybridCache(HybridPrefixCache* hybrid_prefix_cache,
void InsertPrefixCache(KVPrefixCache* kv_prefix_cache, HybridPrefixCache* hybrid_prefix_cache,
const std::vector<std::span<const std::int32_t>>& full_paged_tokens,
std::unique_ptr<DeviceNodeRef>& device_node_ref, LocalKVAllocator* local_kv_allocator,
LocalMambaAllocator* local_mamba_allocator);
Expand Down Expand Up @@ -101,9 +101,10 @@ struct SchedulePrefillFirstChunkEvent : InvalidTransitionHandler<SchedulePrefill
struct SchedulePrefillEvent : InvalidTransitionHandler<SchedulePrefillEvent> {
using InvalidTransitionHandler<SchedulePrefillEvent>::operator();
SchedulePrefillEvent(std::int32_t tokens_this_round, std::int32_t reserve_num_tokens_in_next_schedule_event,
HybridPrefixCache* hybrid_prefix_cache = nullptr)
KVPrefixCache* kv_prefix_cache, HybridPrefixCache* hybrid_prefix_cache = nullptr)
: tokens_this_round_(tokens_this_round),
reserve_num_tokens_in_next_schedule_event_(reserve_num_tokens_in_next_schedule_event),
kv_prefix_cache_(kv_prefix_cache),
hybrid_prefix_cache_(hybrid_prefix_cache) {}

// Returns PrefillDone (last chunk) or Prefilling (more chunks remain).
Expand All @@ -112,20 +113,25 @@ struct SchedulePrefillEvent : InvalidTransitionHandler<SchedulePrefillEvent> {
private:
std::int32_t tokens_this_round_{};
std::int32_t reserve_num_tokens_in_next_schedule_event_{};
KVPrefixCache* kv_prefix_cache_{};
HybridPrefixCache* hybrid_prefix_cache_{};
};

struct ScheduleDecodeEvent : InvalidTransitionHandler<ScheduleDecodeEvent> {
using InvalidTransitionHandler<ScheduleDecodeEvent>::operator();

ScheduleDecodeEvent(std::int32_t decode_input_tokens, HybridPrefixCache* hybrid_prefix_cache = nullptr)
: decode_input_tokens_(decode_input_tokens), hybrid_prefix_cache_(hybrid_prefix_cache) {}
ScheduleDecodeEvent(std::int32_t decode_input_tokens, KVPrefixCache* kv_prefix_cache,
HybridPrefixCache* hybrid_prefix_cache = nullptr)
: decode_input_tokens_(decode_input_tokens),
kv_prefix_cache_(kv_prefix_cache),
hybrid_prefix_cache_(hybrid_prefix_cache) {}

Decoding operator()(PrefillDone&& state);
Decoding operator()(Decoding&& state);

private:
std::int32_t decode_input_tokens_;
KVPrefixCache* kv_prefix_cache_{};
HybridPrefixCache* hybrid_prefix_cache_{};
};

Expand Down
4 changes: 2 additions & 2 deletions tokenspeed-scheduler/csrc/scheduler/operations/forward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ std::optional<fsm::SchedulePrefillEvent> Scheduler::schedulePrefill(
return {};
}

return fsm::SchedulePrefillEvent{tokens_this_round, reserve_num_tokens_in_next_schedule_event,
return fsm::SchedulePrefillEvent{tokens_this_round, reserve_num_tokens_in_next_schedule_event, &kv_prefix_cache_,
hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr};
}

Expand All @@ -140,7 +140,7 @@ std::optional<fsm::ScheduleDecodeEvent> Scheduler::scheduleDecode(Request* reque
return {};
}

return fsm::ScheduleDecodeEvent{config_.decode_input_tokens,
return fsm::ScheduleDecodeEvent{config_.decode_input_tokens, &kv_prefix_cache_,
hybrid_prefix_cache_ ? &*hybrid_prefix_cache_ : nullptr};
}

Expand Down
15 changes: 15 additions & 0 deletions tokenspeed-scheduler/tests/cpp/test_chunked_prefill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,21 @@ TEST_F(ChunkedPrefillTestSuite, PrefillFirst_ContinuesPrefillBeforeNewSubmitted)
EXPECT_EQ(fwd->request_ids[0], "r1");
}

TEST_F(ChunkedPrefillTestSuite, CompletedChunk_IsVisibleToPrefixCacheWithoutHybridCache) {
Submit(MakeRequestSpec("r1", 4)); // 8 tokens, needs 2 chunks
PlanOnce(); // r1 chunk 1

Submit(MakeRequestSpec("r2", 4)); // same prefix as r1
PlanOnce(); // r1 chunk 2; inserts chunk 1 into KV prefix cache

auto plan = PlanOnce();
auto* fwd = GetForwardOp(plan);
ASSERT_NE(fwd, nullptr);
ASSERT_EQ(fwd->request_ids.size(), 1u);
EXPECT_EQ(fwd->request_ids[0], "r2");
EXPECT_EQ(fwd->extend_prefix_lens[0], 4);
}

TEST_F(ChunkedPrefillTestSuite, InputIds_CorrectPerChunk) {
Submit(MakeRequestSpec("r1", 3)); // 6 tokens: [1,2,3,4,5,6]
auto plan1 = PlanOnce();
Expand Down
Loading