From 39fe25179f5ad1c7733d05f6fe92eb682d8efac1 Mon Sep 17 00:00:00 2001 From: davide221 Date: Tue, 26 May 2026 21:21:41 +0000 Subject: [PATCH] =?UTF-8?q?chore(repo):=20rename=20dflash=E2=86=92server,?= =?UTF-8?q?=20group=20pflash+megakernel=20under=20optimizations/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/ci.yml | 6 +-- .gitmodules | 4 +- CONTRIBUTING.md | 8 +-- README.md | 50 +++++++++--------- docs/specs/model-cards.md | 6 +-- docs/specs/thinking-budget.md | 2 +- harness/README.md | 20 +++---- harness/benchmarks/run_lucebox_vs_llamacpp.sh | 16 +++--- harness/clients/README.md | 6 +-- harness/clients/common.sh | 16 +++--- .../run_claude_llamacpp_decode_check.sh | 2 +- harness/clients/run_claude_llamacpp_matrix.sh | 2 +- .../megakernel}/README.md | 2 +- .../megakernel}/README_PASCAL.md | 0 .../megakernel}/RESULTS.md | 8 +-- .../megakernel}/_phase2_variant.py | 0 .../megakernel}/bench.py | 0 .../megakernel}/bench_pp_tg.py | 0 .../megakernel}/bench_pp_tg_nvfp4.py | 0 .../megakernel}/build_corpus.py | 0 .../megakernel}/corpus/baseline.json | 0 .../megakernel}/corpus/wmma.json | 0 .../megakernel}/corpus/wmma_p3.json | 0 .../megakernel}/corpus/wmma_p4.json | 0 .../megakernel}/corpus/wmma_p6cleanup.json | 0 .../megakernel}/corpus/wmma_p7.json | 0 .../megakernel}/corpus/wmma_p8.json | 0 .../megakernel}/diag_phase2_metrics.py | 0 .../megakernel}/diag_prefill_kernels.py | 0 .../megakernel}/final_bench.py | 0 .../megakernel}/final_bench_nvfp4.py | 0 .../megakernel}/half_type.h | 0 .../megakernel}/hero.png | 0 .../megakernel}/hero.raw.png | 0 .../megakernel}/kernel.cu | 0 .../megakernel}/kernel_gb10_nvfp4.cu | 0 .../megakernel}/model.py | 0 .../megakernel}/model_nvfp4.py | 0 .../megakernel}/prefill.cu | 0 .../megakernel}/prefill_bw.cu | 0 .../megakernel}/prefill_megakernel.cu | 0 .../megakernel}/pyproject.toml | 0 .../megakernel}/setup.py | 0 .../megakernel}/torch_bindings.cpp | 0 {pflash => optimizations/pflash}/README.md | 40 +++++++------- {pflash => optimizations/pflash}/demo.gif | 0 {pflash => optimizations/pflash}/hero.png | 0 .../pflash}/pflash/__init__.py | 0 .../pflash}/pflash/config.py | 0 .../pflash}/pflash/dflash_client.py | 0 .../pflash}/pyproject.toml | 0 .../pflash}/tests/bench_niah_cpp.py | 0 .../pflash}/tests/niah_gen.py | 0 pyproject.toml | 2 +- scripts/check_uv_workspace.sh | 6 +-- {dflash => server}/.gitignore | 0 {dflash => server}/CMakeLists.txt | 0 {dflash => server}/CODEX.md | 0 {dflash => server}/DEVELOPER.md | 20 +++---- {dflash => server}/README.md | 4 +- {dflash => server}/RESULTS.md | 6 +-- {dflash => server}/demo.gif | 0 .../deps/Block-Sparse-Attention | 0 .../bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h | 0 .../bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh | 0 {dflash => server}/deps/bsa_stubs/README.md | 2 +- .../deps/bsa_stubs/c10/cuda/CUDAException.h | 0 {dflash => server}/deps/llama.cpp | 0 {dflash => server}/docs/API.md | 0 {dflash => server}/docs/ARCHITECTURE.md | 4 +- {dflash => server}/docs/HIP_PERF_PLAN.md | 0 {dflash => server}/docs/MIXED_BACKEND.md | 0 {dflash => server}/docs/PREFIX_CACHE.md | 0 {dflash => server}/docs/SPEC_PREFILL.md | 2 +- .../docs/laguna_integration_plan.md | 18 +++---- {dflash => server}/eval/README.md | 6 +-- .../eval/humaneval_plus/SOURCE.txt | 0 .../eval/humaneval_plus/humanevalplus.jsonl | 0 {dflash => server}/eval/mt_bench/SOURCE.txt | 0 .../eval/mt_bench/question.jsonl | 0 {dflash => server}/examples/chat.py | 0 {dflash => server}/hero.png | 0 {dflash => server}/hero.raw.png | 0 {dflash => server}/hip_compat/cuda_bf16.h | 0 {dflash => server}/hip_compat/cuda_fp16.h | 0 {dflash => server}/hip_compat/cuda_runtime.h | 0 {dflash => server}/hip_compat/mma.h | 0 {dflash => server}/include/dflash27b.h | 0 {dflash => server}/pyproject.toml | 0 {dflash => server}/scripts/_prefill_hook.py | 0 {dflash => server}/scripts/bench_agent.py | 0 .../scripts/bench_agent_loop.py | 0 {dflash => server}/scripts/bench_daemon.py | 0 {dflash => server}/scripts/bench_he.py | 0 {dflash => server}/scripts/bench_he_http.py | 0 {dflash => server}/scripts/bench_llm.py | 0 {dflash => server}/scripts/bench_server.py | 0 .../scripts/convert_dflash_to_gguf.py | 0 {dflash => server}/scripts/detokenize.py | 0 .../agent_prompts/codex_apply_patch.md | 0 .../fixtures/agent_prompts/codex_gpt52.md | 0 .../agent_prompts/codex_gpt52_codex.md | 0 .../agent_prompts/codex_gpt5_codex.md | 0 .../swe_bench/swe_bench_verified.parquet | Bin {dflash => server}/scripts/gen_oracle.py | 0 .../scripts/laguna_pflash_niah.py | 0 {dflash => server}/scripts/parity_laguna.py | 0 .../scripts/phase_split_dual_gpu.py | 0 .../scripts/placement/__init__.py | 0 .../scripts/placement/backend_device.py | 0 .../scripts/placement/server_resolver.py | 0 .../scripts/placement/test_dflash_args.py | 0 {dflash => server}/scripts/prefix_cache.py | 0 .../scripts/quality_ab_simple.py | 0 .../scripts/quality_humaneval_plus.py | 0 .../scripts/quantize_draft_q8.py | 0 .../scripts/quantize_gemma_dflash_q8.py | 0 {dflash => server}/scripts/run.py | 0 {dflash => server}/scripts/server.py | 0 {dflash => server}/scripts/setup_system.sh | 0 .../scripts/test_full_compress_cache.py | 0 .../scripts/test_multi_turn_prefix_cache.py | 0 .../scripts/test_prefix_cache.py | 0 {dflash => server}/scripts/test_server.py | 0 .../scripts/test_server_integration.py | 0 .../scripts/test_server_prefix_cache.py | 0 .../scripts/test_tool_memory.py | 0 {dflash => server}/scripts/tokenize_prompt.py | 0 {dflash => server}/scripts/tool_memory.py | 0 {dflash => server}/src/bsa_fwd_inst.cu | 0 {dflash => server}/src/bsa_launcher.cu | 0 {dflash => server}/src/bsa_launcher_hip.cu | 0 {dflash => server}/src/common/attn_masks.h | 0 .../src/common/backend_factory.cpp | 0 .../src/common/backend_factory.h | 0 {dflash => server}/src/common/backend_ipc.cpp | 0 {dflash => server}/src/common/backend_ipc.h | 0 {dflash => server}/src/common/daemon_loop.cpp | 0 {dflash => server}/src/common/daemon_loop.h | 0 {dflash => server}/src/common/ddtree.cpp | 0 {dflash => server}/src/common/ddtree.h | 0 .../src/common/device_placement.h | 0 .../src/common/dflash_capture.cpp | 0 .../src/common/dflash_capture.h | 0 .../src/common/dflash_draft_graph.cpp | 0 .../src/common/dflash_draft_graph.h | 0 .../src/common/dflash_draft_ipc.cpp | 0 .../src/common/dflash_draft_ipc.h | 0 .../src/common/dflash_draft_ipc_daemon.cpp | 0 .../src/common/dflash_feature_ring.cpp | 0 .../src/common/dflash_feature_ring.h | 0 .../src/common/dflash_layer_split_runtime.h | 0 .../src/common/dflash_spec_decode.cpp | 0 .../src/common/dflash_spec_decode.h | 0 {dflash => server}/src/common/dflash_target.h | 0 .../src/common/gguf_inspect.cpp | 0 {dflash => server}/src/common/gguf_inspect.h | 0 {dflash => server}/src/common/gguf_mmap.h | 0 .../src/common/gpu_runtime_compat.h | 0 {dflash => server}/src/common/io_utils.h | 0 .../src/common/layer_split_utils.cpp | 0 .../src/common/layer_split_utils.h | 0 {dflash => server}/src/common/model_backend.h | 0 {dflash => server}/src/common/peer_access.cpp | 0 {dflash => server}/src/common/peer_access.h | 0 .../src/common/pflash_drafter_ipc.cpp | 0 .../src/common/pflash_drafter_ipc.h | 0 .../src/common/pflash_drafter_ipc_daemon.cpp | 0 {dflash => server}/src/common/restore_delta.h | 0 {dflash => server}/src/common/sampler.cpp | 0 {dflash => server}/src/common/sampler.h | 0 .../src/common/snapshot_backend.h | 0 {dflash => server}/src/common/step_graph.h | 0 .../src/cuda_cross_device_copy.cpp | 0 {dflash => server}/src/delta_net_chunked.cpp | 0 {dflash => server}/src/delta_net_chunked.h | 0 {dflash => server}/src/device_runtime.h | 0 .../src/draft/draft_gguf_loader.cpp | 0 {dflash => server}/src/draft/draft_graph.cpp | 0 {dflash => server}/src/draft/draft_graph.h | 0 .../src/draft/draft_safetensors_loader.cpp | 0 {dflash => server}/src/errors.cpp | 0 {dflash => server}/src/flashprefill.cpp | 0 {dflash => server}/src/flashprefill.h | 0 {dflash => server}/src/flashprefill_f16.cu | 0 .../src/flashprefill_kernels.cu | 0 .../src/flashprefill_kernels.hip.cu | 0 {dflash => server}/src/flashprefill_q8.cpp | 0 {dflash => server}/src/flashprefill_scalar.cu | 0 .../src/flashprefill_select.cpp | 0 .../src/gemma4/gemma4_backend.cpp | 0 .../src/gemma4/gemma4_backend.h | 0 .../src/gemma4/gemma4_daemon.cpp | 0 {dflash => server}/src/gemma4/gemma4_daemon.h | 0 .../src/gemma4/gemma4_dflash_target.cpp | 0 .../src/gemma4/gemma4_dflash_target.h | 0 .../src/gemma4/gemma4_graph.cpp | 0 .../src/gemma4/gemma4_internal.h | 0 .../src/gemma4/gemma4_loader.cpp | 0 {dflash => server}/src/hip_compat/cuda_bf16.h | 0 {dflash => server}/src/hip_compat/cuda_fp16.h | 0 {dflash => server}/src/internal.h | 0 .../src/ipc/backend_ipc_main.cpp | 0 {dflash => server}/src/kv_cache.cpp | 0 {dflash => server}/src/kv_quant.cpp | 0 {dflash => server}/src/kv_quant.h | 0 .../src/laguna/laguna_backend.cpp | 0 .../src/laguna/laguna_backend.h | 0 .../src/laguna/laguna_daemon.cpp | 0 {dflash => server}/src/laguna/laguna_daemon.h | 0 .../src/laguna/laguna_internal.h | 0 .../src/laguna/laguna_target_graph.cpp | 0 .../src/laguna/laguna_target_loader.cpp | 0 .../src/pflash_ggml_adapter.cpp | 0 {dflash => server}/src/pflash_ggml_adapter.h | 0 .../src/placement/pflash_placement.h | 0 .../src/placement/placement_backend.h | 0 .../src/placement/placement_config.h | 0 .../src/placement/remote_draft_config.h | 0 .../src/qwen3/qwen3_backend.cpp | 0 {dflash => server}/src/qwen3/qwen3_backend.h | 0 {dflash => server}/src/qwen3/qwen3_daemon.cpp | 0 {dflash => server}/src/qwen3/qwen3_daemon.h | 0 .../src/qwen3/qwen3_drafter.cpp | 0 {dflash => server}/src/qwen3/qwen3_drafter.h | 0 .../src/qwen3/qwen3_drafter_model.h | 0 {dflash => server}/src/qwen3/qwen3_graph.cpp | 0 {dflash => server}/src/qwen3/qwen3_loader.cpp | 0 .../src/qwen35/gguf_target_loader.cpp | 0 .../src/qwen35/graph_builders.cpp | 0 .../src/qwen35/graph_builders.h | 0 .../src/qwen35/layer_split_daemon.cpp | 0 .../src/qwen35/layer_split_daemon.h | 0 .../src/qwen35/layer_split_daemon_loop.cpp | 0 .../src/qwen35/layer_split_daemon_loop.h | 0 .../src/qwen35/layer_split_forward.cpp | 0 .../src/qwen35/layer_split_forward.h | 0 .../src/qwen35/layer_split_types.h | 0 .../src/qwen35/qwen35_backend.cpp | 0 .../src/qwen35/qwen35_backend.h | 0 .../src/qwen35/qwen35_daemon.cpp | 0 {dflash => server}/src/qwen35/qwen35_daemon.h | 0 .../src/qwen35/qwen35_dflash_target.cpp | 0 .../src/qwen35/qwen35_dflash_target.h | 0 .../src/qwen35/qwen35_layer_split.h | 0 .../qwen35_layer_split_dflash_target.cpp | 0 .../qwen35/qwen35_layer_split_dflash_target.h | 0 {dflash => server}/src/qwen35/qwen35_ops.h | 0 .../src/qwen35/qwen35_target_graph.cpp | 0 .../src/qwen35moe/qwen35moe_backend.cpp | 0 .../src/qwen35moe/qwen35moe_backend.h | 0 .../src/qwen35moe/qwen35moe_daemon.cpp | 0 .../src/qwen35moe/qwen35moe_daemon.h | 0 .../qwen35moe/qwen35moe_expert_placement.cpp | 0 .../qwen35moe/qwen35moe_expert_placement.h | 0 .../src/qwen35moe/qwen35moe_ffn.cpp | 0 .../src/qwen35moe/qwen35moe_ffn.h | 0 .../qwen35moe/qwen35moe_hybrid_ffn_eval.cpp | 0 .../src/qwen35moe/qwen35moe_hybrid_ffn_eval.h | 0 .../qwen35moe/qwen35moe_hybrid_storage.cpp | 0 .../src/qwen35moe/qwen35moe_hybrid_storage.h | 0 .../src/qwen35moe/qwen35moe_routing_stats.cpp | 0 .../src/qwen35moe/qwen35moe_routing_stats.h | 0 .../src/qwen35moe/qwen35moe_swap_manager.cpp | 0 .../src/qwen35moe/qwen35moe_swap_manager.h | 0 {dflash => server}/src/rms_norm_hip.cu | 0 {dflash => server}/src/server/api_types.h | 0 .../src/server/chat_template.cpp | 0 {dflash => server}/src/server/chat_template.h | 0 .../src/server/disk_prefix_cache.cpp | 0 .../src/server/disk_prefix_cache.h | 0 {dflash => server}/src/server/http_server.cpp | 0 {dflash => server}/src/server/http_server.h | 0 {dflash => server}/src/server/model_card.cpp | 0 {dflash => server}/src/server/model_card.h | 0 .../src/server/prefix_cache.cpp | 0 {dflash => server}/src/server/prefix_cache.h | 0 {dflash => server}/src/server/rax.c | 0 {dflash => server}/src/server/rax.h | 0 {dflash => server}/src/server/reasoning.cpp | 0 {dflash => server}/src/server/reasoning.h | 0 {dflash => server}/src/server/server_main.cpp | 0 {dflash => server}/src/server/sse_emitter.cpp | 0 {dflash => server}/src/server/sse_emitter.h | 0 {dflash => server}/src/server/tokenizer.cpp | 0 {dflash => server}/src/server/tokenizer.h | 0 {dflash => server}/src/server/tool_hint.cpp | 0 {dflash => server}/src/server/tool_hint.h | 0 {dflash => server}/src/server/tool_memory.cpp | 0 {dflash => server}/src/server/tool_memory.h | 0 {dflash => server}/src/server/tool_parser.cpp | 0 {dflash => server}/src/server/tool_parser.h | 0 {dflash => server}/src/server/utf8_utils.h | 0 .../test/bench_laguna_generate.cpp | 0 .../test/bench_laguna_pflash.cpp | 0 {dflash => server}/test/bench_laguna_ttft.cpp | 0 {dflash => server}/test/pflash_daemon.cpp | 0 {dflash => server}/test/smoke_draft_graph.cpp | 0 .../test/smoke_laguna_forward.cpp | 0 {dflash => server}/test/smoke_load_draft.cpp | 0 {dflash => server}/test/smoke_load_target.cpp | 0 .../test/smoke_load_target_laguna.cpp | 0 .../test/smoke_qwen3_forward.cpp | 0 .../test/smoke_target_forward.cpp | 0 {dflash => server}/test/spike_thin_copy.cpp | 0 {dflash => server}/test/test_dflash.cpp | 0 .../test/test_flash_attn_sparse.cpp | 0 .../test/test_flashprefill_kernels.cpp | 0 {dflash => server}/test/test_generate.cpp | 0 {dflash => server}/test/test_gguf_mmap.cpp | 0 {dflash => server}/test/test_kv_quant.cpp | 0 .../test/test_laguna_daemon.cpp | 0 {dflash => server}/test/test_mtp_converter.sh | 0 {dflash => server}/test/test_mtp_e2e.sh | 0 .../test/test_qwen35moe_expert_placement.cpp | 0 .../test/test_qwen35moe_routing_stats.cpp | 0 .../test/test_qwen35moe_swap_manager.cpp | 0 .../test/test_restore_delta.cpp | 0 {dflash => server}/test/test_server_unit.cpp | 0 .../test/test_tokenizer_harness.cpp | 0 {dflash => server}/test/test_vs_oracle.cpp | 0 .../tests/test_server_comprehensive.py | 0 {dflash => server}/tests/test_server_smoke.py | 0 {dflash => server}/tests/test_tokenizer.py | 0 324 files changed, 129 insertions(+), 129 deletions(-) rename {megakernel => optimizations/megakernel}/README.md (99%) rename {megakernel => optimizations/megakernel}/README_PASCAL.md (100%) rename {megakernel => optimizations/megakernel}/RESULTS.md (92%) rename {megakernel => optimizations/megakernel}/_phase2_variant.py (100%) rename {megakernel => optimizations/megakernel}/bench.py (100%) rename {megakernel => optimizations/megakernel}/bench_pp_tg.py (100%) rename {megakernel => optimizations/megakernel}/bench_pp_tg_nvfp4.py (100%) rename {megakernel => optimizations/megakernel}/build_corpus.py (100%) rename {megakernel => optimizations/megakernel}/corpus/baseline.json (100%) rename {megakernel => optimizations/megakernel}/corpus/wmma.json (100%) rename {megakernel => optimizations/megakernel}/corpus/wmma_p3.json (100%) rename {megakernel => optimizations/megakernel}/corpus/wmma_p4.json (100%) rename {megakernel => optimizations/megakernel}/corpus/wmma_p6cleanup.json (100%) rename {megakernel => optimizations/megakernel}/corpus/wmma_p7.json (100%) rename {megakernel => optimizations/megakernel}/corpus/wmma_p8.json (100%) rename {megakernel => optimizations/megakernel}/diag_phase2_metrics.py (100%) rename {megakernel => optimizations/megakernel}/diag_prefill_kernels.py (100%) rename {megakernel => optimizations/megakernel}/final_bench.py (100%) rename {megakernel => optimizations/megakernel}/final_bench_nvfp4.py (100%) rename {megakernel => optimizations/megakernel}/half_type.h (100%) rename {megakernel => optimizations/megakernel}/hero.png (100%) rename {megakernel => optimizations/megakernel}/hero.raw.png (100%) rename {megakernel => optimizations/megakernel}/kernel.cu (100%) rename {megakernel => optimizations/megakernel}/kernel_gb10_nvfp4.cu (100%) rename {megakernel => optimizations/megakernel}/model.py (100%) rename {megakernel => optimizations/megakernel}/model_nvfp4.py (100%) rename {megakernel => optimizations/megakernel}/prefill.cu (100%) rename {megakernel => optimizations/megakernel}/prefill_bw.cu (100%) rename {megakernel => optimizations/megakernel}/prefill_megakernel.cu (100%) rename {megakernel => optimizations/megakernel}/pyproject.toml (100%) rename {megakernel => optimizations/megakernel}/setup.py (100%) rename {megakernel => optimizations/megakernel}/torch_bindings.cpp (100%) rename {pflash => optimizations/pflash}/README.md (92%) rename {pflash => optimizations/pflash}/demo.gif (100%) rename {pflash => optimizations/pflash}/hero.png (100%) rename {pflash => optimizations/pflash}/pflash/__init__.py (100%) rename {pflash => optimizations/pflash}/pflash/config.py (100%) rename {pflash => optimizations/pflash}/pflash/dflash_client.py (100%) rename {pflash => optimizations/pflash}/pyproject.toml (100%) rename {pflash => optimizations/pflash}/tests/bench_niah_cpp.py (100%) rename {pflash => optimizations/pflash}/tests/niah_gen.py (100%) rename {dflash => server}/.gitignore (100%) rename {dflash => server}/CMakeLists.txt (100%) rename {dflash => server}/CODEX.md (100%) rename {dflash => server}/DEVELOPER.md (96%) rename {dflash => server}/README.md (99%) rename {dflash => server}/RESULTS.md (99%) rename {dflash => server}/demo.gif (100%) rename {dflash => server}/deps/Block-Sparse-Attention (100%) rename {dflash => server}/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h (100%) rename {dflash => server}/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh (100%) rename {dflash => server}/deps/bsa_stubs/README.md (95%) rename {dflash => server}/deps/bsa_stubs/c10/cuda/CUDAException.h (100%) rename {dflash => server}/deps/llama.cpp (100%) rename {dflash => server}/docs/API.md (100%) rename {dflash => server}/docs/ARCHITECTURE.md (99%) rename {dflash => server}/docs/HIP_PERF_PLAN.md (100%) rename {dflash => server}/docs/MIXED_BACKEND.md (100%) rename {dflash => server}/docs/PREFIX_CACHE.md (100%) rename {dflash => server}/docs/SPEC_PREFILL.md (97%) rename {dflash => server}/docs/laguna_integration_plan.md (93%) rename {dflash => server}/eval/README.md (94%) rename {dflash => server}/eval/humaneval_plus/SOURCE.txt (100%) rename {dflash => server}/eval/humaneval_plus/humanevalplus.jsonl (100%) rename {dflash => server}/eval/mt_bench/SOURCE.txt (100%) rename {dflash => server}/eval/mt_bench/question.jsonl (100%) rename {dflash => server}/examples/chat.py (100%) rename {dflash => server}/hero.png (100%) rename {dflash => server}/hero.raw.png (100%) rename {dflash => server}/hip_compat/cuda_bf16.h (100%) rename {dflash => server}/hip_compat/cuda_fp16.h (100%) rename {dflash => server}/hip_compat/cuda_runtime.h (100%) rename {dflash => server}/hip_compat/mma.h (100%) rename {dflash => server}/include/dflash27b.h (100%) rename {dflash => server}/pyproject.toml (100%) rename {dflash => server}/scripts/_prefill_hook.py (100%) rename {dflash => server}/scripts/bench_agent.py (100%) rename {dflash => server}/scripts/bench_agent_loop.py (100%) rename {dflash => server}/scripts/bench_daemon.py (100%) rename {dflash => server}/scripts/bench_he.py (100%) rename {dflash => server}/scripts/bench_he_http.py (100%) rename {dflash => server}/scripts/bench_llm.py (100%) rename {dflash => server}/scripts/bench_server.py (100%) rename {dflash => server}/scripts/convert_dflash_to_gguf.py (100%) rename {dflash => server}/scripts/detokenize.py (100%) rename {dflash => server}/scripts/fixtures/agent_prompts/codex_apply_patch.md (100%) rename {dflash => server}/scripts/fixtures/agent_prompts/codex_gpt52.md (100%) rename {dflash => server}/scripts/fixtures/agent_prompts/codex_gpt52_codex.md (100%) rename {dflash => server}/scripts/fixtures/agent_prompts/codex_gpt5_codex.md (100%) rename {dflash => server}/scripts/fixtures/swe_bench/swe_bench_verified.parquet (100%) rename {dflash => server}/scripts/gen_oracle.py (100%) rename {dflash => server}/scripts/laguna_pflash_niah.py (100%) rename {dflash => server}/scripts/parity_laguna.py (100%) rename {dflash => server}/scripts/phase_split_dual_gpu.py (100%) rename {dflash => server}/scripts/placement/__init__.py (100%) rename {dflash => server}/scripts/placement/backend_device.py (100%) rename {dflash => server}/scripts/placement/server_resolver.py (100%) rename {dflash => server}/scripts/placement/test_dflash_args.py (100%) rename {dflash => server}/scripts/prefix_cache.py (100%) rename {dflash => server}/scripts/quality_ab_simple.py (100%) rename {dflash => server}/scripts/quality_humaneval_plus.py (100%) rename {dflash => server}/scripts/quantize_draft_q8.py (100%) rename {dflash => server}/scripts/quantize_gemma_dflash_q8.py (100%) rename {dflash => server}/scripts/run.py (100%) rename {dflash => server}/scripts/server.py (100%) rename {dflash => server}/scripts/setup_system.sh (100%) rename {dflash => server}/scripts/test_full_compress_cache.py (100%) rename {dflash => server}/scripts/test_multi_turn_prefix_cache.py (100%) rename {dflash => server}/scripts/test_prefix_cache.py (100%) rename {dflash => server}/scripts/test_server.py (100%) rename {dflash => server}/scripts/test_server_integration.py (100%) rename {dflash => server}/scripts/test_server_prefix_cache.py (100%) rename {dflash => server}/scripts/test_tool_memory.py (100%) rename {dflash => server}/scripts/tokenize_prompt.py (100%) rename {dflash => server}/scripts/tool_memory.py (100%) rename {dflash => server}/src/bsa_fwd_inst.cu (100%) rename {dflash => server}/src/bsa_launcher.cu (100%) rename {dflash => server}/src/bsa_launcher_hip.cu (100%) rename {dflash => server}/src/common/attn_masks.h (100%) rename {dflash => server}/src/common/backend_factory.cpp (100%) rename {dflash => server}/src/common/backend_factory.h (100%) rename {dflash => server}/src/common/backend_ipc.cpp (100%) rename {dflash => server}/src/common/backend_ipc.h (100%) rename {dflash => server}/src/common/daemon_loop.cpp (100%) rename {dflash => server}/src/common/daemon_loop.h (100%) rename {dflash => server}/src/common/ddtree.cpp (100%) rename {dflash => server}/src/common/ddtree.h (100%) rename {dflash => server}/src/common/device_placement.h (100%) rename {dflash => server}/src/common/dflash_capture.cpp (100%) rename {dflash => server}/src/common/dflash_capture.h (100%) rename {dflash => server}/src/common/dflash_draft_graph.cpp (100%) rename {dflash => server}/src/common/dflash_draft_graph.h (100%) rename {dflash => server}/src/common/dflash_draft_ipc.cpp (100%) rename {dflash => server}/src/common/dflash_draft_ipc.h (100%) rename {dflash => server}/src/common/dflash_draft_ipc_daemon.cpp (100%) rename {dflash => server}/src/common/dflash_feature_ring.cpp (100%) rename {dflash => server}/src/common/dflash_feature_ring.h (100%) rename {dflash => server}/src/common/dflash_layer_split_runtime.h (100%) rename {dflash => server}/src/common/dflash_spec_decode.cpp (100%) rename {dflash => server}/src/common/dflash_spec_decode.h (100%) rename {dflash => server}/src/common/dflash_target.h (100%) rename {dflash => server}/src/common/gguf_inspect.cpp (100%) rename {dflash => server}/src/common/gguf_inspect.h (100%) rename {dflash => server}/src/common/gguf_mmap.h (100%) rename {dflash => server}/src/common/gpu_runtime_compat.h (100%) rename {dflash => server}/src/common/io_utils.h (100%) rename {dflash => server}/src/common/layer_split_utils.cpp (100%) rename {dflash => server}/src/common/layer_split_utils.h (100%) rename {dflash => server}/src/common/model_backend.h (100%) rename {dflash => server}/src/common/peer_access.cpp (100%) rename {dflash => server}/src/common/peer_access.h (100%) rename {dflash => server}/src/common/pflash_drafter_ipc.cpp (100%) rename {dflash => server}/src/common/pflash_drafter_ipc.h (100%) rename {dflash => server}/src/common/pflash_drafter_ipc_daemon.cpp (100%) rename {dflash => server}/src/common/restore_delta.h (100%) rename {dflash => server}/src/common/sampler.cpp (100%) rename {dflash => server}/src/common/sampler.h (100%) rename {dflash => server}/src/common/snapshot_backend.h (100%) rename {dflash => server}/src/common/step_graph.h (100%) rename {dflash => server}/src/cuda_cross_device_copy.cpp (100%) rename {dflash => server}/src/delta_net_chunked.cpp (100%) rename {dflash => server}/src/delta_net_chunked.h (100%) rename {dflash => server}/src/device_runtime.h (100%) rename {dflash => server}/src/draft/draft_gguf_loader.cpp (100%) rename {dflash => server}/src/draft/draft_graph.cpp (100%) rename {dflash => server}/src/draft/draft_graph.h (100%) rename {dflash => server}/src/draft/draft_safetensors_loader.cpp (100%) rename {dflash => server}/src/errors.cpp (100%) rename {dflash => server}/src/flashprefill.cpp (100%) rename {dflash => server}/src/flashprefill.h (100%) rename {dflash => server}/src/flashprefill_f16.cu (100%) rename {dflash => server}/src/flashprefill_kernels.cu (100%) rename {dflash => server}/src/flashprefill_kernels.hip.cu (100%) rename {dflash => server}/src/flashprefill_q8.cpp (100%) rename {dflash => server}/src/flashprefill_scalar.cu (100%) rename {dflash => server}/src/flashprefill_select.cpp (100%) rename {dflash => server}/src/gemma4/gemma4_backend.cpp (100%) rename {dflash => server}/src/gemma4/gemma4_backend.h (100%) rename {dflash => server}/src/gemma4/gemma4_daemon.cpp (100%) rename {dflash => server}/src/gemma4/gemma4_daemon.h (100%) rename {dflash => server}/src/gemma4/gemma4_dflash_target.cpp (100%) rename {dflash => server}/src/gemma4/gemma4_dflash_target.h (100%) rename {dflash => server}/src/gemma4/gemma4_graph.cpp (100%) rename {dflash => server}/src/gemma4/gemma4_internal.h (100%) rename {dflash => server}/src/gemma4/gemma4_loader.cpp (100%) rename {dflash => server}/src/hip_compat/cuda_bf16.h (100%) rename {dflash => server}/src/hip_compat/cuda_fp16.h (100%) rename {dflash => server}/src/internal.h (100%) rename {dflash => server}/src/ipc/backend_ipc_main.cpp (100%) rename {dflash => server}/src/kv_cache.cpp (100%) rename {dflash => server}/src/kv_quant.cpp (100%) rename {dflash => server}/src/kv_quant.h (100%) rename {dflash => server}/src/laguna/laguna_backend.cpp (100%) rename {dflash => server}/src/laguna/laguna_backend.h (100%) rename {dflash => server}/src/laguna/laguna_daemon.cpp (100%) rename {dflash => server}/src/laguna/laguna_daemon.h (100%) rename {dflash => server}/src/laguna/laguna_internal.h (100%) rename {dflash => server}/src/laguna/laguna_target_graph.cpp (100%) rename {dflash => server}/src/laguna/laguna_target_loader.cpp (100%) rename {dflash => server}/src/pflash_ggml_adapter.cpp (100%) rename {dflash => server}/src/pflash_ggml_adapter.h (100%) rename {dflash => server}/src/placement/pflash_placement.h (100%) rename {dflash => server}/src/placement/placement_backend.h (100%) rename {dflash => server}/src/placement/placement_config.h (100%) rename {dflash => server}/src/placement/remote_draft_config.h (100%) rename {dflash => server}/src/qwen3/qwen3_backend.cpp (100%) rename {dflash => server}/src/qwen3/qwen3_backend.h (100%) rename {dflash => server}/src/qwen3/qwen3_daemon.cpp (100%) rename {dflash => server}/src/qwen3/qwen3_daemon.h (100%) rename {dflash => server}/src/qwen3/qwen3_drafter.cpp (100%) rename {dflash => server}/src/qwen3/qwen3_drafter.h (100%) rename {dflash => server}/src/qwen3/qwen3_drafter_model.h (100%) rename {dflash => server}/src/qwen3/qwen3_graph.cpp (100%) rename {dflash => server}/src/qwen3/qwen3_loader.cpp (100%) rename {dflash => server}/src/qwen35/gguf_target_loader.cpp (100%) rename {dflash => server}/src/qwen35/graph_builders.cpp (100%) rename {dflash => server}/src/qwen35/graph_builders.h (100%) rename {dflash => server}/src/qwen35/layer_split_daemon.cpp (100%) rename {dflash => server}/src/qwen35/layer_split_daemon.h (100%) rename {dflash => server}/src/qwen35/layer_split_daemon_loop.cpp (100%) rename {dflash => server}/src/qwen35/layer_split_daemon_loop.h (100%) rename {dflash => server}/src/qwen35/layer_split_forward.cpp (100%) rename {dflash => server}/src/qwen35/layer_split_forward.h (100%) rename {dflash => server}/src/qwen35/layer_split_types.h (100%) rename {dflash => server}/src/qwen35/qwen35_backend.cpp (100%) rename {dflash => server}/src/qwen35/qwen35_backend.h (100%) rename {dflash => server}/src/qwen35/qwen35_daemon.cpp (100%) rename {dflash => server}/src/qwen35/qwen35_daemon.h (100%) rename {dflash => server}/src/qwen35/qwen35_dflash_target.cpp (100%) rename {dflash => server}/src/qwen35/qwen35_dflash_target.h (100%) rename {dflash => server}/src/qwen35/qwen35_layer_split.h (100%) rename {dflash => server}/src/qwen35/qwen35_layer_split_dflash_target.cpp (100%) rename {dflash => server}/src/qwen35/qwen35_layer_split_dflash_target.h (100%) rename {dflash => server}/src/qwen35/qwen35_ops.h (100%) rename {dflash => server}/src/qwen35/qwen35_target_graph.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_backend.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_backend.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_daemon.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_daemon.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_expert_placement.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_expert_placement.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_ffn.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_ffn.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_hybrid_storage.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_hybrid_storage.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_routing_stats.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_routing_stats.h (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_swap_manager.cpp (100%) rename {dflash => server}/src/qwen35moe/qwen35moe_swap_manager.h (100%) rename {dflash => server}/src/rms_norm_hip.cu (100%) rename {dflash => server}/src/server/api_types.h (100%) rename {dflash => server}/src/server/chat_template.cpp (100%) rename {dflash => server}/src/server/chat_template.h (100%) rename {dflash => server}/src/server/disk_prefix_cache.cpp (100%) rename {dflash => server}/src/server/disk_prefix_cache.h (100%) rename {dflash => server}/src/server/http_server.cpp (100%) rename {dflash => server}/src/server/http_server.h (100%) rename {dflash => server}/src/server/model_card.cpp (100%) rename {dflash => server}/src/server/model_card.h (100%) rename {dflash => server}/src/server/prefix_cache.cpp (100%) rename {dflash => server}/src/server/prefix_cache.h (100%) rename {dflash => server}/src/server/rax.c (100%) rename {dflash => server}/src/server/rax.h (100%) rename {dflash => server}/src/server/reasoning.cpp (100%) rename {dflash => server}/src/server/reasoning.h (100%) rename {dflash => server}/src/server/server_main.cpp (100%) rename {dflash => server}/src/server/sse_emitter.cpp (100%) rename {dflash => server}/src/server/sse_emitter.h (100%) rename {dflash => server}/src/server/tokenizer.cpp (100%) rename {dflash => server}/src/server/tokenizer.h (100%) rename {dflash => server}/src/server/tool_hint.cpp (100%) rename {dflash => server}/src/server/tool_hint.h (100%) rename {dflash => server}/src/server/tool_memory.cpp (100%) rename {dflash => server}/src/server/tool_memory.h (100%) rename {dflash => server}/src/server/tool_parser.cpp (100%) rename {dflash => server}/src/server/tool_parser.h (100%) rename {dflash => server}/src/server/utf8_utils.h (100%) rename {dflash => server}/test/bench_laguna_generate.cpp (100%) rename {dflash => server}/test/bench_laguna_pflash.cpp (100%) rename {dflash => server}/test/bench_laguna_ttft.cpp (100%) rename {dflash => server}/test/pflash_daemon.cpp (100%) rename {dflash => server}/test/smoke_draft_graph.cpp (100%) rename {dflash => server}/test/smoke_laguna_forward.cpp (100%) rename {dflash => server}/test/smoke_load_draft.cpp (100%) rename {dflash => server}/test/smoke_load_target.cpp (100%) rename {dflash => server}/test/smoke_load_target_laguna.cpp (100%) rename {dflash => server}/test/smoke_qwen3_forward.cpp (100%) rename {dflash => server}/test/smoke_target_forward.cpp (100%) rename {dflash => server}/test/spike_thin_copy.cpp (100%) rename {dflash => server}/test/test_dflash.cpp (100%) rename {dflash => server}/test/test_flash_attn_sparse.cpp (100%) rename {dflash => server}/test/test_flashprefill_kernels.cpp (100%) rename {dflash => server}/test/test_generate.cpp (100%) rename {dflash => server}/test/test_gguf_mmap.cpp (100%) rename {dflash => server}/test/test_kv_quant.cpp (100%) rename {dflash => server}/test/test_laguna_daemon.cpp (100%) rename {dflash => server}/test/test_mtp_converter.sh (100%) rename {dflash => server}/test/test_mtp_e2e.sh (100%) rename {dflash => server}/test/test_qwen35moe_expert_placement.cpp (100%) rename {dflash => server}/test/test_qwen35moe_routing_stats.cpp (100%) rename {dflash => server}/test/test_qwen35moe_swap_manager.cpp (100%) rename {dflash => server}/test/test_restore_delta.cpp (100%) rename {dflash => server}/test/test_server_unit.cpp (100%) rename {dflash => server}/test/test_tokenizer_harness.cpp (100%) rename {dflash => server}/test/test_vs_oracle.cpp (100%) rename {dflash => server}/tests/test_server_comprehensive.py (100%) rename {dflash => server}/tests/test_server_smoke.py (100%) rename {dflash => server}/tests/test_tokenizer.py (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1b3a8c1a3..705bbf09b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,7 +46,7 @@ jobs: - name: Build dflash (smoke + server) run: | - cd dflash + cd server cmake -B build \ -DCMAKE_CUDA_ARCHITECTURES="86" \ -DDFLASH27B_ENABLE_BSA=OFF \ @@ -59,13 +59,13 @@ jobs: - name: Run C++ server unit tests run: | - cd dflash/build + cd server/build ctest --output-on-failure -R server_unit --no-tests=error - name: Run Python server unit tests run: | pip install pytest fastapi httpx transformers - cd dflash/scripts + cd server/scripts python3 -m pytest test_server.py -v - name: Populate venv with cu128 torch + setuptools diff --git a/.gitmodules b/.gitmodules index d664da54e..c3b57efdc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "dflash/deps/llama.cpp"] - path = dflash/deps/llama.cpp + path = server/deps/llama.cpp url = https://github.com/Luce-Org/llama.cpp-dflash-ggml.git branch = luce-dflash [submodule "dflash/deps/Block-Sparse-Attention"] - path = dflash/deps/Block-Sparse-Attention + path = server/deps/Block-Sparse-Attention url = https://github.com/mit-han-lab/Block-Sparse-Attention.git diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7ff50cf8f..d11cc7d13 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,7 +23,7 @@ Thanks for considering a contribution. Lucebox is a hub of self-contained optimi On Ubuntu 22.04 or 24.04, one script installs all system dependencies — `build-essential`, `cmake`, `git`, `git-lfs`, and the CUDA Toolkit from NVIDIA's repo: ```bash -sudo dflash/scripts/setup_system.sh +sudo server/scripts/setup_system.sh ``` The script is idempotent and configures `nvcc` on PATH for both bash and zsh. For other distros see the [CUDA installation guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/). @@ -51,11 +51,11 @@ uv sync --extra megakernel # also compile the megakernel CUDA extension bash scripts/check_uv_workspace.sh # lockfile + frozen-sync import smoke # C++/CUDA decoder -cmake -B dflash/build -S dflash -DCMAKE_BUILD_TYPE=Release -cmake --build dflash/build --target test_dflash -j +cmake -B server/build -S dflash -DCMAKE_BUILD_TYPE=Release +cmake --build server/build --target test_dflash -j ``` -> If cmake was previously run without CUDA, wipe the build directory first (`rm -rf dflash/build`) to avoid a stale compiler cache. +> If cmake was previously run without CUDA, wipe the build directory first (`rm -rf server/build`) to avoid a stale compiler cache. --- diff --git a/README.md b/README.md index f284ba42f..f1222cded 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,13 @@ Each directory is a self-contained project with setup instructions and benchmark notes.

- Megakernel + Megakernel    - DFlash 27B + DFlash 27B

- PFlash speculative prefill + PFlash speculative prefill

--- @@ -69,7 +69,7 @@ server wrapper: ```bash LUCEBOX_SERVER_BACKEND=cpp \ -DFLASH_SERVER_BIN=dflash/build/dflash_server \ +DFLASH_SERVER_BIN=server/build/dflash_server \ MAX_CTX=32768 BUDGET=22 VERIFY_MODE=ddtree \ harness/clients/run_codex.sh ``` @@ -90,7 +90,7 @@ uv sync --extra megakernel # builds the CUDA extension; torch is auto-i uv run --directory megakernel python final_bench.py ``` -> Don't have `uv`? Install with `curl -LsSf https://astral.sh/uv/install.sh | sh` or see [astral.sh/uv](https://astral.sh/uv/). The legacy `python -m venv` + `pip install -e . --no-build-isolation` flow still works from inside `megakernel/`. +> Don't have `uv`? Install with `curl -LsSf https://astral.sh/uv/install.sh | sh` or see [astral.sh/uv](https://astral.sh/uv/). The legacy `python -m venv` + `pip install -e . --no-build-isolation` flow still works from inside `optimizations/megakernel/`. | Method | Prefill pp520 | Decode tg128 | tok/J | |--------|:-------------:|:------------:|:-----:| @@ -100,9 +100,9 @@ uv run --directory megakernel python final_bench.py Implementation notes: 82 blocks, 512 threads, cooperative grid sync, no CPU round trips between layers, and weights streamed from Hugging Face on first run. -[Full writeup →](megakernel/README.md) · [Benchmarks →](megakernel/RESULTS.md) · [Blog post →](https://lucebox.com/blog/megakernel) +[Full writeup →](optimizations/megakernel/README.md) · [Benchmarks →](optimizations/megakernel/RESULTS.md) · [Blog post →](https://lucebox.com/blog/megakernel) -> **Blackwell (RTX 5090, DGX Spark / GB10):** auto-detected by setup; NVFP4 decode path lands ~194 tok/s tg128 on GB10. See [megakernel/README.md#blackwell-sm_120--sm_121a](megakernel/README.md). +> **Blackwell (RTX 5090, DGX Spark / GB10):** auto-detected by setup; NVFP4 decode path lands ~194 tok/s tg128 on GB10. See [optimizations/megakernel/README.md#blackwell-sm_120--sm_121a](optimizations/megakernel/README.md). --- @@ -127,14 +127,14 @@ uv sync # 3. build the C++/CUDA decoder (CUDA 12+, CMake 3.18+) # Default compiles for Pascal/Volta/Turing/Ampere (60/61/62/70/75/86; +120 on CUDA 12.8+, +sm_121/DGX Spark on CUDA 12.9+, +sm_110/Thor on CUDA 13.0+) so the binary runs on every supported card. # 3090-only users can add -DCMAKE_CUDA_ARCHITECTURES=86 to skip the other archs and build faster (~3 min). -cmake -B dflash/build -S dflash -DCMAKE_BUILD_TYPE=Release -cmake --build dflash/build --target test_dflash -j -cmake --build dflash/build --target test_generate -j -cmake --build dflash/build --target dflash_server -j +cmake -B server/build -S dflash -DCMAKE_BUILD_TYPE=Release +cmake --build server/build --target test_dflash -j +cmake --build server/build --target test_generate -j +cmake --build server/build --target dflash_server -j # 4. fetch weights: ~16 GB Q4_K_M target + 1.84 GB Lucebox Q8_0 GGUF DFlash draft -uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir dflash/models/ -uv run hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir dflash/models/draft/ +uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/ +uv run hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/ # 5a. one-shot streaming generate uv run --directory dflash python scripts/run.py --prompt "def fibonacci(n):" @@ -163,7 +163,7 @@ Implemented here: ### Running on other GPUs (4090, 5090, DGX Spark / GB10, Jetson AGX Thor) -Supported out of the box; the build just needs the right CUDA toolkit. `dflash/CMakeLists.txt` already auto-adds Blackwell archs when your nvcc is new enough, so the main quickstart above works as-is on newer cards. +Supported out of the box; the build just needs the right CUDA toolkit. `server/CMakeLists.txt` already auto-adds Blackwell archs when your nvcc is new enough, so the main quickstart above works as-is on newer cards. | GPU | Arch | Min CUDA | Status | |-----|:----:|:--------:|--------| @@ -203,9 +203,9 @@ cmake --build build --target test_dflash -j **Retune per GPU:** - **DDTree `budget=22`** tuned for 3090 + Q4_K_M + 24 GB. On the RTX 5090, budget=40 is optimal (swept). On GB10 (128 GB unified), re-sweep — larger tree = more verify throughput until memory bandwidth saturates. `scripts/bench_llm.py --budget N` has the sweep hooks. - **TQ3_0 KV cache + sliding `target_feat` ring** was shaped by 24 GB (fits up to 256K context on a 3090). On GB10 (128 GB unified) / 5090 (32 GB) you can push context further or skip quantization entirely and keep F16 KV. -- **Perf numbers** (207 tok/s demo, 129.5 HumanEval, 2.8× vs SGLang AWQ) are RTX 3090 @ stock. RTX 5090 numbers (205 tok/s HumanEval, 4.84×) are in [RESULTS.md](dflash/RESULTS.md). Ada/GB10/Thor not yet swept, PRs with `RESULTS.md` entries welcome. +- **Perf numbers** (207 tok/s demo, 129.5 HumanEval, 2.8× vs SGLang AWQ) are RTX 3090 @ stock. RTX 5090 numbers (205 tok/s HumanEval, 4.84×) are in [RESULTS.md](server/RESULTS.md). Ada/GB10/Thor not yet swept, PRs with `RESULTS.md` entries welcome. -[Full writeup →](dflash/README.md) · [Benchmarks →](dflash/RESULTS.md) · [Blog post →](https://lucebox.com/blog/dflash27b) +[Full writeup →](server/README.md) · [Benchmarks →](server/RESULTS.md) · [Blog post →](https://lucebox.com/blog/dflash27b) --- @@ -245,7 +245,7 @@ DFLASH_FP_USE_BSA=1 DFLASH_FP_ALPHA=0.85 \ Daemon stdin commands: `compress` runs the drafter with FlashPrefill block-sparse attention and returns the compressed token-id stream; `generate` runs the target on that stream with normal speculative decode + DDTree. `park` / `unpark` / `free drafter` swap weights in and out of VRAM so target + drafter coexist on a 24 GB card. -**Runtime tunables** (full list in [`dflash/src/flashprefill.h`](dflash/src/flashprefill.h)): +**Runtime tunables** (full list in [`server/src/flashprefill.h`](server/src/flashprefill.h)): ``` DFLASH_FP_USE_BSA=1 # dispatch sparse FA forward through BSA (sm_80+) DFLASH_FP_ALPHA=0.85 # block-selection threshold; higher = stricter = fewer K-blocks per Q-row @@ -254,11 +254,11 @@ DFLASH_FP_PROFILE=1 # log mean / score / select / forward stage timings **What's ours, what isn't.** Algorithms are from [Cross-Family Speculative Prefill (Liu et al., ICLR 2026)](https://arxiv.org/abs/2603.02631) for the scoring + selection layer and [FlashPrefill (Fan et al., 2026)](https://arxiv.org/abs/2603.06199) for the drafter sparse-attention forward. What we built: - C++/CUDA daemon-resident speculative prefill in front of a quantized GGUF target — no PyTorch, no Triton, no per-request subprocess. -- BSA wired without `libtorch` via a 3-header ATen/c10 stub set under `dflash/deps/bsa_stubs/`. +- BSA wired without `libtorch` via a 3-header ATen/c10 stub set under `server/deps/bsa_stubs/`. - Custom Qwen3-0.6B forward (`qwen3_0p6b_*`) so the drafter runs through the same ggml allocator as the 27B target. - 4 CUDA kernels (`flashprefill_kernels.cu`) for the FlashPrefill `mean_K / score / select / sparse_fwd` algorithm. -[Full writeup →](pflash/README.md) · [Daemon-side build / tunables →](dflash/docs/SPEC_PREFILL.md) · [Blog post →](https://lucebox.com/blog/pflash) +[Full writeup →](optimizations/pflash/README.md) · [Daemon-side build / tunables →](server/docs/SPEC_PREFILL.md) · [Blog post →](https://lucebox.com/blog/pflash) --- @@ -282,7 +282,7 @@ cmake --build build --target test_dflash -j **Per-arch DDTree tuning**: gfx1151 (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. gfx1100 (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card. -**Drafter recipe for max decode**: target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `dflash/scripts/quantize_draft_q8.py`. The matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness. +**Drafter recipe for max decode**: target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `server/scripts/quantize_draft_q8.py`. The matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness. [Blog post →](https://lucebox.com/blog/amd) · [PR #119 →](https://github.com/Luce-Org/lucebox-hub/pull/119) · [PR #156 cross-arch perf plan →](https://github.com/Luce-Org/lucebox-hub/pull/156) @@ -309,9 +309,9 @@ All experiments in this repo are built, tuned, and benchmarked on NVIDIA RTX 309 - **Jetson AGX Thor** (sm_110): supported, CUDA 13+. - **Turing** (sm_75, RTX 2080): supported, CUDA 12+. -PyTorch 2.0+. `dflash/` needs CMake 3.18+ and `--recurse-submodules` for the pinned `Luce-Org/llama.cpp@luce-dflash` fork (three tree-mode ggml ops); multi-arch build is automatic (see [Running on other GPUs](#running-on-other-gpus-4090-5090-dgx-spark--gb10-jetson-agx-thor)). +PyTorch 2.0+. `server/` needs CMake 3.18+ and `--recurse-submodules` for the pinned `Luce-Org/llama.cpp@luce-dflash` fork (three tree-mode ggml ops); multi-arch build is automatic (see [Running on other GPUs](#running-on-other-gpus-4090-5090-dgx-spark--gb10-jetson-agx-thor)). -**Megakernel porting note.** `megakernel/setup.py` auto-detects the GPU arch and SM count at build time via `torch.cuda.get_device_capability()`. The decode grid is persistent (one block per SM) and is clamped to the resident-block ceiling at runtime, so no manual tuning is needed. On SM < 80 (Turing), the kernel uses FP16 instead of BF16 via a compile-time `TARGET_SM` flag; on SM >= 80 (Ampere+), BF16 is used. From the workspace root, `uv sync --extra megakernel` builds the extension; the legacy `pip install -e . --no-build-isolation` flow still works from inside `megakernel/`. +**Megakernel porting note.** `optimizations/megakernel/setup.py` auto-detects the GPU arch and SM count at build time via `torch.cuda.get_device_capability()`. The decode grid is persistent (one block per SM) and is clamped to the resident-block ceiling at runtime, so no manual tuning is needed. On SM < 80 (Turing), the kernel uses FP16 instead of BF16 via a compile-time `TARGET_SM` flag; on SM >= 80 (Ampere+), BF16 is used. From the workspace root, `uv sync --extra megakernel` builds the extension; the legacy `pip install -e . --no-build-isolation` flow still works from inside `optimizations/megakernel/`. **Optional, find your GPU's sweet spot:** `sudo nvidia-smi -pl 220` (megakernel hits best tok/J at 220 W on 3090; re-sweep for other cards). @@ -321,9 +321,9 @@ PyTorch 2.0+. `dflash/` needs CMake 3.18+ and `--recurse-submodules` for the pin ``` lucebox-hub/ -├── megakernel/ · fused forward pass for Qwen 3.5-0.8B -├── dflash/ · DFlash speculative decoding port for Qwen 3.5/3.6-27B on RTX 3090 -├── pflash/ · speculative-prefill harness in front of dflash (12.5× TTFT at 128K) +├── optimizations/megakernel/ · fused forward pass for Qwen 3.5-0.8B +├── server/ · DFlash speculative decoding port for Qwen 3.5/3.6-27B on RTX 3090 +├── optimizations/pflash/ · speculative-prefill harness in front of dflash (12.5× TTFT at 128K) └── assets/ · banners, cards, diagrams ``` diff --git a/docs/specs/model-cards.md b/docs/specs/model-cards.md index d44980f6a..a2a5788f6 100644 --- a/docs/specs/model-cards.md +++ b/docs/specs/model-cards.md @@ -74,7 +74,7 @@ Examples: ### Cards directory search path The server probes (in order, matching -`find_model_cards_dir` in `dflash/src/server/model_card.cpp`): +`find_model_cards_dir` in `server/src/server/model_card.cpp`): 1. `/share/model_cards/` — an optional explicit directory passed by the embedding application (e.g. tests). Not @@ -145,7 +145,7 @@ first source supplying a value wins: values: `max_tokens=16000`, `hard_limit_reply_budget=512`, `think_max_tokens = max_tokens − hard_limit_reply_budget = 15488`. These also match the `ServerConfig` defaults in - `dflash/src/server/http_server.h`. + `server/src/server/http_server.h`. The startup banner prints each tunable's value and which source supplied it, e.g.: @@ -241,7 +241,7 @@ Rounding note: `low` and `medium` use nearest-integer rounding (`int(x + 0.5)`); `x-high` uses C++ integer division (truncation toward zero). For odd or non-divisible `think_max` values this produces deterministic but distinct off-by-one outcomes; see -`compute_default_tiers` in `dflash/src/server/model_card.cpp`. +`compute_default_tiers` in `server/src/server/model_card.cpp`. The `reasoning_effort_tiers` field exists because the ratio-based defaults don't fit every model. A smaller model that caps at 8192 diff --git a/docs/specs/thinking-budget.md b/docs/specs/thinking-budget.md index bd4c6735f..5ebc731be 100644 --- a/docs/specs/thinking-budget.md +++ b/docs/specs/thinking-budget.md @@ -125,7 +125,7 @@ Fields: | `verified_at` | ISO date the values were last checked against the source. | | `max_tokens` | The card's standard recommended combined cap. Drives `default_max_tokens`. | | `complex_problem_max_tokens` | Optional. The card's recommendation for hard reasoning / benchmark workloads. Drives the `x-high` and `max` effort tiers, which sit *above* `default_max_tokens` when this field is present — they are admissible as long as they fit under `max_ctx − hard_limit_reply_budget`. If omitted, both collapse to the `high` tier value. | -| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `dflash/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. | +| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `server/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. | | `sampling` | Recommended sampler params. Used as defaults when the request doesn't pin sampler values. | | `reasoning_effort_tiers` | Explicit phase-1 budgets per tier. Override any computed default. Whichever tiers are present win; missing tiers fall through to the computed defaults below. | diff --git a/harness/README.md b/harness/README.md index b3a4cae64..dfc5fa8b0 100644 --- a/harness/README.md +++ b/harness/README.md @@ -47,14 +47,14 @@ Use the native C++ server instead of the Python server: LUCEBOX_SERVER_BACKEND=cpp harness/clients/run_codex.sh ``` -The native server binary defaults to `dflash/build/dflash_server`. Override the +The native server binary defaults to `server/build/dflash_server`. Override the paths and profile the same way as the Python backend: ```bash LUCEBOX_SERVER_BACKEND=cpp \ -DFLASH_SERVER_BIN=dflash/build/dflash_server \ -TARGET=dflash/models/Qwen3.6-27B-Q4_K_M.gguf \ -DRAFT=dflash/models/draft/dflash-draft-3.6-q8_0.gguf \ +DFLASH_SERVER_BIN=server/build/dflash_server \ +TARGET=server/models/Qwen3.6-27B-Q4_K_M.gguf \ +DRAFT=server/models/draft/dflash-draft-3.6-q8_0.gguf \ MODEL_ID=luce-dflash \ MAX_CTX=32768 MAX_TOKENS=512 \ BUDGET=22 VERIFY_MODE=ddtree FA_WINDOW=2048 \ @@ -64,8 +64,8 @@ harness/clients/run_codex.sh To test an already-running native server: ```bash -dflash/build/dflash_server dflash/models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft dflash/models/draft/dflash-draft-3.6-q8_0.gguf \ +server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \ + --draft server/models/draft/dflash-draft-3.6-q8_0.gguf \ --host 127.0.0.1 --port 18080 \ --max-ctx 32768 --max-tokens 512 \ --fa-window 2048 \ @@ -83,7 +83,7 @@ need different context limits on a 24 GB card. ## Test a server change -If you already have `dflash/scripts/server.py` running, use `probe`: +If you already have `server/scripts/server.py` running, use `probe`: ```bash python3 harness/client_test_runner.py probe \ @@ -99,9 +99,9 @@ For a GPU sweep, let the runner start Lucebox for each profile: ```bash python3 harness/client_test_runner.py sweep \ - --target dflash/models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft dflash/models/draft \ - --bin dflash/build/test_dflash \ + --target server/models/Qwen3.6-27B-Q4_K_M.gguf \ + --draft server/models/draft \ + --bin server/build/test_dflash \ --profiles rtx3090_dflash_safe,rtx3090_dflash_long \ --clients all \ --json-out /tmp/lucebox_harness_sweep.json diff --git a/harness/benchmarks/run_lucebox_vs_llamacpp.sh b/harness/benchmarks/run_lucebox_vs_llamacpp.sh index acce47e0d..a800e3a24 100755 --- a/harness/benchmarks/run_lucebox_vs_llamacpp.sh +++ b/harness/benchmarks/run_lucebox_vs_llamacpp.sh @@ -7,10 +7,10 @@ RUN_DIR="${RUN_DIR:-$REPO_DIR/.harness-runs}" STAMP="${STAMP:-generation-baseline-$(date +%Y%m%d-%H%M%S)}" LOG_DIR="$RUN_DIR/$STAMP" -TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}" -DRAFT="${DRAFT:-$REPO_DIR/dflash/models/draft/dflash-draft-3.6-q8_0.gguf}" -DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/dflash/build/test_dflash}" -LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/dflash/deps/llama.cpp/build/bin/llama-server}" +TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}" +DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}" +DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/server/build/test_dflash}" +LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/server/deps/llama.cpp/build/bin/llama-server}" HOST="${HOST:-127.0.0.1}" LUCEBOX_PORT="${LUCEBOX_PORT:-18080}" @@ -30,7 +30,7 @@ API_KEY="${API_KEY:-sk-lucebox}" PROMPTS="${PROMPTS:-$SCRIPT_DIR/prompts/generation_smoke.jsonl}" LUCEBOX_SERVER_BACKEND="${LUCEBOX_SERVER_BACKEND:-python}" -DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/dflash/build/dflash_server}" +DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}" mkdir -p "$LOG_DIR" @@ -115,8 +115,8 @@ if [[ "$LUCEBOX_SERVER_BACKEND" == "cpp" ]]; then if [[ ! -x "$DFLASH_SERVER_BIN" ]]; then echo "dflash_server not found or not executable: $DFLASH_SERVER_BIN" >&2 echo "Build it first, for example:" >&2 - echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/dflash/build -DGGML_CUDA=ON" >&2 - echo " cmake --build $REPO_DIR/dflash/build --target dflash_server -j\$(nproc)" >&2 + echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/server/build -DGGML_CUDA=ON" >&2 + echo " cmake --build $REPO_DIR/server/build --target dflash_server -j\$(nproc)" >&2 exit 1 fi local_ddtree_args=() @@ -142,7 +142,7 @@ if [[ "$LUCEBOX_SERVER_BACKEND" == "cpp" ]]; then > "$LUCEBOX_LOG" 2>&1 & LUCEBOX_PID=$! else - python3 -u dflash/scripts/server.py \ + python3 -u server/scripts/server.py \ --host "$HOST" \ --port "$LUCEBOX_PORT" \ --target "$TARGET" \ diff --git a/harness/clients/README.md b/harness/clients/README.md index 00041e222..9d7458cb3 100644 --- a/harness/clients/README.md +++ b/harness/clients/README.md @@ -11,16 +11,16 @@ cd /workspace/lucebox-hub-harness harness/clients/run_codex.sh ``` -Each launcher starts `dflash/scripts/server.py`, runs the client, writes logs +Each launcher starts `server/scripts/server.py`, runs the client, writes logs under `/workspace/lucebox-client-harness-runs`, then stops the server. Set `LUCEBOX_SERVER_BACKEND=cpp` to run the native C++ HTTP server instead. -The launcher will start `dflash/build/dflash_server` by default, or the path in +The launcher will start `server/build/dflash_server` by default, or the path in `DFLASH_SERVER_BIN`. ```bash LUCEBOX_SERVER_BACKEND=cpp \ -DFLASH_SERVER_BIN=dflash/build/dflash_server \ +DFLASH_SERVER_BIN=server/build/dflash_server \ MAX_CTX=32768 MAX_TOKENS=512 \ BUDGET=22 VERIFY_MODE=ddtree \ harness/clients/run_codex.sh diff --git a/harness/clients/common.sh b/harness/clients/common.sh index e5dd8a585..f00122edc 100755 --- a/harness/clients/common.sh +++ b/harness/clients/common.sh @@ -8,12 +8,12 @@ REPO_DIR="${REPO_DIR:-/workspace/lucebox-hub-harness}" CLIENT_WORK_DIR="${CLIENT_WORK_DIR:-/workspace/lucebox-harness-work}" RUN_DIR="${RUN_DIR:-/workspace/lucebox-client-harness-runs}" -TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}" -DRAFT="${DRAFT:-$REPO_DIR/dflash/models/draft/dflash-draft-3.6-q8_0.gguf}" -DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/dflash/build/test_dflash}" +TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}" +DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}" +DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/server/build/test_dflash}" MODEL_SERVER="${MODEL_SERVER:-lucebox}" LUCEBOX_SERVER_BACKEND="${LUCEBOX_SERVER_BACKEND:-python}" -DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/dflash/build/dflash_server}" +DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}" LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}" LLAMA_N_GPU_LAYERS="${LLAMA_N_GPU_LAYERS:-999}" LLAMA_FLASH_ATTN="${LLAMA_FLASH_ATTN:-1}" @@ -74,7 +74,7 @@ start_lucebox_server() { # EXTRA_SERVER_ARGS="--lazy-draft --prefill-compression auto" read -r -a extra_args <<< "$EXTRA_SERVER_ARGS" fi - python3 -u dflash/scripts/server.py \ + python3 -u server/scripts/server.py \ --host "$HOST" \ --port "$PORT" \ --target "$TARGET" \ @@ -97,8 +97,8 @@ start_dflash_native_server() { if [[ ! -x "$DFLASH_SERVER_BIN" ]]; then echo "dflash_server not found or not executable: $DFLASH_SERVER_BIN" >&2 echo "Build it first, for example:" >&2 - echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/dflash/build -DGGML_CUDA=ON" >&2 - echo " cmake --build $REPO_DIR/dflash/build --target dflash_server -j\$(nproc)" >&2 + echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/server/build -DGGML_CUDA=ON" >&2 + echo " cmake --build $REPO_DIR/server/build --target dflash_server -j\$(nproc)" >&2 return 1 fi local extra_args=() @@ -134,7 +134,7 @@ start_llamacpp_server() { if [[ ! -x "$LLAMA_SERVER_BIN" ]]; then echo "llama-server not found or not executable: $LLAMA_SERVER_BIN" >&2 echo "Build it first, for example:" >&2 - echo " cmake -S $REPO_DIR/dflash/deps/llama.cpp -B /workspace/llama-cpp-server-build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DLLAMA_BUILD_SERVER=ON -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_CURL=OFF" >&2 + echo " cmake -S $REPO_DIR/server/deps/llama.cpp -B /workspace/llama-cpp-server-build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DLLAMA_BUILD_SERVER=ON -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_CURL=OFF" >&2 echo " cmake --build /workspace/llama-cpp-server-build --target llama-server -j2" >&2 return 1 fi diff --git a/harness/clients/run_claude_llamacpp_decode_check.sh b/harness/clients/run_claude_llamacpp_decode_check.sh index 8cf2902c7..1111aa952 100755 --- a/harness/clients/run_claude_llamacpp_decode_check.sh +++ b/harness/clients/run_claude_llamacpp_decode_check.sh @@ -4,7 +4,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_DIR="${REPO_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}" RUN_ROOT="${RUN_ROOT:-${RUN_DIR:-/workspace/lucebox-client-harness-runs/claude-llamacpp-decode-check}}" -TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}" +TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}" LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}" STAMP="${STAMP:-q8_32k_decode_check}" PROMPT="${PROMPT:-Reply with exactly: OK_DONE}" diff --git a/harness/clients/run_claude_llamacpp_matrix.sh b/harness/clients/run_claude_llamacpp_matrix.sh index 5ffd0b538..fd9a71382 100755 --- a/harness/clients/run_claude_llamacpp_matrix.sh +++ b/harness/clients/run_claude_llamacpp_matrix.sh @@ -4,7 +4,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_DIR="${REPO_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}" RUN_ROOT="${RUN_ROOT:-${RUN_DIR:-/workspace/lucebox-client-harness-runs/claude-llamacpp-matrix}}" -TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}" +TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}" LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}" PROMPT="${PROMPT:-Write exactly 120 words about why a repo-level agent benchmark should run the client harness instead of a handcrafted HTTP request. End with OK_DONE.}" MARKER="${MARKER:-OK_DONE}" diff --git a/megakernel/README.md b/optimizations/megakernel/README.md similarity index 99% rename from megakernel/README.md rename to optimizations/megakernel/README.md index e1c43241d..be066d94f 100644 --- a/megakernel/README.md +++ b/optimizations/megakernel/README.md @@ -116,7 +116,7 @@ uv sync --extra megakernel uv run --directory megakernel python final_bench.py # runs pp520 tg128 (properly warmed), prints tok/s ``` -The legacy standalone flow still works from inside `megakernel/`: create a +The legacy standalone flow still works from inside `optimizations/megakernel/`: create a virtualenv, install `torch`, then run `pip install -e . --no-build-isolation` so `setup.py` can import torch while compiling the CUDA extension. diff --git a/megakernel/README_PASCAL.md b/optimizations/megakernel/README_PASCAL.md similarity index 100% rename from megakernel/README_PASCAL.md rename to optimizations/megakernel/README_PASCAL.md diff --git a/megakernel/RESULTS.md b/optimizations/megakernel/RESULTS.md similarity index 92% rename from megakernel/RESULTS.md rename to optimizations/megakernel/RESULTS.md index 17bc24896..4d1bb0e71 100644 --- a/megakernel/RESULTS.md +++ b/optimizations/megakernel/RESULTS.md @@ -77,12 +77,12 @@ PyTorch reference on the pp520 prompt from `final_bench.py`. ```bash # Auto-dispatches to the NVFP4 path on Blackwell -python megakernel/final_bench.py +python optimizations/megakernel/final_bench.py # Or force a backend -python megakernel/final_bench.py --backend nvfp4 -python megakernel/final_bench.py --backend bf16 +python optimizations/megakernel/final_bench.py --backend nvfp4 +python optimizations/megakernel/final_bench.py --backend bf16 # Switch prefill mode (default is "hybrid"; "raw" uses prefill_megakernel_nvfp4) -MEGAKERNEL_PREFILL_MODE=raw python megakernel/final_bench.py --backend nvfp4 +MEGAKERNEL_PREFILL_MODE=raw python optimizations/megakernel/final_bench.py --backend nvfp4 ``` diff --git a/megakernel/_phase2_variant.py b/optimizations/megakernel/_phase2_variant.py similarity index 100% rename from megakernel/_phase2_variant.py rename to optimizations/megakernel/_phase2_variant.py diff --git a/megakernel/bench.py b/optimizations/megakernel/bench.py similarity index 100% rename from megakernel/bench.py rename to optimizations/megakernel/bench.py diff --git a/megakernel/bench_pp_tg.py b/optimizations/megakernel/bench_pp_tg.py similarity index 100% rename from megakernel/bench_pp_tg.py rename to optimizations/megakernel/bench_pp_tg.py diff --git a/megakernel/bench_pp_tg_nvfp4.py b/optimizations/megakernel/bench_pp_tg_nvfp4.py similarity index 100% rename from megakernel/bench_pp_tg_nvfp4.py rename to optimizations/megakernel/bench_pp_tg_nvfp4.py diff --git a/megakernel/build_corpus.py b/optimizations/megakernel/build_corpus.py similarity index 100% rename from megakernel/build_corpus.py rename to optimizations/megakernel/build_corpus.py diff --git a/megakernel/corpus/baseline.json b/optimizations/megakernel/corpus/baseline.json similarity index 100% rename from megakernel/corpus/baseline.json rename to optimizations/megakernel/corpus/baseline.json diff --git a/megakernel/corpus/wmma.json b/optimizations/megakernel/corpus/wmma.json similarity index 100% rename from megakernel/corpus/wmma.json rename to optimizations/megakernel/corpus/wmma.json diff --git a/megakernel/corpus/wmma_p3.json b/optimizations/megakernel/corpus/wmma_p3.json similarity index 100% rename from megakernel/corpus/wmma_p3.json rename to optimizations/megakernel/corpus/wmma_p3.json diff --git a/megakernel/corpus/wmma_p4.json b/optimizations/megakernel/corpus/wmma_p4.json similarity index 100% rename from megakernel/corpus/wmma_p4.json rename to optimizations/megakernel/corpus/wmma_p4.json diff --git a/megakernel/corpus/wmma_p6cleanup.json b/optimizations/megakernel/corpus/wmma_p6cleanup.json similarity index 100% rename from megakernel/corpus/wmma_p6cleanup.json rename to optimizations/megakernel/corpus/wmma_p6cleanup.json diff --git a/megakernel/corpus/wmma_p7.json b/optimizations/megakernel/corpus/wmma_p7.json similarity index 100% rename from megakernel/corpus/wmma_p7.json rename to optimizations/megakernel/corpus/wmma_p7.json diff --git a/megakernel/corpus/wmma_p8.json b/optimizations/megakernel/corpus/wmma_p8.json similarity index 100% rename from megakernel/corpus/wmma_p8.json rename to optimizations/megakernel/corpus/wmma_p8.json diff --git a/megakernel/diag_phase2_metrics.py b/optimizations/megakernel/diag_phase2_metrics.py similarity index 100% rename from megakernel/diag_phase2_metrics.py rename to optimizations/megakernel/diag_phase2_metrics.py diff --git a/megakernel/diag_prefill_kernels.py b/optimizations/megakernel/diag_prefill_kernels.py similarity index 100% rename from megakernel/diag_prefill_kernels.py rename to optimizations/megakernel/diag_prefill_kernels.py diff --git a/megakernel/final_bench.py b/optimizations/megakernel/final_bench.py similarity index 100% rename from megakernel/final_bench.py rename to optimizations/megakernel/final_bench.py diff --git a/megakernel/final_bench_nvfp4.py b/optimizations/megakernel/final_bench_nvfp4.py similarity index 100% rename from megakernel/final_bench_nvfp4.py rename to optimizations/megakernel/final_bench_nvfp4.py diff --git a/megakernel/half_type.h b/optimizations/megakernel/half_type.h similarity index 100% rename from megakernel/half_type.h rename to optimizations/megakernel/half_type.h diff --git a/megakernel/hero.png b/optimizations/megakernel/hero.png similarity index 100% rename from megakernel/hero.png rename to optimizations/megakernel/hero.png diff --git a/megakernel/hero.raw.png b/optimizations/megakernel/hero.raw.png similarity index 100% rename from megakernel/hero.raw.png rename to optimizations/megakernel/hero.raw.png diff --git a/megakernel/kernel.cu b/optimizations/megakernel/kernel.cu similarity index 100% rename from megakernel/kernel.cu rename to optimizations/megakernel/kernel.cu diff --git a/megakernel/kernel_gb10_nvfp4.cu b/optimizations/megakernel/kernel_gb10_nvfp4.cu similarity index 100% rename from megakernel/kernel_gb10_nvfp4.cu rename to optimizations/megakernel/kernel_gb10_nvfp4.cu diff --git a/megakernel/model.py b/optimizations/megakernel/model.py similarity index 100% rename from megakernel/model.py rename to optimizations/megakernel/model.py diff --git a/megakernel/model_nvfp4.py b/optimizations/megakernel/model_nvfp4.py similarity index 100% rename from megakernel/model_nvfp4.py rename to optimizations/megakernel/model_nvfp4.py diff --git a/megakernel/prefill.cu b/optimizations/megakernel/prefill.cu similarity index 100% rename from megakernel/prefill.cu rename to optimizations/megakernel/prefill.cu diff --git a/megakernel/prefill_bw.cu b/optimizations/megakernel/prefill_bw.cu similarity index 100% rename from megakernel/prefill_bw.cu rename to optimizations/megakernel/prefill_bw.cu diff --git a/megakernel/prefill_megakernel.cu b/optimizations/megakernel/prefill_megakernel.cu similarity index 100% rename from megakernel/prefill_megakernel.cu rename to optimizations/megakernel/prefill_megakernel.cu diff --git a/megakernel/pyproject.toml b/optimizations/megakernel/pyproject.toml similarity index 100% rename from megakernel/pyproject.toml rename to optimizations/megakernel/pyproject.toml diff --git a/megakernel/setup.py b/optimizations/megakernel/setup.py similarity index 100% rename from megakernel/setup.py rename to optimizations/megakernel/setup.py diff --git a/megakernel/torch_bindings.cpp b/optimizations/megakernel/torch_bindings.cpp similarity index 100% rename from megakernel/torch_bindings.cpp rename to optimizations/megakernel/torch_bindings.cpp diff --git a/pflash/README.md b/optimizations/pflash/README.md similarity index 92% rename from pflash/README.md rename to optimizations/pflash/README.md index 22be8a48e..47b9f81cd 100644 --- a/pflash/README.md +++ b/optimizations/pflash/README.md @@ -41,7 +41,7 @@ Long-context prefill is O(S²): vanilla llama.cpp on a single RTX 3090 takes **~ - C++/CUDA daemon-resident drafter + scoring + target generation, all in one process, one ggml allocator. - Custom Qwen3-0.6B BF16 forward (`qwen3_0p6b_loader.cpp` + `qwen3_0p6b_graph.cpp`) — no libllama. - 4 CUDA kernels for the FlashPrefill `mean_K → score → select → sparse_fwd` algorithm (`flashprefill_kernels.cu`). -- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention), FA-2 derived, sm_80+) for the long-context drafter forward, wired without `libtorch` via 3 ATen/c10 header stubs (`dflash/deps/bsa_stubs/`). +- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention), FA-2 derived, sm_80+) for the long-context drafter forward, wired without `libtorch` via 3 ATen/c10 header stubs (`server/deps/bsa_stubs/`). - 128K → 2.6K span selection at `keep_ratio=0.05`, NIAH retrieved at every measured context, decode ~74 tok/s downstream. ## Results @@ -57,7 +57,7 @@ Decode after prefill: ~74 tok/s (dflash spec decode + DDTree). The pipeline is t ## Quick start -PFlash is the algorithm. The implementation lives in [`../dflash/`](../dflash/) as part of the dflash daemon. The `pflash/` directory in this repo only contains the Python tooling for **benchmarking** (NIAH case generation, bench harness around the daemon stdin protocol). Production deploys hit the dflash daemon directly. +PFlash is the algorithm. The implementation lives in [`../server/`](../server/) as part of the dflash daemon. The `optimizations/pflash/` directory in this repo only contains the Python tooling for **benchmarking** (NIAH case generation, bench harness around the daemon stdin protocol). Production deploys hit the dflash daemon directly. ```bash # 1. from the repo root, install Python deps and build dflash with the BSA @@ -65,34 +65,34 @@ PFlash is the algorithm. The implementation lives in [`../dflash/`](../dflash/) cd lucebox-hub uv sync git submodule update --init --recursive -cmake -B dflash/build -S dflash -DCMAKE_BUILD_TYPE=Release \ +cmake -B server/build -S dflash -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CUDA_ARCHITECTURES=86 \ -DDFLASH27B_ENABLE_BSA=ON -cmake --build dflash/build --target test_dflash test_flashprefill_kernels -j +cmake --build server/build --target test_dflash test_flashprefill_kernels -j # 2. fetch weights (target + spec-decode draft + drafter scorer) -uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir dflash/models/ -uv run hf download Qwen/Qwen3-0.6B model.safetensors tokenizer.json --local-dir dflash/models/drafter/ -uv run hf download z-lab/Qwen3.6-27B-DFlash model.safetensors --local-dir dflash/models/draft/ +uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/ +uv run hf download Qwen/Qwen3-0.6B model.safetensors tokenizer.json --local-dir server/models/drafter/ +uv run hf download z-lab/Qwen3.6-27B-DFlash model.safetensors --local-dir server/models/draft/ # 2b. convert the drafter (Qwen3-0.6B HF) to a BF16 GGUF for the C++ scorer. # The submodule already vendors llama.cpp at deps/llama.cpp. -uv run python dflash/deps/llama.cpp/convert_hf_to_gguf.py dflash/models/drafter \ - --outtype bf16 --outfile dflash/models/Qwen3-0.6B-BF16.gguf +uv run python server/deps/llama.cpp/convert_hf_to_gguf.py server/models/drafter \ + --outtype bf16 --outfile server/models/Qwen3-0.6B-BF16.gguf # 3. generate NIAH cases + run head-to-head bench against the C++ daemon uv run --directory pflash python tests/niah_gen.py --n 1 --ctx 131072 --out /tmp/niah_128k.jsonl uv run --directory pflash python tests/bench_niah_cpp.py \ - --bin ../dflash/build/test_dflash \ - --target ../dflash/models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft-spec ../dflash/models/draft/model.safetensors \ - --drafter-gguf ../dflash/models/Qwen3-0.6B-BF16.gguf \ + --bin ../server/build/test_dflash \ + --target ../server/models/Qwen3.6-27B-Q4_K_M.gguf \ + --draft-spec ../server/models/draft/model.safetensors \ + --drafter-gguf ../server/models/Qwen3-0.6B-BF16.gguf \ --cases /tmp/niah_128k.jsonl --keep-ratio 0.05 --n-gen 256 ``` ## OpenAI server flags -For an OpenAI-compatible server with transparent compression on long prompts, run [`dflash/scripts/server.py`](../dflash/scripts/server.py) with these flags: +For an OpenAI-compatible server with transparent compression on long prompts, run [`server/scripts/server.py`](../server/scripts/server.py) with these flags: | Flag | Choices / type | Default | Effect | |---|---|:---:|---| @@ -105,14 +105,14 @@ For an OpenAI-compatible server with transparent compression on long prompts, ru When `--prefill-compression != off`, the server auto-sets `DFLASH27B_LM_HEAD_FIX=0` and `DFLASH27B_FA_WINDOW=0` (matching the bench harness — needed so the post-compress draft graph fits on a 24 GB card without OOM). ```bash -python dflash/scripts/server.py \ - --target dflash/models/Qwen3.6-27B-Q4_K_M.gguf \ - --draft dflash/models/draft/model.safetensors \ +python server/scripts/server.py \ + --target server/models/Qwen3.6-27B-Q4_K_M.gguf \ + --draft server/models/draft/model.safetensors \ --max-ctx 8192 --budget 16 --fa-window 0 \ --prefill-compression auto \ --prefill-threshold 4096 \ --prefill-keep-ratio 0.02 \ - --prefill-drafter dflash/models/Qwen3-0.6B-BF16.gguf + --prefill-drafter server/models/Qwen3-0.6B-BF16.gguf ``` Below the threshold the server runs the standard target generate (no compression). Above it, the server transparently runs `compress` on the daemon, swaps the prompt for the compressed text, and continues the normal `/v1/chat/completions` flow. Tool-calling requests (`req.tools` non-empty) skip compression so JSON tool definitions stay intact. @@ -137,7 +137,7 @@ Typical flow at 128K on a 24 GB card: `park target` → `compress` → `free dra ## Runtime tunables -Everything is configured via env vars on the daemon process. Full list in [`../dflash/src/flashprefill.h`](../dflash/src/flashprefill.h). +Everything is configured via env vars on the daemon process. Full list in [`../server/src/flashprefill.h`](../server/src/flashprefill.h). | Env var | Default | Purpose | |---|:---:|---| @@ -205,7 +205,7 @@ The algorithms are not ours: What we built: - C++/CUDA port of the FlashPrefill algorithm: 4 kernels (`mean_K / score / select / sparse_fwd`), no Triton dependency. -- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention)) wired without `libtorch` via 3 ATen/c10 header stubs (`dflash/deps/bsa_stubs/`). +- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention)) wired without `libtorch` via 3 ATen/c10 header stubs (`server/deps/bsa_stubs/`). - Custom Qwen3-0.6B BF16 forward so the drafter runs through the same ggml allocator as the 27B target. - Daemon stdin protocol (`compress` / `generate` / `park` / `unpark` / `free drafter`) so target + drafter coexist on a 24 GB card. - NIAH harness against `llama-bench` for end-to-end validation. diff --git a/pflash/demo.gif b/optimizations/pflash/demo.gif similarity index 100% rename from pflash/demo.gif rename to optimizations/pflash/demo.gif diff --git a/pflash/hero.png b/optimizations/pflash/hero.png similarity index 100% rename from pflash/hero.png rename to optimizations/pflash/hero.png diff --git a/pflash/pflash/__init__.py b/optimizations/pflash/pflash/__init__.py similarity index 100% rename from pflash/pflash/__init__.py rename to optimizations/pflash/pflash/__init__.py diff --git a/pflash/pflash/config.py b/optimizations/pflash/pflash/config.py similarity index 100% rename from pflash/pflash/config.py rename to optimizations/pflash/pflash/config.py diff --git a/pflash/pflash/dflash_client.py b/optimizations/pflash/pflash/dflash_client.py similarity index 100% rename from pflash/pflash/dflash_client.py rename to optimizations/pflash/pflash/dflash_client.py diff --git a/pflash/pyproject.toml b/optimizations/pflash/pyproject.toml similarity index 100% rename from pflash/pyproject.toml rename to optimizations/pflash/pyproject.toml diff --git a/pflash/tests/bench_niah_cpp.py b/optimizations/pflash/tests/bench_niah_cpp.py similarity index 100% rename from pflash/tests/bench_niah_cpp.py rename to optimizations/pflash/tests/bench_niah_cpp.py diff --git a/pflash/tests/niah_gen.py b/optimizations/pflash/tests/niah_gen.py similarity index 100% rename from pflash/tests/niah_gen.py rename to optimizations/pflash/tests/niah_gen.py diff --git a/pyproject.toml b/pyproject.toml index 9fa072776..30dc2ee58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ package = false no-build-isolation-package = ["qwen35-megakernel-bf16"] [tool.uv.workspace] -members = ["dflash", "megakernel", "pflash"] +members = ["server", "optimizations/megakernel", "optimizations/pflash"] [tool.uv.sources] lucebox-dflash = { workspace = true } diff --git a/scripts/check_uv_workspace.sh b/scripts/check_uv_workspace.sh index 9015a48ed..618c57011 100755 --- a/scripts/check_uv_workspace.sh +++ b/scripts/check_uv_workspace.sh @@ -21,14 +21,14 @@ import numpy import transformers import uvicorn -sys.path.insert(0, "dflash/scripts") +sys.path.insert(0, "server/scripts") import server # noqa: F401 print("workspace import OK from repo root") PY ( - cd dflash + cd server uv run --frozen --no-sync python - <<'PY' from pathlib import Path import sys @@ -38,6 +38,6 @@ assert Path(sys.prefix).resolve() == root_venv, (sys.prefix, root_venv) sys.path.insert(0, "scripts") import server # noqa: F401 -print("workspace discovery OK from dflash/") +print("workspace discovery OK from server/") PY ) diff --git a/dflash/.gitignore b/server/.gitignore similarity index 100% rename from dflash/.gitignore rename to server/.gitignore diff --git a/dflash/CMakeLists.txt b/server/CMakeLists.txt similarity index 100% rename from dflash/CMakeLists.txt rename to server/CMakeLists.txt diff --git a/dflash/CODEX.md b/server/CODEX.md similarity index 100% rename from dflash/CODEX.md rename to server/CODEX.md diff --git a/dflash/DEVELOPER.md b/server/DEVELOPER.md similarity index 96% rename from dflash/DEVELOPER.md rename to server/DEVELOPER.md index 6cda569a0..1c6ed16c2 100644 --- a/dflash/DEVELOPER.md +++ b/server/DEVELOPER.md @@ -22,7 +22,7 @@ build-essential cmake git git-lfs nvcc (CUDA Toolkit) A setup script is provided that installs everything (run as root): ```bash -sudo bash dflash/scripts/setup_system.sh +sudo bash server/scripts/setup_system.sh ``` This installs build tools, `hf` (via pipx), and the CUDA Toolkit. @@ -71,7 +71,7 @@ cmake -B build -S . -DCMAKE_BUILD_TYPE=Release cmake --build build --target test_dflash -j ``` -The binary lands at `dflash/build/test_dflash`. +The binary lands at `server/build/test_dflash`. ### CMake options @@ -90,16 +90,16 @@ Download models before running the server: ```bash # Target model (Q4_K_M quantized Qwen3.6-27B) -hf download --local-dir dflash/models/ +hf download --local-dir server/models/ # Draft model (1.84 GB default Qwen3.6 GGUF draft) -hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir dflash/models/draft/ +hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/ ``` Expected layout: ``` -dflash/models/ +server/models/ ├── Qwen3.6-27B-Q4_K_M.gguf # --target (GGUF) └── draft/ └── dflash-draft-3.6-q8_0.gguf # --draft (GGUF) @@ -155,7 +155,7 @@ python scripts/server.py These tests **do not** require a GPU or running daemon — they use mocked backends: ```bash -cd dflash/scripts +cd server/scripts python -m pytest test_server.py -v ``` @@ -186,7 +186,7 @@ run the baseline tests above to validate code changes. After building: ```bash -cd dflash/build +cd server/build # Numerics tests ./test_vs_oracle --target ../models/Qwen3.6-27B-Q4_K_M.gguf \ @@ -203,7 +203,7 @@ cd dflash/build These scripts start their own server subprocess and need the daemon binary + models: ```bash -cd dflash/scripts +cd server/scripts python test_server_prefix_cache.py python test_multi_turn_prefix_cache.py python test_full_compress_cache.py @@ -214,7 +214,7 @@ python test_full_compress_cache.py ## Project structure ``` -dflash/ +server/ ├── CMakeLists.txt # C++ build (cmake) ├── include/ # C++ headers ├── src/ # C++ sources (target/draft graph, KV cache, FlashPrefill) @@ -267,7 +267,7 @@ No `env_key` is needed for local use. ```bash # Start the server -python dflash/scripts/server.py --port 8080 +python server/scripts/server.py --port 8080 # In another terminal codex --provider dflash "Explain this codebase" diff --git a/dflash/README.md b/server/README.md similarity index 99% rename from dflash/README.md rename to server/README.md index 35bc2a2b7..2a7ee6343 100644 --- a/dflash/README.md +++ b/server/README.md @@ -273,7 +273,7 @@ The 131K `keep=0.10` run depends on token-boundary repair in `scripts/laguna_pfl The table above uses synthetic uniform filler. Pass `--filler-file ` to use a real corpus instead (file or directory; directories are recursively -concatenated). On `dflash/src` (1.3 MiB of C++/CUDA, ctx=16K, depth=0.5): +concatenated). On `server/src` (1.3 MiB of C++/CUDA, ctx=16K, depth=0.5): | keep | drafter compressed | NIAH | |:----:|-------------------:|:----:| @@ -457,7 +457,7 @@ only wire protocol used by [OpenAI Codex](https://github.com/openai/codex). ### 1. Start the DFlash server ```bash -python dflash/scripts/server.py \ +python server/scripts/server.py \ --target models/Qwen3.5-27B-Q4_K_M.gguf \ --draft models/Qwen3.5-3B-f16.safetensors \ --budget 22 --port 8080 diff --git a/dflash/RESULTS.md b/server/RESULTS.md similarity index 99% rename from dflash/RESULTS.md rename to server/RESULTS.md index b110e9c54..09fe73536 100644 --- a/dflash/RESULTS.md +++ b/server/RESULTS.md @@ -577,7 +577,7 @@ Draft: local Qwen3.6-27B DFlash safetensors + Qwen3-0.6B-BF16 PFlash drafter. Build: `cmake -B build-luce-sm120 -S . -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=120 -DDFLASH27B_USER_CUDA_ARCHITECTURES=120 -DDFLASH27B_ENABLE_BSA=ON` -Final ("V4") runtime config — driven via the `pflash/tests/bench_niah_cpp.py` +Final ("V4") runtime config — driven via the `optimizations/pflash/tests/bench_niah_cpp.py` CLI flags added in #90 plus daemon env vars (each bullet leads with the exact interface): @@ -589,7 +589,7 @@ exact interface): - `--n-gen=1024` Test set: 10 NIAH prompts at 117K tokens (margin under Qwen3.6-27B's 131K -native RoPE limit, generated with [`pflash/tests/niah_gen.py`](../pflash/tests/niah_gen.py) +native RoPE limit, generated with [`optimizations/pflash/tests/niah_gen.py`](../optimizations/pflash/tests/niah_gen.py) at calibrated `char_per_tok`). ### RTX 5090 long-ctx headline @@ -774,7 +774,7 @@ Linux 4090 should match or exceed the 3090 numbers. Running via `server.py` (OpenAI-compatible HTTP) with TQ3 KV cache and 128K context: ```bash -DFLASH27B_KV_TQ3=1 python dflash/scripts/server.py \ +DFLASH27B_KV_TQ3=1 python server/scripts/server.py \ --target Qwen3.6-27B-Q4_K_M.gguf --draft dflash-draft-3.6-q8_0.gguf \ --port 8082 --budget 28 --max-ctx 131072 ``` diff --git a/dflash/demo.gif b/server/demo.gif similarity index 100% rename from dflash/demo.gif rename to server/demo.gif diff --git a/dflash/deps/Block-Sparse-Attention b/server/deps/Block-Sparse-Attention similarity index 100% rename from dflash/deps/Block-Sparse-Attention rename to server/deps/Block-Sparse-Attention diff --git a/dflash/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h b/server/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h similarity index 100% rename from dflash/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h rename to server/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h diff --git a/dflash/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh b/server/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh similarity index 100% rename from dflash/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh rename to server/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh diff --git a/dflash/deps/bsa_stubs/README.md b/server/deps/bsa_stubs/README.md similarity index 95% rename from dflash/deps/bsa_stubs/README.md rename to server/deps/bsa_stubs/README.md index a0b0c6864..d0abfb125 100644 --- a/dflash/deps/bsa_stubs/README.md +++ b/server/deps/bsa_stubs/README.md @@ -16,7 +16,7 @@ the references BSA actually uses: returning `{seed, offset}` from the stub state. These headers are placed FIRST on the BSA include path -(`dflash/CMakeLists.txt`, gated on `DFLASH27B_ENABLE_BSA`). When BSA's +(`server/CMakeLists.txt`, gated on `DFLASH27B_ENABLE_BSA`). When BSA's generated CUDA includes ``, the compiler picks up this stub instead of trying to find PyTorch. diff --git a/dflash/deps/bsa_stubs/c10/cuda/CUDAException.h b/server/deps/bsa_stubs/c10/cuda/CUDAException.h similarity index 100% rename from dflash/deps/bsa_stubs/c10/cuda/CUDAException.h rename to server/deps/bsa_stubs/c10/cuda/CUDAException.h diff --git a/dflash/deps/llama.cpp b/server/deps/llama.cpp similarity index 100% rename from dflash/deps/llama.cpp rename to server/deps/llama.cpp diff --git a/dflash/docs/API.md b/server/docs/API.md similarity index 100% rename from dflash/docs/API.md rename to server/docs/API.md diff --git a/dflash/docs/ARCHITECTURE.md b/server/docs/ARCHITECTURE.md similarity index 99% rename from dflash/docs/ARCHITECTURE.md rename to server/docs/ARCHITECTURE.md index 1583aed71..dfba75840 100644 --- a/dflash/docs/ARCHITECTURE.md +++ b/server/docs/ARCHITECTURE.md @@ -36,7 +36,7 @@ using a uniform stdin/stdout protocol. ## Directory Structure ``` -dflash/src/ +server/src/ ├── common/ # Shared infrastructure (all backends) │ ├── model_backend.h # ModelBackend abstract interface │ ├── snapshot_backend.h # Platform-aware snapshot backend selection @@ -406,7 +406,7 @@ by Qwen35Backend. ### Step 6: Build and Test ```bash -cd dflash/build && cmake .. -DCMAKE_BUILD_TYPE=Release && cmake --build . -j$(nproc) +cd server/build && cmake .. -DCMAKE_BUILD_TYPE=Release && cmake --build . -j$(nproc) # AR baseline ./test_dflash daemon --target ../../models/gemma4.gguf diff --git a/dflash/docs/HIP_PERF_PLAN.md b/server/docs/HIP_PERF_PLAN.md similarity index 100% rename from dflash/docs/HIP_PERF_PLAN.md rename to server/docs/HIP_PERF_PLAN.md diff --git a/dflash/docs/MIXED_BACKEND.md b/server/docs/MIXED_BACKEND.md similarity index 100% rename from dflash/docs/MIXED_BACKEND.md rename to server/docs/MIXED_BACKEND.md diff --git a/dflash/docs/PREFIX_CACHE.md b/server/docs/PREFIX_CACHE.md similarity index 100% rename from dflash/docs/PREFIX_CACHE.md rename to server/docs/PREFIX_CACHE.md diff --git a/dflash/docs/SPEC_PREFILL.md b/server/docs/SPEC_PREFILL.md similarity index 97% rename from dflash/docs/SPEC_PREFILL.md rename to server/docs/SPEC_PREFILL.md index cb2837dce..05ff617d0 100644 --- a/dflash/docs/SPEC_PREFILL.md +++ b/server/docs/SPEC_PREFILL.md @@ -4,7 +4,7 @@ In-process speculative-prefill + speculative-decode daemon (C++/CUDA only, no Python, no Triton, no PyTorch at runtime). This doc is the build / runtime / tunables reference for the C++ daemon -path described in [`pflash/README.md`](../../pflash/README.md) and on the +path described in [`optimizations/pflash/README.md`](../../optimizations/pflash/README.md) and on the [blog post](https://lucebox.com/blog/pflash): - **Drafter** (Qwen3-0.6B) loaded via a custom forward (`qwen3_*`) diff --git a/dflash/docs/laguna_integration_plan.md b/server/docs/laguna_integration_plan.md similarity index 93% rename from dflash/docs/laguna_integration_plan.md rename to server/docs/laguna_integration_plan.md index fcdc55057..60423e922 100644 --- a/dflash/docs/laguna_integration_plan.md +++ b/server/docs/laguna_integration_plan.md @@ -12,26 +12,26 @@ Status: scaffolding. PR #115 in lucebox-hub bumps llama.cpp submodule to `luce-d ## Constraint -No libllama dependency in dflash runtime. Keep ggml-only stack. (libllama+LAGUNA arch from PR #7 is used by quantize/inspect tools at /workspace/lucebox-hub/dflash/deps/llama.cpp/build-standalone/ and at HF upload time, not by the daemon.) +No libllama dependency in dflash runtime. Keep ggml-only stack. (libllama+LAGUNA arch from PR #7 is used by quantize/inspect tools at /workspace/lucebox-hub/server/deps/llama.cpp/build-standalone/ and at HF upload time, not by the daemon.) ## Implementation outline ### Files to add -1. `dflash/src/laguna_internal.h` (NEW, ~200 LOC) — structs: +1. `server/src/laguna_internal.h` (NEW, ~200 LOC) — structs: - `LagunaTargetLayer` — per-layer tensors (attn_norm, wq/wk/wv/wo, q_norm/k_norm, attn_gate, ffn_norm, dense MLP for layer 0, MoE: ffn_gate_inp + ffn_exp_probs_b + ffn_gate_exps + ffn_up_exps + ffn_down_exps + ffn_gate_shexp + ffn_up_shexp + ffn_down_shexp) - `LagunaTargetWeights` — collection of layers + tok_embd + output_norm + output, plus metadata (n_layer=40, n_head_per_layer[40] = [48,64,64,64]*10, n_head_kv=8, head_dim=128, n_embd=2048, n_ff=8192, n_ff_exp=512, n_ff_shexp=512, n_expert=256, n_expert_used=8, expert_weights_scale=2.5, sliding_window=512, rope_freq_base_full=500000, rope_freq_base_swa=10000, n_rot_full=64, n_rot_swa=128, eos_id=2, eot_id=24) - `LagunaTargetCache` — KV cache (Q8_0, per layer, max_ctx tokens), no SSM/conv state - `LagunaGraphInputs` / `LagunaGraphOutputs` -2. `dflash/src/laguna_target_loader.cpp` (NEW, ~500 LOC): +2. `server/src/laguna_target_loader.cpp` (NEW, ~500 LOC): - `load_target_gguf_laguna(path, backend, LagunaTargetWeights & out)` - Validates `arch == "laguna"`, reads all hparams, mmaps GGUF, copies tensors to ggml_backend buffer - Per-layer head count: reads `laguna.attention.head_count` as ARRAY (length 40) into `n_head_arr` - Tensor naming: matches gguf-py's MODEL_ARCH.LAGUNA list (token_embd, output_norm, output, blk..{attn_norm, attn_q, attn_k, attn_v, attn_output, attn_q_norm, attn_k_norm, attn_gate, ffn_norm, ffn_gate, ffn_down, ffn_up, ffn_gate_inp, ffn_gate_exps, ffn_down_exps, ffn_up_exps, ffn_gate_shexp, ffn_up_shexp, ffn_down_shexp, exp_probs_b}) - Layer 0: dense MLP (ffn_gate/down/up). Layers 1-39: sparse MoE (ffn_*_exps + shexp + gate_inp + exp_probs_b) -3. `dflash/src/laguna_target_graph.cpp` (NEW, ~1500 LOC — multi-session): +3. `server/src/laguna_target_graph.cpp` (NEW, ~1500 LOC — multi-session): - `build_laguna_full_attn_block` — full attention layer with YaRN RoPE (theta=500K, factor=32, partial_rotary=0.5, n_rot=64), per-head softplus gate, head_count from per-layer arr (48 on full) - `build_laguna_swa_block` — sliding-window attention layer (window=512, theta=10K, partial_rotary=1.0, n_rot=128), per-head softplus gate, head_count=64 - `build_laguna_dense_mlp` — SwiGLU dense MLP (layer 0) @@ -42,23 +42,23 @@ No libllama dependency in dflash runtime. Keep ggml-only stack. (libllama+LAGUNA - Reuses `flash_prefill_forward_bf16` for sparse prefill on full-attention layers (for sliding-window layers, use dense FA since window=512 is small) - Cache mgmt: `create_laguna_target_cache`, `free_laguna_target_cache`, `reset_laguna_target_cache`, `snapshot_laguna_target_cache`, `restore_laguna_target_cache` -4. Modify `dflash/src/gguf_target_loader.cpp` (~30 LOC added): +4. Modify `server/src/gguf_target_loader.cpp` (~30 LOC added): - Pre-detect arch string from GGUF header - Dispatch: arch == "qwen35" → existing path, arch == "laguna" → new path -5. Modify `dflash/src/internal.h` (~50 LOC added): +5. Modify `server/src/internal.h` (~50 LOC added): - `enum class TargetArch { Qwen35, Laguna }` to tag the loaded weights - Forward decls for Laguna structs / functions (or include `laguna_internal.h`) -6. Modify `dflash/CMakeLists.txt`: +6. Modify `server/CMakeLists.txt`: - Add `src/laguna_target_loader.cpp` and `src/laguna_target_graph.cpp` to `dflash27b` library sources -7. Modify `dflash/test/test_dflash.cpp` (substantial changes — multi-session): +7. Modify `server/test/test_dflash.cpp` (substantial changes — multi-session): - Detect arch from loaded weights - For Laguna arch, use `LagunaTargetCache` + `build_laguna_graph` instead of qwen35 equivalents - Adjust per-layer-head-count in attention buffer sizing - PFlash drafter call unchanged (drafter is Qwen3-0.6B regardless of target) - - Cross-tokenizer mapping (Qwen3 IDs → Laguna IDs): byte-level round-trip via existing pflash/ Python module OR port to C++ helper + - Cross-tokenizer mapping (Qwen3 IDs → Laguna IDs): byte-level round-trip via existing optimizations/pflash/ Python module OR port to C++ helper ## Phasing diff --git a/dflash/eval/README.md b/server/eval/README.md similarity index 94% rename from dflash/eval/README.md rename to server/eval/README.md index 0b7bcb716..8397fb16c 100644 --- a/dflash/eval/README.md +++ b/server/eval/README.md @@ -8,7 +8,7 @@ produce different bytes for the same prompt?", not "is the answer correct". ## quick A/B (`scripts/quality_ab_simple.py`) Runs ~7 short conversational prompts against several server configs. For each -config it spawns a fresh `dflash/scripts/server.py`, fires the prompts in +config it spawns a fresh `server/scripts/server.py`, fires the prompts in sequence, then tears the server down. At the end it prints a markdown table comparing each config against the matching baseline (configs ending in `_f16` are compared against `baseline_f16` so attention precision is held constant). @@ -16,9 +16,9 @@ are compared against `baseline_f16` so attention precision is held constant). ``` PFLASH_TARGET=/path/to/target.gguf \ PFLASH_DRAFT=/path/to/draft-dir-or-safetensors \ -PFLASH_BIN=dflash/build/test_dflash \ +PFLASH_BIN=server/build/test_dflash \ PFLASH_DRAFTER=/path/to/Qwen3-0.6B-BF16.gguf \ -python3 dflash/scripts/quality_ab_simple.py +python3 server/scripts/quality_ab_simple.py ``` Configs are defined in `CONFIGS` near the top of the script. Each spawns one diff --git a/dflash/eval/humaneval_plus/SOURCE.txt b/server/eval/humaneval_plus/SOURCE.txt similarity index 100% rename from dflash/eval/humaneval_plus/SOURCE.txt rename to server/eval/humaneval_plus/SOURCE.txt diff --git a/dflash/eval/humaneval_plus/humanevalplus.jsonl b/server/eval/humaneval_plus/humanevalplus.jsonl similarity index 100% rename from dflash/eval/humaneval_plus/humanevalplus.jsonl rename to server/eval/humaneval_plus/humanevalplus.jsonl diff --git a/dflash/eval/mt_bench/SOURCE.txt b/server/eval/mt_bench/SOURCE.txt similarity index 100% rename from dflash/eval/mt_bench/SOURCE.txt rename to server/eval/mt_bench/SOURCE.txt diff --git a/dflash/eval/mt_bench/question.jsonl b/server/eval/mt_bench/question.jsonl similarity index 100% rename from dflash/eval/mt_bench/question.jsonl rename to server/eval/mt_bench/question.jsonl diff --git a/dflash/examples/chat.py b/server/examples/chat.py similarity index 100% rename from dflash/examples/chat.py rename to server/examples/chat.py diff --git a/dflash/hero.png b/server/hero.png similarity index 100% rename from dflash/hero.png rename to server/hero.png diff --git a/dflash/hero.raw.png b/server/hero.raw.png similarity index 100% rename from dflash/hero.raw.png rename to server/hero.raw.png diff --git a/dflash/hip_compat/cuda_bf16.h b/server/hip_compat/cuda_bf16.h similarity index 100% rename from dflash/hip_compat/cuda_bf16.h rename to server/hip_compat/cuda_bf16.h diff --git a/dflash/hip_compat/cuda_fp16.h b/server/hip_compat/cuda_fp16.h similarity index 100% rename from dflash/hip_compat/cuda_fp16.h rename to server/hip_compat/cuda_fp16.h diff --git a/dflash/hip_compat/cuda_runtime.h b/server/hip_compat/cuda_runtime.h similarity index 100% rename from dflash/hip_compat/cuda_runtime.h rename to server/hip_compat/cuda_runtime.h diff --git a/dflash/hip_compat/mma.h b/server/hip_compat/mma.h similarity index 100% rename from dflash/hip_compat/mma.h rename to server/hip_compat/mma.h diff --git a/dflash/include/dflash27b.h b/server/include/dflash27b.h similarity index 100% rename from dflash/include/dflash27b.h rename to server/include/dflash27b.h diff --git a/dflash/pyproject.toml b/server/pyproject.toml similarity index 100% rename from dflash/pyproject.toml rename to server/pyproject.toml diff --git a/dflash/scripts/_prefill_hook.py b/server/scripts/_prefill_hook.py similarity index 100% rename from dflash/scripts/_prefill_hook.py rename to server/scripts/_prefill_hook.py diff --git a/dflash/scripts/bench_agent.py b/server/scripts/bench_agent.py similarity index 100% rename from dflash/scripts/bench_agent.py rename to server/scripts/bench_agent.py diff --git a/dflash/scripts/bench_agent_loop.py b/server/scripts/bench_agent_loop.py similarity index 100% rename from dflash/scripts/bench_agent_loop.py rename to server/scripts/bench_agent_loop.py diff --git a/dflash/scripts/bench_daemon.py b/server/scripts/bench_daemon.py similarity index 100% rename from dflash/scripts/bench_daemon.py rename to server/scripts/bench_daemon.py diff --git a/dflash/scripts/bench_he.py b/server/scripts/bench_he.py similarity index 100% rename from dflash/scripts/bench_he.py rename to server/scripts/bench_he.py diff --git a/dflash/scripts/bench_he_http.py b/server/scripts/bench_he_http.py similarity index 100% rename from dflash/scripts/bench_he_http.py rename to server/scripts/bench_he_http.py diff --git a/dflash/scripts/bench_llm.py b/server/scripts/bench_llm.py similarity index 100% rename from dflash/scripts/bench_llm.py rename to server/scripts/bench_llm.py diff --git a/dflash/scripts/bench_server.py b/server/scripts/bench_server.py similarity index 100% rename from dflash/scripts/bench_server.py rename to server/scripts/bench_server.py diff --git a/dflash/scripts/convert_dflash_to_gguf.py b/server/scripts/convert_dflash_to_gguf.py similarity index 100% rename from dflash/scripts/convert_dflash_to_gguf.py rename to server/scripts/convert_dflash_to_gguf.py diff --git a/dflash/scripts/detokenize.py b/server/scripts/detokenize.py similarity index 100% rename from dflash/scripts/detokenize.py rename to server/scripts/detokenize.py diff --git a/dflash/scripts/fixtures/agent_prompts/codex_apply_patch.md b/server/scripts/fixtures/agent_prompts/codex_apply_patch.md similarity index 100% rename from dflash/scripts/fixtures/agent_prompts/codex_apply_patch.md rename to server/scripts/fixtures/agent_prompts/codex_apply_patch.md diff --git a/dflash/scripts/fixtures/agent_prompts/codex_gpt52.md b/server/scripts/fixtures/agent_prompts/codex_gpt52.md similarity index 100% rename from dflash/scripts/fixtures/agent_prompts/codex_gpt52.md rename to server/scripts/fixtures/agent_prompts/codex_gpt52.md diff --git a/dflash/scripts/fixtures/agent_prompts/codex_gpt52_codex.md b/server/scripts/fixtures/agent_prompts/codex_gpt52_codex.md similarity index 100% rename from dflash/scripts/fixtures/agent_prompts/codex_gpt52_codex.md rename to server/scripts/fixtures/agent_prompts/codex_gpt52_codex.md diff --git a/dflash/scripts/fixtures/agent_prompts/codex_gpt5_codex.md b/server/scripts/fixtures/agent_prompts/codex_gpt5_codex.md similarity index 100% rename from dflash/scripts/fixtures/agent_prompts/codex_gpt5_codex.md rename to server/scripts/fixtures/agent_prompts/codex_gpt5_codex.md diff --git a/dflash/scripts/fixtures/swe_bench/swe_bench_verified.parquet b/server/scripts/fixtures/swe_bench/swe_bench_verified.parquet similarity index 100% rename from dflash/scripts/fixtures/swe_bench/swe_bench_verified.parquet rename to server/scripts/fixtures/swe_bench/swe_bench_verified.parquet diff --git a/dflash/scripts/gen_oracle.py b/server/scripts/gen_oracle.py similarity index 100% rename from dflash/scripts/gen_oracle.py rename to server/scripts/gen_oracle.py diff --git a/dflash/scripts/laguna_pflash_niah.py b/server/scripts/laguna_pflash_niah.py similarity index 100% rename from dflash/scripts/laguna_pflash_niah.py rename to server/scripts/laguna_pflash_niah.py diff --git a/dflash/scripts/parity_laguna.py b/server/scripts/parity_laguna.py similarity index 100% rename from dflash/scripts/parity_laguna.py rename to server/scripts/parity_laguna.py diff --git a/dflash/scripts/phase_split_dual_gpu.py b/server/scripts/phase_split_dual_gpu.py similarity index 100% rename from dflash/scripts/phase_split_dual_gpu.py rename to server/scripts/phase_split_dual_gpu.py diff --git a/dflash/scripts/placement/__init__.py b/server/scripts/placement/__init__.py similarity index 100% rename from dflash/scripts/placement/__init__.py rename to server/scripts/placement/__init__.py diff --git a/dflash/scripts/placement/backend_device.py b/server/scripts/placement/backend_device.py similarity index 100% rename from dflash/scripts/placement/backend_device.py rename to server/scripts/placement/backend_device.py diff --git a/dflash/scripts/placement/server_resolver.py b/server/scripts/placement/server_resolver.py similarity index 100% rename from dflash/scripts/placement/server_resolver.py rename to server/scripts/placement/server_resolver.py diff --git a/dflash/scripts/placement/test_dflash_args.py b/server/scripts/placement/test_dflash_args.py similarity index 100% rename from dflash/scripts/placement/test_dflash_args.py rename to server/scripts/placement/test_dflash_args.py diff --git a/dflash/scripts/prefix_cache.py b/server/scripts/prefix_cache.py similarity index 100% rename from dflash/scripts/prefix_cache.py rename to server/scripts/prefix_cache.py diff --git a/dflash/scripts/quality_ab_simple.py b/server/scripts/quality_ab_simple.py similarity index 100% rename from dflash/scripts/quality_ab_simple.py rename to server/scripts/quality_ab_simple.py diff --git a/dflash/scripts/quality_humaneval_plus.py b/server/scripts/quality_humaneval_plus.py similarity index 100% rename from dflash/scripts/quality_humaneval_plus.py rename to server/scripts/quality_humaneval_plus.py diff --git a/dflash/scripts/quantize_draft_q8.py b/server/scripts/quantize_draft_q8.py similarity index 100% rename from dflash/scripts/quantize_draft_q8.py rename to server/scripts/quantize_draft_q8.py diff --git a/dflash/scripts/quantize_gemma_dflash_q8.py b/server/scripts/quantize_gemma_dflash_q8.py similarity index 100% rename from dflash/scripts/quantize_gemma_dflash_q8.py rename to server/scripts/quantize_gemma_dflash_q8.py diff --git a/dflash/scripts/run.py b/server/scripts/run.py similarity index 100% rename from dflash/scripts/run.py rename to server/scripts/run.py diff --git a/dflash/scripts/server.py b/server/scripts/server.py similarity index 100% rename from dflash/scripts/server.py rename to server/scripts/server.py diff --git a/dflash/scripts/setup_system.sh b/server/scripts/setup_system.sh similarity index 100% rename from dflash/scripts/setup_system.sh rename to server/scripts/setup_system.sh diff --git a/dflash/scripts/test_full_compress_cache.py b/server/scripts/test_full_compress_cache.py similarity index 100% rename from dflash/scripts/test_full_compress_cache.py rename to server/scripts/test_full_compress_cache.py diff --git a/dflash/scripts/test_multi_turn_prefix_cache.py b/server/scripts/test_multi_turn_prefix_cache.py similarity index 100% rename from dflash/scripts/test_multi_turn_prefix_cache.py rename to server/scripts/test_multi_turn_prefix_cache.py diff --git a/dflash/scripts/test_prefix_cache.py b/server/scripts/test_prefix_cache.py similarity index 100% rename from dflash/scripts/test_prefix_cache.py rename to server/scripts/test_prefix_cache.py diff --git a/dflash/scripts/test_server.py b/server/scripts/test_server.py similarity index 100% rename from dflash/scripts/test_server.py rename to server/scripts/test_server.py diff --git a/dflash/scripts/test_server_integration.py b/server/scripts/test_server_integration.py similarity index 100% rename from dflash/scripts/test_server_integration.py rename to server/scripts/test_server_integration.py diff --git a/dflash/scripts/test_server_prefix_cache.py b/server/scripts/test_server_prefix_cache.py similarity index 100% rename from dflash/scripts/test_server_prefix_cache.py rename to server/scripts/test_server_prefix_cache.py diff --git a/dflash/scripts/test_tool_memory.py b/server/scripts/test_tool_memory.py similarity index 100% rename from dflash/scripts/test_tool_memory.py rename to server/scripts/test_tool_memory.py diff --git a/dflash/scripts/tokenize_prompt.py b/server/scripts/tokenize_prompt.py similarity index 100% rename from dflash/scripts/tokenize_prompt.py rename to server/scripts/tokenize_prompt.py diff --git a/dflash/scripts/tool_memory.py b/server/scripts/tool_memory.py similarity index 100% rename from dflash/scripts/tool_memory.py rename to server/scripts/tool_memory.py diff --git a/dflash/src/bsa_fwd_inst.cu b/server/src/bsa_fwd_inst.cu similarity index 100% rename from dflash/src/bsa_fwd_inst.cu rename to server/src/bsa_fwd_inst.cu diff --git a/dflash/src/bsa_launcher.cu b/server/src/bsa_launcher.cu similarity index 100% rename from dflash/src/bsa_launcher.cu rename to server/src/bsa_launcher.cu diff --git a/dflash/src/bsa_launcher_hip.cu b/server/src/bsa_launcher_hip.cu similarity index 100% rename from dflash/src/bsa_launcher_hip.cu rename to server/src/bsa_launcher_hip.cu diff --git a/dflash/src/common/attn_masks.h b/server/src/common/attn_masks.h similarity index 100% rename from dflash/src/common/attn_masks.h rename to server/src/common/attn_masks.h diff --git a/dflash/src/common/backend_factory.cpp b/server/src/common/backend_factory.cpp similarity index 100% rename from dflash/src/common/backend_factory.cpp rename to server/src/common/backend_factory.cpp diff --git a/dflash/src/common/backend_factory.h b/server/src/common/backend_factory.h similarity index 100% rename from dflash/src/common/backend_factory.h rename to server/src/common/backend_factory.h diff --git a/dflash/src/common/backend_ipc.cpp b/server/src/common/backend_ipc.cpp similarity index 100% rename from dflash/src/common/backend_ipc.cpp rename to server/src/common/backend_ipc.cpp diff --git a/dflash/src/common/backend_ipc.h b/server/src/common/backend_ipc.h similarity index 100% rename from dflash/src/common/backend_ipc.h rename to server/src/common/backend_ipc.h diff --git a/dflash/src/common/daemon_loop.cpp b/server/src/common/daemon_loop.cpp similarity index 100% rename from dflash/src/common/daemon_loop.cpp rename to server/src/common/daemon_loop.cpp diff --git a/dflash/src/common/daemon_loop.h b/server/src/common/daemon_loop.h similarity index 100% rename from dflash/src/common/daemon_loop.h rename to server/src/common/daemon_loop.h diff --git a/dflash/src/common/ddtree.cpp b/server/src/common/ddtree.cpp similarity index 100% rename from dflash/src/common/ddtree.cpp rename to server/src/common/ddtree.cpp diff --git a/dflash/src/common/ddtree.h b/server/src/common/ddtree.h similarity index 100% rename from dflash/src/common/ddtree.h rename to server/src/common/ddtree.h diff --git a/dflash/src/common/device_placement.h b/server/src/common/device_placement.h similarity index 100% rename from dflash/src/common/device_placement.h rename to server/src/common/device_placement.h diff --git a/dflash/src/common/dflash_capture.cpp b/server/src/common/dflash_capture.cpp similarity index 100% rename from dflash/src/common/dflash_capture.cpp rename to server/src/common/dflash_capture.cpp diff --git a/dflash/src/common/dflash_capture.h b/server/src/common/dflash_capture.h similarity index 100% rename from dflash/src/common/dflash_capture.h rename to server/src/common/dflash_capture.h diff --git a/dflash/src/common/dflash_draft_graph.cpp b/server/src/common/dflash_draft_graph.cpp similarity index 100% rename from dflash/src/common/dflash_draft_graph.cpp rename to server/src/common/dflash_draft_graph.cpp diff --git a/dflash/src/common/dflash_draft_graph.h b/server/src/common/dflash_draft_graph.h similarity index 100% rename from dflash/src/common/dflash_draft_graph.h rename to server/src/common/dflash_draft_graph.h diff --git a/dflash/src/common/dflash_draft_ipc.cpp b/server/src/common/dflash_draft_ipc.cpp similarity index 100% rename from dflash/src/common/dflash_draft_ipc.cpp rename to server/src/common/dflash_draft_ipc.cpp diff --git a/dflash/src/common/dflash_draft_ipc.h b/server/src/common/dflash_draft_ipc.h similarity index 100% rename from dflash/src/common/dflash_draft_ipc.h rename to server/src/common/dflash_draft_ipc.h diff --git a/dflash/src/common/dflash_draft_ipc_daemon.cpp b/server/src/common/dflash_draft_ipc_daemon.cpp similarity index 100% rename from dflash/src/common/dflash_draft_ipc_daemon.cpp rename to server/src/common/dflash_draft_ipc_daemon.cpp diff --git a/dflash/src/common/dflash_feature_ring.cpp b/server/src/common/dflash_feature_ring.cpp similarity index 100% rename from dflash/src/common/dflash_feature_ring.cpp rename to server/src/common/dflash_feature_ring.cpp diff --git a/dflash/src/common/dflash_feature_ring.h b/server/src/common/dflash_feature_ring.h similarity index 100% rename from dflash/src/common/dflash_feature_ring.h rename to server/src/common/dflash_feature_ring.h diff --git a/dflash/src/common/dflash_layer_split_runtime.h b/server/src/common/dflash_layer_split_runtime.h similarity index 100% rename from dflash/src/common/dflash_layer_split_runtime.h rename to server/src/common/dflash_layer_split_runtime.h diff --git a/dflash/src/common/dflash_spec_decode.cpp b/server/src/common/dflash_spec_decode.cpp similarity index 100% rename from dflash/src/common/dflash_spec_decode.cpp rename to server/src/common/dflash_spec_decode.cpp diff --git a/dflash/src/common/dflash_spec_decode.h b/server/src/common/dflash_spec_decode.h similarity index 100% rename from dflash/src/common/dflash_spec_decode.h rename to server/src/common/dflash_spec_decode.h diff --git a/dflash/src/common/dflash_target.h b/server/src/common/dflash_target.h similarity index 100% rename from dflash/src/common/dflash_target.h rename to server/src/common/dflash_target.h diff --git a/dflash/src/common/gguf_inspect.cpp b/server/src/common/gguf_inspect.cpp similarity index 100% rename from dflash/src/common/gguf_inspect.cpp rename to server/src/common/gguf_inspect.cpp diff --git a/dflash/src/common/gguf_inspect.h b/server/src/common/gguf_inspect.h similarity index 100% rename from dflash/src/common/gguf_inspect.h rename to server/src/common/gguf_inspect.h diff --git a/dflash/src/common/gguf_mmap.h b/server/src/common/gguf_mmap.h similarity index 100% rename from dflash/src/common/gguf_mmap.h rename to server/src/common/gguf_mmap.h diff --git a/dflash/src/common/gpu_runtime_compat.h b/server/src/common/gpu_runtime_compat.h similarity index 100% rename from dflash/src/common/gpu_runtime_compat.h rename to server/src/common/gpu_runtime_compat.h diff --git a/dflash/src/common/io_utils.h b/server/src/common/io_utils.h similarity index 100% rename from dflash/src/common/io_utils.h rename to server/src/common/io_utils.h diff --git a/dflash/src/common/layer_split_utils.cpp b/server/src/common/layer_split_utils.cpp similarity index 100% rename from dflash/src/common/layer_split_utils.cpp rename to server/src/common/layer_split_utils.cpp diff --git a/dflash/src/common/layer_split_utils.h b/server/src/common/layer_split_utils.h similarity index 100% rename from dflash/src/common/layer_split_utils.h rename to server/src/common/layer_split_utils.h diff --git a/dflash/src/common/model_backend.h b/server/src/common/model_backend.h similarity index 100% rename from dflash/src/common/model_backend.h rename to server/src/common/model_backend.h diff --git a/dflash/src/common/peer_access.cpp b/server/src/common/peer_access.cpp similarity index 100% rename from dflash/src/common/peer_access.cpp rename to server/src/common/peer_access.cpp diff --git a/dflash/src/common/peer_access.h b/server/src/common/peer_access.h similarity index 100% rename from dflash/src/common/peer_access.h rename to server/src/common/peer_access.h diff --git a/dflash/src/common/pflash_drafter_ipc.cpp b/server/src/common/pflash_drafter_ipc.cpp similarity index 100% rename from dflash/src/common/pflash_drafter_ipc.cpp rename to server/src/common/pflash_drafter_ipc.cpp diff --git a/dflash/src/common/pflash_drafter_ipc.h b/server/src/common/pflash_drafter_ipc.h similarity index 100% rename from dflash/src/common/pflash_drafter_ipc.h rename to server/src/common/pflash_drafter_ipc.h diff --git a/dflash/src/common/pflash_drafter_ipc_daemon.cpp b/server/src/common/pflash_drafter_ipc_daemon.cpp similarity index 100% rename from dflash/src/common/pflash_drafter_ipc_daemon.cpp rename to server/src/common/pflash_drafter_ipc_daemon.cpp diff --git a/dflash/src/common/restore_delta.h b/server/src/common/restore_delta.h similarity index 100% rename from dflash/src/common/restore_delta.h rename to server/src/common/restore_delta.h diff --git a/dflash/src/common/sampler.cpp b/server/src/common/sampler.cpp similarity index 100% rename from dflash/src/common/sampler.cpp rename to server/src/common/sampler.cpp diff --git a/dflash/src/common/sampler.h b/server/src/common/sampler.h similarity index 100% rename from dflash/src/common/sampler.h rename to server/src/common/sampler.h diff --git a/dflash/src/common/snapshot_backend.h b/server/src/common/snapshot_backend.h similarity index 100% rename from dflash/src/common/snapshot_backend.h rename to server/src/common/snapshot_backend.h diff --git a/dflash/src/common/step_graph.h b/server/src/common/step_graph.h similarity index 100% rename from dflash/src/common/step_graph.h rename to server/src/common/step_graph.h diff --git a/dflash/src/cuda_cross_device_copy.cpp b/server/src/cuda_cross_device_copy.cpp similarity index 100% rename from dflash/src/cuda_cross_device_copy.cpp rename to server/src/cuda_cross_device_copy.cpp diff --git a/dflash/src/delta_net_chunked.cpp b/server/src/delta_net_chunked.cpp similarity index 100% rename from dflash/src/delta_net_chunked.cpp rename to server/src/delta_net_chunked.cpp diff --git a/dflash/src/delta_net_chunked.h b/server/src/delta_net_chunked.h similarity index 100% rename from dflash/src/delta_net_chunked.h rename to server/src/delta_net_chunked.h diff --git a/dflash/src/device_runtime.h b/server/src/device_runtime.h similarity index 100% rename from dflash/src/device_runtime.h rename to server/src/device_runtime.h diff --git a/dflash/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp similarity index 100% rename from dflash/src/draft/draft_gguf_loader.cpp rename to server/src/draft/draft_gguf_loader.cpp diff --git a/dflash/src/draft/draft_graph.cpp b/server/src/draft/draft_graph.cpp similarity index 100% rename from dflash/src/draft/draft_graph.cpp rename to server/src/draft/draft_graph.cpp diff --git a/dflash/src/draft/draft_graph.h b/server/src/draft/draft_graph.h similarity index 100% rename from dflash/src/draft/draft_graph.h rename to server/src/draft/draft_graph.h diff --git a/dflash/src/draft/draft_safetensors_loader.cpp b/server/src/draft/draft_safetensors_loader.cpp similarity index 100% rename from dflash/src/draft/draft_safetensors_loader.cpp rename to server/src/draft/draft_safetensors_loader.cpp diff --git a/dflash/src/errors.cpp b/server/src/errors.cpp similarity index 100% rename from dflash/src/errors.cpp rename to server/src/errors.cpp diff --git a/dflash/src/flashprefill.cpp b/server/src/flashprefill.cpp similarity index 100% rename from dflash/src/flashprefill.cpp rename to server/src/flashprefill.cpp diff --git a/dflash/src/flashprefill.h b/server/src/flashprefill.h similarity index 100% rename from dflash/src/flashprefill.h rename to server/src/flashprefill.h diff --git a/dflash/src/flashprefill_f16.cu b/server/src/flashprefill_f16.cu similarity index 100% rename from dflash/src/flashprefill_f16.cu rename to server/src/flashprefill_f16.cu diff --git a/dflash/src/flashprefill_kernels.cu b/server/src/flashprefill_kernels.cu similarity index 100% rename from dflash/src/flashprefill_kernels.cu rename to server/src/flashprefill_kernels.cu diff --git a/dflash/src/flashprefill_kernels.hip.cu b/server/src/flashprefill_kernels.hip.cu similarity index 100% rename from dflash/src/flashprefill_kernels.hip.cu rename to server/src/flashprefill_kernels.hip.cu diff --git a/dflash/src/flashprefill_q8.cpp b/server/src/flashprefill_q8.cpp similarity index 100% rename from dflash/src/flashprefill_q8.cpp rename to server/src/flashprefill_q8.cpp diff --git a/dflash/src/flashprefill_scalar.cu b/server/src/flashprefill_scalar.cu similarity index 100% rename from dflash/src/flashprefill_scalar.cu rename to server/src/flashprefill_scalar.cu diff --git a/dflash/src/flashprefill_select.cpp b/server/src/flashprefill_select.cpp similarity index 100% rename from dflash/src/flashprefill_select.cpp rename to server/src/flashprefill_select.cpp diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp similarity index 100% rename from dflash/src/gemma4/gemma4_backend.cpp rename to server/src/gemma4/gemma4_backend.cpp diff --git a/dflash/src/gemma4/gemma4_backend.h b/server/src/gemma4/gemma4_backend.h similarity index 100% rename from dflash/src/gemma4/gemma4_backend.h rename to server/src/gemma4/gemma4_backend.h diff --git a/dflash/src/gemma4/gemma4_daemon.cpp b/server/src/gemma4/gemma4_daemon.cpp similarity index 100% rename from dflash/src/gemma4/gemma4_daemon.cpp rename to server/src/gemma4/gemma4_daemon.cpp diff --git a/dflash/src/gemma4/gemma4_daemon.h b/server/src/gemma4/gemma4_daemon.h similarity index 100% rename from dflash/src/gemma4/gemma4_daemon.h rename to server/src/gemma4/gemma4_daemon.h diff --git a/dflash/src/gemma4/gemma4_dflash_target.cpp b/server/src/gemma4/gemma4_dflash_target.cpp similarity index 100% rename from dflash/src/gemma4/gemma4_dflash_target.cpp rename to server/src/gemma4/gemma4_dflash_target.cpp diff --git a/dflash/src/gemma4/gemma4_dflash_target.h b/server/src/gemma4/gemma4_dflash_target.h similarity index 100% rename from dflash/src/gemma4/gemma4_dflash_target.h rename to server/src/gemma4/gemma4_dflash_target.h diff --git a/dflash/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp similarity index 100% rename from dflash/src/gemma4/gemma4_graph.cpp rename to server/src/gemma4/gemma4_graph.cpp diff --git a/dflash/src/gemma4/gemma4_internal.h b/server/src/gemma4/gemma4_internal.h similarity index 100% rename from dflash/src/gemma4/gemma4_internal.h rename to server/src/gemma4/gemma4_internal.h diff --git a/dflash/src/gemma4/gemma4_loader.cpp b/server/src/gemma4/gemma4_loader.cpp similarity index 100% rename from dflash/src/gemma4/gemma4_loader.cpp rename to server/src/gemma4/gemma4_loader.cpp diff --git a/dflash/src/hip_compat/cuda_bf16.h b/server/src/hip_compat/cuda_bf16.h similarity index 100% rename from dflash/src/hip_compat/cuda_bf16.h rename to server/src/hip_compat/cuda_bf16.h diff --git a/dflash/src/hip_compat/cuda_fp16.h b/server/src/hip_compat/cuda_fp16.h similarity index 100% rename from dflash/src/hip_compat/cuda_fp16.h rename to server/src/hip_compat/cuda_fp16.h diff --git a/dflash/src/internal.h b/server/src/internal.h similarity index 100% rename from dflash/src/internal.h rename to server/src/internal.h diff --git a/dflash/src/ipc/backend_ipc_main.cpp b/server/src/ipc/backend_ipc_main.cpp similarity index 100% rename from dflash/src/ipc/backend_ipc_main.cpp rename to server/src/ipc/backend_ipc_main.cpp diff --git a/dflash/src/kv_cache.cpp b/server/src/kv_cache.cpp similarity index 100% rename from dflash/src/kv_cache.cpp rename to server/src/kv_cache.cpp diff --git a/dflash/src/kv_quant.cpp b/server/src/kv_quant.cpp similarity index 100% rename from dflash/src/kv_quant.cpp rename to server/src/kv_quant.cpp diff --git a/dflash/src/kv_quant.h b/server/src/kv_quant.h similarity index 100% rename from dflash/src/kv_quant.h rename to server/src/kv_quant.h diff --git a/dflash/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp similarity index 100% rename from dflash/src/laguna/laguna_backend.cpp rename to server/src/laguna/laguna_backend.cpp diff --git a/dflash/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h similarity index 100% rename from dflash/src/laguna/laguna_backend.h rename to server/src/laguna/laguna_backend.h diff --git a/dflash/src/laguna/laguna_daemon.cpp b/server/src/laguna/laguna_daemon.cpp similarity index 100% rename from dflash/src/laguna/laguna_daemon.cpp rename to server/src/laguna/laguna_daemon.cpp diff --git a/dflash/src/laguna/laguna_daemon.h b/server/src/laguna/laguna_daemon.h similarity index 100% rename from dflash/src/laguna/laguna_daemon.h rename to server/src/laguna/laguna_daemon.h diff --git a/dflash/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h similarity index 100% rename from dflash/src/laguna/laguna_internal.h rename to server/src/laguna/laguna_internal.h diff --git a/dflash/src/laguna/laguna_target_graph.cpp b/server/src/laguna/laguna_target_graph.cpp similarity index 100% rename from dflash/src/laguna/laguna_target_graph.cpp rename to server/src/laguna/laguna_target_graph.cpp diff --git a/dflash/src/laguna/laguna_target_loader.cpp b/server/src/laguna/laguna_target_loader.cpp similarity index 100% rename from dflash/src/laguna/laguna_target_loader.cpp rename to server/src/laguna/laguna_target_loader.cpp diff --git a/dflash/src/pflash_ggml_adapter.cpp b/server/src/pflash_ggml_adapter.cpp similarity index 100% rename from dflash/src/pflash_ggml_adapter.cpp rename to server/src/pflash_ggml_adapter.cpp diff --git a/dflash/src/pflash_ggml_adapter.h b/server/src/pflash_ggml_adapter.h similarity index 100% rename from dflash/src/pflash_ggml_adapter.h rename to server/src/pflash_ggml_adapter.h diff --git a/dflash/src/placement/pflash_placement.h b/server/src/placement/pflash_placement.h similarity index 100% rename from dflash/src/placement/pflash_placement.h rename to server/src/placement/pflash_placement.h diff --git a/dflash/src/placement/placement_backend.h b/server/src/placement/placement_backend.h similarity index 100% rename from dflash/src/placement/placement_backend.h rename to server/src/placement/placement_backend.h diff --git a/dflash/src/placement/placement_config.h b/server/src/placement/placement_config.h similarity index 100% rename from dflash/src/placement/placement_config.h rename to server/src/placement/placement_config.h diff --git a/dflash/src/placement/remote_draft_config.h b/server/src/placement/remote_draft_config.h similarity index 100% rename from dflash/src/placement/remote_draft_config.h rename to server/src/placement/remote_draft_config.h diff --git a/dflash/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp similarity index 100% rename from dflash/src/qwen3/qwen3_backend.cpp rename to server/src/qwen3/qwen3_backend.cpp diff --git a/dflash/src/qwen3/qwen3_backend.h b/server/src/qwen3/qwen3_backend.h similarity index 100% rename from dflash/src/qwen3/qwen3_backend.h rename to server/src/qwen3/qwen3_backend.h diff --git a/dflash/src/qwen3/qwen3_daemon.cpp b/server/src/qwen3/qwen3_daemon.cpp similarity index 100% rename from dflash/src/qwen3/qwen3_daemon.cpp rename to server/src/qwen3/qwen3_daemon.cpp diff --git a/dflash/src/qwen3/qwen3_daemon.h b/server/src/qwen3/qwen3_daemon.h similarity index 100% rename from dflash/src/qwen3/qwen3_daemon.h rename to server/src/qwen3/qwen3_daemon.h diff --git a/dflash/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp similarity index 100% rename from dflash/src/qwen3/qwen3_drafter.cpp rename to server/src/qwen3/qwen3_drafter.cpp diff --git a/dflash/src/qwen3/qwen3_drafter.h b/server/src/qwen3/qwen3_drafter.h similarity index 100% rename from dflash/src/qwen3/qwen3_drafter.h rename to server/src/qwen3/qwen3_drafter.h diff --git a/dflash/src/qwen3/qwen3_drafter_model.h b/server/src/qwen3/qwen3_drafter_model.h similarity index 100% rename from dflash/src/qwen3/qwen3_drafter_model.h rename to server/src/qwen3/qwen3_drafter_model.h diff --git a/dflash/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp similarity index 100% rename from dflash/src/qwen3/qwen3_graph.cpp rename to server/src/qwen3/qwen3_graph.cpp diff --git a/dflash/src/qwen3/qwen3_loader.cpp b/server/src/qwen3/qwen3_loader.cpp similarity index 100% rename from dflash/src/qwen3/qwen3_loader.cpp rename to server/src/qwen3/qwen3_loader.cpp diff --git a/dflash/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp similarity index 100% rename from dflash/src/qwen35/gguf_target_loader.cpp rename to server/src/qwen35/gguf_target_loader.cpp diff --git a/dflash/src/qwen35/graph_builders.cpp b/server/src/qwen35/graph_builders.cpp similarity index 100% rename from dflash/src/qwen35/graph_builders.cpp rename to server/src/qwen35/graph_builders.cpp diff --git a/dflash/src/qwen35/graph_builders.h b/server/src/qwen35/graph_builders.h similarity index 100% rename from dflash/src/qwen35/graph_builders.h rename to server/src/qwen35/graph_builders.h diff --git a/dflash/src/qwen35/layer_split_daemon.cpp b/server/src/qwen35/layer_split_daemon.cpp similarity index 100% rename from dflash/src/qwen35/layer_split_daemon.cpp rename to server/src/qwen35/layer_split_daemon.cpp diff --git a/dflash/src/qwen35/layer_split_daemon.h b/server/src/qwen35/layer_split_daemon.h similarity index 100% rename from dflash/src/qwen35/layer_split_daemon.h rename to server/src/qwen35/layer_split_daemon.h diff --git a/dflash/src/qwen35/layer_split_daemon_loop.cpp b/server/src/qwen35/layer_split_daemon_loop.cpp similarity index 100% rename from dflash/src/qwen35/layer_split_daemon_loop.cpp rename to server/src/qwen35/layer_split_daemon_loop.cpp diff --git a/dflash/src/qwen35/layer_split_daemon_loop.h b/server/src/qwen35/layer_split_daemon_loop.h similarity index 100% rename from dflash/src/qwen35/layer_split_daemon_loop.h rename to server/src/qwen35/layer_split_daemon_loop.h diff --git a/dflash/src/qwen35/layer_split_forward.cpp b/server/src/qwen35/layer_split_forward.cpp similarity index 100% rename from dflash/src/qwen35/layer_split_forward.cpp rename to server/src/qwen35/layer_split_forward.cpp diff --git a/dflash/src/qwen35/layer_split_forward.h b/server/src/qwen35/layer_split_forward.h similarity index 100% rename from dflash/src/qwen35/layer_split_forward.h rename to server/src/qwen35/layer_split_forward.h diff --git a/dflash/src/qwen35/layer_split_types.h b/server/src/qwen35/layer_split_types.h similarity index 100% rename from dflash/src/qwen35/layer_split_types.h rename to server/src/qwen35/layer_split_types.h diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp similarity index 100% rename from dflash/src/qwen35/qwen35_backend.cpp rename to server/src/qwen35/qwen35_backend.cpp diff --git a/dflash/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h similarity index 100% rename from dflash/src/qwen35/qwen35_backend.h rename to server/src/qwen35/qwen35_backend.h diff --git a/dflash/src/qwen35/qwen35_daemon.cpp b/server/src/qwen35/qwen35_daemon.cpp similarity index 100% rename from dflash/src/qwen35/qwen35_daemon.cpp rename to server/src/qwen35/qwen35_daemon.cpp diff --git a/dflash/src/qwen35/qwen35_daemon.h b/server/src/qwen35/qwen35_daemon.h similarity index 100% rename from dflash/src/qwen35/qwen35_daemon.h rename to server/src/qwen35/qwen35_daemon.h diff --git a/dflash/src/qwen35/qwen35_dflash_target.cpp b/server/src/qwen35/qwen35_dflash_target.cpp similarity index 100% rename from dflash/src/qwen35/qwen35_dflash_target.cpp rename to server/src/qwen35/qwen35_dflash_target.cpp diff --git a/dflash/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h similarity index 100% rename from dflash/src/qwen35/qwen35_dflash_target.h rename to server/src/qwen35/qwen35_dflash_target.h diff --git a/dflash/src/qwen35/qwen35_layer_split.h b/server/src/qwen35/qwen35_layer_split.h similarity index 100% rename from dflash/src/qwen35/qwen35_layer_split.h rename to server/src/qwen35/qwen35_layer_split.h diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp b/server/src/qwen35/qwen35_layer_split_dflash_target.cpp similarity index 100% rename from dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp rename to server/src/qwen35/qwen35_layer_split_dflash_target.cpp diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.h b/server/src/qwen35/qwen35_layer_split_dflash_target.h similarity index 100% rename from dflash/src/qwen35/qwen35_layer_split_dflash_target.h rename to server/src/qwen35/qwen35_layer_split_dflash_target.h diff --git a/dflash/src/qwen35/qwen35_ops.h b/server/src/qwen35/qwen35_ops.h similarity index 100% rename from dflash/src/qwen35/qwen35_ops.h rename to server/src/qwen35/qwen35_ops.h diff --git a/dflash/src/qwen35/qwen35_target_graph.cpp b/server/src/qwen35/qwen35_target_graph.cpp similarity index 100% rename from dflash/src/qwen35/qwen35_target_graph.cpp rename to server/src/qwen35/qwen35_target_graph.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_backend.cpp rename to server/src/qwen35moe/qwen35moe_backend.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_backend.h rename to server/src/qwen35moe/qwen35moe_backend.h diff --git a/dflash/src/qwen35moe/qwen35moe_daemon.cpp b/server/src/qwen35moe/qwen35moe_daemon.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_daemon.cpp rename to server/src/qwen35moe/qwen35moe_daemon.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_daemon.h b/server/src/qwen35moe/qwen35moe_daemon.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_daemon.h rename to server/src/qwen35moe/qwen35moe_daemon.h diff --git a/dflash/src/qwen35moe/qwen35moe_expert_placement.cpp b/server/src/qwen35moe/qwen35moe_expert_placement.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_expert_placement.cpp rename to server/src/qwen35moe/qwen35moe_expert_placement.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_expert_placement.h b/server/src/qwen35moe/qwen35moe_expert_placement.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_expert_placement.h rename to server/src/qwen35moe/qwen35moe_expert_placement.h diff --git a/dflash/src/qwen35moe/qwen35moe_ffn.cpp b/server/src/qwen35moe/qwen35moe_ffn.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_ffn.cpp rename to server/src/qwen35moe/qwen35moe_ffn.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_ffn.h b/server/src/qwen35moe/qwen35moe_ffn.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_ffn.h rename to server/src/qwen35moe/qwen35moe_ffn.h diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp rename to server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h rename to server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_storage.cpp b/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_hybrid_storage.cpp rename to server/src/qwen35moe/qwen35moe_hybrid_storage.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_storage.h b/server/src/qwen35moe/qwen35moe_hybrid_storage.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_hybrid_storage.h rename to server/src/qwen35moe/qwen35moe_hybrid_storage.h diff --git a/dflash/src/qwen35moe/qwen35moe_routing_stats.cpp b/server/src/qwen35moe/qwen35moe_routing_stats.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_routing_stats.cpp rename to server/src/qwen35moe/qwen35moe_routing_stats.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_routing_stats.h b/server/src/qwen35moe/qwen35moe_routing_stats.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_routing_stats.h rename to server/src/qwen35moe/qwen35moe_routing_stats.h diff --git a/dflash/src/qwen35moe/qwen35moe_swap_manager.cpp b/server/src/qwen35moe/qwen35moe_swap_manager.cpp similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_swap_manager.cpp rename to server/src/qwen35moe/qwen35moe_swap_manager.cpp diff --git a/dflash/src/qwen35moe/qwen35moe_swap_manager.h b/server/src/qwen35moe/qwen35moe_swap_manager.h similarity index 100% rename from dflash/src/qwen35moe/qwen35moe_swap_manager.h rename to server/src/qwen35moe/qwen35moe_swap_manager.h diff --git a/dflash/src/rms_norm_hip.cu b/server/src/rms_norm_hip.cu similarity index 100% rename from dflash/src/rms_norm_hip.cu rename to server/src/rms_norm_hip.cu diff --git a/dflash/src/server/api_types.h b/server/src/server/api_types.h similarity index 100% rename from dflash/src/server/api_types.h rename to server/src/server/api_types.h diff --git a/dflash/src/server/chat_template.cpp b/server/src/server/chat_template.cpp similarity index 100% rename from dflash/src/server/chat_template.cpp rename to server/src/server/chat_template.cpp diff --git a/dflash/src/server/chat_template.h b/server/src/server/chat_template.h similarity index 100% rename from dflash/src/server/chat_template.h rename to server/src/server/chat_template.h diff --git a/dflash/src/server/disk_prefix_cache.cpp b/server/src/server/disk_prefix_cache.cpp similarity index 100% rename from dflash/src/server/disk_prefix_cache.cpp rename to server/src/server/disk_prefix_cache.cpp diff --git a/dflash/src/server/disk_prefix_cache.h b/server/src/server/disk_prefix_cache.h similarity index 100% rename from dflash/src/server/disk_prefix_cache.h rename to server/src/server/disk_prefix_cache.h diff --git a/dflash/src/server/http_server.cpp b/server/src/server/http_server.cpp similarity index 100% rename from dflash/src/server/http_server.cpp rename to server/src/server/http_server.cpp diff --git a/dflash/src/server/http_server.h b/server/src/server/http_server.h similarity index 100% rename from dflash/src/server/http_server.h rename to server/src/server/http_server.h diff --git a/dflash/src/server/model_card.cpp b/server/src/server/model_card.cpp similarity index 100% rename from dflash/src/server/model_card.cpp rename to server/src/server/model_card.cpp diff --git a/dflash/src/server/model_card.h b/server/src/server/model_card.h similarity index 100% rename from dflash/src/server/model_card.h rename to server/src/server/model_card.h diff --git a/dflash/src/server/prefix_cache.cpp b/server/src/server/prefix_cache.cpp similarity index 100% rename from dflash/src/server/prefix_cache.cpp rename to server/src/server/prefix_cache.cpp diff --git a/dflash/src/server/prefix_cache.h b/server/src/server/prefix_cache.h similarity index 100% rename from dflash/src/server/prefix_cache.h rename to server/src/server/prefix_cache.h diff --git a/dflash/src/server/rax.c b/server/src/server/rax.c similarity index 100% rename from dflash/src/server/rax.c rename to server/src/server/rax.c diff --git a/dflash/src/server/rax.h b/server/src/server/rax.h similarity index 100% rename from dflash/src/server/rax.h rename to server/src/server/rax.h diff --git a/dflash/src/server/reasoning.cpp b/server/src/server/reasoning.cpp similarity index 100% rename from dflash/src/server/reasoning.cpp rename to server/src/server/reasoning.cpp diff --git a/dflash/src/server/reasoning.h b/server/src/server/reasoning.h similarity index 100% rename from dflash/src/server/reasoning.h rename to server/src/server/reasoning.h diff --git a/dflash/src/server/server_main.cpp b/server/src/server/server_main.cpp similarity index 100% rename from dflash/src/server/server_main.cpp rename to server/src/server/server_main.cpp diff --git a/dflash/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp similarity index 100% rename from dflash/src/server/sse_emitter.cpp rename to server/src/server/sse_emitter.cpp diff --git a/dflash/src/server/sse_emitter.h b/server/src/server/sse_emitter.h similarity index 100% rename from dflash/src/server/sse_emitter.h rename to server/src/server/sse_emitter.h diff --git a/dflash/src/server/tokenizer.cpp b/server/src/server/tokenizer.cpp similarity index 100% rename from dflash/src/server/tokenizer.cpp rename to server/src/server/tokenizer.cpp diff --git a/dflash/src/server/tokenizer.h b/server/src/server/tokenizer.h similarity index 100% rename from dflash/src/server/tokenizer.h rename to server/src/server/tokenizer.h diff --git a/dflash/src/server/tool_hint.cpp b/server/src/server/tool_hint.cpp similarity index 100% rename from dflash/src/server/tool_hint.cpp rename to server/src/server/tool_hint.cpp diff --git a/dflash/src/server/tool_hint.h b/server/src/server/tool_hint.h similarity index 100% rename from dflash/src/server/tool_hint.h rename to server/src/server/tool_hint.h diff --git a/dflash/src/server/tool_memory.cpp b/server/src/server/tool_memory.cpp similarity index 100% rename from dflash/src/server/tool_memory.cpp rename to server/src/server/tool_memory.cpp diff --git a/dflash/src/server/tool_memory.h b/server/src/server/tool_memory.h similarity index 100% rename from dflash/src/server/tool_memory.h rename to server/src/server/tool_memory.h diff --git a/dflash/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp similarity index 100% rename from dflash/src/server/tool_parser.cpp rename to server/src/server/tool_parser.cpp diff --git a/dflash/src/server/tool_parser.h b/server/src/server/tool_parser.h similarity index 100% rename from dflash/src/server/tool_parser.h rename to server/src/server/tool_parser.h diff --git a/dflash/src/server/utf8_utils.h b/server/src/server/utf8_utils.h similarity index 100% rename from dflash/src/server/utf8_utils.h rename to server/src/server/utf8_utils.h diff --git a/dflash/test/bench_laguna_generate.cpp b/server/test/bench_laguna_generate.cpp similarity index 100% rename from dflash/test/bench_laguna_generate.cpp rename to server/test/bench_laguna_generate.cpp diff --git a/dflash/test/bench_laguna_pflash.cpp b/server/test/bench_laguna_pflash.cpp similarity index 100% rename from dflash/test/bench_laguna_pflash.cpp rename to server/test/bench_laguna_pflash.cpp diff --git a/dflash/test/bench_laguna_ttft.cpp b/server/test/bench_laguna_ttft.cpp similarity index 100% rename from dflash/test/bench_laguna_ttft.cpp rename to server/test/bench_laguna_ttft.cpp diff --git a/dflash/test/pflash_daemon.cpp b/server/test/pflash_daemon.cpp similarity index 100% rename from dflash/test/pflash_daemon.cpp rename to server/test/pflash_daemon.cpp diff --git a/dflash/test/smoke_draft_graph.cpp b/server/test/smoke_draft_graph.cpp similarity index 100% rename from dflash/test/smoke_draft_graph.cpp rename to server/test/smoke_draft_graph.cpp diff --git a/dflash/test/smoke_laguna_forward.cpp b/server/test/smoke_laguna_forward.cpp similarity index 100% rename from dflash/test/smoke_laguna_forward.cpp rename to server/test/smoke_laguna_forward.cpp diff --git a/dflash/test/smoke_load_draft.cpp b/server/test/smoke_load_draft.cpp similarity index 100% rename from dflash/test/smoke_load_draft.cpp rename to server/test/smoke_load_draft.cpp diff --git a/dflash/test/smoke_load_target.cpp b/server/test/smoke_load_target.cpp similarity index 100% rename from dflash/test/smoke_load_target.cpp rename to server/test/smoke_load_target.cpp diff --git a/dflash/test/smoke_load_target_laguna.cpp b/server/test/smoke_load_target_laguna.cpp similarity index 100% rename from dflash/test/smoke_load_target_laguna.cpp rename to server/test/smoke_load_target_laguna.cpp diff --git a/dflash/test/smoke_qwen3_forward.cpp b/server/test/smoke_qwen3_forward.cpp similarity index 100% rename from dflash/test/smoke_qwen3_forward.cpp rename to server/test/smoke_qwen3_forward.cpp diff --git a/dflash/test/smoke_target_forward.cpp b/server/test/smoke_target_forward.cpp similarity index 100% rename from dflash/test/smoke_target_forward.cpp rename to server/test/smoke_target_forward.cpp diff --git a/dflash/test/spike_thin_copy.cpp b/server/test/spike_thin_copy.cpp similarity index 100% rename from dflash/test/spike_thin_copy.cpp rename to server/test/spike_thin_copy.cpp diff --git a/dflash/test/test_dflash.cpp b/server/test/test_dflash.cpp similarity index 100% rename from dflash/test/test_dflash.cpp rename to server/test/test_dflash.cpp diff --git a/dflash/test/test_flash_attn_sparse.cpp b/server/test/test_flash_attn_sparse.cpp similarity index 100% rename from dflash/test/test_flash_attn_sparse.cpp rename to server/test/test_flash_attn_sparse.cpp diff --git a/dflash/test/test_flashprefill_kernels.cpp b/server/test/test_flashprefill_kernels.cpp similarity index 100% rename from dflash/test/test_flashprefill_kernels.cpp rename to server/test/test_flashprefill_kernels.cpp diff --git a/dflash/test/test_generate.cpp b/server/test/test_generate.cpp similarity index 100% rename from dflash/test/test_generate.cpp rename to server/test/test_generate.cpp diff --git a/dflash/test/test_gguf_mmap.cpp b/server/test/test_gguf_mmap.cpp similarity index 100% rename from dflash/test/test_gguf_mmap.cpp rename to server/test/test_gguf_mmap.cpp diff --git a/dflash/test/test_kv_quant.cpp b/server/test/test_kv_quant.cpp similarity index 100% rename from dflash/test/test_kv_quant.cpp rename to server/test/test_kv_quant.cpp diff --git a/dflash/test/test_laguna_daemon.cpp b/server/test/test_laguna_daemon.cpp similarity index 100% rename from dflash/test/test_laguna_daemon.cpp rename to server/test/test_laguna_daemon.cpp diff --git a/dflash/test/test_mtp_converter.sh b/server/test/test_mtp_converter.sh similarity index 100% rename from dflash/test/test_mtp_converter.sh rename to server/test/test_mtp_converter.sh diff --git a/dflash/test/test_mtp_e2e.sh b/server/test/test_mtp_e2e.sh similarity index 100% rename from dflash/test/test_mtp_e2e.sh rename to server/test/test_mtp_e2e.sh diff --git a/dflash/test/test_qwen35moe_expert_placement.cpp b/server/test/test_qwen35moe_expert_placement.cpp similarity index 100% rename from dflash/test/test_qwen35moe_expert_placement.cpp rename to server/test/test_qwen35moe_expert_placement.cpp diff --git a/dflash/test/test_qwen35moe_routing_stats.cpp b/server/test/test_qwen35moe_routing_stats.cpp similarity index 100% rename from dflash/test/test_qwen35moe_routing_stats.cpp rename to server/test/test_qwen35moe_routing_stats.cpp diff --git a/dflash/test/test_qwen35moe_swap_manager.cpp b/server/test/test_qwen35moe_swap_manager.cpp similarity index 100% rename from dflash/test/test_qwen35moe_swap_manager.cpp rename to server/test/test_qwen35moe_swap_manager.cpp diff --git a/dflash/test/test_restore_delta.cpp b/server/test/test_restore_delta.cpp similarity index 100% rename from dflash/test/test_restore_delta.cpp rename to server/test/test_restore_delta.cpp diff --git a/dflash/test/test_server_unit.cpp b/server/test/test_server_unit.cpp similarity index 100% rename from dflash/test/test_server_unit.cpp rename to server/test/test_server_unit.cpp diff --git a/dflash/test/test_tokenizer_harness.cpp b/server/test/test_tokenizer_harness.cpp similarity index 100% rename from dflash/test/test_tokenizer_harness.cpp rename to server/test/test_tokenizer_harness.cpp diff --git a/dflash/test/test_vs_oracle.cpp b/server/test/test_vs_oracle.cpp similarity index 100% rename from dflash/test/test_vs_oracle.cpp rename to server/test/test_vs_oracle.cpp diff --git a/dflash/tests/test_server_comprehensive.py b/server/tests/test_server_comprehensive.py similarity index 100% rename from dflash/tests/test_server_comprehensive.py rename to server/tests/test_server_comprehensive.py diff --git a/dflash/tests/test_server_smoke.py b/server/tests/test_server_smoke.py similarity index 100% rename from dflash/tests/test_server_smoke.py rename to server/tests/test_server_smoke.py diff --git a/dflash/tests/test_tokenizer.py b/server/tests/test_tokenizer.py similarity index 100% rename from dflash/tests/test_tokenizer.py rename to server/tests/test_tokenizer.py