diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1b3a8c1a3..705bbf09b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -46,7 +46,7 @@ jobs:
- name: Build dflash (smoke + server)
run: |
- cd dflash
+ cd server
cmake -B build \
-DCMAKE_CUDA_ARCHITECTURES="86" \
-DDFLASH27B_ENABLE_BSA=OFF \
@@ -59,13 +59,13 @@ jobs:
- name: Run C++ server unit tests
run: |
- cd dflash/build
+ cd server/build
ctest --output-on-failure -R server_unit --no-tests=error
- name: Run Python server unit tests
run: |
pip install pytest fastapi httpx transformers
- cd dflash/scripts
+ cd server/scripts
python3 -m pytest test_server.py -v
- name: Populate venv with cu128 torch + setuptools
diff --git a/.gitmodules b/.gitmodules
index d664da54e..c3b57efdc 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
[submodule "dflash/deps/llama.cpp"]
- path = dflash/deps/llama.cpp
+ path = server/deps/llama.cpp
url = https://github.com/Luce-Org/llama.cpp-dflash-ggml.git
branch = luce-dflash
[submodule "dflash/deps/Block-Sparse-Attention"]
- path = dflash/deps/Block-Sparse-Attention
+ path = server/deps/Block-Sparse-Attention
url = https://github.com/mit-han-lab/Block-Sparse-Attention.git
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7ff50cf8f..d11cc7d13 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -23,7 +23,7 @@ Thanks for considering a contribution. Lucebox is a hub of self-contained optimi
On Ubuntu 22.04 or 24.04, one script installs all system dependencies — `build-essential`, `cmake`, `git`, `git-lfs`, and the CUDA Toolkit from NVIDIA's repo:
```bash
-sudo dflash/scripts/setup_system.sh
+sudo server/scripts/setup_system.sh
```
The script is idempotent and configures `nvcc` on PATH for both bash and zsh. For other distros see the [CUDA installation guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
@@ -51,11 +51,11 @@ uv sync --extra megakernel # also compile the megakernel CUDA extension
bash scripts/check_uv_workspace.sh # lockfile + frozen-sync import smoke
# C++/CUDA decoder
-cmake -B dflash/build -S dflash -DCMAKE_BUILD_TYPE=Release
-cmake --build dflash/build --target test_dflash -j
+cmake -B server/build -S dflash -DCMAKE_BUILD_TYPE=Release
+cmake --build server/build --target test_dflash -j
```
-> If cmake was previously run without CUDA, wipe the build directory first (`rm -rf dflash/build`) to avoid a stale compiler cache.
+> If cmake was previously run without CUDA, wipe the build directory first (`rm -rf server/build`) to avoid a stale compiler cache.
---
diff --git a/README.md b/README.md
index f284ba42f..f1222cded 100644
--- a/README.md
+++ b/README.md
@@ -27,13 +27,13 @@
Each directory is a self-contained project with setup instructions and benchmark notes.
-
+
-
+
-
+
---
@@ -69,7 +69,7 @@ server wrapper:
```bash
LUCEBOX_SERVER_BACKEND=cpp \
-DFLASH_SERVER_BIN=dflash/build/dflash_server \
+DFLASH_SERVER_BIN=server/build/dflash_server \
MAX_CTX=32768 BUDGET=22 VERIFY_MODE=ddtree \
harness/clients/run_codex.sh
```
@@ -90,7 +90,7 @@ uv sync --extra megakernel # builds the CUDA extension; torch is auto-i
uv run --directory megakernel python final_bench.py
```
-> Don't have `uv`? Install with `curl -LsSf https://astral.sh/uv/install.sh | sh` or see [astral.sh/uv](https://astral.sh/uv/). The legacy `python -m venv` + `pip install -e . --no-build-isolation` flow still works from inside `megakernel/`.
+> Don't have `uv`? Install with `curl -LsSf https://astral.sh/uv/install.sh | sh` or see [astral.sh/uv](https://astral.sh/uv/). The legacy `python -m venv` + `pip install -e . --no-build-isolation` flow still works from inside `optimizations/megakernel/`.
| Method | Prefill pp520 | Decode tg128 | tok/J |
|--------|:-------------:|:------------:|:-----:|
@@ -100,9 +100,9 @@ uv run --directory megakernel python final_bench.py
Implementation notes: 82 blocks, 512 threads, cooperative grid sync, no CPU round trips between layers, and weights streamed from Hugging Face on first run.
-[Full writeup →](megakernel/README.md) · [Benchmarks →](megakernel/RESULTS.md) · [Blog post →](https://lucebox.com/blog/megakernel)
+[Full writeup →](optimizations/megakernel/README.md) · [Benchmarks →](optimizations/megakernel/RESULTS.md) · [Blog post →](https://lucebox.com/blog/megakernel)
-> **Blackwell (RTX 5090, DGX Spark / GB10):** auto-detected by setup; NVFP4 decode path lands ~194 tok/s tg128 on GB10. See [megakernel/README.md#blackwell-sm_120--sm_121a](megakernel/README.md).
+> **Blackwell (RTX 5090, DGX Spark / GB10):** auto-detected by setup; NVFP4 decode path lands ~194 tok/s tg128 on GB10. See [optimizations/megakernel/README.md#blackwell-sm_120--sm_121a](optimizations/megakernel/README.md).
---
@@ -127,14 +127,14 @@ uv sync
# 3. build the C++/CUDA decoder (CUDA 12+, CMake 3.18+)
# Default compiles for Pascal/Volta/Turing/Ampere (60/61/62/70/75/86; +120 on CUDA 12.8+, +sm_121/DGX Spark on CUDA 12.9+, +sm_110/Thor on CUDA 13.0+) so the binary runs on every supported card.
# 3090-only users can add -DCMAKE_CUDA_ARCHITECTURES=86 to skip the other archs and build faster (~3 min).
-cmake -B dflash/build -S dflash -DCMAKE_BUILD_TYPE=Release
-cmake --build dflash/build --target test_dflash -j
-cmake --build dflash/build --target test_generate -j
-cmake --build dflash/build --target dflash_server -j
+cmake -B server/build -S dflash -DCMAKE_BUILD_TYPE=Release
+cmake --build server/build --target test_dflash -j
+cmake --build server/build --target test_generate -j
+cmake --build server/build --target dflash_server -j
# 4. fetch weights: ~16 GB Q4_K_M target + 1.84 GB Lucebox Q8_0 GGUF DFlash draft
-uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir dflash/models/
-uv run hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir dflash/models/draft/
+uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/
+uv run hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/
# 5a. one-shot streaming generate
uv run --directory dflash python scripts/run.py --prompt "def fibonacci(n):"
@@ -163,7 +163,7 @@ Implemented here:
### Running on other GPUs (4090, 5090, DGX Spark / GB10, Jetson AGX Thor)
-Supported out of the box; the build just needs the right CUDA toolkit. `dflash/CMakeLists.txt` already auto-adds Blackwell archs when your nvcc is new enough, so the main quickstart above works as-is on newer cards.
+Supported out of the box; the build just needs the right CUDA toolkit. `server/CMakeLists.txt` already auto-adds Blackwell archs when your nvcc is new enough, so the main quickstart above works as-is on newer cards.
| GPU | Arch | Min CUDA | Status |
|-----|:----:|:--------:|--------|
@@ -203,9 +203,9 @@ cmake --build build --target test_dflash -j
**Retune per GPU:**
- **DDTree `budget=22`** tuned for 3090 + Q4_K_M + 24 GB. On the RTX 5090, budget=40 is optimal (swept). On GB10 (128 GB unified), re-sweep — larger tree = more verify throughput until memory bandwidth saturates. `scripts/bench_llm.py --budget N` has the sweep hooks.
- **TQ3_0 KV cache + sliding `target_feat` ring** was shaped by 24 GB (fits up to 256K context on a 3090). On GB10 (128 GB unified) / 5090 (32 GB) you can push context further or skip quantization entirely and keep F16 KV.
-- **Perf numbers** (207 tok/s demo, 129.5 HumanEval, 2.8× vs SGLang AWQ) are RTX 3090 @ stock. RTX 5090 numbers (205 tok/s HumanEval, 4.84×) are in [RESULTS.md](dflash/RESULTS.md). Ada/GB10/Thor not yet swept, PRs with `RESULTS.md` entries welcome.
+- **Perf numbers** (207 tok/s demo, 129.5 HumanEval, 2.8× vs SGLang AWQ) are RTX 3090 @ stock. RTX 5090 numbers (205 tok/s HumanEval, 4.84×) are in [RESULTS.md](server/RESULTS.md). Ada/GB10/Thor not yet swept, PRs with `RESULTS.md` entries welcome.
-[Full writeup →](dflash/README.md) · [Benchmarks →](dflash/RESULTS.md) · [Blog post →](https://lucebox.com/blog/dflash27b)
+[Full writeup →](server/README.md) · [Benchmarks →](server/RESULTS.md) · [Blog post →](https://lucebox.com/blog/dflash27b)
---
@@ -245,7 +245,7 @@ DFLASH_FP_USE_BSA=1 DFLASH_FP_ALPHA=0.85 \
Daemon stdin commands: `compress` runs the drafter with FlashPrefill block-sparse attention and returns the compressed token-id stream; `generate` runs the target on that stream with normal speculative decode + DDTree. `park` / `unpark` / `free drafter` swap weights in and out of VRAM so target + drafter coexist on a 24 GB card.
-**Runtime tunables** (full list in [`dflash/src/flashprefill.h`](dflash/src/flashprefill.h)):
+**Runtime tunables** (full list in [`server/src/flashprefill.h`](server/src/flashprefill.h)):
```
DFLASH_FP_USE_BSA=1 # dispatch sparse FA forward through BSA (sm_80+)
DFLASH_FP_ALPHA=0.85 # block-selection threshold; higher = stricter = fewer K-blocks per Q-row
@@ -254,11 +254,11 @@ DFLASH_FP_PROFILE=1 # log mean / score / select / forward stage timings
**What's ours, what isn't.** Algorithms are from [Cross-Family Speculative Prefill (Liu et al., ICLR 2026)](https://arxiv.org/abs/2603.02631) for the scoring + selection layer and [FlashPrefill (Fan et al., 2026)](https://arxiv.org/abs/2603.06199) for the drafter sparse-attention forward. What we built:
- C++/CUDA daemon-resident speculative prefill in front of a quantized GGUF target — no PyTorch, no Triton, no per-request subprocess.
-- BSA wired without `libtorch` via a 3-header ATen/c10 stub set under `dflash/deps/bsa_stubs/`.
+- BSA wired without `libtorch` via a 3-header ATen/c10 stub set under `server/deps/bsa_stubs/`.
- Custom Qwen3-0.6B forward (`qwen3_0p6b_*`) so the drafter runs through the same ggml allocator as the 27B target.
- 4 CUDA kernels (`flashprefill_kernels.cu`) for the FlashPrefill `mean_K / score / select / sparse_fwd` algorithm.
-[Full writeup →](pflash/README.md) · [Daemon-side build / tunables →](dflash/docs/SPEC_PREFILL.md) · [Blog post →](https://lucebox.com/blog/pflash)
+[Full writeup →](optimizations/pflash/README.md) · [Daemon-side build / tunables →](server/docs/SPEC_PREFILL.md) · [Blog post →](https://lucebox.com/blog/pflash)
---
@@ -282,7 +282,7 @@ cmake --build build --target test_dflash -j
**Per-arch DDTree tuning**: gfx1151 (Strix Halo iGPU, bandwidth-bound on LPDDR5X) peaks at `--ddtree-budget=22`. gfx1100 (7900 XTX, GDDR6) prefers `budget=8` per the [PR #156 cross-arch perf plan](https://github.com/Luce-Org/lucebox-hub/pull/156). Run `scripts/bench_he.py --ddtree-budget N` to verify on your card.
-**Drafter recipe for max decode**: target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `dflash/scripts/quantize_draft_q8.py`. The matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness.
+**Drafter recipe for max decode**: target = Qwen3.5-27B Q4_K_M, drafter = same gen quantized to Q8_0 via `server/scripts/quantize_draft_q8.py`. The matching Q8_0 GGUF on the unsloth Qwen3.6 target needs `DFLASH27B_DRAFT_SWA=2048` for sliding-window correctness.
[Blog post →](https://lucebox.com/blog/amd) · [PR #119 →](https://github.com/Luce-Org/lucebox-hub/pull/119) · [PR #156 cross-arch perf plan →](https://github.com/Luce-Org/lucebox-hub/pull/156)
@@ -309,9 +309,9 @@ All experiments in this repo are built, tuned, and benchmarked on NVIDIA RTX 309
- **Jetson AGX Thor** (sm_110): supported, CUDA 13+.
- **Turing** (sm_75, RTX 2080): supported, CUDA 12+.
-PyTorch 2.0+. `dflash/` needs CMake 3.18+ and `--recurse-submodules` for the pinned `Luce-Org/llama.cpp@luce-dflash` fork (three tree-mode ggml ops); multi-arch build is automatic (see [Running on other GPUs](#running-on-other-gpus-4090-5090-dgx-spark--gb10-jetson-agx-thor)).
+PyTorch 2.0+. `server/` needs CMake 3.18+ and `--recurse-submodules` for the pinned `Luce-Org/llama.cpp@luce-dflash` fork (three tree-mode ggml ops); multi-arch build is automatic (see [Running on other GPUs](#running-on-other-gpus-4090-5090-dgx-spark--gb10-jetson-agx-thor)).
-**Megakernel porting note.** `megakernel/setup.py` auto-detects the GPU arch and SM count at build time via `torch.cuda.get_device_capability()`. The decode grid is persistent (one block per SM) and is clamped to the resident-block ceiling at runtime, so no manual tuning is needed. On SM < 80 (Turing), the kernel uses FP16 instead of BF16 via a compile-time `TARGET_SM` flag; on SM >= 80 (Ampere+), BF16 is used. From the workspace root, `uv sync --extra megakernel` builds the extension; the legacy `pip install -e . --no-build-isolation` flow still works from inside `megakernel/`.
+**Megakernel porting note.** `optimizations/megakernel/setup.py` auto-detects the GPU arch and SM count at build time via `torch.cuda.get_device_capability()`. The decode grid is persistent (one block per SM) and is clamped to the resident-block ceiling at runtime, so no manual tuning is needed. On SM < 80 (Turing), the kernel uses FP16 instead of BF16 via a compile-time `TARGET_SM` flag; on SM >= 80 (Ampere+), BF16 is used. From the workspace root, `uv sync --extra megakernel` builds the extension; the legacy `pip install -e . --no-build-isolation` flow still works from inside `optimizations/megakernel/`.
**Optional, find your GPU's sweet spot:** `sudo nvidia-smi -pl 220` (megakernel hits best tok/J at 220 W on 3090; re-sweep for other cards).
@@ -321,9 +321,9 @@ PyTorch 2.0+. `dflash/` needs CMake 3.18+ and `--recurse-submodules` for the pin
```
lucebox-hub/
-├── megakernel/ · fused forward pass for Qwen 3.5-0.8B
-├── dflash/ · DFlash speculative decoding port for Qwen 3.5/3.6-27B on RTX 3090
-├── pflash/ · speculative-prefill harness in front of dflash (12.5× TTFT at 128K)
+├── optimizations/megakernel/ · fused forward pass for Qwen 3.5-0.8B
+├── server/ · DFlash speculative decoding port for Qwen 3.5/3.6-27B on RTX 3090
+├── optimizations/pflash/ · speculative-prefill harness in front of dflash (12.5× TTFT at 128K)
└── assets/ · banners, cards, diagrams
```
diff --git a/docs/specs/model-cards.md b/docs/specs/model-cards.md
index d44980f6a..a2a5788f6 100644
--- a/docs/specs/model-cards.md
+++ b/docs/specs/model-cards.md
@@ -74,7 +74,7 @@ Examples:
### Cards directory search path
The server probes (in order, matching
-`find_model_cards_dir` in `dflash/src/server/model_card.cpp`):
+`find_model_cards_dir` in `server/src/server/model_card.cpp`):
1. `/share/model_cards/` — an optional explicit
directory passed by the embedding application (e.g. tests). Not
@@ -145,7 +145,7 @@ first source supplying a value wins:
values: `max_tokens=16000`, `hard_limit_reply_budget=512`,
`think_max_tokens = max_tokens − hard_limit_reply_budget = 15488`.
These also match the `ServerConfig` defaults in
- `dflash/src/server/http_server.h`.
+ `server/src/server/http_server.h`.
The startup banner prints each tunable's value and which source
supplied it, e.g.:
@@ -241,7 +241,7 @@ Rounding note: `low` and `medium` use nearest-integer rounding
(`int(x + 0.5)`); `x-high` uses C++ integer division (truncation
toward zero). For odd or non-divisible `think_max` values this
produces deterministic but distinct off-by-one outcomes; see
-`compute_default_tiers` in `dflash/src/server/model_card.cpp`.
+`compute_default_tiers` in `server/src/server/model_card.cpp`.
The `reasoning_effort_tiers` field exists because the ratio-based
defaults don't fit every model. A smaller model that caps at 8192
diff --git a/docs/specs/thinking-budget.md b/docs/specs/thinking-budget.md
index bd4c6735f..5ebc731be 100644
--- a/docs/specs/thinking-budget.md
+++ b/docs/specs/thinking-budget.md
@@ -125,7 +125,7 @@ Fields:
| `verified_at` | ISO date the values were last checked against the source. |
| `max_tokens` | The card's standard recommended combined cap. Drives `default_max_tokens`. |
| `complex_problem_max_tokens` | Optional. The card's recommendation for hard reasoning / benchmark workloads. Drives the `x-high` and `max` effort tiers, which sit *above* `default_max_tokens` when this field is present — they are admissible as long as they fit under `max_ctx − hard_limit_reply_budget`. If omitted, both collapse to the `high` tier value. |
-| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `dflash/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. |
+| `hard_limit_reply_budget` | Optional. Tokens reserved post-`` for the visible answer phase, used both to derive `think_max_tokens = max_tokens − hard_limit_reply_budget` and as the force-close trigger inside `do_ar_decode` / `do_spec_decode` (when `n_gen − generated ≤ hard_limit_reply_budget`, the engine overrides the next sampled token with ``). Default 4096 (raised from 512 on 2026-05-25). The original 512 came from `ds4_eval.c`, sized for DeepSeek-V4-flash's terse style, but it silently truncated almost every other model mid-answer — bench results from `server/docs/experiments/gemma4-26b-thinking-control-2026-05-25.md` showed every force-closed thinking probe getting cut off mid-coordinate-geometry-proof at 512. Without priors on a specific model, 4096 is the safer default; terse models should override down. Qwen3.6, Gemma 4 26B, Gemma 4 31B all ship 4096 in their sidecars. |
| `sampling` | Recommended sampler params. Used as defaults when the request doesn't pin sampler values. |
| `reasoning_effort_tiers` | Explicit phase-1 budgets per tier. Override any computed default. Whichever tiers are present win; missing tiers fall through to the computed defaults below. |
diff --git a/harness/README.md b/harness/README.md
index b3a4cae64..dfc5fa8b0 100644
--- a/harness/README.md
+++ b/harness/README.md
@@ -47,14 +47,14 @@ Use the native C++ server instead of the Python server:
LUCEBOX_SERVER_BACKEND=cpp harness/clients/run_codex.sh
```
-The native server binary defaults to `dflash/build/dflash_server`. Override the
+The native server binary defaults to `server/build/dflash_server`. Override the
paths and profile the same way as the Python backend:
```bash
LUCEBOX_SERVER_BACKEND=cpp \
-DFLASH_SERVER_BIN=dflash/build/dflash_server \
-TARGET=dflash/models/Qwen3.6-27B-Q4_K_M.gguf \
-DRAFT=dflash/models/draft/dflash-draft-3.6-q8_0.gguf \
+DFLASH_SERVER_BIN=server/build/dflash_server \
+TARGET=server/models/Qwen3.6-27B-Q4_K_M.gguf \
+DRAFT=server/models/draft/dflash-draft-3.6-q8_0.gguf \
MODEL_ID=luce-dflash \
MAX_CTX=32768 MAX_TOKENS=512 \
BUDGET=22 VERIFY_MODE=ddtree FA_WINDOW=2048 \
@@ -64,8 +64,8 @@ harness/clients/run_codex.sh
To test an already-running native server:
```bash
-dflash/build/dflash_server dflash/models/Qwen3.6-27B-Q4_K_M.gguf \
- --draft dflash/models/draft/dflash-draft-3.6-q8_0.gguf \
+server/build/dflash_server server/models/Qwen3.6-27B-Q4_K_M.gguf \
+ --draft server/models/draft/dflash-draft-3.6-q8_0.gguf \
--host 127.0.0.1 --port 18080 \
--max-ctx 32768 --max-tokens 512 \
--fa-window 2048 \
@@ -83,7 +83,7 @@ need different context limits on a 24 GB card.
## Test a server change
-If you already have `dflash/scripts/server.py` running, use `probe`:
+If you already have `server/scripts/server.py` running, use `probe`:
```bash
python3 harness/client_test_runner.py probe \
@@ -99,9 +99,9 @@ For a GPU sweep, let the runner start Lucebox for each profile:
```bash
python3 harness/client_test_runner.py sweep \
- --target dflash/models/Qwen3.6-27B-Q4_K_M.gguf \
- --draft dflash/models/draft \
- --bin dflash/build/test_dflash \
+ --target server/models/Qwen3.6-27B-Q4_K_M.gguf \
+ --draft server/models/draft \
+ --bin server/build/test_dflash \
--profiles rtx3090_dflash_safe,rtx3090_dflash_long \
--clients all \
--json-out /tmp/lucebox_harness_sweep.json
diff --git a/harness/benchmarks/run_lucebox_vs_llamacpp.sh b/harness/benchmarks/run_lucebox_vs_llamacpp.sh
index acce47e0d..a800e3a24 100755
--- a/harness/benchmarks/run_lucebox_vs_llamacpp.sh
+++ b/harness/benchmarks/run_lucebox_vs_llamacpp.sh
@@ -7,10 +7,10 @@ RUN_DIR="${RUN_DIR:-$REPO_DIR/.harness-runs}"
STAMP="${STAMP:-generation-baseline-$(date +%Y%m%d-%H%M%S)}"
LOG_DIR="$RUN_DIR/$STAMP"
-TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}"
-DRAFT="${DRAFT:-$REPO_DIR/dflash/models/draft/dflash-draft-3.6-q8_0.gguf}"
-DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/dflash/build/test_dflash}"
-LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/dflash/deps/llama.cpp/build/bin/llama-server}"
+TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
+DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}"
+DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/server/build/test_dflash}"
+LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-$REPO_DIR/server/deps/llama.cpp/build/bin/llama-server}"
HOST="${HOST:-127.0.0.1}"
LUCEBOX_PORT="${LUCEBOX_PORT:-18080}"
@@ -30,7 +30,7 @@ API_KEY="${API_KEY:-sk-lucebox}"
PROMPTS="${PROMPTS:-$SCRIPT_DIR/prompts/generation_smoke.jsonl}"
LUCEBOX_SERVER_BACKEND="${LUCEBOX_SERVER_BACKEND:-python}"
-DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/dflash/build/dflash_server}"
+DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}"
mkdir -p "$LOG_DIR"
@@ -115,8 +115,8 @@ if [[ "$LUCEBOX_SERVER_BACKEND" == "cpp" ]]; then
if [[ ! -x "$DFLASH_SERVER_BIN" ]]; then
echo "dflash_server not found or not executable: $DFLASH_SERVER_BIN" >&2
echo "Build it first, for example:" >&2
- echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/dflash/build -DGGML_CUDA=ON" >&2
- echo " cmake --build $REPO_DIR/dflash/build --target dflash_server -j\$(nproc)" >&2
+ echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/server/build -DGGML_CUDA=ON" >&2
+ echo " cmake --build $REPO_DIR/server/build --target dflash_server -j\$(nproc)" >&2
exit 1
fi
local_ddtree_args=()
@@ -142,7 +142,7 @@ if [[ "$LUCEBOX_SERVER_BACKEND" == "cpp" ]]; then
> "$LUCEBOX_LOG" 2>&1 &
LUCEBOX_PID=$!
else
- python3 -u dflash/scripts/server.py \
+ python3 -u server/scripts/server.py \
--host "$HOST" \
--port "$LUCEBOX_PORT" \
--target "$TARGET" \
diff --git a/harness/clients/README.md b/harness/clients/README.md
index 00041e222..9d7458cb3 100644
--- a/harness/clients/README.md
+++ b/harness/clients/README.md
@@ -11,16 +11,16 @@ cd /workspace/lucebox-hub-harness
harness/clients/run_codex.sh
```
-Each launcher starts `dflash/scripts/server.py`, runs the client, writes logs
+Each launcher starts `server/scripts/server.py`, runs the client, writes logs
under `/workspace/lucebox-client-harness-runs`, then stops the server.
Set `LUCEBOX_SERVER_BACKEND=cpp` to run the native C++ HTTP server instead.
-The launcher will start `dflash/build/dflash_server` by default, or the path in
+The launcher will start `server/build/dflash_server` by default, or the path in
`DFLASH_SERVER_BIN`.
```bash
LUCEBOX_SERVER_BACKEND=cpp \
-DFLASH_SERVER_BIN=dflash/build/dflash_server \
+DFLASH_SERVER_BIN=server/build/dflash_server \
MAX_CTX=32768 MAX_TOKENS=512 \
BUDGET=22 VERIFY_MODE=ddtree \
harness/clients/run_codex.sh
diff --git a/harness/clients/common.sh b/harness/clients/common.sh
index e5dd8a585..f00122edc 100755
--- a/harness/clients/common.sh
+++ b/harness/clients/common.sh
@@ -8,12 +8,12 @@ REPO_DIR="${REPO_DIR:-/workspace/lucebox-hub-harness}"
CLIENT_WORK_DIR="${CLIENT_WORK_DIR:-/workspace/lucebox-harness-work}"
RUN_DIR="${RUN_DIR:-/workspace/lucebox-client-harness-runs}"
-TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}"
-DRAFT="${DRAFT:-$REPO_DIR/dflash/models/draft/dflash-draft-3.6-q8_0.gguf}"
-DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/dflash/build/test_dflash}"
+TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
+DRAFT="${DRAFT:-$REPO_DIR/server/models/draft/dflash-draft-3.6-q8_0.gguf}"
+DFLASH_BIN="${DFLASH_BIN:-$REPO_DIR/server/build/test_dflash}"
MODEL_SERVER="${MODEL_SERVER:-lucebox}"
LUCEBOX_SERVER_BACKEND="${LUCEBOX_SERVER_BACKEND:-python}"
-DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/dflash/build/dflash_server}"
+DFLASH_SERVER_BIN="${DFLASH_SERVER_BIN:-$REPO_DIR/server/build/dflash_server}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}"
LLAMA_N_GPU_LAYERS="${LLAMA_N_GPU_LAYERS:-999}"
LLAMA_FLASH_ATTN="${LLAMA_FLASH_ATTN:-1}"
@@ -74,7 +74,7 @@ start_lucebox_server() {
# EXTRA_SERVER_ARGS="--lazy-draft --prefill-compression auto"
read -r -a extra_args <<< "$EXTRA_SERVER_ARGS"
fi
- python3 -u dflash/scripts/server.py \
+ python3 -u server/scripts/server.py \
--host "$HOST" \
--port "$PORT" \
--target "$TARGET" \
@@ -97,8 +97,8 @@ start_dflash_native_server() {
if [[ ! -x "$DFLASH_SERVER_BIN" ]]; then
echo "dflash_server not found or not executable: $DFLASH_SERVER_BIN" >&2
echo "Build it first, for example:" >&2
- echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/dflash/build -DGGML_CUDA=ON" >&2
- echo " cmake --build $REPO_DIR/dflash/build --target dflash_server -j\$(nproc)" >&2
+ echo " cmake -S $REPO_DIR/dflash -B $REPO_DIR/server/build -DGGML_CUDA=ON" >&2
+ echo " cmake --build $REPO_DIR/server/build --target dflash_server -j\$(nproc)" >&2
return 1
fi
local extra_args=()
@@ -134,7 +134,7 @@ start_llamacpp_server() {
if [[ ! -x "$LLAMA_SERVER_BIN" ]]; then
echo "llama-server not found or not executable: $LLAMA_SERVER_BIN" >&2
echo "Build it first, for example:" >&2
- echo " cmake -S $REPO_DIR/dflash/deps/llama.cpp -B /workspace/llama-cpp-server-build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DLLAMA_BUILD_SERVER=ON -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_CURL=OFF" >&2
+ echo " cmake -S $REPO_DIR/server/deps/llama.cpp -B /workspace/llama-cpp-server-build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DLLAMA_BUILD_SERVER=ON -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_CURL=OFF" >&2
echo " cmake --build /workspace/llama-cpp-server-build --target llama-server -j2" >&2
return 1
fi
diff --git a/harness/clients/run_claude_llamacpp_decode_check.sh b/harness/clients/run_claude_llamacpp_decode_check.sh
index 8cf2902c7..1111aa952 100755
--- a/harness/clients/run_claude_llamacpp_decode_check.sh
+++ b/harness/clients/run_claude_llamacpp_decode_check.sh
@@ -4,7 +4,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="${REPO_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
RUN_ROOT="${RUN_ROOT:-${RUN_DIR:-/workspace/lucebox-client-harness-runs/claude-llamacpp-decode-check}}"
-TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}"
+TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}"
STAMP="${STAMP:-q8_32k_decode_check}"
PROMPT="${PROMPT:-Reply with exactly: OK_DONE}"
diff --git a/harness/clients/run_claude_llamacpp_matrix.sh b/harness/clients/run_claude_llamacpp_matrix.sh
index 5ffd0b538..fd9a71382 100755
--- a/harness/clients/run_claude_llamacpp_matrix.sh
+++ b/harness/clients/run_claude_llamacpp_matrix.sh
@@ -4,7 +4,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="${REPO_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
RUN_ROOT="${RUN_ROOT:-${RUN_DIR:-/workspace/lucebox-client-harness-runs/claude-llamacpp-matrix}}"
-TARGET="${TARGET:-$REPO_DIR/dflash/models/Qwen3.6-27B-Q4_K_M.gguf}"
+TARGET="${TARGET:-$REPO_DIR/server/models/Qwen3.6-27B-Q4_K_M.gguf}"
LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-/workspace/llama-cpp-server-build/bin/llama-server}"
PROMPT="${PROMPT:-Write exactly 120 words about why a repo-level agent benchmark should run the client harness instead of a handcrafted HTTP request. End with OK_DONE.}"
MARKER="${MARKER:-OK_DONE}"
diff --git a/megakernel/README.md b/optimizations/megakernel/README.md
similarity index 99%
rename from megakernel/README.md
rename to optimizations/megakernel/README.md
index e1c43241d..be066d94f 100644
--- a/megakernel/README.md
+++ b/optimizations/megakernel/README.md
@@ -116,7 +116,7 @@ uv sync --extra megakernel
uv run --directory megakernel python final_bench.py # runs pp520 tg128 (properly warmed), prints tok/s
```
-The legacy standalone flow still works from inside `megakernel/`: create a
+The legacy standalone flow still works from inside `optimizations/megakernel/`: create a
virtualenv, install `torch`, then run `pip install -e . --no-build-isolation`
so `setup.py` can import torch while compiling the CUDA extension.
diff --git a/megakernel/README_PASCAL.md b/optimizations/megakernel/README_PASCAL.md
similarity index 100%
rename from megakernel/README_PASCAL.md
rename to optimizations/megakernel/README_PASCAL.md
diff --git a/megakernel/RESULTS.md b/optimizations/megakernel/RESULTS.md
similarity index 92%
rename from megakernel/RESULTS.md
rename to optimizations/megakernel/RESULTS.md
index 17bc24896..4d1bb0e71 100644
--- a/megakernel/RESULTS.md
+++ b/optimizations/megakernel/RESULTS.md
@@ -77,12 +77,12 @@ PyTorch reference on the pp520 prompt from `final_bench.py`.
```bash
# Auto-dispatches to the NVFP4 path on Blackwell
-python megakernel/final_bench.py
+python optimizations/megakernel/final_bench.py
# Or force a backend
-python megakernel/final_bench.py --backend nvfp4
-python megakernel/final_bench.py --backend bf16
+python optimizations/megakernel/final_bench.py --backend nvfp4
+python optimizations/megakernel/final_bench.py --backend bf16
# Switch prefill mode (default is "hybrid"; "raw" uses prefill_megakernel_nvfp4)
-MEGAKERNEL_PREFILL_MODE=raw python megakernel/final_bench.py --backend nvfp4
+MEGAKERNEL_PREFILL_MODE=raw python optimizations/megakernel/final_bench.py --backend nvfp4
```
diff --git a/megakernel/_phase2_variant.py b/optimizations/megakernel/_phase2_variant.py
similarity index 100%
rename from megakernel/_phase2_variant.py
rename to optimizations/megakernel/_phase2_variant.py
diff --git a/megakernel/bench.py b/optimizations/megakernel/bench.py
similarity index 100%
rename from megakernel/bench.py
rename to optimizations/megakernel/bench.py
diff --git a/megakernel/bench_pp_tg.py b/optimizations/megakernel/bench_pp_tg.py
similarity index 100%
rename from megakernel/bench_pp_tg.py
rename to optimizations/megakernel/bench_pp_tg.py
diff --git a/megakernel/bench_pp_tg_nvfp4.py b/optimizations/megakernel/bench_pp_tg_nvfp4.py
similarity index 100%
rename from megakernel/bench_pp_tg_nvfp4.py
rename to optimizations/megakernel/bench_pp_tg_nvfp4.py
diff --git a/megakernel/build_corpus.py b/optimizations/megakernel/build_corpus.py
similarity index 100%
rename from megakernel/build_corpus.py
rename to optimizations/megakernel/build_corpus.py
diff --git a/megakernel/corpus/baseline.json b/optimizations/megakernel/corpus/baseline.json
similarity index 100%
rename from megakernel/corpus/baseline.json
rename to optimizations/megakernel/corpus/baseline.json
diff --git a/megakernel/corpus/wmma.json b/optimizations/megakernel/corpus/wmma.json
similarity index 100%
rename from megakernel/corpus/wmma.json
rename to optimizations/megakernel/corpus/wmma.json
diff --git a/megakernel/corpus/wmma_p3.json b/optimizations/megakernel/corpus/wmma_p3.json
similarity index 100%
rename from megakernel/corpus/wmma_p3.json
rename to optimizations/megakernel/corpus/wmma_p3.json
diff --git a/megakernel/corpus/wmma_p4.json b/optimizations/megakernel/corpus/wmma_p4.json
similarity index 100%
rename from megakernel/corpus/wmma_p4.json
rename to optimizations/megakernel/corpus/wmma_p4.json
diff --git a/megakernel/corpus/wmma_p6cleanup.json b/optimizations/megakernel/corpus/wmma_p6cleanup.json
similarity index 100%
rename from megakernel/corpus/wmma_p6cleanup.json
rename to optimizations/megakernel/corpus/wmma_p6cleanup.json
diff --git a/megakernel/corpus/wmma_p7.json b/optimizations/megakernel/corpus/wmma_p7.json
similarity index 100%
rename from megakernel/corpus/wmma_p7.json
rename to optimizations/megakernel/corpus/wmma_p7.json
diff --git a/megakernel/corpus/wmma_p8.json b/optimizations/megakernel/corpus/wmma_p8.json
similarity index 100%
rename from megakernel/corpus/wmma_p8.json
rename to optimizations/megakernel/corpus/wmma_p8.json
diff --git a/megakernel/diag_phase2_metrics.py b/optimizations/megakernel/diag_phase2_metrics.py
similarity index 100%
rename from megakernel/diag_phase2_metrics.py
rename to optimizations/megakernel/diag_phase2_metrics.py
diff --git a/megakernel/diag_prefill_kernels.py b/optimizations/megakernel/diag_prefill_kernels.py
similarity index 100%
rename from megakernel/diag_prefill_kernels.py
rename to optimizations/megakernel/diag_prefill_kernels.py
diff --git a/megakernel/final_bench.py b/optimizations/megakernel/final_bench.py
similarity index 100%
rename from megakernel/final_bench.py
rename to optimizations/megakernel/final_bench.py
diff --git a/megakernel/final_bench_nvfp4.py b/optimizations/megakernel/final_bench_nvfp4.py
similarity index 100%
rename from megakernel/final_bench_nvfp4.py
rename to optimizations/megakernel/final_bench_nvfp4.py
diff --git a/megakernel/half_type.h b/optimizations/megakernel/half_type.h
similarity index 100%
rename from megakernel/half_type.h
rename to optimizations/megakernel/half_type.h
diff --git a/megakernel/hero.png b/optimizations/megakernel/hero.png
similarity index 100%
rename from megakernel/hero.png
rename to optimizations/megakernel/hero.png
diff --git a/megakernel/hero.raw.png b/optimizations/megakernel/hero.raw.png
similarity index 100%
rename from megakernel/hero.raw.png
rename to optimizations/megakernel/hero.raw.png
diff --git a/megakernel/kernel.cu b/optimizations/megakernel/kernel.cu
similarity index 100%
rename from megakernel/kernel.cu
rename to optimizations/megakernel/kernel.cu
diff --git a/megakernel/kernel_gb10_nvfp4.cu b/optimizations/megakernel/kernel_gb10_nvfp4.cu
similarity index 100%
rename from megakernel/kernel_gb10_nvfp4.cu
rename to optimizations/megakernel/kernel_gb10_nvfp4.cu
diff --git a/megakernel/model.py b/optimizations/megakernel/model.py
similarity index 100%
rename from megakernel/model.py
rename to optimizations/megakernel/model.py
diff --git a/megakernel/model_nvfp4.py b/optimizations/megakernel/model_nvfp4.py
similarity index 100%
rename from megakernel/model_nvfp4.py
rename to optimizations/megakernel/model_nvfp4.py
diff --git a/megakernel/prefill.cu b/optimizations/megakernel/prefill.cu
similarity index 100%
rename from megakernel/prefill.cu
rename to optimizations/megakernel/prefill.cu
diff --git a/megakernel/prefill_bw.cu b/optimizations/megakernel/prefill_bw.cu
similarity index 100%
rename from megakernel/prefill_bw.cu
rename to optimizations/megakernel/prefill_bw.cu
diff --git a/megakernel/prefill_megakernel.cu b/optimizations/megakernel/prefill_megakernel.cu
similarity index 100%
rename from megakernel/prefill_megakernel.cu
rename to optimizations/megakernel/prefill_megakernel.cu
diff --git a/megakernel/pyproject.toml b/optimizations/megakernel/pyproject.toml
similarity index 100%
rename from megakernel/pyproject.toml
rename to optimizations/megakernel/pyproject.toml
diff --git a/megakernel/setup.py b/optimizations/megakernel/setup.py
similarity index 100%
rename from megakernel/setup.py
rename to optimizations/megakernel/setup.py
diff --git a/megakernel/torch_bindings.cpp b/optimizations/megakernel/torch_bindings.cpp
similarity index 100%
rename from megakernel/torch_bindings.cpp
rename to optimizations/megakernel/torch_bindings.cpp
diff --git a/pflash/README.md b/optimizations/pflash/README.md
similarity index 92%
rename from pflash/README.md
rename to optimizations/pflash/README.md
index 22be8a48e..47b9f81cd 100644
--- a/pflash/README.md
+++ b/optimizations/pflash/README.md
@@ -41,7 +41,7 @@ Long-context prefill is O(S²): vanilla llama.cpp on a single RTX 3090 takes **~
- C++/CUDA daemon-resident drafter + scoring + target generation, all in one process, one ggml allocator.
- Custom Qwen3-0.6B BF16 forward (`qwen3_0p6b_loader.cpp` + `qwen3_0p6b_graph.cpp`) — no libllama.
- 4 CUDA kernels for the FlashPrefill `mean_K → score → select → sparse_fwd` algorithm (`flashprefill_kernels.cu`).
-- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention), FA-2 derived, sm_80+) for the long-context drafter forward, wired without `libtorch` via 3 ATen/c10 header stubs (`dflash/deps/bsa_stubs/`).
+- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention), FA-2 derived, sm_80+) for the long-context drafter forward, wired without `libtorch` via 3 ATen/c10 header stubs (`server/deps/bsa_stubs/`).
- 128K → 2.6K span selection at `keep_ratio=0.05`, NIAH retrieved at every measured context, decode ~74 tok/s downstream.
## Results
@@ -57,7 +57,7 @@ Decode after prefill: ~74 tok/s (dflash spec decode + DDTree). The pipeline is t
## Quick start
-PFlash is the algorithm. The implementation lives in [`../dflash/`](../dflash/) as part of the dflash daemon. The `pflash/` directory in this repo only contains the Python tooling for **benchmarking** (NIAH case generation, bench harness around the daemon stdin protocol). Production deploys hit the dflash daemon directly.
+PFlash is the algorithm. The implementation lives in [`../server/`](../server/) as part of the dflash daemon. The `optimizations/pflash/` directory in this repo only contains the Python tooling for **benchmarking** (NIAH case generation, bench harness around the daemon stdin protocol). Production deploys hit the dflash daemon directly.
```bash
# 1. from the repo root, install Python deps and build dflash with the BSA
@@ -65,34 +65,34 @@ PFlash is the algorithm. The implementation lives in [`../dflash/`](../dflash/)
cd lucebox-hub
uv sync
git submodule update --init --recursive
-cmake -B dflash/build -S dflash -DCMAKE_BUILD_TYPE=Release \
+cmake -B server/build -S dflash -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=86 \
-DDFLASH27B_ENABLE_BSA=ON
-cmake --build dflash/build --target test_dflash test_flashprefill_kernels -j
+cmake --build server/build --target test_dflash test_flashprefill_kernels -j
# 2. fetch weights (target + spec-decode draft + drafter scorer)
-uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir dflash/models/
-uv run hf download Qwen/Qwen3-0.6B model.safetensors tokenizer.json --local-dir dflash/models/drafter/
-uv run hf download z-lab/Qwen3.6-27B-DFlash model.safetensors --local-dir dflash/models/draft/
+uv run hf download unsloth/Qwen3.6-27B-GGUF Qwen3.6-27B-Q4_K_M.gguf --local-dir server/models/
+uv run hf download Qwen/Qwen3-0.6B model.safetensors tokenizer.json --local-dir server/models/drafter/
+uv run hf download z-lab/Qwen3.6-27B-DFlash model.safetensors --local-dir server/models/draft/
# 2b. convert the drafter (Qwen3-0.6B HF) to a BF16 GGUF for the C++ scorer.
# The submodule already vendors llama.cpp at deps/llama.cpp.
-uv run python dflash/deps/llama.cpp/convert_hf_to_gguf.py dflash/models/drafter \
- --outtype bf16 --outfile dflash/models/Qwen3-0.6B-BF16.gguf
+uv run python server/deps/llama.cpp/convert_hf_to_gguf.py server/models/drafter \
+ --outtype bf16 --outfile server/models/Qwen3-0.6B-BF16.gguf
# 3. generate NIAH cases + run head-to-head bench against the C++ daemon
uv run --directory pflash python tests/niah_gen.py --n 1 --ctx 131072 --out /tmp/niah_128k.jsonl
uv run --directory pflash python tests/bench_niah_cpp.py \
- --bin ../dflash/build/test_dflash \
- --target ../dflash/models/Qwen3.6-27B-Q4_K_M.gguf \
- --draft-spec ../dflash/models/draft/model.safetensors \
- --drafter-gguf ../dflash/models/Qwen3-0.6B-BF16.gguf \
+ --bin ../server/build/test_dflash \
+ --target ../server/models/Qwen3.6-27B-Q4_K_M.gguf \
+ --draft-spec ../server/models/draft/model.safetensors \
+ --drafter-gguf ../server/models/Qwen3-0.6B-BF16.gguf \
--cases /tmp/niah_128k.jsonl --keep-ratio 0.05 --n-gen 256
```
## OpenAI server flags
-For an OpenAI-compatible server with transparent compression on long prompts, run [`dflash/scripts/server.py`](../dflash/scripts/server.py) with these flags:
+For an OpenAI-compatible server with transparent compression on long prompts, run [`server/scripts/server.py`](../server/scripts/server.py) with these flags:
| Flag | Choices / type | Default | Effect |
|---|---|:---:|---|
@@ -105,14 +105,14 @@ For an OpenAI-compatible server with transparent compression on long prompts, ru
When `--prefill-compression != off`, the server auto-sets `DFLASH27B_LM_HEAD_FIX=0` and `DFLASH27B_FA_WINDOW=0` (matching the bench harness — needed so the post-compress draft graph fits on a 24 GB card without OOM).
```bash
-python dflash/scripts/server.py \
- --target dflash/models/Qwen3.6-27B-Q4_K_M.gguf \
- --draft dflash/models/draft/model.safetensors \
+python server/scripts/server.py \
+ --target server/models/Qwen3.6-27B-Q4_K_M.gguf \
+ --draft server/models/draft/model.safetensors \
--max-ctx 8192 --budget 16 --fa-window 0 \
--prefill-compression auto \
--prefill-threshold 4096 \
--prefill-keep-ratio 0.02 \
- --prefill-drafter dflash/models/Qwen3-0.6B-BF16.gguf
+ --prefill-drafter server/models/Qwen3-0.6B-BF16.gguf
```
Below the threshold the server runs the standard target generate (no compression). Above it, the server transparently runs `compress` on the daemon, swaps the prompt for the compressed text, and continues the normal `/v1/chat/completions` flow. Tool-calling requests (`req.tools` non-empty) skip compression so JSON tool definitions stay intact.
@@ -137,7 +137,7 @@ Typical flow at 128K on a 24 GB card: `park target` → `compress` → `free dra
## Runtime tunables
-Everything is configured via env vars on the daemon process. Full list in [`../dflash/src/flashprefill.h`](../dflash/src/flashprefill.h).
+Everything is configured via env vars on the daemon process. Full list in [`../server/src/flashprefill.h`](../server/src/flashprefill.h).
| Env var | Default | Purpose |
|---|:---:|---|
@@ -205,7 +205,7 @@ The algorithms are not ours:
What we built:
- C++/CUDA port of the FlashPrefill algorithm: 4 kernels (`mean_K / score / select / sparse_fwd`), no Triton dependency.
-- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention)) wired without `libtorch` via 3 ATen/c10 header stubs (`dflash/deps/bsa_stubs/`).
+- BSA ([mit-han-lab/Block-Sparse-Attention](https://github.com/mit-han-lab/Block-Sparse-Attention)) wired without `libtorch` via 3 ATen/c10 header stubs (`server/deps/bsa_stubs/`).
- Custom Qwen3-0.6B BF16 forward so the drafter runs through the same ggml allocator as the 27B target.
- Daemon stdin protocol (`compress` / `generate` / `park` / `unpark` / `free drafter`) so target + drafter coexist on a 24 GB card.
- NIAH harness against `llama-bench` for end-to-end validation.
diff --git a/pflash/demo.gif b/optimizations/pflash/demo.gif
similarity index 100%
rename from pflash/demo.gif
rename to optimizations/pflash/demo.gif
diff --git a/pflash/hero.png b/optimizations/pflash/hero.png
similarity index 100%
rename from pflash/hero.png
rename to optimizations/pflash/hero.png
diff --git a/pflash/pflash/__init__.py b/optimizations/pflash/pflash/__init__.py
similarity index 100%
rename from pflash/pflash/__init__.py
rename to optimizations/pflash/pflash/__init__.py
diff --git a/pflash/pflash/config.py b/optimizations/pflash/pflash/config.py
similarity index 100%
rename from pflash/pflash/config.py
rename to optimizations/pflash/pflash/config.py
diff --git a/pflash/pflash/dflash_client.py b/optimizations/pflash/pflash/dflash_client.py
similarity index 100%
rename from pflash/pflash/dflash_client.py
rename to optimizations/pflash/pflash/dflash_client.py
diff --git a/pflash/pyproject.toml b/optimizations/pflash/pyproject.toml
similarity index 100%
rename from pflash/pyproject.toml
rename to optimizations/pflash/pyproject.toml
diff --git a/pflash/tests/bench_niah_cpp.py b/optimizations/pflash/tests/bench_niah_cpp.py
similarity index 100%
rename from pflash/tests/bench_niah_cpp.py
rename to optimizations/pflash/tests/bench_niah_cpp.py
diff --git a/pflash/tests/niah_gen.py b/optimizations/pflash/tests/niah_gen.py
similarity index 100%
rename from pflash/tests/niah_gen.py
rename to optimizations/pflash/tests/niah_gen.py
diff --git a/pyproject.toml b/pyproject.toml
index 9fa072776..30dc2ee58 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ package = false
no-build-isolation-package = ["qwen35-megakernel-bf16"]
[tool.uv.workspace]
-members = ["dflash", "megakernel", "pflash"]
+members = ["server", "optimizations/megakernel", "optimizations/pflash"]
[tool.uv.sources]
lucebox-dflash = { workspace = true }
diff --git a/scripts/check_uv_workspace.sh b/scripts/check_uv_workspace.sh
index 9015a48ed..618c57011 100755
--- a/scripts/check_uv_workspace.sh
+++ b/scripts/check_uv_workspace.sh
@@ -21,14 +21,14 @@ import numpy
import transformers
import uvicorn
-sys.path.insert(0, "dflash/scripts")
+sys.path.insert(0, "server/scripts")
import server # noqa: F401
print("workspace import OK from repo root")
PY
(
- cd dflash
+ cd server
uv run --frozen --no-sync python - <<'PY'
from pathlib import Path
import sys
@@ -38,6 +38,6 @@ assert Path(sys.prefix).resolve() == root_venv, (sys.prefix, root_venv)
sys.path.insert(0, "scripts")
import server # noqa: F401
-print("workspace discovery OK from dflash/")
+print("workspace discovery OK from server/")
PY
)
diff --git a/dflash/.gitignore b/server/.gitignore
similarity index 100%
rename from dflash/.gitignore
rename to server/.gitignore
diff --git a/dflash/CMakeLists.txt b/server/CMakeLists.txt
similarity index 100%
rename from dflash/CMakeLists.txt
rename to server/CMakeLists.txt
diff --git a/dflash/CODEX.md b/server/CODEX.md
similarity index 100%
rename from dflash/CODEX.md
rename to server/CODEX.md
diff --git a/dflash/DEVELOPER.md b/server/DEVELOPER.md
similarity index 96%
rename from dflash/DEVELOPER.md
rename to server/DEVELOPER.md
index 6cda569a0..1c6ed16c2 100644
--- a/dflash/DEVELOPER.md
+++ b/server/DEVELOPER.md
@@ -22,7 +22,7 @@ build-essential cmake git git-lfs nvcc (CUDA Toolkit)
A setup script is provided that installs everything (run as root):
```bash
-sudo bash dflash/scripts/setup_system.sh
+sudo bash server/scripts/setup_system.sh
```
This installs build tools, `hf` (via pipx), and the CUDA Toolkit.
@@ -71,7 +71,7 @@ cmake -B build -S . -DCMAKE_BUILD_TYPE=Release
cmake --build build --target test_dflash -j
```
-The binary lands at `dflash/build/test_dflash`.
+The binary lands at `server/build/test_dflash`.
### CMake options
@@ -90,16 +90,16 @@ Download models before running the server:
```bash
# Target model (Q4_K_M quantized Qwen3.6-27B)
-hf download --local-dir dflash/models/
+hf download --local-dir server/models/
# Draft model (1.84 GB default Qwen3.6 GGUF draft)
-hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir dflash/models/draft/
+hf download Lucebox/Qwen3.6-27B-DFlash-GGUF dflash-draft-3.6-q8_0.gguf --local-dir server/models/draft/
```
Expected layout:
```
-dflash/models/
+server/models/
├── Qwen3.6-27B-Q4_K_M.gguf # --target (GGUF)
└── draft/
└── dflash-draft-3.6-q8_0.gguf # --draft (GGUF)
@@ -155,7 +155,7 @@ python scripts/server.py
These tests **do not** require a GPU or running daemon — they use mocked backends:
```bash
-cd dflash/scripts
+cd server/scripts
python -m pytest test_server.py -v
```
@@ -186,7 +186,7 @@ run the baseline tests above to validate code changes.
After building:
```bash
-cd dflash/build
+cd server/build
# Numerics tests
./test_vs_oracle --target ../models/Qwen3.6-27B-Q4_K_M.gguf \
@@ -203,7 +203,7 @@ cd dflash/build
These scripts start their own server subprocess and need the daemon binary + models:
```bash
-cd dflash/scripts
+cd server/scripts
python test_server_prefix_cache.py
python test_multi_turn_prefix_cache.py
python test_full_compress_cache.py
@@ -214,7 +214,7 @@ python test_full_compress_cache.py
## Project structure
```
-dflash/
+server/
├── CMakeLists.txt # C++ build (cmake)
├── include/ # C++ headers
├── src/ # C++ sources (target/draft graph, KV cache, FlashPrefill)
@@ -267,7 +267,7 @@ No `env_key` is needed for local use.
```bash
# Start the server
-python dflash/scripts/server.py --port 8080
+python server/scripts/server.py --port 8080
# In another terminal
codex --provider dflash "Explain this codebase"
diff --git a/dflash/README.md b/server/README.md
similarity index 99%
rename from dflash/README.md
rename to server/README.md
index 35bc2a2b7..2a7ee6343 100644
--- a/dflash/README.md
+++ b/server/README.md
@@ -273,7 +273,7 @@ The 131K `keep=0.10` run depends on token-boundary repair in `scripts/laguna_pfl
The table above uses synthetic uniform filler. Pass `--filler-file ` to
use a real corpus instead (file or directory; directories are recursively
-concatenated). On `dflash/src` (1.3 MiB of C++/CUDA, ctx=16K, depth=0.5):
+concatenated). On `server/src` (1.3 MiB of C++/CUDA, ctx=16K, depth=0.5):
| keep | drafter compressed | NIAH |
|:----:|-------------------:|:----:|
@@ -457,7 +457,7 @@ only wire protocol used by [OpenAI Codex](https://github.com/openai/codex).
### 1. Start the DFlash server
```bash
-python dflash/scripts/server.py \
+python server/scripts/server.py \
--target models/Qwen3.5-27B-Q4_K_M.gguf \
--draft models/Qwen3.5-3B-f16.safetensors \
--budget 22 --port 8080
diff --git a/dflash/RESULTS.md b/server/RESULTS.md
similarity index 99%
rename from dflash/RESULTS.md
rename to server/RESULTS.md
index b110e9c54..09fe73536 100644
--- a/dflash/RESULTS.md
+++ b/server/RESULTS.md
@@ -577,7 +577,7 @@ Draft: local Qwen3.6-27B DFlash safetensors + Qwen3-0.6B-BF16 PFlash drafter.
Build: `cmake -B build-luce-sm120 -S . -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=120 -DDFLASH27B_USER_CUDA_ARCHITECTURES=120 -DDFLASH27B_ENABLE_BSA=ON`
-Final ("V4") runtime config — driven via the `pflash/tests/bench_niah_cpp.py`
+Final ("V4") runtime config — driven via the `optimizations/pflash/tests/bench_niah_cpp.py`
CLI flags added in #90 plus daemon env vars (each bullet leads with the
exact interface):
@@ -589,7 +589,7 @@ exact interface):
- `--n-gen=1024`
Test set: 10 NIAH prompts at 117K tokens (margin under Qwen3.6-27B's 131K
-native RoPE limit, generated with [`pflash/tests/niah_gen.py`](../pflash/tests/niah_gen.py)
+native RoPE limit, generated with [`optimizations/pflash/tests/niah_gen.py`](../optimizations/pflash/tests/niah_gen.py)
at calibrated `char_per_tok`).
### RTX 5090 long-ctx headline
@@ -774,7 +774,7 @@ Linux 4090 should match or exceed the 3090 numbers.
Running via `server.py` (OpenAI-compatible HTTP) with TQ3 KV cache and 128K context:
```bash
-DFLASH27B_KV_TQ3=1 python dflash/scripts/server.py \
+DFLASH27B_KV_TQ3=1 python server/scripts/server.py \
--target Qwen3.6-27B-Q4_K_M.gguf --draft dflash-draft-3.6-q8_0.gguf \
--port 8082 --budget 28 --max-ctx 131072
```
diff --git a/dflash/demo.gif b/server/demo.gif
similarity index 100%
rename from dflash/demo.gif
rename to server/demo.gif
diff --git a/dflash/deps/Block-Sparse-Attention b/server/deps/Block-Sparse-Attention
similarity index 100%
rename from dflash/deps/Block-Sparse-Attention
rename to server/deps/Block-Sparse-Attention
diff --git a/dflash/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h b/server/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h
similarity index 100%
rename from dflash/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h
rename to server/deps/bsa_stubs/ATen/cuda/CUDAGeneratorImpl.h
diff --git a/dflash/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh b/server/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh
similarity index 100%
rename from dflash/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh
rename to server/deps/bsa_stubs/ATen/cuda/CUDAGraphsUtils.cuh
diff --git a/dflash/deps/bsa_stubs/README.md b/server/deps/bsa_stubs/README.md
similarity index 95%
rename from dflash/deps/bsa_stubs/README.md
rename to server/deps/bsa_stubs/README.md
index a0b0c6864..d0abfb125 100644
--- a/dflash/deps/bsa_stubs/README.md
+++ b/server/deps/bsa_stubs/README.md
@@ -16,7 +16,7 @@ the references BSA actually uses:
returning `{seed, offset}` from the stub state.
These headers are placed FIRST on the BSA include path
-(`dflash/CMakeLists.txt`, gated on `DFLASH27B_ENABLE_BSA`). When BSA's
+(`server/CMakeLists.txt`, gated on `DFLASH27B_ENABLE_BSA`). When BSA's
generated CUDA includes ``, the compiler picks up
this stub instead of trying to find PyTorch.
diff --git a/dflash/deps/bsa_stubs/c10/cuda/CUDAException.h b/server/deps/bsa_stubs/c10/cuda/CUDAException.h
similarity index 100%
rename from dflash/deps/bsa_stubs/c10/cuda/CUDAException.h
rename to server/deps/bsa_stubs/c10/cuda/CUDAException.h
diff --git a/dflash/deps/llama.cpp b/server/deps/llama.cpp
similarity index 100%
rename from dflash/deps/llama.cpp
rename to server/deps/llama.cpp
diff --git a/dflash/docs/API.md b/server/docs/API.md
similarity index 100%
rename from dflash/docs/API.md
rename to server/docs/API.md
diff --git a/dflash/docs/ARCHITECTURE.md b/server/docs/ARCHITECTURE.md
similarity index 99%
rename from dflash/docs/ARCHITECTURE.md
rename to server/docs/ARCHITECTURE.md
index 1583aed71..dfba75840 100644
--- a/dflash/docs/ARCHITECTURE.md
+++ b/server/docs/ARCHITECTURE.md
@@ -36,7 +36,7 @@ using a uniform stdin/stdout protocol.
## Directory Structure
```
-dflash/src/
+server/src/
├── common/ # Shared infrastructure (all backends)
│ ├── model_backend.h # ModelBackend abstract interface
│ ├── snapshot_backend.h # Platform-aware snapshot backend selection
@@ -406,7 +406,7 @@ by Qwen35Backend.
### Step 6: Build and Test
```bash
-cd dflash/build && cmake .. -DCMAKE_BUILD_TYPE=Release && cmake --build . -j$(nproc)
+cd server/build && cmake .. -DCMAKE_BUILD_TYPE=Release && cmake --build . -j$(nproc)
# AR baseline
./test_dflash daemon --target ../../models/gemma4.gguf
diff --git a/dflash/docs/HIP_PERF_PLAN.md b/server/docs/HIP_PERF_PLAN.md
similarity index 100%
rename from dflash/docs/HIP_PERF_PLAN.md
rename to server/docs/HIP_PERF_PLAN.md
diff --git a/dflash/docs/MIXED_BACKEND.md b/server/docs/MIXED_BACKEND.md
similarity index 100%
rename from dflash/docs/MIXED_BACKEND.md
rename to server/docs/MIXED_BACKEND.md
diff --git a/dflash/docs/PREFIX_CACHE.md b/server/docs/PREFIX_CACHE.md
similarity index 100%
rename from dflash/docs/PREFIX_CACHE.md
rename to server/docs/PREFIX_CACHE.md
diff --git a/dflash/docs/SPEC_PREFILL.md b/server/docs/SPEC_PREFILL.md
similarity index 97%
rename from dflash/docs/SPEC_PREFILL.md
rename to server/docs/SPEC_PREFILL.md
index cb2837dce..05ff617d0 100644
--- a/dflash/docs/SPEC_PREFILL.md
+++ b/server/docs/SPEC_PREFILL.md
@@ -4,7 +4,7 @@ In-process speculative-prefill + speculative-decode daemon (C++/CUDA only,
no Python, no Triton, no PyTorch at runtime).
This doc is the build / runtime / tunables reference for the C++ daemon
-path described in [`pflash/README.md`](../../pflash/README.md) and on the
+path described in [`optimizations/pflash/README.md`](../../optimizations/pflash/README.md) and on the
[blog post](https://lucebox.com/blog/pflash):
- **Drafter** (Qwen3-0.6B) loaded via a custom forward (`qwen3_*`)
diff --git a/dflash/docs/laguna_integration_plan.md b/server/docs/laguna_integration_plan.md
similarity index 93%
rename from dflash/docs/laguna_integration_plan.md
rename to server/docs/laguna_integration_plan.md
index fcdc55057..60423e922 100644
--- a/dflash/docs/laguna_integration_plan.md
+++ b/server/docs/laguna_integration_plan.md
@@ -12,26 +12,26 @@ Status: scaffolding. PR #115 in lucebox-hub bumps llama.cpp submodule to `luce-d
## Constraint
-No libllama dependency in dflash runtime. Keep ggml-only stack. (libllama+LAGUNA arch from PR #7 is used by quantize/inspect tools at /workspace/lucebox-hub/dflash/deps/llama.cpp/build-standalone/ and at HF upload time, not by the daemon.)
+No libllama dependency in dflash runtime. Keep ggml-only stack. (libllama+LAGUNA arch from PR #7 is used by quantize/inspect tools at /workspace/lucebox-hub/server/deps/llama.cpp/build-standalone/ and at HF upload time, not by the daemon.)
## Implementation outline
### Files to add
-1. `dflash/src/laguna_internal.h` (NEW, ~200 LOC) — structs:
+1. `server/src/laguna_internal.h` (NEW, ~200 LOC) — structs:
- `LagunaTargetLayer` — per-layer tensors (attn_norm, wq/wk/wv/wo, q_norm/k_norm, attn_gate, ffn_norm, dense MLP for layer 0, MoE: ffn_gate_inp + ffn_exp_probs_b + ffn_gate_exps + ffn_up_exps + ffn_down_exps + ffn_gate_shexp + ffn_up_shexp + ffn_down_shexp)
- `LagunaTargetWeights` — collection of layers + tok_embd + output_norm + output, plus metadata (n_layer=40, n_head_per_layer[40] = [48,64,64,64]*10, n_head_kv=8, head_dim=128, n_embd=2048, n_ff=8192, n_ff_exp=512, n_ff_shexp=512, n_expert=256, n_expert_used=8, expert_weights_scale=2.5, sliding_window=512, rope_freq_base_full=500000, rope_freq_base_swa=10000, n_rot_full=64, n_rot_swa=128, eos_id=2, eot_id=24)
- `LagunaTargetCache` — KV cache (Q8_0, per layer, max_ctx tokens), no SSM/conv state
- `LagunaGraphInputs` / `LagunaGraphOutputs`
-2. `dflash/src/laguna_target_loader.cpp` (NEW, ~500 LOC):
+2. `server/src/laguna_target_loader.cpp` (NEW, ~500 LOC):
- `load_target_gguf_laguna(path, backend, LagunaTargetWeights & out)`
- Validates `arch == "laguna"`, reads all hparams, mmaps GGUF, copies tensors to ggml_backend buffer
- Per-layer head count: reads `laguna.attention.head_count` as ARRAY (length 40) into `n_head_arr`
- Tensor naming: matches gguf-py's MODEL_ARCH.LAGUNA list (token_embd, output_norm, output, blk..{attn_norm, attn_q, attn_k, attn_v, attn_output, attn_q_norm, attn_k_norm, attn_gate, ffn_norm, ffn_gate, ffn_down, ffn_up, ffn_gate_inp, ffn_gate_exps, ffn_down_exps, ffn_up_exps, ffn_gate_shexp, ffn_up_shexp, ffn_down_shexp, exp_probs_b})
- Layer 0: dense MLP (ffn_gate/down/up). Layers 1-39: sparse MoE (ffn_*_exps + shexp + gate_inp + exp_probs_b)
-3. `dflash/src/laguna_target_graph.cpp` (NEW, ~1500 LOC — multi-session):
+3. `server/src/laguna_target_graph.cpp` (NEW, ~1500 LOC — multi-session):
- `build_laguna_full_attn_block` — full attention layer with YaRN RoPE (theta=500K, factor=32, partial_rotary=0.5, n_rot=64), per-head softplus gate, head_count from per-layer arr (48 on full)
- `build_laguna_swa_block` — sliding-window attention layer (window=512, theta=10K, partial_rotary=1.0, n_rot=128), per-head softplus gate, head_count=64
- `build_laguna_dense_mlp` — SwiGLU dense MLP (layer 0)
@@ -42,23 +42,23 @@ No libllama dependency in dflash runtime. Keep ggml-only stack. (libllama+LAGUNA
- Reuses `flash_prefill_forward_bf16` for sparse prefill on full-attention layers (for sliding-window layers, use dense FA since window=512 is small)
- Cache mgmt: `create_laguna_target_cache`, `free_laguna_target_cache`, `reset_laguna_target_cache`, `snapshot_laguna_target_cache`, `restore_laguna_target_cache`
-4. Modify `dflash/src/gguf_target_loader.cpp` (~30 LOC added):
+4. Modify `server/src/gguf_target_loader.cpp` (~30 LOC added):
- Pre-detect arch string from GGUF header
- Dispatch: arch == "qwen35" → existing path, arch == "laguna" → new path
-5. Modify `dflash/src/internal.h` (~50 LOC added):
+5. Modify `server/src/internal.h` (~50 LOC added):
- `enum class TargetArch { Qwen35, Laguna }` to tag the loaded weights
- Forward decls for Laguna structs / functions (or include `laguna_internal.h`)
-6. Modify `dflash/CMakeLists.txt`:
+6. Modify `server/CMakeLists.txt`:
- Add `src/laguna_target_loader.cpp` and `src/laguna_target_graph.cpp` to `dflash27b` library sources
-7. Modify `dflash/test/test_dflash.cpp` (substantial changes — multi-session):
+7. Modify `server/test/test_dflash.cpp` (substantial changes — multi-session):
- Detect arch from loaded weights
- For Laguna arch, use `LagunaTargetCache` + `build_laguna_graph` instead of qwen35 equivalents
- Adjust per-layer-head-count in attention buffer sizing
- PFlash drafter call unchanged (drafter is Qwen3-0.6B regardless of target)
- - Cross-tokenizer mapping (Qwen3 IDs → Laguna IDs): byte-level round-trip via existing pflash/ Python module OR port to C++ helper
+ - Cross-tokenizer mapping (Qwen3 IDs → Laguna IDs): byte-level round-trip via existing optimizations/pflash/ Python module OR port to C++ helper
## Phasing
diff --git a/dflash/eval/README.md b/server/eval/README.md
similarity index 94%
rename from dflash/eval/README.md
rename to server/eval/README.md
index 0b7bcb716..8397fb16c 100644
--- a/dflash/eval/README.md
+++ b/server/eval/README.md
@@ -8,7 +8,7 @@ produce different bytes for the same prompt?", not "is the answer correct".
## quick A/B (`scripts/quality_ab_simple.py`)
Runs ~7 short conversational prompts against several server configs. For each
-config it spawns a fresh `dflash/scripts/server.py`, fires the prompts in
+config it spawns a fresh `server/scripts/server.py`, fires the prompts in
sequence, then tears the server down. At the end it prints a markdown table
comparing each config against the matching baseline (configs ending in `_f16`
are compared against `baseline_f16` so attention precision is held constant).
@@ -16,9 +16,9 @@ are compared against `baseline_f16` so attention precision is held constant).
```
PFLASH_TARGET=/path/to/target.gguf \
PFLASH_DRAFT=/path/to/draft-dir-or-safetensors \
-PFLASH_BIN=dflash/build/test_dflash \
+PFLASH_BIN=server/build/test_dflash \
PFLASH_DRAFTER=/path/to/Qwen3-0.6B-BF16.gguf \
-python3 dflash/scripts/quality_ab_simple.py
+python3 server/scripts/quality_ab_simple.py
```
Configs are defined in `CONFIGS` near the top of the script. Each spawns one
diff --git a/dflash/eval/humaneval_plus/SOURCE.txt b/server/eval/humaneval_plus/SOURCE.txt
similarity index 100%
rename from dflash/eval/humaneval_plus/SOURCE.txt
rename to server/eval/humaneval_plus/SOURCE.txt
diff --git a/dflash/eval/humaneval_plus/humanevalplus.jsonl b/server/eval/humaneval_plus/humanevalplus.jsonl
similarity index 100%
rename from dflash/eval/humaneval_plus/humanevalplus.jsonl
rename to server/eval/humaneval_plus/humanevalplus.jsonl
diff --git a/dflash/eval/mt_bench/SOURCE.txt b/server/eval/mt_bench/SOURCE.txt
similarity index 100%
rename from dflash/eval/mt_bench/SOURCE.txt
rename to server/eval/mt_bench/SOURCE.txt
diff --git a/dflash/eval/mt_bench/question.jsonl b/server/eval/mt_bench/question.jsonl
similarity index 100%
rename from dflash/eval/mt_bench/question.jsonl
rename to server/eval/mt_bench/question.jsonl
diff --git a/dflash/examples/chat.py b/server/examples/chat.py
similarity index 100%
rename from dflash/examples/chat.py
rename to server/examples/chat.py
diff --git a/dflash/hero.png b/server/hero.png
similarity index 100%
rename from dflash/hero.png
rename to server/hero.png
diff --git a/dflash/hero.raw.png b/server/hero.raw.png
similarity index 100%
rename from dflash/hero.raw.png
rename to server/hero.raw.png
diff --git a/dflash/hip_compat/cuda_bf16.h b/server/hip_compat/cuda_bf16.h
similarity index 100%
rename from dflash/hip_compat/cuda_bf16.h
rename to server/hip_compat/cuda_bf16.h
diff --git a/dflash/hip_compat/cuda_fp16.h b/server/hip_compat/cuda_fp16.h
similarity index 100%
rename from dflash/hip_compat/cuda_fp16.h
rename to server/hip_compat/cuda_fp16.h
diff --git a/dflash/hip_compat/cuda_runtime.h b/server/hip_compat/cuda_runtime.h
similarity index 100%
rename from dflash/hip_compat/cuda_runtime.h
rename to server/hip_compat/cuda_runtime.h
diff --git a/dflash/hip_compat/mma.h b/server/hip_compat/mma.h
similarity index 100%
rename from dflash/hip_compat/mma.h
rename to server/hip_compat/mma.h
diff --git a/dflash/include/dflash27b.h b/server/include/dflash27b.h
similarity index 100%
rename from dflash/include/dflash27b.h
rename to server/include/dflash27b.h
diff --git a/dflash/pyproject.toml b/server/pyproject.toml
similarity index 100%
rename from dflash/pyproject.toml
rename to server/pyproject.toml
diff --git a/dflash/scripts/_prefill_hook.py b/server/scripts/_prefill_hook.py
similarity index 100%
rename from dflash/scripts/_prefill_hook.py
rename to server/scripts/_prefill_hook.py
diff --git a/dflash/scripts/bench_agent.py b/server/scripts/bench_agent.py
similarity index 100%
rename from dflash/scripts/bench_agent.py
rename to server/scripts/bench_agent.py
diff --git a/dflash/scripts/bench_agent_loop.py b/server/scripts/bench_agent_loop.py
similarity index 100%
rename from dflash/scripts/bench_agent_loop.py
rename to server/scripts/bench_agent_loop.py
diff --git a/dflash/scripts/bench_daemon.py b/server/scripts/bench_daemon.py
similarity index 100%
rename from dflash/scripts/bench_daemon.py
rename to server/scripts/bench_daemon.py
diff --git a/dflash/scripts/bench_he.py b/server/scripts/bench_he.py
similarity index 100%
rename from dflash/scripts/bench_he.py
rename to server/scripts/bench_he.py
diff --git a/dflash/scripts/bench_he_http.py b/server/scripts/bench_he_http.py
similarity index 100%
rename from dflash/scripts/bench_he_http.py
rename to server/scripts/bench_he_http.py
diff --git a/dflash/scripts/bench_llm.py b/server/scripts/bench_llm.py
similarity index 100%
rename from dflash/scripts/bench_llm.py
rename to server/scripts/bench_llm.py
diff --git a/dflash/scripts/bench_server.py b/server/scripts/bench_server.py
similarity index 100%
rename from dflash/scripts/bench_server.py
rename to server/scripts/bench_server.py
diff --git a/dflash/scripts/convert_dflash_to_gguf.py b/server/scripts/convert_dflash_to_gguf.py
similarity index 100%
rename from dflash/scripts/convert_dflash_to_gguf.py
rename to server/scripts/convert_dflash_to_gguf.py
diff --git a/dflash/scripts/detokenize.py b/server/scripts/detokenize.py
similarity index 100%
rename from dflash/scripts/detokenize.py
rename to server/scripts/detokenize.py
diff --git a/dflash/scripts/fixtures/agent_prompts/codex_apply_patch.md b/server/scripts/fixtures/agent_prompts/codex_apply_patch.md
similarity index 100%
rename from dflash/scripts/fixtures/agent_prompts/codex_apply_patch.md
rename to server/scripts/fixtures/agent_prompts/codex_apply_patch.md
diff --git a/dflash/scripts/fixtures/agent_prompts/codex_gpt52.md b/server/scripts/fixtures/agent_prompts/codex_gpt52.md
similarity index 100%
rename from dflash/scripts/fixtures/agent_prompts/codex_gpt52.md
rename to server/scripts/fixtures/agent_prompts/codex_gpt52.md
diff --git a/dflash/scripts/fixtures/agent_prompts/codex_gpt52_codex.md b/server/scripts/fixtures/agent_prompts/codex_gpt52_codex.md
similarity index 100%
rename from dflash/scripts/fixtures/agent_prompts/codex_gpt52_codex.md
rename to server/scripts/fixtures/agent_prompts/codex_gpt52_codex.md
diff --git a/dflash/scripts/fixtures/agent_prompts/codex_gpt5_codex.md b/server/scripts/fixtures/agent_prompts/codex_gpt5_codex.md
similarity index 100%
rename from dflash/scripts/fixtures/agent_prompts/codex_gpt5_codex.md
rename to server/scripts/fixtures/agent_prompts/codex_gpt5_codex.md
diff --git a/dflash/scripts/fixtures/swe_bench/swe_bench_verified.parquet b/server/scripts/fixtures/swe_bench/swe_bench_verified.parquet
similarity index 100%
rename from dflash/scripts/fixtures/swe_bench/swe_bench_verified.parquet
rename to server/scripts/fixtures/swe_bench/swe_bench_verified.parquet
diff --git a/dflash/scripts/gen_oracle.py b/server/scripts/gen_oracle.py
similarity index 100%
rename from dflash/scripts/gen_oracle.py
rename to server/scripts/gen_oracle.py
diff --git a/dflash/scripts/laguna_pflash_niah.py b/server/scripts/laguna_pflash_niah.py
similarity index 100%
rename from dflash/scripts/laguna_pflash_niah.py
rename to server/scripts/laguna_pflash_niah.py
diff --git a/dflash/scripts/parity_laguna.py b/server/scripts/parity_laguna.py
similarity index 100%
rename from dflash/scripts/parity_laguna.py
rename to server/scripts/parity_laguna.py
diff --git a/dflash/scripts/phase_split_dual_gpu.py b/server/scripts/phase_split_dual_gpu.py
similarity index 100%
rename from dflash/scripts/phase_split_dual_gpu.py
rename to server/scripts/phase_split_dual_gpu.py
diff --git a/dflash/scripts/placement/__init__.py b/server/scripts/placement/__init__.py
similarity index 100%
rename from dflash/scripts/placement/__init__.py
rename to server/scripts/placement/__init__.py
diff --git a/dflash/scripts/placement/backend_device.py b/server/scripts/placement/backend_device.py
similarity index 100%
rename from dflash/scripts/placement/backend_device.py
rename to server/scripts/placement/backend_device.py
diff --git a/dflash/scripts/placement/server_resolver.py b/server/scripts/placement/server_resolver.py
similarity index 100%
rename from dflash/scripts/placement/server_resolver.py
rename to server/scripts/placement/server_resolver.py
diff --git a/dflash/scripts/placement/test_dflash_args.py b/server/scripts/placement/test_dflash_args.py
similarity index 100%
rename from dflash/scripts/placement/test_dflash_args.py
rename to server/scripts/placement/test_dflash_args.py
diff --git a/dflash/scripts/prefix_cache.py b/server/scripts/prefix_cache.py
similarity index 100%
rename from dflash/scripts/prefix_cache.py
rename to server/scripts/prefix_cache.py
diff --git a/dflash/scripts/quality_ab_simple.py b/server/scripts/quality_ab_simple.py
similarity index 100%
rename from dflash/scripts/quality_ab_simple.py
rename to server/scripts/quality_ab_simple.py
diff --git a/dflash/scripts/quality_humaneval_plus.py b/server/scripts/quality_humaneval_plus.py
similarity index 100%
rename from dflash/scripts/quality_humaneval_plus.py
rename to server/scripts/quality_humaneval_plus.py
diff --git a/dflash/scripts/quantize_draft_q8.py b/server/scripts/quantize_draft_q8.py
similarity index 100%
rename from dflash/scripts/quantize_draft_q8.py
rename to server/scripts/quantize_draft_q8.py
diff --git a/dflash/scripts/quantize_gemma_dflash_q8.py b/server/scripts/quantize_gemma_dflash_q8.py
similarity index 100%
rename from dflash/scripts/quantize_gemma_dflash_q8.py
rename to server/scripts/quantize_gemma_dflash_q8.py
diff --git a/dflash/scripts/run.py b/server/scripts/run.py
similarity index 100%
rename from dflash/scripts/run.py
rename to server/scripts/run.py
diff --git a/dflash/scripts/server.py b/server/scripts/server.py
similarity index 100%
rename from dflash/scripts/server.py
rename to server/scripts/server.py
diff --git a/dflash/scripts/setup_system.sh b/server/scripts/setup_system.sh
similarity index 100%
rename from dflash/scripts/setup_system.sh
rename to server/scripts/setup_system.sh
diff --git a/dflash/scripts/test_full_compress_cache.py b/server/scripts/test_full_compress_cache.py
similarity index 100%
rename from dflash/scripts/test_full_compress_cache.py
rename to server/scripts/test_full_compress_cache.py
diff --git a/dflash/scripts/test_multi_turn_prefix_cache.py b/server/scripts/test_multi_turn_prefix_cache.py
similarity index 100%
rename from dflash/scripts/test_multi_turn_prefix_cache.py
rename to server/scripts/test_multi_turn_prefix_cache.py
diff --git a/dflash/scripts/test_prefix_cache.py b/server/scripts/test_prefix_cache.py
similarity index 100%
rename from dflash/scripts/test_prefix_cache.py
rename to server/scripts/test_prefix_cache.py
diff --git a/dflash/scripts/test_server.py b/server/scripts/test_server.py
similarity index 100%
rename from dflash/scripts/test_server.py
rename to server/scripts/test_server.py
diff --git a/dflash/scripts/test_server_integration.py b/server/scripts/test_server_integration.py
similarity index 100%
rename from dflash/scripts/test_server_integration.py
rename to server/scripts/test_server_integration.py
diff --git a/dflash/scripts/test_server_prefix_cache.py b/server/scripts/test_server_prefix_cache.py
similarity index 100%
rename from dflash/scripts/test_server_prefix_cache.py
rename to server/scripts/test_server_prefix_cache.py
diff --git a/dflash/scripts/test_tool_memory.py b/server/scripts/test_tool_memory.py
similarity index 100%
rename from dflash/scripts/test_tool_memory.py
rename to server/scripts/test_tool_memory.py
diff --git a/dflash/scripts/tokenize_prompt.py b/server/scripts/tokenize_prompt.py
similarity index 100%
rename from dflash/scripts/tokenize_prompt.py
rename to server/scripts/tokenize_prompt.py
diff --git a/dflash/scripts/tool_memory.py b/server/scripts/tool_memory.py
similarity index 100%
rename from dflash/scripts/tool_memory.py
rename to server/scripts/tool_memory.py
diff --git a/dflash/src/bsa_fwd_inst.cu b/server/src/bsa_fwd_inst.cu
similarity index 100%
rename from dflash/src/bsa_fwd_inst.cu
rename to server/src/bsa_fwd_inst.cu
diff --git a/dflash/src/bsa_launcher.cu b/server/src/bsa_launcher.cu
similarity index 100%
rename from dflash/src/bsa_launcher.cu
rename to server/src/bsa_launcher.cu
diff --git a/dflash/src/bsa_launcher_hip.cu b/server/src/bsa_launcher_hip.cu
similarity index 100%
rename from dflash/src/bsa_launcher_hip.cu
rename to server/src/bsa_launcher_hip.cu
diff --git a/dflash/src/common/attn_masks.h b/server/src/common/attn_masks.h
similarity index 100%
rename from dflash/src/common/attn_masks.h
rename to server/src/common/attn_masks.h
diff --git a/dflash/src/common/backend_factory.cpp b/server/src/common/backend_factory.cpp
similarity index 100%
rename from dflash/src/common/backend_factory.cpp
rename to server/src/common/backend_factory.cpp
diff --git a/dflash/src/common/backend_factory.h b/server/src/common/backend_factory.h
similarity index 100%
rename from dflash/src/common/backend_factory.h
rename to server/src/common/backend_factory.h
diff --git a/dflash/src/common/backend_ipc.cpp b/server/src/common/backend_ipc.cpp
similarity index 100%
rename from dflash/src/common/backend_ipc.cpp
rename to server/src/common/backend_ipc.cpp
diff --git a/dflash/src/common/backend_ipc.h b/server/src/common/backend_ipc.h
similarity index 100%
rename from dflash/src/common/backend_ipc.h
rename to server/src/common/backend_ipc.h
diff --git a/dflash/src/common/daemon_loop.cpp b/server/src/common/daemon_loop.cpp
similarity index 100%
rename from dflash/src/common/daemon_loop.cpp
rename to server/src/common/daemon_loop.cpp
diff --git a/dflash/src/common/daemon_loop.h b/server/src/common/daemon_loop.h
similarity index 100%
rename from dflash/src/common/daemon_loop.h
rename to server/src/common/daemon_loop.h
diff --git a/dflash/src/common/ddtree.cpp b/server/src/common/ddtree.cpp
similarity index 100%
rename from dflash/src/common/ddtree.cpp
rename to server/src/common/ddtree.cpp
diff --git a/dflash/src/common/ddtree.h b/server/src/common/ddtree.h
similarity index 100%
rename from dflash/src/common/ddtree.h
rename to server/src/common/ddtree.h
diff --git a/dflash/src/common/device_placement.h b/server/src/common/device_placement.h
similarity index 100%
rename from dflash/src/common/device_placement.h
rename to server/src/common/device_placement.h
diff --git a/dflash/src/common/dflash_capture.cpp b/server/src/common/dflash_capture.cpp
similarity index 100%
rename from dflash/src/common/dflash_capture.cpp
rename to server/src/common/dflash_capture.cpp
diff --git a/dflash/src/common/dflash_capture.h b/server/src/common/dflash_capture.h
similarity index 100%
rename from dflash/src/common/dflash_capture.h
rename to server/src/common/dflash_capture.h
diff --git a/dflash/src/common/dflash_draft_graph.cpp b/server/src/common/dflash_draft_graph.cpp
similarity index 100%
rename from dflash/src/common/dflash_draft_graph.cpp
rename to server/src/common/dflash_draft_graph.cpp
diff --git a/dflash/src/common/dflash_draft_graph.h b/server/src/common/dflash_draft_graph.h
similarity index 100%
rename from dflash/src/common/dflash_draft_graph.h
rename to server/src/common/dflash_draft_graph.h
diff --git a/dflash/src/common/dflash_draft_ipc.cpp b/server/src/common/dflash_draft_ipc.cpp
similarity index 100%
rename from dflash/src/common/dflash_draft_ipc.cpp
rename to server/src/common/dflash_draft_ipc.cpp
diff --git a/dflash/src/common/dflash_draft_ipc.h b/server/src/common/dflash_draft_ipc.h
similarity index 100%
rename from dflash/src/common/dflash_draft_ipc.h
rename to server/src/common/dflash_draft_ipc.h
diff --git a/dflash/src/common/dflash_draft_ipc_daemon.cpp b/server/src/common/dflash_draft_ipc_daemon.cpp
similarity index 100%
rename from dflash/src/common/dflash_draft_ipc_daemon.cpp
rename to server/src/common/dflash_draft_ipc_daemon.cpp
diff --git a/dflash/src/common/dflash_feature_ring.cpp b/server/src/common/dflash_feature_ring.cpp
similarity index 100%
rename from dflash/src/common/dflash_feature_ring.cpp
rename to server/src/common/dflash_feature_ring.cpp
diff --git a/dflash/src/common/dflash_feature_ring.h b/server/src/common/dflash_feature_ring.h
similarity index 100%
rename from dflash/src/common/dflash_feature_ring.h
rename to server/src/common/dflash_feature_ring.h
diff --git a/dflash/src/common/dflash_layer_split_runtime.h b/server/src/common/dflash_layer_split_runtime.h
similarity index 100%
rename from dflash/src/common/dflash_layer_split_runtime.h
rename to server/src/common/dflash_layer_split_runtime.h
diff --git a/dflash/src/common/dflash_spec_decode.cpp b/server/src/common/dflash_spec_decode.cpp
similarity index 100%
rename from dflash/src/common/dflash_spec_decode.cpp
rename to server/src/common/dflash_spec_decode.cpp
diff --git a/dflash/src/common/dflash_spec_decode.h b/server/src/common/dflash_spec_decode.h
similarity index 100%
rename from dflash/src/common/dflash_spec_decode.h
rename to server/src/common/dflash_spec_decode.h
diff --git a/dflash/src/common/dflash_target.h b/server/src/common/dflash_target.h
similarity index 100%
rename from dflash/src/common/dflash_target.h
rename to server/src/common/dflash_target.h
diff --git a/dflash/src/common/gguf_inspect.cpp b/server/src/common/gguf_inspect.cpp
similarity index 100%
rename from dflash/src/common/gguf_inspect.cpp
rename to server/src/common/gguf_inspect.cpp
diff --git a/dflash/src/common/gguf_inspect.h b/server/src/common/gguf_inspect.h
similarity index 100%
rename from dflash/src/common/gguf_inspect.h
rename to server/src/common/gguf_inspect.h
diff --git a/dflash/src/common/gguf_mmap.h b/server/src/common/gguf_mmap.h
similarity index 100%
rename from dflash/src/common/gguf_mmap.h
rename to server/src/common/gguf_mmap.h
diff --git a/dflash/src/common/gpu_runtime_compat.h b/server/src/common/gpu_runtime_compat.h
similarity index 100%
rename from dflash/src/common/gpu_runtime_compat.h
rename to server/src/common/gpu_runtime_compat.h
diff --git a/dflash/src/common/io_utils.h b/server/src/common/io_utils.h
similarity index 100%
rename from dflash/src/common/io_utils.h
rename to server/src/common/io_utils.h
diff --git a/dflash/src/common/layer_split_utils.cpp b/server/src/common/layer_split_utils.cpp
similarity index 100%
rename from dflash/src/common/layer_split_utils.cpp
rename to server/src/common/layer_split_utils.cpp
diff --git a/dflash/src/common/layer_split_utils.h b/server/src/common/layer_split_utils.h
similarity index 100%
rename from dflash/src/common/layer_split_utils.h
rename to server/src/common/layer_split_utils.h
diff --git a/dflash/src/common/model_backend.h b/server/src/common/model_backend.h
similarity index 100%
rename from dflash/src/common/model_backend.h
rename to server/src/common/model_backend.h
diff --git a/dflash/src/common/peer_access.cpp b/server/src/common/peer_access.cpp
similarity index 100%
rename from dflash/src/common/peer_access.cpp
rename to server/src/common/peer_access.cpp
diff --git a/dflash/src/common/peer_access.h b/server/src/common/peer_access.h
similarity index 100%
rename from dflash/src/common/peer_access.h
rename to server/src/common/peer_access.h
diff --git a/dflash/src/common/pflash_drafter_ipc.cpp b/server/src/common/pflash_drafter_ipc.cpp
similarity index 100%
rename from dflash/src/common/pflash_drafter_ipc.cpp
rename to server/src/common/pflash_drafter_ipc.cpp
diff --git a/dflash/src/common/pflash_drafter_ipc.h b/server/src/common/pflash_drafter_ipc.h
similarity index 100%
rename from dflash/src/common/pflash_drafter_ipc.h
rename to server/src/common/pflash_drafter_ipc.h
diff --git a/dflash/src/common/pflash_drafter_ipc_daemon.cpp b/server/src/common/pflash_drafter_ipc_daemon.cpp
similarity index 100%
rename from dflash/src/common/pflash_drafter_ipc_daemon.cpp
rename to server/src/common/pflash_drafter_ipc_daemon.cpp
diff --git a/dflash/src/common/restore_delta.h b/server/src/common/restore_delta.h
similarity index 100%
rename from dflash/src/common/restore_delta.h
rename to server/src/common/restore_delta.h
diff --git a/dflash/src/common/sampler.cpp b/server/src/common/sampler.cpp
similarity index 100%
rename from dflash/src/common/sampler.cpp
rename to server/src/common/sampler.cpp
diff --git a/dflash/src/common/sampler.h b/server/src/common/sampler.h
similarity index 100%
rename from dflash/src/common/sampler.h
rename to server/src/common/sampler.h
diff --git a/dflash/src/common/snapshot_backend.h b/server/src/common/snapshot_backend.h
similarity index 100%
rename from dflash/src/common/snapshot_backend.h
rename to server/src/common/snapshot_backend.h
diff --git a/dflash/src/common/step_graph.h b/server/src/common/step_graph.h
similarity index 100%
rename from dflash/src/common/step_graph.h
rename to server/src/common/step_graph.h
diff --git a/dflash/src/cuda_cross_device_copy.cpp b/server/src/cuda_cross_device_copy.cpp
similarity index 100%
rename from dflash/src/cuda_cross_device_copy.cpp
rename to server/src/cuda_cross_device_copy.cpp
diff --git a/dflash/src/delta_net_chunked.cpp b/server/src/delta_net_chunked.cpp
similarity index 100%
rename from dflash/src/delta_net_chunked.cpp
rename to server/src/delta_net_chunked.cpp
diff --git a/dflash/src/delta_net_chunked.h b/server/src/delta_net_chunked.h
similarity index 100%
rename from dflash/src/delta_net_chunked.h
rename to server/src/delta_net_chunked.h
diff --git a/dflash/src/device_runtime.h b/server/src/device_runtime.h
similarity index 100%
rename from dflash/src/device_runtime.h
rename to server/src/device_runtime.h
diff --git a/dflash/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp
similarity index 100%
rename from dflash/src/draft/draft_gguf_loader.cpp
rename to server/src/draft/draft_gguf_loader.cpp
diff --git a/dflash/src/draft/draft_graph.cpp b/server/src/draft/draft_graph.cpp
similarity index 100%
rename from dflash/src/draft/draft_graph.cpp
rename to server/src/draft/draft_graph.cpp
diff --git a/dflash/src/draft/draft_graph.h b/server/src/draft/draft_graph.h
similarity index 100%
rename from dflash/src/draft/draft_graph.h
rename to server/src/draft/draft_graph.h
diff --git a/dflash/src/draft/draft_safetensors_loader.cpp b/server/src/draft/draft_safetensors_loader.cpp
similarity index 100%
rename from dflash/src/draft/draft_safetensors_loader.cpp
rename to server/src/draft/draft_safetensors_loader.cpp
diff --git a/dflash/src/errors.cpp b/server/src/errors.cpp
similarity index 100%
rename from dflash/src/errors.cpp
rename to server/src/errors.cpp
diff --git a/dflash/src/flashprefill.cpp b/server/src/flashprefill.cpp
similarity index 100%
rename from dflash/src/flashprefill.cpp
rename to server/src/flashprefill.cpp
diff --git a/dflash/src/flashprefill.h b/server/src/flashprefill.h
similarity index 100%
rename from dflash/src/flashprefill.h
rename to server/src/flashprefill.h
diff --git a/dflash/src/flashprefill_f16.cu b/server/src/flashprefill_f16.cu
similarity index 100%
rename from dflash/src/flashprefill_f16.cu
rename to server/src/flashprefill_f16.cu
diff --git a/dflash/src/flashprefill_kernels.cu b/server/src/flashprefill_kernels.cu
similarity index 100%
rename from dflash/src/flashprefill_kernels.cu
rename to server/src/flashprefill_kernels.cu
diff --git a/dflash/src/flashprefill_kernels.hip.cu b/server/src/flashprefill_kernels.hip.cu
similarity index 100%
rename from dflash/src/flashprefill_kernels.hip.cu
rename to server/src/flashprefill_kernels.hip.cu
diff --git a/dflash/src/flashprefill_q8.cpp b/server/src/flashprefill_q8.cpp
similarity index 100%
rename from dflash/src/flashprefill_q8.cpp
rename to server/src/flashprefill_q8.cpp
diff --git a/dflash/src/flashprefill_scalar.cu b/server/src/flashprefill_scalar.cu
similarity index 100%
rename from dflash/src/flashprefill_scalar.cu
rename to server/src/flashprefill_scalar.cu
diff --git a/dflash/src/flashprefill_select.cpp b/server/src/flashprefill_select.cpp
similarity index 100%
rename from dflash/src/flashprefill_select.cpp
rename to server/src/flashprefill_select.cpp
diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/server/src/gemma4/gemma4_backend.cpp
similarity index 100%
rename from dflash/src/gemma4/gemma4_backend.cpp
rename to server/src/gemma4/gemma4_backend.cpp
diff --git a/dflash/src/gemma4/gemma4_backend.h b/server/src/gemma4/gemma4_backend.h
similarity index 100%
rename from dflash/src/gemma4/gemma4_backend.h
rename to server/src/gemma4/gemma4_backend.h
diff --git a/dflash/src/gemma4/gemma4_daemon.cpp b/server/src/gemma4/gemma4_daemon.cpp
similarity index 100%
rename from dflash/src/gemma4/gemma4_daemon.cpp
rename to server/src/gemma4/gemma4_daemon.cpp
diff --git a/dflash/src/gemma4/gemma4_daemon.h b/server/src/gemma4/gemma4_daemon.h
similarity index 100%
rename from dflash/src/gemma4/gemma4_daemon.h
rename to server/src/gemma4/gemma4_daemon.h
diff --git a/dflash/src/gemma4/gemma4_dflash_target.cpp b/server/src/gemma4/gemma4_dflash_target.cpp
similarity index 100%
rename from dflash/src/gemma4/gemma4_dflash_target.cpp
rename to server/src/gemma4/gemma4_dflash_target.cpp
diff --git a/dflash/src/gemma4/gemma4_dflash_target.h b/server/src/gemma4/gemma4_dflash_target.h
similarity index 100%
rename from dflash/src/gemma4/gemma4_dflash_target.h
rename to server/src/gemma4/gemma4_dflash_target.h
diff --git a/dflash/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp
similarity index 100%
rename from dflash/src/gemma4/gemma4_graph.cpp
rename to server/src/gemma4/gemma4_graph.cpp
diff --git a/dflash/src/gemma4/gemma4_internal.h b/server/src/gemma4/gemma4_internal.h
similarity index 100%
rename from dflash/src/gemma4/gemma4_internal.h
rename to server/src/gemma4/gemma4_internal.h
diff --git a/dflash/src/gemma4/gemma4_loader.cpp b/server/src/gemma4/gemma4_loader.cpp
similarity index 100%
rename from dflash/src/gemma4/gemma4_loader.cpp
rename to server/src/gemma4/gemma4_loader.cpp
diff --git a/dflash/src/hip_compat/cuda_bf16.h b/server/src/hip_compat/cuda_bf16.h
similarity index 100%
rename from dflash/src/hip_compat/cuda_bf16.h
rename to server/src/hip_compat/cuda_bf16.h
diff --git a/dflash/src/hip_compat/cuda_fp16.h b/server/src/hip_compat/cuda_fp16.h
similarity index 100%
rename from dflash/src/hip_compat/cuda_fp16.h
rename to server/src/hip_compat/cuda_fp16.h
diff --git a/dflash/src/internal.h b/server/src/internal.h
similarity index 100%
rename from dflash/src/internal.h
rename to server/src/internal.h
diff --git a/dflash/src/ipc/backend_ipc_main.cpp b/server/src/ipc/backend_ipc_main.cpp
similarity index 100%
rename from dflash/src/ipc/backend_ipc_main.cpp
rename to server/src/ipc/backend_ipc_main.cpp
diff --git a/dflash/src/kv_cache.cpp b/server/src/kv_cache.cpp
similarity index 100%
rename from dflash/src/kv_cache.cpp
rename to server/src/kv_cache.cpp
diff --git a/dflash/src/kv_quant.cpp b/server/src/kv_quant.cpp
similarity index 100%
rename from dflash/src/kv_quant.cpp
rename to server/src/kv_quant.cpp
diff --git a/dflash/src/kv_quant.h b/server/src/kv_quant.h
similarity index 100%
rename from dflash/src/kv_quant.h
rename to server/src/kv_quant.h
diff --git a/dflash/src/laguna/laguna_backend.cpp b/server/src/laguna/laguna_backend.cpp
similarity index 100%
rename from dflash/src/laguna/laguna_backend.cpp
rename to server/src/laguna/laguna_backend.cpp
diff --git a/dflash/src/laguna/laguna_backend.h b/server/src/laguna/laguna_backend.h
similarity index 100%
rename from dflash/src/laguna/laguna_backend.h
rename to server/src/laguna/laguna_backend.h
diff --git a/dflash/src/laguna/laguna_daemon.cpp b/server/src/laguna/laguna_daemon.cpp
similarity index 100%
rename from dflash/src/laguna/laguna_daemon.cpp
rename to server/src/laguna/laguna_daemon.cpp
diff --git a/dflash/src/laguna/laguna_daemon.h b/server/src/laguna/laguna_daemon.h
similarity index 100%
rename from dflash/src/laguna/laguna_daemon.h
rename to server/src/laguna/laguna_daemon.h
diff --git a/dflash/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h
similarity index 100%
rename from dflash/src/laguna/laguna_internal.h
rename to server/src/laguna/laguna_internal.h
diff --git a/dflash/src/laguna/laguna_target_graph.cpp b/server/src/laguna/laguna_target_graph.cpp
similarity index 100%
rename from dflash/src/laguna/laguna_target_graph.cpp
rename to server/src/laguna/laguna_target_graph.cpp
diff --git a/dflash/src/laguna/laguna_target_loader.cpp b/server/src/laguna/laguna_target_loader.cpp
similarity index 100%
rename from dflash/src/laguna/laguna_target_loader.cpp
rename to server/src/laguna/laguna_target_loader.cpp
diff --git a/dflash/src/pflash_ggml_adapter.cpp b/server/src/pflash_ggml_adapter.cpp
similarity index 100%
rename from dflash/src/pflash_ggml_adapter.cpp
rename to server/src/pflash_ggml_adapter.cpp
diff --git a/dflash/src/pflash_ggml_adapter.h b/server/src/pflash_ggml_adapter.h
similarity index 100%
rename from dflash/src/pflash_ggml_adapter.h
rename to server/src/pflash_ggml_adapter.h
diff --git a/dflash/src/placement/pflash_placement.h b/server/src/placement/pflash_placement.h
similarity index 100%
rename from dflash/src/placement/pflash_placement.h
rename to server/src/placement/pflash_placement.h
diff --git a/dflash/src/placement/placement_backend.h b/server/src/placement/placement_backend.h
similarity index 100%
rename from dflash/src/placement/placement_backend.h
rename to server/src/placement/placement_backend.h
diff --git a/dflash/src/placement/placement_config.h b/server/src/placement/placement_config.h
similarity index 100%
rename from dflash/src/placement/placement_config.h
rename to server/src/placement/placement_config.h
diff --git a/dflash/src/placement/remote_draft_config.h b/server/src/placement/remote_draft_config.h
similarity index 100%
rename from dflash/src/placement/remote_draft_config.h
rename to server/src/placement/remote_draft_config.h
diff --git a/dflash/src/qwen3/qwen3_backend.cpp b/server/src/qwen3/qwen3_backend.cpp
similarity index 100%
rename from dflash/src/qwen3/qwen3_backend.cpp
rename to server/src/qwen3/qwen3_backend.cpp
diff --git a/dflash/src/qwen3/qwen3_backend.h b/server/src/qwen3/qwen3_backend.h
similarity index 100%
rename from dflash/src/qwen3/qwen3_backend.h
rename to server/src/qwen3/qwen3_backend.h
diff --git a/dflash/src/qwen3/qwen3_daemon.cpp b/server/src/qwen3/qwen3_daemon.cpp
similarity index 100%
rename from dflash/src/qwen3/qwen3_daemon.cpp
rename to server/src/qwen3/qwen3_daemon.cpp
diff --git a/dflash/src/qwen3/qwen3_daemon.h b/server/src/qwen3/qwen3_daemon.h
similarity index 100%
rename from dflash/src/qwen3/qwen3_daemon.h
rename to server/src/qwen3/qwen3_daemon.h
diff --git a/dflash/src/qwen3/qwen3_drafter.cpp b/server/src/qwen3/qwen3_drafter.cpp
similarity index 100%
rename from dflash/src/qwen3/qwen3_drafter.cpp
rename to server/src/qwen3/qwen3_drafter.cpp
diff --git a/dflash/src/qwen3/qwen3_drafter.h b/server/src/qwen3/qwen3_drafter.h
similarity index 100%
rename from dflash/src/qwen3/qwen3_drafter.h
rename to server/src/qwen3/qwen3_drafter.h
diff --git a/dflash/src/qwen3/qwen3_drafter_model.h b/server/src/qwen3/qwen3_drafter_model.h
similarity index 100%
rename from dflash/src/qwen3/qwen3_drafter_model.h
rename to server/src/qwen3/qwen3_drafter_model.h
diff --git a/dflash/src/qwen3/qwen3_graph.cpp b/server/src/qwen3/qwen3_graph.cpp
similarity index 100%
rename from dflash/src/qwen3/qwen3_graph.cpp
rename to server/src/qwen3/qwen3_graph.cpp
diff --git a/dflash/src/qwen3/qwen3_loader.cpp b/server/src/qwen3/qwen3_loader.cpp
similarity index 100%
rename from dflash/src/qwen3/qwen3_loader.cpp
rename to server/src/qwen3/qwen3_loader.cpp
diff --git a/dflash/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp
similarity index 100%
rename from dflash/src/qwen35/gguf_target_loader.cpp
rename to server/src/qwen35/gguf_target_loader.cpp
diff --git a/dflash/src/qwen35/graph_builders.cpp b/server/src/qwen35/graph_builders.cpp
similarity index 100%
rename from dflash/src/qwen35/graph_builders.cpp
rename to server/src/qwen35/graph_builders.cpp
diff --git a/dflash/src/qwen35/graph_builders.h b/server/src/qwen35/graph_builders.h
similarity index 100%
rename from dflash/src/qwen35/graph_builders.h
rename to server/src/qwen35/graph_builders.h
diff --git a/dflash/src/qwen35/layer_split_daemon.cpp b/server/src/qwen35/layer_split_daemon.cpp
similarity index 100%
rename from dflash/src/qwen35/layer_split_daemon.cpp
rename to server/src/qwen35/layer_split_daemon.cpp
diff --git a/dflash/src/qwen35/layer_split_daemon.h b/server/src/qwen35/layer_split_daemon.h
similarity index 100%
rename from dflash/src/qwen35/layer_split_daemon.h
rename to server/src/qwen35/layer_split_daemon.h
diff --git a/dflash/src/qwen35/layer_split_daemon_loop.cpp b/server/src/qwen35/layer_split_daemon_loop.cpp
similarity index 100%
rename from dflash/src/qwen35/layer_split_daemon_loop.cpp
rename to server/src/qwen35/layer_split_daemon_loop.cpp
diff --git a/dflash/src/qwen35/layer_split_daemon_loop.h b/server/src/qwen35/layer_split_daemon_loop.h
similarity index 100%
rename from dflash/src/qwen35/layer_split_daemon_loop.h
rename to server/src/qwen35/layer_split_daemon_loop.h
diff --git a/dflash/src/qwen35/layer_split_forward.cpp b/server/src/qwen35/layer_split_forward.cpp
similarity index 100%
rename from dflash/src/qwen35/layer_split_forward.cpp
rename to server/src/qwen35/layer_split_forward.cpp
diff --git a/dflash/src/qwen35/layer_split_forward.h b/server/src/qwen35/layer_split_forward.h
similarity index 100%
rename from dflash/src/qwen35/layer_split_forward.h
rename to server/src/qwen35/layer_split_forward.h
diff --git a/dflash/src/qwen35/layer_split_types.h b/server/src/qwen35/layer_split_types.h
similarity index 100%
rename from dflash/src/qwen35/layer_split_types.h
rename to server/src/qwen35/layer_split_types.h
diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/server/src/qwen35/qwen35_backend.cpp
similarity index 100%
rename from dflash/src/qwen35/qwen35_backend.cpp
rename to server/src/qwen35/qwen35_backend.cpp
diff --git a/dflash/src/qwen35/qwen35_backend.h b/server/src/qwen35/qwen35_backend.h
similarity index 100%
rename from dflash/src/qwen35/qwen35_backend.h
rename to server/src/qwen35/qwen35_backend.h
diff --git a/dflash/src/qwen35/qwen35_daemon.cpp b/server/src/qwen35/qwen35_daemon.cpp
similarity index 100%
rename from dflash/src/qwen35/qwen35_daemon.cpp
rename to server/src/qwen35/qwen35_daemon.cpp
diff --git a/dflash/src/qwen35/qwen35_daemon.h b/server/src/qwen35/qwen35_daemon.h
similarity index 100%
rename from dflash/src/qwen35/qwen35_daemon.h
rename to server/src/qwen35/qwen35_daemon.h
diff --git a/dflash/src/qwen35/qwen35_dflash_target.cpp b/server/src/qwen35/qwen35_dflash_target.cpp
similarity index 100%
rename from dflash/src/qwen35/qwen35_dflash_target.cpp
rename to server/src/qwen35/qwen35_dflash_target.cpp
diff --git a/dflash/src/qwen35/qwen35_dflash_target.h b/server/src/qwen35/qwen35_dflash_target.h
similarity index 100%
rename from dflash/src/qwen35/qwen35_dflash_target.h
rename to server/src/qwen35/qwen35_dflash_target.h
diff --git a/dflash/src/qwen35/qwen35_layer_split.h b/server/src/qwen35/qwen35_layer_split.h
similarity index 100%
rename from dflash/src/qwen35/qwen35_layer_split.h
rename to server/src/qwen35/qwen35_layer_split.h
diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp b/server/src/qwen35/qwen35_layer_split_dflash_target.cpp
similarity index 100%
rename from dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp
rename to server/src/qwen35/qwen35_layer_split_dflash_target.cpp
diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.h b/server/src/qwen35/qwen35_layer_split_dflash_target.h
similarity index 100%
rename from dflash/src/qwen35/qwen35_layer_split_dflash_target.h
rename to server/src/qwen35/qwen35_layer_split_dflash_target.h
diff --git a/dflash/src/qwen35/qwen35_ops.h b/server/src/qwen35/qwen35_ops.h
similarity index 100%
rename from dflash/src/qwen35/qwen35_ops.h
rename to server/src/qwen35/qwen35_ops.h
diff --git a/dflash/src/qwen35/qwen35_target_graph.cpp b/server/src/qwen35/qwen35_target_graph.cpp
similarity index 100%
rename from dflash/src/qwen35/qwen35_target_graph.cpp
rename to server/src/qwen35/qwen35_target_graph.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_backend.cpp b/server/src/qwen35moe/qwen35moe_backend.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_backend.cpp
rename to server/src/qwen35moe/qwen35moe_backend.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_backend.h b/server/src/qwen35moe/qwen35moe_backend.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_backend.h
rename to server/src/qwen35moe/qwen35moe_backend.h
diff --git a/dflash/src/qwen35moe/qwen35moe_daemon.cpp b/server/src/qwen35moe/qwen35moe_daemon.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_daemon.cpp
rename to server/src/qwen35moe/qwen35moe_daemon.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_daemon.h b/server/src/qwen35moe/qwen35moe_daemon.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_daemon.h
rename to server/src/qwen35moe/qwen35moe_daemon.h
diff --git a/dflash/src/qwen35moe/qwen35moe_expert_placement.cpp b/server/src/qwen35moe/qwen35moe_expert_placement.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_expert_placement.cpp
rename to server/src/qwen35moe/qwen35moe_expert_placement.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_expert_placement.h b/server/src/qwen35moe/qwen35moe_expert_placement.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_expert_placement.h
rename to server/src/qwen35moe/qwen35moe_expert_placement.h
diff --git a/dflash/src/qwen35moe/qwen35moe_ffn.cpp b/server/src/qwen35moe/qwen35moe_ffn.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_ffn.cpp
rename to server/src/qwen35moe/qwen35moe_ffn.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_ffn.h b/server/src/qwen35moe/qwen35moe_ffn.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_ffn.h
rename to server/src/qwen35moe/qwen35moe_ffn.h
diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
rename to server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h b/server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
rename to server/src/qwen35moe/qwen35moe_hybrid_ffn_eval.h
diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_storage.cpp b/server/src/qwen35moe/qwen35moe_hybrid_storage.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_hybrid_storage.cpp
rename to server/src/qwen35moe/qwen35moe_hybrid_storage.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_hybrid_storage.h b/server/src/qwen35moe/qwen35moe_hybrid_storage.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_hybrid_storage.h
rename to server/src/qwen35moe/qwen35moe_hybrid_storage.h
diff --git a/dflash/src/qwen35moe/qwen35moe_routing_stats.cpp b/server/src/qwen35moe/qwen35moe_routing_stats.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_routing_stats.cpp
rename to server/src/qwen35moe/qwen35moe_routing_stats.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_routing_stats.h b/server/src/qwen35moe/qwen35moe_routing_stats.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_routing_stats.h
rename to server/src/qwen35moe/qwen35moe_routing_stats.h
diff --git a/dflash/src/qwen35moe/qwen35moe_swap_manager.cpp b/server/src/qwen35moe/qwen35moe_swap_manager.cpp
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_swap_manager.cpp
rename to server/src/qwen35moe/qwen35moe_swap_manager.cpp
diff --git a/dflash/src/qwen35moe/qwen35moe_swap_manager.h b/server/src/qwen35moe/qwen35moe_swap_manager.h
similarity index 100%
rename from dflash/src/qwen35moe/qwen35moe_swap_manager.h
rename to server/src/qwen35moe/qwen35moe_swap_manager.h
diff --git a/dflash/src/rms_norm_hip.cu b/server/src/rms_norm_hip.cu
similarity index 100%
rename from dflash/src/rms_norm_hip.cu
rename to server/src/rms_norm_hip.cu
diff --git a/dflash/src/server/api_types.h b/server/src/server/api_types.h
similarity index 100%
rename from dflash/src/server/api_types.h
rename to server/src/server/api_types.h
diff --git a/dflash/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
similarity index 100%
rename from dflash/src/server/chat_template.cpp
rename to server/src/server/chat_template.cpp
diff --git a/dflash/src/server/chat_template.h b/server/src/server/chat_template.h
similarity index 100%
rename from dflash/src/server/chat_template.h
rename to server/src/server/chat_template.h
diff --git a/dflash/src/server/disk_prefix_cache.cpp b/server/src/server/disk_prefix_cache.cpp
similarity index 100%
rename from dflash/src/server/disk_prefix_cache.cpp
rename to server/src/server/disk_prefix_cache.cpp
diff --git a/dflash/src/server/disk_prefix_cache.h b/server/src/server/disk_prefix_cache.h
similarity index 100%
rename from dflash/src/server/disk_prefix_cache.h
rename to server/src/server/disk_prefix_cache.h
diff --git a/dflash/src/server/http_server.cpp b/server/src/server/http_server.cpp
similarity index 100%
rename from dflash/src/server/http_server.cpp
rename to server/src/server/http_server.cpp
diff --git a/dflash/src/server/http_server.h b/server/src/server/http_server.h
similarity index 100%
rename from dflash/src/server/http_server.h
rename to server/src/server/http_server.h
diff --git a/dflash/src/server/model_card.cpp b/server/src/server/model_card.cpp
similarity index 100%
rename from dflash/src/server/model_card.cpp
rename to server/src/server/model_card.cpp
diff --git a/dflash/src/server/model_card.h b/server/src/server/model_card.h
similarity index 100%
rename from dflash/src/server/model_card.h
rename to server/src/server/model_card.h
diff --git a/dflash/src/server/prefix_cache.cpp b/server/src/server/prefix_cache.cpp
similarity index 100%
rename from dflash/src/server/prefix_cache.cpp
rename to server/src/server/prefix_cache.cpp
diff --git a/dflash/src/server/prefix_cache.h b/server/src/server/prefix_cache.h
similarity index 100%
rename from dflash/src/server/prefix_cache.h
rename to server/src/server/prefix_cache.h
diff --git a/dflash/src/server/rax.c b/server/src/server/rax.c
similarity index 100%
rename from dflash/src/server/rax.c
rename to server/src/server/rax.c
diff --git a/dflash/src/server/rax.h b/server/src/server/rax.h
similarity index 100%
rename from dflash/src/server/rax.h
rename to server/src/server/rax.h
diff --git a/dflash/src/server/reasoning.cpp b/server/src/server/reasoning.cpp
similarity index 100%
rename from dflash/src/server/reasoning.cpp
rename to server/src/server/reasoning.cpp
diff --git a/dflash/src/server/reasoning.h b/server/src/server/reasoning.h
similarity index 100%
rename from dflash/src/server/reasoning.h
rename to server/src/server/reasoning.h
diff --git a/dflash/src/server/server_main.cpp b/server/src/server/server_main.cpp
similarity index 100%
rename from dflash/src/server/server_main.cpp
rename to server/src/server/server_main.cpp
diff --git a/dflash/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
similarity index 100%
rename from dflash/src/server/sse_emitter.cpp
rename to server/src/server/sse_emitter.cpp
diff --git a/dflash/src/server/sse_emitter.h b/server/src/server/sse_emitter.h
similarity index 100%
rename from dflash/src/server/sse_emitter.h
rename to server/src/server/sse_emitter.h
diff --git a/dflash/src/server/tokenizer.cpp b/server/src/server/tokenizer.cpp
similarity index 100%
rename from dflash/src/server/tokenizer.cpp
rename to server/src/server/tokenizer.cpp
diff --git a/dflash/src/server/tokenizer.h b/server/src/server/tokenizer.h
similarity index 100%
rename from dflash/src/server/tokenizer.h
rename to server/src/server/tokenizer.h
diff --git a/dflash/src/server/tool_hint.cpp b/server/src/server/tool_hint.cpp
similarity index 100%
rename from dflash/src/server/tool_hint.cpp
rename to server/src/server/tool_hint.cpp
diff --git a/dflash/src/server/tool_hint.h b/server/src/server/tool_hint.h
similarity index 100%
rename from dflash/src/server/tool_hint.h
rename to server/src/server/tool_hint.h
diff --git a/dflash/src/server/tool_memory.cpp b/server/src/server/tool_memory.cpp
similarity index 100%
rename from dflash/src/server/tool_memory.cpp
rename to server/src/server/tool_memory.cpp
diff --git a/dflash/src/server/tool_memory.h b/server/src/server/tool_memory.h
similarity index 100%
rename from dflash/src/server/tool_memory.h
rename to server/src/server/tool_memory.h
diff --git a/dflash/src/server/tool_parser.cpp b/server/src/server/tool_parser.cpp
similarity index 100%
rename from dflash/src/server/tool_parser.cpp
rename to server/src/server/tool_parser.cpp
diff --git a/dflash/src/server/tool_parser.h b/server/src/server/tool_parser.h
similarity index 100%
rename from dflash/src/server/tool_parser.h
rename to server/src/server/tool_parser.h
diff --git a/dflash/src/server/utf8_utils.h b/server/src/server/utf8_utils.h
similarity index 100%
rename from dflash/src/server/utf8_utils.h
rename to server/src/server/utf8_utils.h
diff --git a/dflash/test/bench_laguna_generate.cpp b/server/test/bench_laguna_generate.cpp
similarity index 100%
rename from dflash/test/bench_laguna_generate.cpp
rename to server/test/bench_laguna_generate.cpp
diff --git a/dflash/test/bench_laguna_pflash.cpp b/server/test/bench_laguna_pflash.cpp
similarity index 100%
rename from dflash/test/bench_laguna_pflash.cpp
rename to server/test/bench_laguna_pflash.cpp
diff --git a/dflash/test/bench_laguna_ttft.cpp b/server/test/bench_laguna_ttft.cpp
similarity index 100%
rename from dflash/test/bench_laguna_ttft.cpp
rename to server/test/bench_laguna_ttft.cpp
diff --git a/dflash/test/pflash_daemon.cpp b/server/test/pflash_daemon.cpp
similarity index 100%
rename from dflash/test/pflash_daemon.cpp
rename to server/test/pflash_daemon.cpp
diff --git a/dflash/test/smoke_draft_graph.cpp b/server/test/smoke_draft_graph.cpp
similarity index 100%
rename from dflash/test/smoke_draft_graph.cpp
rename to server/test/smoke_draft_graph.cpp
diff --git a/dflash/test/smoke_laguna_forward.cpp b/server/test/smoke_laguna_forward.cpp
similarity index 100%
rename from dflash/test/smoke_laguna_forward.cpp
rename to server/test/smoke_laguna_forward.cpp
diff --git a/dflash/test/smoke_load_draft.cpp b/server/test/smoke_load_draft.cpp
similarity index 100%
rename from dflash/test/smoke_load_draft.cpp
rename to server/test/smoke_load_draft.cpp
diff --git a/dflash/test/smoke_load_target.cpp b/server/test/smoke_load_target.cpp
similarity index 100%
rename from dflash/test/smoke_load_target.cpp
rename to server/test/smoke_load_target.cpp
diff --git a/dflash/test/smoke_load_target_laguna.cpp b/server/test/smoke_load_target_laguna.cpp
similarity index 100%
rename from dflash/test/smoke_load_target_laguna.cpp
rename to server/test/smoke_load_target_laguna.cpp
diff --git a/dflash/test/smoke_qwen3_forward.cpp b/server/test/smoke_qwen3_forward.cpp
similarity index 100%
rename from dflash/test/smoke_qwen3_forward.cpp
rename to server/test/smoke_qwen3_forward.cpp
diff --git a/dflash/test/smoke_target_forward.cpp b/server/test/smoke_target_forward.cpp
similarity index 100%
rename from dflash/test/smoke_target_forward.cpp
rename to server/test/smoke_target_forward.cpp
diff --git a/dflash/test/spike_thin_copy.cpp b/server/test/spike_thin_copy.cpp
similarity index 100%
rename from dflash/test/spike_thin_copy.cpp
rename to server/test/spike_thin_copy.cpp
diff --git a/dflash/test/test_dflash.cpp b/server/test/test_dflash.cpp
similarity index 100%
rename from dflash/test/test_dflash.cpp
rename to server/test/test_dflash.cpp
diff --git a/dflash/test/test_flash_attn_sparse.cpp b/server/test/test_flash_attn_sparse.cpp
similarity index 100%
rename from dflash/test/test_flash_attn_sparse.cpp
rename to server/test/test_flash_attn_sparse.cpp
diff --git a/dflash/test/test_flashprefill_kernels.cpp b/server/test/test_flashprefill_kernels.cpp
similarity index 100%
rename from dflash/test/test_flashprefill_kernels.cpp
rename to server/test/test_flashprefill_kernels.cpp
diff --git a/dflash/test/test_generate.cpp b/server/test/test_generate.cpp
similarity index 100%
rename from dflash/test/test_generate.cpp
rename to server/test/test_generate.cpp
diff --git a/dflash/test/test_gguf_mmap.cpp b/server/test/test_gguf_mmap.cpp
similarity index 100%
rename from dflash/test/test_gguf_mmap.cpp
rename to server/test/test_gguf_mmap.cpp
diff --git a/dflash/test/test_kv_quant.cpp b/server/test/test_kv_quant.cpp
similarity index 100%
rename from dflash/test/test_kv_quant.cpp
rename to server/test/test_kv_quant.cpp
diff --git a/dflash/test/test_laguna_daemon.cpp b/server/test/test_laguna_daemon.cpp
similarity index 100%
rename from dflash/test/test_laguna_daemon.cpp
rename to server/test/test_laguna_daemon.cpp
diff --git a/dflash/test/test_mtp_converter.sh b/server/test/test_mtp_converter.sh
similarity index 100%
rename from dflash/test/test_mtp_converter.sh
rename to server/test/test_mtp_converter.sh
diff --git a/dflash/test/test_mtp_e2e.sh b/server/test/test_mtp_e2e.sh
similarity index 100%
rename from dflash/test/test_mtp_e2e.sh
rename to server/test/test_mtp_e2e.sh
diff --git a/dflash/test/test_qwen35moe_expert_placement.cpp b/server/test/test_qwen35moe_expert_placement.cpp
similarity index 100%
rename from dflash/test/test_qwen35moe_expert_placement.cpp
rename to server/test/test_qwen35moe_expert_placement.cpp
diff --git a/dflash/test/test_qwen35moe_routing_stats.cpp b/server/test/test_qwen35moe_routing_stats.cpp
similarity index 100%
rename from dflash/test/test_qwen35moe_routing_stats.cpp
rename to server/test/test_qwen35moe_routing_stats.cpp
diff --git a/dflash/test/test_qwen35moe_swap_manager.cpp b/server/test/test_qwen35moe_swap_manager.cpp
similarity index 100%
rename from dflash/test/test_qwen35moe_swap_manager.cpp
rename to server/test/test_qwen35moe_swap_manager.cpp
diff --git a/dflash/test/test_restore_delta.cpp b/server/test/test_restore_delta.cpp
similarity index 100%
rename from dflash/test/test_restore_delta.cpp
rename to server/test/test_restore_delta.cpp
diff --git a/dflash/test/test_server_unit.cpp b/server/test/test_server_unit.cpp
similarity index 100%
rename from dflash/test/test_server_unit.cpp
rename to server/test/test_server_unit.cpp
diff --git a/dflash/test/test_tokenizer_harness.cpp b/server/test/test_tokenizer_harness.cpp
similarity index 100%
rename from dflash/test/test_tokenizer_harness.cpp
rename to server/test/test_tokenizer_harness.cpp
diff --git a/dflash/test/test_vs_oracle.cpp b/server/test/test_vs_oracle.cpp
similarity index 100%
rename from dflash/test/test_vs_oracle.cpp
rename to server/test/test_vs_oracle.cpp
diff --git a/dflash/tests/test_server_comprehensive.py b/server/tests/test_server_comprehensive.py
similarity index 100%
rename from dflash/tests/test_server_comprehensive.py
rename to server/tests/test_server_comprehensive.py
diff --git a/dflash/tests/test_server_smoke.py b/server/tests/test_server_smoke.py
similarity index 100%
rename from dflash/tests/test_server_smoke.py
rename to server/tests/test_server_smoke.py
diff --git a/dflash/tests/test_tokenizer.py b/server/tests/test_tokenizer.py
similarity index 100%
rename from dflash/tests/test_tokenizer.py
rename to server/tests/test_tokenizer.py