diff --git a/benchmark/base.py b/benchmark/base.py
index fc176fbd..d1c99df4 100644
--- a/benchmark/base.py
+++ b/benchmark/base.py
@@ -232,6 +232,78 @@ def get_supported_modalities(self):
         }
 
 
+class MingFlashOmni(Model):
+    """Ming-flash-omni-2.0 (inclusionAI), the Ling-2.0 sparse-MoE omni model
+    (100B total / 6B active params) released 2026-02-11.
+
+    Reachable today via the vllm-omni server using
+    ``vllm_omni/deploy/ming_flash_omni.yaml`` (thinker+talker) or
+    ``ming_flash_omni_thinker_only.yaml`` (text-only). The native ``ours`` /
+    ``ours_openai`` backends will work once the mstar-side port under
+    ``mstar/model/ming_omni_flash/`` is finished — until then, point the
+    benchmark at a vllm-omni instance with ``--inference-system vllm_omni``.
+
+    Wire shape mirrors :class:`Qwen3Omni`: standard OpenAI
+    ``/v1/chat/completions`` with multimodal content parts. The role remap
+    from OpenAI's ``user``/``assistant``/``system`` to Ming's internal
+    ``HUMAN``/``ASSISTANT``/``SYSTEM`` happens inside the jinja chat_template
+    shipped in ``tokenizer_config.json`` — vllm-omni renders prompts via
+    ``tokenizer.apply_chat_template`` which uses that jinja, so the benchmark
+    sends the standard OpenAI shape unchanged.
+
+    Caveat: Ming ALSO ships a Python-side ``BailingMM2Processor.apply_chat_template``
+    (in the Ming source repo) that is strict about uppercase roles and would
+    AssertionError on ``user``/``assistant``. mstar's native port uses that
+    processor for full multimodal preprocessing (vision/audio feature
+    extraction) and remaps roles in ``process_prompt`` accordingly — see
+    ``mstar/model/ming_omni_flash/`` and its tokenizer tests.
+    """
+
+    def get_hf_url(self):
+        return "inclusionAI/Ming-flash-omni-2.0"
+
+    def get_openai_system_message(self) -> Optional[dict]:
+        # Ming-flash-omni-2.0's cookbook uses ``sys_prompt_exp=None`` and
+        # ``use_cot_system_prompt=False`` by default — there's no required
+        # "You are Ming…"-style preamble equivalent to Qwen3-Omni's. The HF
+        # processor's chat_template fills in any internal system text on its
+        # own, and vllm-omni's serving layer goes through that template via
+        # ``trust_remote_code``. Sending an explicit system message here only
+        # risks overriding the model's own defaults, so default to None.
+        return None
+
+    def get_model_kwargs(self, request_type: RequestType):
+        # Cap thinker output at 256 tokens for cross-system fairness — same
+        # rationale as Qwen3Omni: comparable runs need a fixed decode budget.
+        # vllm-omni's released stage default is ``max_tokens: 2048`` (see
+        # ``vllm_omni/deploy/ming_flash_omni.yaml`` stage 0); we lower it for
+        # benchmark parity. Send both ``max_tokens`` (OpenAI convention) and
+        # ``max_output_tokens`` (mstar's native kwarg) so the cap survives
+        # whichever ``--inference-system`` is in use.
+        #
+        # Force greedy on the thinker (``temperature=0.0`` at payload top-level
+        # in VLLMOmni.send_request) for deterministic text. The talker's
+        # sampling defaults live server-side in the deploy yaml
+        # (``stage_id: 1`` → ``temperature: 0.0`` per the released config) —
+        # we don't override them here.
+        return {
+            "max_tokens": 256,
+            "max_output_tokens": 256,
+        }
+
+    def get_supported_modalities(self):
+        return {
+            RequestType.T2T,
+            RequestType.T2S,
+            RequestType.I2T,
+            RequestType.I2S,
+            RequestType.A2T,
+            RequestType.A2S,
+            RequestType.V2T,
+            RequestType.V2S,
+        }
+
+
 class Pi05(Model):
     """Physical Intelligence Pi0.5 VLA model.
 
@@ -286,6 +358,7 @@ class ModelType(Enum):
     BAGEL = "bagel"
     ORPHEUS = "orpheus"
     QWEN3OMNI = "qwen3omni"
+    MING_FLASH_OMNI = "ming_flash_omni"
     PI05 = "pi05"
     VJEPA2AC = "vjepa2ac"
 
@@ -296,6 +369,8 @@ def inst(self, **kwargs) -> Model:
             return Orpheus(**kwargs)
         if self == ModelType.QWEN3OMNI:
             return Qwen3Omni(**kwargs)
+        if self == ModelType.MING_FLASH_OMNI:
+            return MingFlashOmni(**kwargs)
         if self == ModelType.PI05:
             return Pi05(**kwargs)
         if self == ModelType.VJEPA2AC:
diff --git a/benchmark/vllm_omni_instructions.md b/benchmark/vllm_omni_instructions.md
index 2934c6c9..3e534544 100644
--- a/benchmark/vllm_omni_instructions.md
+++ b/benchmark/vllm_omni_instructions.md
@@ -21,4 +21,93 @@ CUDA_VISIBLE_DEVICES=3 vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8000
 ### for qwen3-omni:
 ```
 vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
-```
\ No newline at end of file
+```
+
+### for ming-flash-omni-2.0:
+
+The released `inclusionAI/Ming-flash-omni-2.0` ckpt (~238 GB / 42 shards)
+does NOT load cleanly into vllm-omni's `MingFlashOmniForConditionalGeneration`
+class as-is. Two patches are needed (one-time setup):
+
+1. **Replace metadata files.** vllm-omni's model class uses
+   `Qwen2VLImageProcessor` + `MingWhisperFeatureExtractor` (its own
+   registered classes), while the inclusionAI snapshot declares the
+   `BailingMM2*` processor variants via `auto_map` and `trust_remote_code`.
+   Use `Jonathan1909/Ming-flash-omni-2.0`'s `preprocessor_config.json`,
+   `config.json` (auto_map stripped), and `tokenizer*.json` instead.
+
+2. **Replace the talker weights.** vllm-omni's `MingFlashOmniTalker` expects
+   weights under `audio_vae.*` but the inclusionAI talker safetensors uses
+   `audio.*` prefix. Jonathan1909 reshipped the talker with renamed weights
+   (~1.5 GB).
+
+Building a hybrid snapshot avoids re-downloading the 200+ GB thinker weights:
+
+```bash
+# 1. Make sure the inclusionAI thinker shards are cached
+huggingface-cli download inclusionAI/Ming-flash-omni-2.0 \
+    --include="model-*.safetensors" --include="model.safetensors.index.json"
+
+# 2. Pull only Jonathan1909's metadata + talker (no thinker weights)
+huggingface-cli download Jonathan1909/Ming-flash-omni-2.0 \
+    --include="*.json" --include="*.py" --include="*.txt" --include="*.mvn" \
+    --include="talker/**" \
+    --cache-dir /dev/shm/hf-cache    # or any path with ~3 GB free
+
+# 3. Stitch the two together
+INCL=$(huggingface-cli scan-cache | grep inclusionAI/Ming-flash-omni-2.0 \
+       | awk '{print $NF}')/snapshots/$(ls ~/.cache/huggingface/hub/models--inclusionAI--Ming-flash-omni-2.0/snapshots | head -1)
+JONA=/dev/shm/hf-cache/models--Jonathan1909--Ming-flash-omni-2.0/snapshots/*
+HYBRID=/dev/shm/ming-hybrid
+mkdir -p $HYBRID
+for f in $INCL/model-*.safetensors; do ln -s "$f" "$HYBRID/$(basename $f)"; done
+for f in $JONA/*; do
+    base=$(basename "$f")
+    [ -L "$HYBRID/$base" ] && rm "$HYBRID/$base"
+    ln -s "$f" "$HYBRID/$base"
+done
+```
+
+Then serve and benchmark:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve /dev/shm/ming-hybrid \
+  --omni --port 8091 --host 0.0.0.0 --trust-remote-code \
+  --stage-configs-path /tmp/vllm-omni/vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml
+
+# Wait for "Application startup complete" then:
+MODEL=ming_flash_omni INF_SYS=vllm_omni TASK=text_to_text \
+  URL=http://0.0.0.0:8091 ./benchmark/run_benchmark.sh
+```
+
+NOTE: vllm-omni's `/v1/chat/completions` rejects unknown model ids, so the
+client must send `"model": "/dev/shm/ming-hybrid"` (the served path), not
+`"inclusionAI/Ming-flash-omni-2.0"`. Easiest is to monkey-patch
+`MingFlashOmni.get_hf_url` before calling the benchmark runner:
+
+```python
+from benchmark.base import MingFlashOmni
+MingFlashOmni.get_hf_url = lambda self: "/dev/shm/ming-hybrid"
+```
+
+Or pass `--served-model-name inclusionAI/Ming-flash-omni-2.0` to `vllm serve`
+(untested; would also work in principle).
+
+#### Modalities exercised on a local 4×H100 run (2026-06-06)
+
+| Task | Status | Notes |
+|---|---|---|
+| T2T (text → text) | ✅ | offline B=1: 110 tok/s, closed-loop C=32: **1060 tok/s** (full scaling sweep in [`results/ming_t2t_sweep/SUMMARY.md`](../results/ming_t2t_sweep/SUMMARY.md)) |
+| I2T (image → text) | ✅ | TTFT 87 ms, ~100 tok/s on Food101 |
+| A2T (audio → text) | ✅ | English transcription + Chinese audio QA both work |
+| T2S (text → speech) | ✅ | RTF 0.14, 24 kHz mono PCM via harness; 44.1 kHz via direct OpenAI path |
+| V2T (video → text) | ✅ | Local Ming demo mp4s; coherent descriptions (`yoga.mp4` → yoga pose narration, `cup_change.mp4` → "shell game") |
+| V2S (video → speech) | ✅ | Local Ming demo mp4s; 2-3 MB WAV/clip @ 44.1 kHz |
+| I2S (image → speech) | ✅ | Food101 in, ~7 s/req for ~48 s of audio |
+| A2S (audio → speech) | ✅ | Ming sample wavs; 0.5-3 MB WAV/clip @ 44.1 kHz |
+| T2I / I2I (image gen) | not wired | requires `ming_flash_omni_image.yaml` + a benchmark wrapper similar to BAGEL's `/v1/images/generations` path |
+
+The V2T/V2S/A2S runs sidestep the bench harness's `UCF101Dataset` and
+`LibriSpeechDataset` (both want fresh HF-Hub downloads) by hitting
+`/v1/chat/completions` directly with base64-inlined media from local files
+(Ming repo's `figures/cases/*.mp4` and `data/wavs/*.wav`).
\ No newline at end of file
diff --git a/configs/ming_flash_omni.yaml b/configs/ming_flash_omni.yaml
new file mode 100644
index 00000000..df88e507
--- /dev/null
+++ b/configs/ming_flash_omni.yaml
@@ -0,0 +1,36 @@
+# Ming-flash-omni-2.0 — full omni deploy (text/image/audio/video in, text + speech out).
+#
+# Node→rank mapping for the native mstar port
+# (mstar/model/ming_omni_flash/). The model registers these nodes
+# (see MingFlashOmniModel.get_node_engine_types):
+#
+#   * Thinker        (KV_CACHE, TP)  — Ling-2.0 sparse MoE LLM, the
+#                                      multimodal understanding core.
+#   * vision_encoder (STATELESS)     — Qwen3-MoE ViT + projector.
+#   * audio_encoder  (STATELESS)     — Whisper encoder + projector.
+#   * Talker         (STATELESS)     — CFM talker; the AudioVAE is wrapped
+#                                      INSIDE the Talker submodule (it is
+#                                      NOT a separate graph node).
+#
+# Thinker runs TP=8 across all 8 H100s here to leave room for the
+# colocated talker + encoders. TP=4 also fits the thinker (~57-62 GB/rank,
+# verified 2026-06-12); an earlier fp32-allocation bug made it OOM at
+# ~78.5 GB and is now fixed (see get_submodule). The stateless encoders +
+# the talker are small (~1.5 GB each) and colocate on rank 0.
+#
+# The Thinker→Talker bridge passes DETOKENIZED TEXT (re-tokenized with
+# the talker's own talker/llm tokenizer), so the talker is a near-
+# standalone TTS partition fed by a streaming connection — see
+# MingFlashOmniModel.get_partition_topology.
+
+model: "ming_flash_omni"
+max_seq_len: 32768
+node_groups:
+  # Stateless encoders + the talker colocate on rank 0.
+  - node_names: [vision_encoder, audio_encoder, Talker]
+    ranks: [0]
+
+  # Thinker sharded across all 8 GPUs.
+  - node_names: [Thinker]
+    ranks: [0, 1, 2, 3, 4, 5, 6, 7]
+    tp_size: 8
diff --git a/configs/ming_flash_omni_thinker_only.yaml b/configs/ming_flash_omni_thinker_only.yaml
new file mode 100644
index 00000000..539201f5
--- /dev/null
+++ b/configs/ming_flash_omni_thinker_only.yaml
@@ -0,0 +1,21 @@
+# Ming-flash-omni-2.0 — thinker-only deploy (text out, no talker).
+#
+# TP=8 across 8 H100s. Per-rank shard_inter = 1024/8 = 128;
+# experts.gate_up_proj is (256, 2*128, 4096) per rank, ~33 GB across
+# 31 MoE layers. With embed + lm_head + attention + dense layer 0 +
+# KV cache, ~40 GB per rank fits the 80 GB H100s comfortably.
+#
+# TP=4 also fits (~57-62 GB/rank, verified 2026-06-12) — see
+# configs/ming_flash_omni_thinker_only_tp4.yaml. An earlier fp32-allocation
+# bug made TP=4 OOM at ~78.5 GB; fixed in get_submodule (cast meta model to
+# bf16 before to_empty). TP=8 still leaves the most headroom.
+#
+# Audio / vision / talker / image-gen are step 4+; this config is for
+# text-only T2T benchmarking and the first mstar-served Ming forward.
+
+model: "ming_flash_omni"
+max_seq_len: 32768
+node_groups:
+  - node_names: [Thinker]
+    ranks: [0, 1, 2, 3, 4, 5, 6, 7]
+    tp_size: 8
diff --git a/configs/ming_flash_omni_thinker_only_tp4.yaml b/configs/ming_flash_omni_thinker_only_tp4.yaml
new file mode 100644
index 00000000..fc90e6bd
--- /dev/null
+++ b/configs/ming_flash_omni_thinker_only_tp4.yaml
@@ -0,0 +1,20 @@
+# Ming-flash-omni-2.0 — thinker-only deploy, TP=4 (4-GPU layout).
+#
+# Pinned to 4 ranks. Launch with CUDA_VISIBLE_DEVICES=4,5,6,7 so physical
+# GPUs 4-7 map to logical ranks 0-3.
+#
+# TP=4 fits in ~57-62 GB per rank (verified 2026-06-12). Earlier notes
+# claimed TP=4 OOM'd at ~78.5/80 GB; that was a load-time bug — params were
+# allocated in fp32 before the bf16 cast, doubling the allocation peak. Fixed
+# in MingFlashOmniModel.get_submodule (cast the meta model to bf16 BEFORE
+# to_empty, so allocation happens directly in bf16).
+#
+# TP=4 IS dimensionally valid: 32 heads/4=8, 4 KV heads/4=1, 256 experts/4=64,
+# hidden 4096/4=1024, moe_inter 1024/4=256 — all divide.
+
+model: "ming_flash_omni"
+max_seq_len: 32768
+node_groups:
+  - node_names: [Thinker]
+    ranks: [0, 1, 2, 3]
+    tp_size: 4
diff --git a/mstar/model/base.py b/mstar/model/base.py
index 54a7e90d..fa58d1d3 100644
--- a/mstar/model/base.py
+++ b/mstar/model/base.py
@@ -253,14 +253,24 @@ def get_worker_graphs(self, config_path: str) -> list[WorkerGraph]:
         if node_groups is None:
             raise KeyError("Config must define `node_groups`.")
 
+        # Nodes this deploy actually provides. A graph walk referencing a
+        # node absent from node_groups (e.g. the encoder / talker walks in
+        # a thinker-only deploy) is skipped rather than KeyError'ing during
+        # worker-graph division — that deploy simply can't serve the walk.
+        available_nodes: set[str] = set()
+        for group in node_groups:
+            available_nodes.update(group["node_names"])
+
         # TODO: merge identical worker graphs from different graph walks
-        return sum(
-            [
+        worker_graphs: list[WorkerGraph] = []
+        for graph_walk, graph in self.get_graph_walk_graphs().items():
+            required = set(graph.get_nodes().keys())
+            if not required <= available_nodes:
+                continue
+            worker_graphs.extend(
                 self._get_worker_graphs_for_graph_walk(graph_walk, graph, node_groups)
-                for graph_walk, graph in self.get_graph_walk_graphs().items()
-            ],
-            start=[],
-        )
+            )
+        return worker_graphs
 
     def get_sharding_config(self, config_path: str) -> ShardingConfig:
         with open(config_path, "r") as f:
diff --git a/mstar/model/ming_omni_flash/PORTING_NOTES.md b/mstar/model/ming_omni_flash/PORTING_NOTES.md
new file mode 100644
index 00000000..c6631733
--- /dev/null
+++ b/mstar/model/ming_omni_flash/PORTING_NOTES.md
@@ -0,0 +1,1056 @@
+# Ming-flash-omni-2.0 — porting notes
+
+Native mstar port of `inclusionAI/Ming-flash-omni-2.0`. This directory is a
+scaffold today; everything below is the punch list to make it real.
+
+## Status
+
+- `benchmark/base.py` has `MingFlashOmni` + `ModelType.MING_FLASH_OMNI`.
+  Benchmarking against a vllm-omni server **works today** with
+  `--inference-system vllm_omni` (see `benchmark/vllm_omni_instructions.md`).
+- Step 1 (config port) — DONE. `mstar/model/ming_omni_flash/config.py`
+  loads the released ckpt; 10 tests in `test/modular/test_ming_flash_omni_config.py`.
+- Step 2 (tokenizer + processor wiring) — DONE.
+  `MingFlashOmniModel.__init__` resolves the snapshot, stages Ming source
+  files (see "Ming source dependency" below), and loads
+  `BailingTokenizer` + `BailingMM2Processor` with graceful fallback;
+  11 tests in `test/modular/test_ming_flash_omni_tokenizer.py`.
+- Everything else in `MingFlashOmniModel` still raises `NotImplementedError`
+  — `mstar-serve --config configs/ming_flash_omni.yaml` will fail at
+  startup until step 3+ lands.
+
+## Ming source dependency (loading the tokenizer/processor)
+
+The released HF checkpoint `inclusionAI/Ming-flash-omni-2.0` ships
+**only weights and sub-dir configs**. The tokenizer/processor Python
+modules (`configuration_bailingmm2.py`, `tokenization_bailing.py`,
+`processing_bailingmm2.py`, etc.) live in the source repo at
+https://github.com/inclusionAI/Ming . To load the tokenizer/processor:
+
+```bash
+# 1. Clone the source repo
+git clone https://github.com/inclusionAI/Ming.git /path/to/Ming
+
+# 2. Install extra Python deps Ming's modules depend on
+pip install opencv-python-headless openai-whisper
+
+# 3. Tell mstar where to find the source repo
+export MING_CODE_DIR=/path/to/Ming
+# (or pass ming_code_dir="/path/to/Ming" to MingFlashOmniModel)
+```
+
+`MingFlashOmniModel.__init__` (via `_prepare_tokenizer_dir`) symlinks
+the required .py and .json files from `$MING_CODE_DIR` alongside the
+snapshot's `config.json` so transformers' `trust_remote_code` machinery
+can resolve them. The snapshot dir is also pushed onto `sys.path` so
+the dynamic-module loader's sibling imports resolve.
+
+## Role-handling nuance (chat templates)
+
+Ming-flash-omni-2.0 ships **two** chat-template implementations with
+**different role conventions**:
+
+- `tokenizer.apply_chat_template(messages)` — uses the **jinja template
+  in `tokenizer_config.json`**. Accepts standard OpenAI roles
+  (`user` / `assistant` / `system`) and remaps them to Ming's uppercase
+  `HUMAN` / `ASSISTANT` / `SYSTEM` inside the template. This is the path
+  vllm-omni's serving layer uses → the benchmark side works unchanged.
+
+- `processor.apply_chat_template(messages, sys_prompt_exp=..., use_cot_system_prompt=...)`
+  — uses the **Python implementation in `BailingMM2Processor`** (Ming
+  source repo). **Strict**: asserts `role in [HUMAN, ASSISTANT]` and
+  raises `AssertionError` on lowercase OpenAI roles. The native mstar
+  `process_prompt` (step 7) will need this path for the multimodal
+  preprocessing (vision feature extraction, audio padding, etc.) and
+  must explicitly remap roles before calling.
+
+## Upstream reference
+
+Treat the vllm-omni port as the source of truth for architecture. Files to
+read (totals ~6.5 KLOC):
+
+| Concern | vllm-omni file |
+|---|---|
+| Pipeline glue | `vllm_omni/model_executor/models/ming_flash_omni/pipeline.py` (141 LOC) |
+| Top-level model | `ming_flash_omni.py` (255 LOC) |
+| Thinker (Ling-2.0 MoE + multimodal) | `ming_flash_omni_thinker.py` (1,164 LOC) |
+| Talker (CFM + LLM) | `ming_flash_omni_talker.py` (586) + `talker_module.py` (1,145) |
+| Audio VAE | `audio_vae.py` (392) |
+| Audio encoder | `audio_encoder.py` (246) |
+| Vision encoder | `vision_encoder.py` (125) + `projectors.py` (184) |
+| Ling MoE backbone | `modeling_bailing_moe_v2.py` (892) |
+| Prompt utils | `prompt_utils.py` (134) — `IMAGE_PATCH_TOKEN`, `DEFAULT_NUM_QUERY_TOKENS=256`, TTS caption template |
+| Text processing | `text_processing.py` (535) |
+| Speaker presets | `spk_embedding.py` (44) + `voice_presets.py` (289) |
+| Config | `vllm_omni/transformers_utils/configs/ming_flash_omni.py` (420) |
+| Stage input processor | `vllm_omni/model_executor/stage_input_processors/ming_flash_omni.py` |
+| ImageGen pipeline | `vllm_omni/diffusion/models/ming_flash_omni/` |
+| Deploy yamls | `vllm_omni/deploy/ming_flash_omni{,_image,_thinker_only,_tts}.yaml` |
+
+## mstar parallels
+
+Mirror the structure of `mstar/model/qwen3_omni/` end-to-end. That model is
+the closest analog (multimodal thinker + speech talker + vocoder), and the
+graph-walk / partition / streaming patterns transfer 1:1.
+
+| mstar surface | Qwen3-Omni reference | Ming-flash-omni equivalent |
+|---|---|---|
+| Model class | `qwen3_omni_model.py` (1,529) | `ming_omni_flash_model.py` |
+| Submodules | `submodules.py` (2,016) | `submodules.py` (TODO) |
+| Config | `config.py` (544) | `config.py` |
+| Talker | `components/talker.py` (549) + `code2wav.py` (534) | `components/talker.py` + `audio_vae.py` (TODO) |
+| Thinker | `components/thinker.py` (259) | `components/thinker.py` (TODO) |
+| Attention / RoPE | `components/attention.py` + `rope.py` | likely shareable; check Ling-2.0 attention shape |
+
+## Punch list (in order)
+
+1. **Config port — DONE.** `mstar/model/ming_omni_flash/config.py`
+   loads `config.json` + sibling subdir configs (talker / image-gen) into
+   a dataclass tree. Verified via 10 tests in
+   `test/modular/test_ming_flash_omni_config.py`.
+
+2. **Tokenizer + processor — DONE.** `MingFlashOmniModel.__init__`
+   resolves the snapshot, stages Ming source files alongside it (see
+   "Ming source dependency" above), and loads `BailingTokenizer` +
+   `BailingMM2Processor` with graceful fallback. The chat-template role
+   handling has two paths (see "Role-handling nuance" above); the native
+   `process_prompt` (step 7) will use the strict processor path and must
+   remap roles. Verified via 11 tests in
+   `test/modular/test_ming_flash_omni_tokenizer.py`.
+
+3. **Ling-2.0 thinker LLM port — IN PROGRESS.**
+   - **3a — DONE** (`components/router.py`, `rope.py`, `attention.py`):
+     architecture-novel pieces (MultiRouter group-limited top-k, partial
+     3D `video_rope`, QK-norm attention). 12 tests in
+     `test/modular/test_ming_flash_omni_components.py`.
+   - **3b — DONE** (`components/moe.py`, `decoder_layer.py`, `model.py`):
+     `LingMoeBlock` (3-router text/image/audio with `torch.where`
+     per-token swap), `LingDecoderLayer` (hybrid dense/MoE per
+     `first_k_dense_replace`), full `LingMoeModel` (embed + N layers +
+     RMSNorm + lm_head). 9 tests in `test_ming_flash_omni_model.py`.
+   - **3c — DONE** (`loader.py`): weight loader that maps the released
+     ckpt's `model.model.*` namespace to `LingMoeModel`'s state_dict,
+     with per-expert gate/up/down fusion into the packed
+     `experts.gate_up_proj` tensor via mstar's existing
+     `WeightConverter` machinery. Real-ckpt smoke test loads embed +
+     dense layer 0 + lm_head from the released shards and runs a
+     forward — output is finite bf16 logits at the expected
+     `(T, vocab_size)` shape. 6 tests in
+     `test_ming_flash_omni_loader.py` (4 pure-Python + 2 CUDA+snapshot).
+   - **3e — DONE** (TP-aware variants): `LingAttention` uses
+     `QKVParallelLinear` + `RowParallelLinear` (per-rank heads + dense
+     row-parallel); `LingMoeBlock` shards fused experts by
+     `shard_inter = moe_intermediate_size / tp_size` and uses mstar's
+     existing `_gate_up_weight_loader` / `_down_proj_weight_loader`
+     for per-rank weight slicing; dense layer-0 MLP uses
+     `ParallelGatedMLP`; `LingMoeModel` threads `comm_group` through
+     every decoder layer. Weight loader refactored onto mstar's
+     `load_hf_weights` + 770 `StackedParamRule`s (3 per expert ×
+     num_experts + dense MLP + synthetic QKV). The packed
+     `attention.query_key_value.weight` from the checkpoint is split
+     into synthetic `q_proj` / `k_proj` / `v_proj` keys by
+     `_split_packed_qkv` so `QKVParallelLinear`'s standard weight
+     loader handles per-rank head slicing.
+
+     **Verified via TP=8 mstar-serve smoke** (8 H100s): server starts,
+     all 8 workers load 507 thinker params each (one per packed
+     parameter; per-rank ~40 GB), KVCacheEngine warmup_and_capture
+     completes, torch.compile applies, dedicated GPU threads spin up,
+     port 8092 listens. Per-rank model + KV cache is well under 80 GB.
+     TP=4 was tried first and OOMed at 78.58 GB / 80 GB; TP=8 has
+     plenty of headroom.
+
+     **Known gap (resolved in 3f)**: see step 3f.
+
+   - **3d — DONE** (cache wiring + submodule + engine integration):
+     `LingAttention` now uses `cache_handle.run_attention` for paged
+     KV-cache attention (keeps the custom partial-3D rope inline);
+     `BailingMoeV2ThinkerSubmodule` in `submodules.py` implements
+     `prepare_inputs` / `preprocess` / `forward` / `check_stop` for
+     the prefill + decode walks; `MingFlashOmniModel.__init__` no
+     longer raises NotImplementedError and all Model ABC methods
+     (`get_kv_cache_config`, `get_graph_walk_graphs`, `get_partitions`,
+     `process_prompt`, `postprocess`, `get_submodule`, etc.) are
+     implemented for the text-only path. 12 tests in
+     `test_ming_flash_omni_model.py` + the existing 30+ Ming tests
+     still pass.
+
+     **Verified via `mstar-serve` smoke**: the engine instantiates the
+     model class, calls `get_submodule("Thinker")`, and reaches
+     `load_thinker_weights` — failing with OOM on a single GPU
+     (loaded ~75 GB before exhausting the 80 GB H100). The engine
+     plumbing itself works; **single-GPU OOM is the expected blocker
+     until step 3e brings TP-aware variants**. To actually serve the
+     full 100B model we need TP=4 distributing the experts + attention
+     across 4 H100s.
+
+   - **3f — DONE** (graph wiring for the text-only generate loop):
+     two model-side bugs blocked the first end-to-end `/generate`
+     response on top of step 3e.
+
+     (a) `BailingMoeV2ThinkerSubmodule` had no `postprocess` hook.
+     The decode loop's output edge is named `text_inputs` so the
+     loop feeds the previous sampled token back into the next
+     iteration. `submodule.forward` returns `{"logits": [...]}`;
+     the KV-cache engine samples into `{"new_token": [...]}`; but
+     the graph router needs a `text_inputs` key under that name.
+     Added `postprocess` that rebinds `new_token → text_inputs`,
+     mirroring :meth:`OrpheusLLMSubmodule.postprocess`. Without
+     this, every decode iteration hit `IndexError` at
+     `prepare_inputs` (`text_inputs` list arrived empty), which
+     is the same symptom the 3e notes called out.
+
+     (b) The prefill / decode output edges used `EMPTY_DESTINATION`
+     + `conductor_new_token=True` rather than
+     `EMIT_TO_CLIENT` + `output_modality="text"`. With (a) fixed
+     the loop produced tokens, but the API server received
+     `{"outputs": {}}` because no edge routed `new_token` to the
+     client. Switched to Qwen3-Omni's pattern: prefill emits its
+     first token to the client and the decode-loop section emits
+     each subsequent sampled token via a parallel
+     `EMIT_TO_CLIENT, name="new_token", output_modality="text"`
+     edge alongside the `text_inputs` loopback.
+
+     **Environment / dependency patches collected along the way**
+     (not Ming code, but required on this box to reach a working
+     forward):
+
+     * `BailingTokenizer` doesn't load under transformers >= 5.0:
+       (i) accessor properties reference `self.verbose`, removed
+       in 5.x — set a class-level `verbose = False`; (ii)
+       `__init__` sets `self.add_bos_token` before
+       `super().__init__()` and the 5.x setter calls
+       `update_post_processor()` which dereferences the not-yet-
+       built `self._tokenizer`. Both patches live in
+       `_patch_bailing_tokenizer_for_transformers5` in
+       `ming_omni_flash_model.py`, applied once after the first
+       `AutoTokenizer.from_pretrained` raises an `AttributeError`
+       matching either signature.
+
+     * `LingMoeBlock._dispatch_tp` always called
+       `mstar.utils.fused_moe.fused_experts`, which hard-requires
+       `sgl_kernel`. On boxes where the installed `sgl_kernel.so`
+       has an ABI mismatch against the running torch (the
+       importlib-level error doesn't propagate as a normal
+       `ImportError` until you actually call into the .so), this
+       crashes mid-forward. Added a naive fallback that calls
+       `dispatch_experts_fused` on each rank's expert shard then
+       all-reduces; math is equivalent because sum-over-TP and
+       sum-over-top-k commute.
+
+     * `flashinfer-python` 0.6.6 ships a Python wrapper that
+       passes 10 args to the bundled `top_p_sampling_from_probs`
+       op while `flashinfer-jit-cache` 0.6.2 expects 8. Pin
+       `flashinfer-python==0.6.2` (via `pip install --no-deps`)
+       to match the jit-cache; the alternative would be rebuilding
+       the cache against 0.6.6.
+
+     **Verified via `mstar-serve` smoke (TP=8 on 8 H100s)**:
+     /generate returns real model text. <details to be filled in
+     by the verification curl in step 3g (benchmark wiring).>
+
+   Note: expert layout doesn't share with Qwen3-Omni's MoE block —
+   `MultiRouter` (3 gates + modality masks) is Ling-specific, and
+   the per-expert fused weight tensor has its own shape constraints.
+
+4. **Vision + audio encoders.** Stateless graph nodes. Port
+   `vision_encoder.py` + `projectors.py` and `audio_encoder.py`. Wire into
+   the prefill graph walks.
+
+   - **4a — DONE** (`components/projectors.py`,
+     `components/vision_encoder.py`, `components/audio_encoder.py`):
+     pure-port encoder + projector modules with weight-key parity
+     against the released ckpt's top-level prefixes
+     (`vision.*`, `audio.*`, `linear_proj.*`, `linear_proj_audio.*`).
+
+     * `MingVisionProjector` / `MingAudioProjector` mirror the
+       `nn.Sequential` chains built inline in
+       `modeling_bailingmm2.py` (Linear→GELU→Linear for vision,
+       Conv1d→Transpose→GELU→Linear→Transpose for audio). Layer
+       indices match the on-disk keys (`linear_proj.{0,2}` vision,
+       `linear_proj_audio.{0,3}` audio).
+
+     * `build_vision_encoder` constructs Ming's
+       `Qwen3MoeVisionTransformer` via dynamic import from the staged
+       Ming source dir (same path used by the tokenizer + processor).
+       Reused as-is rather than forked — no vLLM dep, ~1 GB at bf16,
+       runs on a single GPU.
+
+     * `MingAudioEncoder` is a self-contained port of vllm-omni's
+       packed-sequence Whisper encoder (~250 LOC) — no
+       `openai-whisper` runtime dep, optional flash-attn varlen fast
+       path with a manual fallback. Param names match upstream
+       Whisper (`query` / `key` / `value` / `out`,
+       `mlp.{0,2}.{weight,bias}`) so the released ckpt's
+       `audio.blocks.N.*` keys load by state-dict equality.
+
+     * 17 tests in `test/modular/test_ming_flash_omni_encoders.py`:
+       12 pure-Python (projector shape / layer indices / forward /
+       audio encoder weight-key parity / packed-attention fallback
+       shape) + 1 snapshot-gated (vision encoder builds from the
+       real `VisionEncoderConfig`) + 1 CUDA-gated (forward smoke
+       under eager attention — currently skipped on this box for
+       missing libnvrtc-builtins, not a code bug).
+
+   - **4b — DONE** (encoder weight loading): `loader.py` now exposes
+     `load_vision_encoder_weights`, `load_audio_encoder_weights`,
+     `load_vision_projector_weights`, `load_audio_projector_weights`
+     on top of a shared `_load_prefixed_state_dict` helper. None of
+     these are TP-aware — vision + audio encoders colocate on rank 0
+     in the typical topology (see `configs/ming_flash_omni.yaml`) so
+     a plain prefix-strip + `load_state_dict` path suffices. The
+     projector loaders also prepend `proj.` to the stripped key so
+     the on-disk `linear_proj.{0,2}.*` / `linear_proj_audio.{0,3}.*`
+     keys hit the `nn.Sequential` slot by integer index.
+
+     Verified by 4 snapshot-gated tests in
+     `test_ming_flash_omni_encoders.py` against the real
+     `/dev/shm/ming-hybrid` ckpt — all four prefixes load strictly
+     (no missing / unexpected). The audio encoder's
+     `positional_embedding` is loaded as a buffer (overrides the
+     sinusoidal init); the vision encoder loads all 27 blocks +
+     merger + deepstack_merger_list cleanly.
+
+5. **Thinker graph walks.** `prefill_text`, `prefill_audio`, `prefill_vision`,
+   `prefill_video`, `thinker_decode`. Follow Qwen3-Omni's pattern for
+   conditional walks based on `input_modalities`.
+
+   - **5a — DONE** (`submodules.py`, `ming_omni_flash_model.py`): the two
+     encoder NodeSubmodules and their construction paths.
+
+     * `VisionEncoderSubmodule` wraps Ming's `Qwen3MoeVisionTransformer`
+       + `MingVisionProjector`, mirrors
+       `modeling_bailingmm2.extract_image_feature` (encoder → projector
+       → L2 norm). `prepare_inputs` raises clearly on missing
+       `pixel_values` / `image_grid_thw` and promotes 1-D
+       `[T, H, W]` grid_thw to `(1, 3)`.
+
+     * `AudioEncoderSubmodule` wraps `MingAudioEncoder` +
+       `MingAudioProjector`. Accepts either a single `(n_mels, T)` clip
+       or a `(B, n_mels, T)` batched tensor and optionally trims the
+       padded tail using `audio_seqlens`. Per-clip embeddings are
+       concatenated along time; L2-norm is applied when
+       `audio_config.norm_query_embeds` is set (true on the released
+       ckpt — matches `modeling_bailingmm2.extract_audio_feature`).
+
+     * `get_node_engine_types` now registers
+       `vision_encoder` / `audio_encoder` as `EngineType.STATELESS`
+       alongside the KV-cache Thinker. Construction routes through
+       new `_create_vision_encoder_submodule` /
+       `_create_audio_encoder_submodule` helpers that build, dtype-cast,
+       and weight-load via the loaders from step 4b.
+
+     * 12 tests in `test/modular/test_ming_flash_omni_submodules.py`:
+       10 pure-Python (input-validation, output shape, L2 norm,
+       audio batched-vs-single equivalence, audio_seqlens trim,
+       grid_thw promotion, node-type registration, friendly error on
+       unknown node) + 2 snapshot-gated (full
+       `_create_audio_encoder_submodule` on the real ckpt — verifies
+       Conv1 + projector params are non-zero post-load).
+
+   - **5b — DONE** (Thinker prefill dispatch + position helpers):
+     `BailingMoeV2ThinkerSubmodule.prepare_inputs` now dispatches on
+     `graph_walk` and emits either `input_ids` (text-only walks) or
+     `input_embeds` + `custom_pos_ids` (multimodal walks). `preprocess`
+     and `forward` route both shapes through to `LingMoeModel`'s
+     existing dual input_ids/input_embeds + 1D/3D position_ids
+     handling — no new model.py path needed.
+
+     Three new position-id helpers live in `components/positions.py`,
+     each producing `(3, T)` long tensors compatible with
+     `LingPartialMRotaryEmbedding`'s `video_rope` branch:
+
+     * `get_rope_index_text(seq_len, start_pos)` — three identical
+       sequential rows. Matches `modeling_bailing_moe_v2.get_rope_index`'s
+       pure-text branch (`:658-675`).
+     * `get_rope_index_audio` — alias to the text helper (Ming
+       does not special-case audio in `get_rope_index`).
+     * `get_rope_index_vision(grid_thw, start_pos, spatial_merge_size,
+       second_per_grid_t=None, tokens_per_second=2)` — per-image
+       3D grid math from `:625-647`. Optional video timestamp
+       scaling via `second_per_grid_t * tokens_per_second`.
+
+     The Thinker dispatch:
+
+     * `prefill` / `prefill_text` — backward-compat text path
+       (unchanged from step 3f).
+     * `prefill_audio` — wraps `audio_embeds` with `audio_start`
+       / `audio_end` sentinel embeddings, builds text-like positions
+       for the span.
+     * `prefill_vision` / `prefill_video` — wraps `vision_embeds`
+       with `image_start`/`image_end` (or `video_start`/`video_end`),
+       builds grid-aware 3D positions; `eos` sentinel sits at
+       `global_max(vision_pos) + 1` so the next walk's text positions
+       can resume without collision (matches Ming source's
+       `llm_pos_ids_list[-1].max() + 1` accounting).
+     * `decode` / `thinker_decode` — single-token AR step (unchanged).
+
+     Sentinel embeds are lazily computed per device on first use.
+     The model.py construction now passes `config=self.config` to the
+     submodule so it can read `vision.spatial_merge_size`,
+     `thinker_llm.tokens_per_second`, and the `*_start_token` /
+     `*_end_token` ids.
+
+     Step 5b restricts to single-image / single-clip requests
+     (multi-image splice via `Sequential` graph wiring lands in 5c).
+
+     21 new tests across `test_ming_flash_omni_positions.py` (11) and
+     `test_ming_flash_omni_submodules.py` (10): position-id shape /
+     offset / abs-time math, missing-input error paths,
+     multi-image rejection, sentinel embed correctness for audio /
+     image / video walks, start_pos advancement, legacy `prefill`
+     walk name compat. All green.
+
+   - **5c — DONE** (graph wiring + multimodal scheduling):
+     `get_graph_walk_graphs` now returns five walks instead of the
+     step 3f text-only `prefill` / `decode` pair:
+
+     * `prefill_text` — bare `Thinker` node.
+     * `prefill_audio` — `Sequential([audio_encoder, Thinker])`
+       where the encoder emits `audio_embeds` into the Thinker.
+     * `prefill_vision` — `Sequential([vision_encoder, Thinker])`;
+       `image_grid_thw` routes to BOTH the encoder (for spatial
+       positions on the patches) AND the Thinker (for 3D MRoPE math
+       around the vision span).
+     * `prefill_video` — same shape as `prefill_vision` plus
+       `video_second_per_grid` routed into the Thinker.
+     * `thinker_decode` — AR loop, renamed from step 3f's `decode`.
+
+     `get_partitions` lists all five walks under the single `Thinker`
+     partition with `initial_walk="prefill_text"`. Two new helpers
+     drive the scheduling:
+
+     * `_build_thinker_prefill_schedule(input_modalities, input_signals)`
+       — one schedule step per modality, in `input_modalities` order;
+       each step is `(walk_name, {input_name: TensorPointerInfo})`.
+       Modalities listed without matching tensors in `input_signals`
+       are silently skipped (parity with qwen3_omni).
+     * `_get_thinker_prefill_inputs(metadata, input_signals)` — emits
+       one `GraphEdge` per input for the current step, routing each
+       to the right node (encoder vs Thinker), including the dual
+       `image_grid_thw` edge for vision walks.
+
+     `get_initial_forward_pass_args` builds the schedule, picks the
+     first walk, and stashes the schedule + step counter on the
+     metadata. `get_partition_forward_pass_args` is the Thinker state
+     machine: advance schedule → transition to `thinker_decode` →
+     return `request_done=True` after the decode loop unwinds. Mirrors
+     `mstar/model/qwen3_omni/qwen3_omni_model.py:765+` minus the
+     Talker / Code2Wav partitions (which land in step 6+).
+
+     Empty-schedule edge case (no usable modalities) short-circuits
+     to `request_done=True` so the conductor doesn't hang.
+
+     21 tests in `test/modular/test_ming_flash_omni_graph.py`:
+     graph-walk structure (5 walks, encoder→Thinker chaining, dual
+     grid_thw edge, loop feedback edge), partition listing, prefill
+     schedule construction for text-only / text+audio+image / video /
+     unknown-modality / no-inputs cases, edge routing for each walk
+     type, full state-machine drive across a text+audio request
+     (init → audio prefill → decode → done).
+
+6. **Talker + Audio VAE.** Port `ming_flash_omni_talker.py` + `talker_module.py`
+   + `audio_vae.py`. The talker is CFM-based (continuous flow matching) rather
+   than discrete-codec-AR like Qwen3-Omni's — the streaming topology will
+   differ. Re-read `mstar/streaming/topology.py` before wiring connections.
+
+   Broken out into sub-steps because the upstream code is ~2,100 LOC
+   across three files (`ming_flash_omni_talker.py` 586 LOC +
+   `talker_module.py` 1,145 LOC + `audio_vae.py` 392 LOC):
+
+   - **6a — DONE** (config port): replaced the step-1 raw-dict
+     skeleton `TalkerConfig` with typed sub-config dataclasses so the
+     modeling code (CFM head + DiT blocks + Aggregator + AudioVAE)
+     can read dims off `config.talker.*` directly.
+
+     New dataclasses in `components/config.py` (under `TalkerConfig`):
+     * `TalkerLLMConfig` — Qwen2 backbone (896-dim, 24L, 14H/2KV,
+       sliding-window=False, RoPE θ=1e6). Distinct from
+       `ThinkerLLMConfig` (different vocab, no MoE, smaller dims).
+       `head_dim` property computes 896/14=64.
+     * `DiTBlockConfig` — shared shape for `flowmodel` and
+       `aggregator` (depth=8, hidden_size=1024, num_heads=16,
+       mlp_ratio=4, in_channels=64); only `dropout` differs (0 vs
+       0.1 on the released ckpt). `head_dim` / `intermediate_size`
+       properties for convenience.
+     * `AudioVAEConfig` — encoder + decoder dims (latent_dim=64,
+       input_dim=80, hop_size=320, output_dim=882),
+       `sample_rate=44100`, `patch_size=4`. Encoder/decoder Qwen2
+       backbones kept as raw dicts (`enc_backbone` /
+       `dec_backbone`) for the eventual block-builder to lift.
+       Discriminator + loss-weight fields retained for round-trip
+       fidelity but not consumed at inference.
+
+     `TalkerConfig.from_subdir` now constructs the typed sub-configs
+     directly (was raw-dict assignment); `vae_sample_rate` /
+     `vae_patch_size` retained as `@property` accessors for backward
+     compat with `Model.get_output_sample_rate`.
+
+     8 new tests in `test_ming_flash_omni_config.py` (7 freshly
+     authored + 1 updated to assert the new typed shape):
+     - `TalkerLLMConfig` defaults / head_dim / unknown-key filter
+     - `DiTBlockConfig` intermediate_size / head_dim derivations
+     - `AudioVAEConfig` enc/dec kwarg lifting + fallback when
+       enc_kwargs missing latent_dim
+     - `TalkerConfig.from_subdir` end-to-end with synthetic tmp dirs
+       (round-trips all three sub-configs)
+     - Default-factory check that `TalkerConfig()` with no args yields
+       typed sub-configs
+
+     Verified by re-running the existing snapshot-gated
+     `test_subdir_configs_load_when_present` against the real
+     `/dev/shm/ming-hybrid/talker/` tree — typed fields read
+     correctly (LLM hidden_size=896, VAE sample_rate=44100,
+     flowmodel depth=8, aggregator dropout=0.1).
+
+   - **6b — DONE** (CFM + DiT building blocks): new
+     `components/talker_dit.py` ports the modeling primitives from
+     upstream `talker_module.py:1-402`. Module names mirror upstream
+     so the released ckpt's `talker/model.safetensors` keys
+     (`flowmodel.blocks.N.attn.to_q.weight`,
+     `flowmodel.blocks.N.mlp.ff.0.0.weight` etc.) will load by
+     state-dict equality once the loader path lands.
+
+     Two external deps replaced with minimal in-tree ports:
+     * `DiTTimestepEmbedding` — sinusoidal pos-emb + Linear+SiLU+Linear
+       MLP, matching vllm-omni's `timestep_embedding.DiTTimestepEmbedding`.
+     * `RotaryEmbedding` — non-xpos 1-D RoPE matching
+       `x_transformers.RotaryEmbedding.forward_from_seq_len` exactly,
+       including the INTERLEAVED-pair `rotate_half(x1, x2) = (-x2, x1)`
+       layout. This is DIFFERENT from Ling-2.0 thinker's neox-cat
+       layout — adjacent freq pairs share the same value here, while
+       Ling's halves repeat across the split. Required so the released
+       weights line up with the same RoPE shape they were trained
+       against.
+
+     The CFM module wraps the DiT and integrates an ODE/SDE step grid
+     from `get_epss_timesteps` with classifier-free guidance.
+     Sway-sampling-coef remap is honored (`-1.0` default packs more
+     steps near `t=0`). The released ckpt's `steps=10` schedule is
+     the predefined `[0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32] / 32`.
+
+     Skipped from `talker_module.py`: `CFMGraphExecutor`,
+     `CFMGraphExecutorPool` (vllm-specific batching), `Aggregator`
+     (lands in 6c), the resampling / silence-trim / `build_tts_input`
+     / `MingAudioGenerator` orchestration utilities (lands in 6e
+     where the Talker submodule wires the streaming graph).
+
+     New factory `build_talker_cfm(talker_config, llm_cond_dim=None,
+     dtype=..., device=...)` constructs DiT + CFM directly from a
+     `TalkerConfig` so 6e's `_create_talker_submodule` will be a
+     one-liner. `llm_cond_dim` defaults to `talker.llm.hidden_size`
+     (896 on the released ckpt).
+
+     28 tests in `test_ming_flash_omni_talker_dit.py`:
+     - RotaryEmbedding layout: rotate-half pair negation,
+       freqs.shape `(1, T, dim)`, adjacent-pair-shared-frequency
+       invariant, partial-rotary apply preserves passed-through tail.
+     - DiTTimestepEmbedding: shape, dtype-stability, even-dim guard.
+     - RMSNorm normalises to unit-rms per row.
+     - FeedForward layer indices align with the ckpt's
+       `ff.0.0` / `ff.0.1` / `ff.2` keys.
+     - Attention: `to_q/to_k/to_v/to_out.0` param names, qk_norm
+       branches, rope on/off shape preservation, unknown-qk_norm
+       rejection.
+     - DiTBlock + FinalLayer + CondEmbedder round-trip.
+     - DiT.forward output `(B, 1 + his + patch, out_channels)` for
+       no-spk and `(B, 2 + his + patch, ...)` with spk; CFG forward
+       returns trailing `x.shape[1]` rows.
+     - CFM.sample shape preservation + length / sde_rnd validation,
+       sway=None branch.
+     - `build_talker_cfm` from real `TalkerConfig` defaults yields
+       the expected DiT dims (1024 hidden, 8 layers, 16 heads,
+       cond_embedder input = 896) + `llm_cond_dim` override.
+
+   - **6c — DONE** (Aggregator + Qwen2 backbone + heads):
+
+     `_Attention` / `_DiTBlock` grew a `mask` parameter to match
+     upstream API exactly. For the CFM path the caller passes
+     `mask=None`, so behaviour is unchanged; the Aggregator's mask
+     branch is now exercised. Mask semantics mirror upstream's
+     `talker_module.Attention.forward`:
+     * `attn_mask_enabled=True` builds an SDPA `attn_mask` from the
+       (B, T) key-padding mask so padded keys are excluded from
+       softmax.
+     * Regardless of `attn_mask_enabled`, the masked-out output rows
+       are zeroed via `masked_fill(~mask, 0)` — matches upstream's
+       unconditional zeroing branch.
+
+     `Aggregator` (port of `talker_module.Aggregator:702-744`): same
+     DiTBlock stack as the CFM head, but the input embedder is
+     `nn.Linear` (audio-latent → hidden) plus a learnable [CLS]-style
+     `word_embedder` (`nn.Embedding(1, hidden_size)`) prepended to the
+     sequence. The output is the `[CLS]` row only, projected through
+     `final_layer` to `llm_input_dim` so the condition feedback loops
+     back into the talker LLM's embedding space.
+
+     `build_aggregator(talker_config, llm_input_dim=None, ...)` and
+     `build_talker_cfm(...)` both honor `attn_mask_enabled` from the
+     respective DiTBlockConfig (False on the released ckpt).
+
+     **Talker LLM backbone** — `build_talker_llm(talker_llm_config,
+     attn_implementation="sdpa", ...)` constructs a stock
+     `transformers.Qwen2Model` from `TalkerLLMConfig`. No custom modeling
+     path: the talker LLM colocates on a single rank in the typical
+     topology and the ckpt's `talker/model.safetensors` keys are
+     plain `model.*` Qwen2 keys, so reusing HF keeps the surface small
+     and inherits HF's KV-cache + attention impl. Matches what the
+     upstream `MingFlashOmniTalkerForConditionalGeneration.__init__`
+     does (line 116: `self.model = Qwen2Model(llm_config)`).
+
+     **Talker heads** — `build_talker_heads(talker_config,
+     spk_embed_dim=192, ...)` returns a dict of two `nn.Linear` heads:
+     * `stop_head` — `Linear(hidden_size, 2, bias=True)`: binary
+       end-of-audio classifier consumed during the generation loop.
+     * `spk_head` — `Linear(192, hidden_size, bias=True)`: projects
+       a CAMPPlus speaker embedding into the LLM hidden space; the
+       projected embedding is prepended to the prompt as a voice-
+       condition token.
+
+     13 new tests appended to `test_ming_flash_omni_talker_dit.py`:
+     - Attention mask output-zeroing (unconditional), SDPA attn_mask
+       branch (attn_mask_enabled=True), no-mask no-zeroing regression
+       guard.
+     - Aggregator: `[CLS]` row only output `(B, 1, llm_input_dim)`,
+       single-row `word_embedder`, mask propagation through DiT
+       blocks, shape stability across varying T, `build_aggregator`
+       from real TalkerConfig + `llm_input_dim` override.
+     - `build_talker_llm`: returns `transformers.Qwen2Model` with
+       correct dims; tiny-input forward returns hidden states.
+     - `build_talker_heads`: stop_head (h→2) + spk_head (192→h) with
+       biases; `spk_embed_dim` override.
+
+     Total talker_dit tests: 41 (28 from 6b + 13 from 6c). Full
+     Ming step-1..7 + 6a/6b/6c suite: **162 pass / 9 skipped / 0 fail
+     / 1 deselected** (deselected is pre-existing cuDNN-broken
+     attention forward, unrelated).
+
+   - **6d — DONE** (AudioVAE): new `components/audio_vae.py` ports
+     `vllm_omni/.../audio_vae.py` (~392 LOC). Module tree mirrors
+     upstream so the released ckpt's `talker/vae/model.safetensors`
+     keys load 1:1 by state-dict equality once the loader path
+     lands (6f).
+
+     Building blocks:
+     * `_ISTFT` — sliding-window OLA inverse-STFT. Two padding
+       modes: `"center"` wraps `torch.istft` directly; `"same"` is
+       the hand-rolled `F.fold` reconstruction with optional
+       streaming buffers (carries the trailing `win_length - hop`
+       samples + window envelope across chunks).
+     * `_ISTFTHead` — Linear → STFT mag (exp+clip) / phase → `_ISTFT`.
+     * `_StreamingLinearUpsample` — chunked linear upsampler with
+       1-step lookahead so chunked output matches single-shot output
+       at chunk boundaries.
+     * `_Encoder` — waveform → latent params. `get_frames` windows
+       the waveform with stride `hop_size`, `fc1` projects to hidden,
+       Qwen2 backbone runs, then optional `aggregator` (4-layer
+       Qwen2 + `cls_embed`) summarises each patch.
+     * `_Decoder` — latent → waveform. `fc1` to hidden, optional
+       `_StreamingLinearUpsample`, Qwen2 backbone with sliding-window
+       bridge for streaming KV cache, `_ISTFTHead` to audio.
+     * `AudioVAE` — wraps encoder+decoder, exposes `encode_latent`
+       (with an inline `_oobleck_sample()` so we don't depend on the
+       broken-on-this-box `diffusers` package) and `decode`.
+
+     **Defaults fixed**: `AudioVAEConfig.encoder_input_dim` /
+     `encoder_hop_size` were previously 80 / 320 (placeholder from
+     step 6a); updated to 882 / 882 to match the released ckpt
+     (`enc_kwargs: {hop_size: 882, input_dim: 882, latent_dim: 64}`).
+     The existing 6a tests still pass since they explicitly pass
+     overrides through `from_dict`.
+
+     `build_audio_vae(audio_vae_config, dtype, device, attn_implementation=None)`:
+     auto-picks `"sdpa"` on CPU and FA2 when available on CUDA;
+     caller can pin explicitly. Mirrors vllm-omni's runtime choice
+     for the talker LLM (`llm_config._attn_implementation = "sdpa"`).
+
+     18 tests in `test_ming_flash_omni_audio_vae.py` covering:
+     - Oobleck sampler shape + mean-collapse-on-small-scale.
+     - ISTFT padding-mode validation + center / same forward paths.
+     - StreamingLinearUpsample: single-shot path, deferred-first-chunk
+       path, **chunked-vs-single-shot equivalence** (the key
+       correctness property — proves boundary lookahead is wired
+       correctly so chunked streaming doesn't introduce artefacts).
+     - ISTFTHead output shape (audio + x_pred).
+     - Encoder: `get_frames` padding arithmetic, forward without
+       patching, forward with patching (aggregator path collapses
+       to per-patch latents).
+     - Decoder: non-streaming reconstruct shape, patching path
+       routes through the upsampler.
+     - AudioVAE: construction + encode_latent shape (incl. per-clip
+       frame counts) + decode end-to-end.
+     - **Snapshot-gated parity**: built `AudioVAE.state_dict()` keys
+       contain all representative entries present in
+       `talker/vae/model.safetensors` (fc1/fc2/fc3/norm/cls_embed,
+       encoder.encoder, encoder.aggregator, decoder.fc1,
+       decoder.head.out, decoder.head.istft.window, decoder.decoder)
+       and vice versa — proves the eventual loader will be a clean
+       prefix-strip + load_state_dict.
+
+   - **6e — IN PROGRESS** (Talker submodule + graph walks): split into
+     6e-1 (orchestration helper) + 6e-2 (mstar graph wiring).
+
+     - **6e-1 — DONE** (`components/talker_generator.py`): port of
+       upstream `MingAudioGenerator` (talker_module.py:854-1146) plus
+       the streaming-decode utilities `silence_holder` /
+       `trim_trailing_silence`. Stateless-per-request `TalkerGenerator`
+       binds Qwen2 LLM + CFM + Aggregator + stop_head + AudioVAE and
+       exposes:
+       * `generate_latents(inputs_embeds, ...)` — the AR loop:
+         repeated (`llm_step` → `cfm_sample_step` → stop check). Each
+         step emits one `(B, patch_size, latent_dim)` latent; the
+         Aggregator output becomes the next step's `inputs_embeds`;
+         the stop_head softmax gates early termination after
+         `min_new_token` steps.
+       * `cfm_sample_step` — one CFM substep-integration + Aggregator
+         + stop classification.
+       * `llm_step` — single Qwen2 forward with `StaticCache`
+         `cache_position` bookkeeping on step > 0.
+       * `decode_to_waveform(latents, stream_decode=True)` — one-shot
+         or chunked AudioVAE decode; the streaming path threads
+         `silence_holder` + a sliding `decode_pad` window across chunks.
+       * `duration_capped_steps` — the text-length → max-steps prosody
+         heuristic.
+       * `_init_his_lat` / `_update_his_lat` — history-latent sliding
+         window (right-aligns a voice-prompt latent when supplied).
+
+       Skipped from upstream: `CFMGraphExecutorPool` / `CFMGraphExecutor`
+       (vllm CUDA-graph batching — mstar's engine handles capture);
+       `build_tts_input` / `_looks_like_music_prompt` (→ step 8).
+
+       24 tests in `test_ming_flash_omni_talker_generator.py`:
+       trim_trailing_silence (empty / short-clip / silent-tail trim /
+       weird-shape passthrough), silence_holder (cache init, sub-frame
+       buffering until last_chunk), generator construction (with /
+       without VAE), his-lat zeros + right-align + window update +
+       unsupported-shape guard, cfm_sample_step output shapes +
+       stop-softmax-sums-to-1, llm_step step-0 path, generate_latents
+       per-step collection + max_steps cap, duration_capped_steps
+       heuristic, decode_to_waveform one-shot / streaming / empty /
+       no-VAE-raises, instance trim_trailing_silence.
+
+     - **6e-2 — DONE** (TalkerSubmodule + construction + node
+       registration): the talker is a STATELESS node, not an AR /
+       streaming-codec node. Ming's thinker→talker bridge passes
+       DETOKENIZED TEXT (the talker re-encodes with its own
+       `talker/llm` tokenizer — see vllm-omni `pipeline.py`'s
+       `thinker2talker`), and the CFM step count is stop_head-
+       determined rather than a conductor decode loop. So the whole
+       per-request generation (LLM prefill + CFM AR decode + AudioVAE
+       decode) runs inside one `TalkerSubmodule.forward` call.
+
+       * `TalkerSubmodule` (`submodules.py`): `prepare_inputs` embeds
+         `talker_text_inputs` token ids via the talker LLM's
+         `embed_tokens`; `forward` runs `generate_latents` →
+         `decode_to_waveform` → `trim_trailing_silence` and returns
+         `{"audio_chunk": [waveform]}` (`(1, 1, num_samples)` at the
+         VAE sample rate). `get_stateless_flavor` returns
+         `"audio_codec"` (no autocast / no torch.compile — the CFM
+         ODE loop + ISTFT are numerically sensitive).
+
+       * `get_node_engine_types` registers `Talker` as
+         `EngineType.STATELESS` when the snapshot ships a `talker/`
+         subdir; thinker-only configs omit it.
+
+       * `_create_talker_submodule` builds the full stack
+         (`build_talker_llm` + `build_talker_cfm` + `build_aggregator`
+         + `build_talker_heads` + `build_audio_vae`), loads every
+         subtree via the step-6f loaders, wraps in a
+         `TalkerGenerator` → `TalkerSubmodule`.
+
+       12 tests across `test_ming_flash_omni_talker_submodule.py` (9)
+       + an updated `test_get_submodule_rejects_unknown_node`:
+       stateless flavor, prepare_inputs embed (1-D + 2-D ids) +
+       missing-input guard, forward returns finite audio_chunk,
+       node-type registration (with / without talker config),
+       `_create_talker_submodule` no-talker guard, plus a
+       snapshot-gated end-to-end that builds the full talker from
+       real weights and generates a finite waveform.
+
+     - **6e-3 — DONE** (graph walks + Thinker→Talker bridge): the
+       talker is now a second partition wired off the Thinker, gated
+       entirely on `config.talker is not None` (thinker-only configs
+       are byte-for-byte unchanged from step 5c).
+
+       Graph + partition additions (all in `ming_omni_flash_model.py`):
+       * `get_graph_walk_graphs` adds a `talker` walk — a single
+         `Talker` node consuming `thinker_tokens`, emitting one
+         `audio_chunk` `EMIT_TO_CLIENT` edge. The `thinker_decode`
+         loop gains a `StreamingGraphEdge(name="thinker_tokens",
+         target_partition="Talker")` so each decoded token streams to
+         the talker.
+       * `get_partition_topology` declares the Thinker→Talker
+         `Connection` with a `FixedChunkPolicy(chunk_size=1,
+         continue_after_done=True)` — the talker needs the FULL text
+         before it generates, so the policy keeps the consumer alive
+         past the Thinker's text EOS.
+       * `get_partitions` adds the `Talker` partition
+         (`producer_partitions=["Thinker"]`, `initial_walk=None`).
+       * `get_output_sample_rate("audio")` returns the talker VAE
+         sample rate (44.1 kHz).
+       * `get_initial_forward_pass_args` / `get_partition_forward_pass_args`
+         dispatch a Talker branch: `_get_talker_forward` waits for
+         `producer_done`, then fires the single `talker` walk once and
+         reports `request_done` on the next invocation.
+
+       Thinker→Talker text bridge: Ming passes DETOKENIZED TEXT, not
+       hidden states. `thinker_text_to_talker_inputs` decodes the
+       thinker output ids with the thinker tokenizer and re-encodes
+       with the talker's own `talker/llm` tokenizer (loaded lazily +
+       cached via `_get_talker_tokenizer`). `_create_talker_submodule`
+       injects this as the `TalkerSubmodule.text_bridge`, and
+       `TalkerSubmodule.prepare_inputs` accepts either pre-bridged
+       `talker_text_inputs` or raw `thinker_tokens` (running the
+       bridge in the latter case).
+
+       18 tests in `test_ming_flash_omni_talker_graph.py`: thinker-only
+       path unchanged (no talker walk / partition / streaming edge),
+       talker-enabled graph structure (walk, audio edge, streaming
+       edge to Talker), partition + topology + chunk-policy
+       continue-after-done, node-type registration, audio sample rate,
+       Talker state machine (waits for producer_done, fires once,
+       then done; audio-output gating), and the text bridge
+       (decode→re-encode round-trip + missing-tokenizer guard).
+       Updated two pre-existing tests that asserted Talker was an
+       unknown node/partition.
+
+     **Step 6 complete** — audio-out `/generate` is now wireable
+     end-to-end at the model layer (live bring-up still blocked by the
+     TP=4 OOM on the 4-GPU dev box; needs TP=8 thinker + talker on a
+     spare rank).
+
+   - **6f — DONE** (weight loaders): `loader.py` exposes five new
+     entry points on top of the step-4b `_load_prefixed_state_dict`
+     helper. The helper grew two args: `subdir` (relative to
+     `local_dir` — lets us point at `talker/` or `talker/vae/` instead
+     of the snapshot root) and `allow_unexpected` (set of post-rename
+     keys allowed to appear in the ckpt without a target module slot).
+
+     Five loaders:
+     * `load_talker_llm_weights` — strips `model.` from
+       `talker/model.safetensors` for a `transformers.Qwen2Model`.
+     * `load_talker_cfm_weights` — strips `cfm.` for a `CFM(DiT)`.
+       Allows the ckpt's `model.rotary_embed.inv_freq` (we register
+       it as `persistent=False` and recompute locally — deterministic
+       from head_dim + rope_theta).
+     * `load_talker_aggregator_weights` — strips `aggregator.` for
+       an `Aggregator`. Same `rotary_embed.inv_freq` allow.
+     * `load_talker_heads_weights` — loads `stop_head.*` +
+       `spk_head.*` into the dict produced by `build_talker_heads`.
+     * `load_talker_audio_vae_weights` — empty-prefix load from
+       `talker/vae/model.safetensors` (the ckpt's `encoder.*` /
+       `decoder.*` are top-level siblings with no shared prefix —
+       no strip needed).
+
+     7 snapshot-gated tests in `test_ming_flash_omni_talker_loader.py`
+     verify strict load against `/dev/shm/ming-hybrid/talker/`:
+     - Talker LLM: representative key parity + non-zero embed table
+       after load.
+     - CFM: `model.x_embedder.weight` / `model.blocks.0.attn.to_q.weight`
+       / `model.blocks.0.mlp.ff.0.0.weight` / `model.final_layer.linear.weight`.
+     - Aggregator: `x_embedder` / `word_embedder` / `blocks.0.attn.to_q`
+       / `final_layer.linear`.
+     - Heads: `stop_head` + `spk_head` weights both load; non-zero
+       post-load; missing-key guard fires before disk I/O.
+     - AudioVAE: full encoder + decoder + aggregator + ISTFT window
+       keys loaded; CPU end-to-end decode on a real-weights latent
+       produces a finite waveform (catches catastrophic
+       dtype/layout misloads that key-name parity alone wouldn't
+       surface).
+
+     Full Ming step-1..7 + 6a/6b/6c/6d/6f suite: 187 pass / 9 skipped
+     / 0 fail / 1 deselected.
+
+7. **Process_prompt — DONE.** `MingFlashOmniModel.process_prompt` now
+   produces the full `NameToTensorList` consumed by step 5c's prefill
+   scheduler. Strategy mirrors `qwen3_omni`'s `process_prompt`: apply
+   the chat template to TEXT-ONLY messages (so the tokenizer doesn't
+   insert placeholder tokens we'd later have to strip), then run the
+   image / video / audio sub-processors separately for each modality.
+   The Ming chat template path uses `tokenizer.apply_chat_template`
+   (jinja, accepts OpenAI roles `user`/`assistant`/`system`) rather
+   than `processor.apply_chat_template` (Python implementation in
+   `BailingMM2Processor`, asserts on lowercase OpenAI roles — see
+   "Role-handling nuance" above).
+
+   Input convention (`tensors: NameToTensorList`):
+     * `image_inputs` — list of CHW float [0,1] tensors per image.
+       Internal `_image_to_processor_input` converts to HWC uint8 to
+       avoid the upstream's double-rescale near-zero bug
+       (`qwen3_omni_model.py:1033-1038` documents the same gotcha).
+       Single-channel inputs auto-broadcast to 3 channels.
+     * `audio_inputs` — list of either raw 1-D float tensors (sample
+       rate inferred from processor default 16 kHz) or
+       `(waveform, sample_rate)` tuples.
+     * `video_inputs` — list of (T, C, H, W) float tensors. Per-frame
+       `second_per_grid` defaults to 1.0; override via
+       `kwargs["input_metadata"]["video"][i]["second_per_grid"]`.
+
+   Output keys consumed by `_build_thinker_prefill_schedule`:
+     * `text_inputs` — list of 1-D long tensors (one per text turn).
+     * `pixel_values`, `image_grid_thw` — one entry per image.
+     * `pixel_values_videos`, `video_grid_thw`,
+       `video_second_per_grid` — one entry per video clip.
+     * `audio_features` (n_mels, T) + `audio_seqlens` (length-1 long)
+       — one entry per audio clip. Note: upstream returns audio_feats
+       as (B, T, n_mels); we transpose to (n_mels, T) per clip so
+       `AudioEncoderSubmodule.prepare_inputs` can splice without a
+       reshape.
+
+   17 tests in `test/modular/test_ming_flash_omni_process_prompt.py`:
+   text-only happy path, no-prompt audio-only path, image conversion
+   correctness (CHW float [0,1] → HWC uint8, grayscale broadcast,
+   uint8 pass-through), per-modality dispatch, missing-processor
+   error paths, multi-image / mixed-modality combinations, video
+   metadata override, snapshot-gated text+image E2E with the real
+   `BailingMM2Processor`. 16 green + 1 env-skip on this box.
+
+   Image-gen-specific `<image><imagePatch>*256</image>` block (the
+   query-token expansion for the imagegen DiT path) is deferred to
+   step 9 (ImageGen partition), since today's prefill schedule only
+   covers text-out generation.
+
+8. **TTS caption template — DONE.** `components/prompt_utils.py` ports
+   vllm-omni's `prompt_utils.py` wholesale (self-contained, no torch /
+   model deps):
+   * `create_instruction(user_input)` + `BASE_CAPTION_TEMPLATE` — the
+     JSON caption builder for the `ming_flash_omni_tts` talker-only
+     deploy. Merges only known keys (序号 / 说话人 / 方言 / 风格 / 语速
+     / 基频 / 音量 / 情感 / BGM / IP) into a deep-copied template;
+     `ensure_ascii=False` keeps the Chinese field names readable.
+   * `maybe_expand_image_gen_prompt` + `IMAGE_PATCH_TOKEN` +
+     `DEFAULT_NUM_QUERY_TOKENS=256` — the
+     `<image><imagePatch>*256</image>` query-token expansion the
+     ImageGen path (step 9) needs; landed here so the constants live
+     in one place.
+   10 tests in `test_ming_flash_omni_prompt_utils.py`: query-token
+   expansion (default 256, custom count, no-op on already-expanded /
+   empty / non-string), caption build (defaults, known-key merge,
+   unknown-key ignore, no template mutation across calls, unescaped
+   unicode, shallow BGM merge).
+
+   **8b — DONE (image-gen prompt wiring):** `process_prompt` now calls
+   `maybe_expand_image_gen_prompt` when `output_modalities` contains
+   `"image"` AND the deploy ships an `ImageGenConfig` (thinker-only
+   deploys leave the prompt untouched). The expansion count comes from
+   `config.image_gen.num_query_tokens` (= sum of img_gen_scales², 256 by
+   default), so it tracks the checkpoint rather than the hard-coded
+   constant. This is the thinker-side half of the step-9 handoff; the DiT
+   that consumes the query embeddings is still 9b. 5 tests in
+   `test_ming_flash_omni_process_prompt.py`: block appended on image
+   output, count tracks img_gen_scales, no expansion on text output,
+   no-op without ImageGenConfig, no double-expansion.
+
+9. **ImageGen partition.** Separate from the omni pipeline; lives under
+   vllm-omni's diffusion tree (`diffusion/models/ming_flash_omni/`,
+   ~1,315 LOC). Wire as a fourth partition with its own graph walk.
+   Needs `FlowEngine`-style integration. Multi-commit step.
+
+   - **9a — DONE** (config port): `ImageGenConfig` fleshed out with
+     typed sub-config dataclasses parsed from the imagegen subdir tree:
+     * `ZImageDiTConfig` (transformer/config.json) — the diffusion DiT
+       (dim=3840, 30 layers + 2 refiner, 16-channel latents, 3D axial
+       RoPE via axes_dims=(32,48,48) / axes_lens=(1536,512,512)).
+     * `ImageVAEConfig` (vae/config.json) — AutoencoderKL, 16-channel
+       latent, scaling_factor=0.3611 / shift_factor=0.1159.
+     * `ImageGenSchedulerConfig` (scheduler/) —
+       FlowMatchEulerDiscreteScheduler (shift=3.0).
+     * `ByT5MapperConfig` (byt5/byt5.json) — ByT5-small glyph encoder +
+       T5EncoderBlockByT5Mapper (4 layers → sdxl_channels=2560) for the
+       text-rendering pathway.
+     * `connector` — Qwen2 LLM (1536-dim, 28L) kept as a raw dict
+       (built via the shared Qwen2 path at construction time).
+     `from_subdirs` reads each subdir into the typed fields; the
+     `mlp/config.json` knobs (img_gen_scales, diffusion_c_input_dim,
+     use_identity_mlp, dit_type) stay at the top level.
+     13 tests (7 new pure-Python + 6 existing, incl. updated
+     snapshot-gated assertions on dit.dim=3840 / vae.latent_channels=16
+     / scheduler.shift=3.0 / byt5.sdxl_channels=2560 / connector qwen2).
+
+   - **9b — DONE** (modeling + pipeline + wiring): the full image-gen
+     stack is ported into `components/` as native pure-torch (+ stock
+     transformers) modules, decoupled from vllm-omni / vllm TP / diffusers
+     internals:
+     * `t5_block_mapper.py` + `byte5_encoder.py` — ByT5 glyph mapper +
+       encoder. Built on **stock HF `T5Block`** (unfused q/k/v/o,
+       wi_0/wi_1/wo) so Ming's `byt5_mapper.pt` loads with a plain `copy_`,
+       no stacked-param remap. 11 mapper tests + 1 snapshot-gated encoder.
+     * `zimage_transformer.py` — ZImage DiT (`ZImageTransformer2DModel`) +
+       Ming's ref-latent subclass (`MingZImageTransformer2DModel`). Drops
+       vllm's TP linears / `CachedTransformer` / fused `Attention` /
+       `RotaryEmbedding` for plain `nn.Linear` +
+       `F.scaled_dot_product_attention`. Unfused param names
+       (`attention.to_q/to_k/to_v`, `feed_forward.w1/w3`) → direct load.
+       The interleaved (is_neox_style=False) RoPE, GLIDE/DiT
+       `timestep_embedding`, and FP32 RMSNorm match the vllm-omni reference
+       (RoPE parity verified maxdiff=0.0). 14 tests on a tiny config.
+       One intentional divergence: vllm-omni computes-but-does-not-apply the
+       attention pad mask; this port applies it (identical for the bsz-1
+       multiple-of-32 t2i path, correct when caption padding is nonzero).
+     * `condition_encoder.py` — Qwen2-connector condition path (proj_in →
+       bidirectional Qwen2 → proj_out → L2-normalize×1000). transformers
+       only (no diffusers). 7 tests with a stub connector.
+     * `imagegen_pipeline.py` — flow-matching + CFG denoise loop
+       (`MingImageDenoiser`, `combine_cfg`, `calculate_shift`) **decoupled
+       from diffusers** (DiT/scheduler/VAE injected), so the guidance math /
+       sign convention / scheduler stepping are unit-tested with stubs.
+       diffusers + transformers loading lives behind the lazy
+       `MingImagePipeline.from_checkpoint` classmethod (diffusers is broken
+       on this box — confirmed — so eager import is avoided). 11 tests.
+     * Wiring (`submodules.py` + `ming_omni_flash_model.py`):
+       `ImageGenSubmodule` (STATELESS consumer) + an `imagegen` graph walk +
+       `ImageGen` partition + `Thinker→ImageGen` streaming connection
+       (`continue_after_done`) + `_create_imagegen_submodule` factory, all
+       guarded on `config.image_gen`. Mirrors the Talker consumer pattern.
+
+     **Producer↔consumer handoff — DONE.** Both ends of the Thinker→ImageGen
+     stream are wired:
+     * Producer: the thinker prefill node carries a `thinker_hidden_states`
+       `StreamingGraphEdge` (added when `config.image_gen` is set), and
+       `BailingMoeV2ThinkerSubmodule.forward` detects the `<imagePatch>` token
+       in the prefill ids, runs `LingMoeModel.forward(return_hidden_states=True)`,
+       slices the patch positions via `extract_image_gen_hidden_states`, and
+       publishes them under `thinker_hidden_states`. No metadata plumbing —
+       the gate is the patch token's presence in the tokenized prompt.
+     * Consumer: `get_initial_forward_pass_args` / `get_partition_forward_pass_args`
+       gained an `ImageGen` branch + `_get_imagegen_forward` state machine
+       (fires the `imagegen` walk once the producer is done, then request_done),
+       mirroring `_get_talker_forward`.
+     ~30 graph/partition/submodule/producer tests (incl. patch-token-gated
+     emit, consumer state machine fire-once-then-done).
+
+     **Remaining live-bringup gap (not code):** end-to-end image output still
+     needs live multi-GPU serve (TP=8) + a working diffusers (broken on this
+     box). The full modeling + graph + producer/consumer wiring are complete
+     and unit-validated; only the live run is left.
+
+10. **Configs — DONE.** `configs/ming_flash_omni.yaml` rewritten to the
+    real registered node names: `vision_encoder` + `audio_encoder` +
+    `Talker` colocated on rank 0, `Thinker` TP=8 across all 8 GPUs.
+    Dropped the stale placeholders (`AudioVAE` is wrapped inside the
+    Talker submodule, not a separate node; TP=4 → TP=8 to match the
+    verified OOM finding). Node names cross-checked against
+    `get_node_engine_types` (a yaml-vs-registered assertion passes).
+    `configs/ming_flash_omni_thinker_only.yaml` unchanged (already
+    correct). An image-gen variant lands with step 9.
+
+11. **Benchmark `OursOpenAI` parity.** Once `mstar-serve` boots the model,
+    extend `benchmark/request.py:OursOpenAI` to route Ming TTS through the
+    correct endpoint (likely `/v1/chat/completions` with `modalities=["audio"]`,
+    matching the Qwen3-Omni path — `MingFlashOmni` declares no Orpheus-style
+    speech-only fallback).
+
+12. **Tests.** Add `test/modular/test_ming_flash_omni_*.py` covering config
+    load, submodule weight load on a tiny shard, and a smoke graph walk on
+    a single GPU. Mirror `test/modular/test_qwen3_omni_*.py` if present.
+
+## Things to verify against the released checkpoint (not in vllm-omni)
+
+- Exact `max_position_embeddings` and `rope_theta` for thinker vs talker
+  (read from `config.json`, not the deploy yaml).
+- Whether `default_sampling_params.repetition_penalty=1.05` from the deploy
+  yaml is a serving default or a hard requirement — affects
+  `benchmark/base.py:MingFlashOmni.get_model_kwargs`.
+- The output sample rate for the talker (Qwen3-Omni is 24 kHz; check
+  `audio_vae.py` for Ming's). Override
+  `Model.get_output_sample_rate` if it differs.
diff --git a/mstar/model/ming_omni_flash/__init__.py b/mstar/model/ming_omni_flash/__init__.py
new file mode 100644
index 00000000..c6152997
--- /dev/null
+++ b/mstar/model/ming_omni_flash/__init__.py
@@ -0,0 +1,21 @@
+from mstar.model.ming_omni_flash.components.model import (
+    LingMoeModel as LingMoeModel,
+)
+from mstar.model.ming_omni_flash.loader import (
+    load_audio_encoder_weights as load_audio_encoder_weights,
+)
+from mstar.model.ming_omni_flash.loader import (
+    load_audio_projector_weights as load_audio_projector_weights,
+)
+from mstar.model.ming_omni_flash.loader import (
+    load_thinker_weights as load_thinker_weights,
+)
+from mstar.model.ming_omni_flash.loader import (
+    load_vision_encoder_weights as load_vision_encoder_weights,
+)
+from mstar.model.ming_omni_flash.loader import (
+    load_vision_projector_weights as load_vision_projector_weights,
+)
+from mstar.model.ming_omni_flash.ming_omni_flash_model import (
+    MingFlashOmniModel as MingFlashOmniModel,
+)
diff --git a/mstar/model/ming_omni_flash/components/__init__.py b/mstar/model/ming_omni_flash/components/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mstar/model/ming_omni_flash/components/attention.py b/mstar/model/ming_omni_flash/components/attention.py
new file mode 100644
index 00000000..dbb7cac7
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/attention.py
@@ -0,0 +1,171 @@
+"""Ling-2.0 attention (TP-aware, packed-tokens, cache-handle-aware).
+
+Uses mstar's :class:`QKVParallelLinear` + :class:`RowParallelLinear` for
+TP-sharded projections. Per-rank head counts come from the QKV proj —
+when ``tp_size > 1``, attention runs on this rank's slice of heads and
+the output `dense` projection all-reduces across ranks.
+
+The architecture-specific bits (per-head QK-norm, partial 3D
+``video_rope`` rotation) stay inline — they only operate on this rank's
+heads, no cross-rank comm.
+
+Reference: mstar's :class:`ParallelAttention`
+(`mstar/model/components/distributed/attention.py`) +
+Qwen3-Omni's :class:`Qwen3OmniAttention`
+(`mstar/model/qwen3_omni/components/attention.py`).
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+
+from mstar.distributed.communication import TPCommGroup
+from mstar.engine.cache_manager import BatchedCacheManager
+from mstar.model.components.distributed.linear import (
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from mstar.model.components.norm import RMSNorm
+from mstar.model.ming_omni_flash.components.rope import LingPartialMRotaryEmbedding
+
+
+class LingAttention(nn.Module):
+    """Ling-2.0 attention layer (TP-aware).
+
+    Constructor takes TOTAL head counts; per-rank counts are derived from
+    ``qkv_proj.num_heads`` / ``qkv_proj.num_kv_heads`` after construction
+    (computed by :class:`QKVParallelLinear` based on ``comm_group.world_size``).
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        rotary: LingPartialMRotaryEmbedding,
+        use_qkv_bias: bool = False,
+        use_bias: bool = False,
+        comm_group: TPCommGroup | None = None,
+    ) -> None:
+        super().__init__()
+        if num_heads % num_kv_heads != 0:
+            raise ValueError(
+                f"num_heads={num_heads} must be divisible by "
+                f"num_kv_heads={num_kv_heads} for GQA"
+            )
+        if rotary.head_dim != head_dim:
+            raise ValueError(
+                f"rotary.head_dim={rotary.head_dim} must equal head_dim={head_dim}"
+            )
+        if comm_group is None:
+            comm_group = TPCommGroup.trivial()
+        self.comm_group = comm_group
+
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.total_num_kv_heads = num_kv_heads
+
+        # Packed QKV projection — TP-sharded along the heads axis.
+        # Q rows: total_num_heads * head_dim; K rows: total_num_kv_heads *
+        # head_dim; V rows: same. Stored ordered [Q, K, V] along dim 0 —
+        # same packing the released ckpt uses for ``query_key_value.weight``,
+        # so the manual q/k/v split in loader.py copies into the right
+        # slots automatically.
+        self.qkv_proj = QKVParallelLinear(
+            comm_group=comm_group,
+            hidden_size=hidden_size,
+            head_size=head_dim,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_kv_heads,
+            bias=use_qkv_bias,
+        )
+        # Per-rank head counts; everything downstream uses these.
+        self.num_heads = self.qkv_proj.num_heads
+        self.num_kv_heads = self.qkv_proj.num_kv_heads
+        self.kv_groups = self.num_heads // self.num_kv_heads
+        self.q_size = self.num_heads * head_dim
+        self.kv_size = self.num_kv_heads * head_dim
+        self.scaling = head_dim ** -0.5
+
+        # Output projection — input dim is sharded (per-rank q_size),
+        # output dim is full hidden_size; row-parallel runs all-reduce
+        # across ranks.
+        self.dense = RowParallelLinear(
+            comm_group=comm_group,
+            input_size=num_heads * head_dim,  # full pre-shard input
+            output_size=hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+            reduce_results=True,
+        )
+
+        # Per-head normalisation on q and k before rope. Operates on the
+        # head_dim axis, so identical math at each rank's local heads.
+        self.q_norm = RMSNorm(head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(head_dim, eps=rms_norm_eps)
+
+        self.rotary = rotary
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_handle: BatchedCacheManager,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Engine-facing forward (packed tokens, cache-aware, TP-aware).
+
+        Args:
+            hidden_states: ``(num_tokens, hidden_size)``. NOT pre-sharded
+                — QKVParallelLinear takes the full hidden dim as input.
+            cache_handle: see step 3d.
+            position_ids: see step 3d.
+
+        Returns:
+            ``(num_tokens, hidden_size)`` — full hidden dim after the
+            row-parallel dense all-reduces across ranks.
+        """
+        num_tokens = hidden_states.shape[0]
+
+        # qkv_proj returns this rank's slice along the heads axis:
+        # (num_tokens, num_heads * head_dim + 2 * num_kv_heads * head_dim).
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = q.view(num_tokens, self.num_heads, self.head_dim)
+        k = k.view(num_tokens, self.num_kv_heads, self.head_dim)
+        v = v.view(num_tokens, self.num_kv_heads, self.head_dim)
+
+        # QK-norm: per-head RMSNorm on the head_dim axis. Each rank
+        # operates on its own slice of heads — no comm.
+        q = self.q_norm(q.reshape(-1, self.head_dim)).view(
+            num_tokens, self.num_heads, self.head_dim
+        )
+        k = self.k_norm(k.reshape(-1, self.head_dim)).view(
+            num_tokens, self.num_kv_heads, self.head_dim
+        )
+
+        # Partial 3D rope on this rank's heads (rope cos/sin are
+        # head_dim-shaped, identical at every rank).
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        q, k = self.rotary(q, k, position_ids)
+        q = q.transpose(0, 1).contiguous()
+        k = k.transpose(0, 1).contiguous()
+
+        # Cache attention on per-rank heads. mstar's BatchedCacheManager
+        # is per-worker, so its KV cache config already accounts for the
+        # per-rank head counts (worker derives this from ShardingConfig).
+        attn_output = cache_handle.run_attention(q=q, k=k, v=v)
+        attn_output = attn_output.reshape(num_tokens, self.q_size)
+        # dense is row-parallel: it consumes the per-rank slice along the
+        # input dim and all-reduces the (full hidden_size) output.
+        return self.dense(attn_output)
+
+    @staticmethod
+    def head_norm_check(q_after_norm: torch.Tensor) -> float:
+        """Diagnostic: returns max abs deviation of per-head RMS from 1."""
+        norms = q_after_norm.float().pow(2).mean(dim=-1).sqrt()
+        return (norms - 1.0).abs().max().item()
diff --git a/mstar/model/ming_omni_flash/components/audio_encoder.py b/mstar/model/ming_omni_flash/components/audio_encoder.py
new file mode 100644
index 00000000..37acefd3
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/audio_encoder.py
@@ -0,0 +1,343 @@
+"""Whisper-style audio encoder for Ming-flash-omni-2.0.
+
+Self-contained port of vllm-omni's
+``vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py`` (247
+LOC) — itself a re-implementation of the OpenAI Whisper encoder that
+supports packed variable-length inputs (the Ming source's
+``modeling_whisper_encoder.py`` uses padded batches and depends on
+``openai-whisper``; we avoid that runtime dep entirely).
+
+Weight-key parity with the upstream Whisper encoder:
+  - ``conv1.{weight,bias}``                  (kernel=3, stride=1, pad=1)
+  - ``conv2.{weight,bias}``                  (kernel=3, stride=2, pad=1)
+  - ``positional_embedding``                 buffer (sinusoidal, not loaded)
+  - ``blocks.{N}.attn.{query,key,value,out}.{weight,bias}``
+  - ``blocks.{N}.attn_ln.{weight,bias}``
+  - ``blocks.{N}.mlp.{0,2}.{weight,bias}``   (Linear, GELU, Linear)
+  - ``blocks.{N}.mlp_ln.{weight,bias}``
+  - ``ln_post.{weight,bias}``
+
+The released Ming checkpoint stores these under the top-level prefix
+``audio.*`` (see ``model.safetensors.index.json``); the loader strips
+that prefix before applying state_dict here.
+"""
+
+from __future__ import annotations
+
+import logging
+import operator
+from itertools import accumulate
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Whisper primitives (auto-dtype-casting layers + sinusoidal embedding)
+# ---------------------------------------------------------------------------
+
+
+def _sinusoids(length: int, channels: int, max_timescale: int = 10000) -> torch.Tensor:
+    """Sinusoidal positional embedding from Whisper.
+
+    Args:
+        length:   positions.
+        channels: must be even.
+        max_timescale: matches OpenAI Whisper's default (10_000).
+    """
+    if channels % 2 != 0:
+        raise ValueError(f"channels must be even, got {channels}")
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+
+
+class _AutoCastConv1d(nn.Conv1d):
+    """Conv1d that casts its weight/bias to the input dtype on every forward.
+
+    Lets the encoder keep bf16 weights while taking fp32 mel inputs
+    without an explicit ``.to(bf16)`` at the call site (Whisper does
+    this too).
+    """
+
+    def _conv_forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype),
+        )
+
+
+class _AutoCastLinear(nn.Linear):
+    """Linear with the same auto-cast trick."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(
+            x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Multi-head attention (packed sequence with optional FA2 fast path)
+# ---------------------------------------------------------------------------
+
+
+def _try_import_flash_attn():
+    """Return flash_attn_varlen_func if importable, else None.
+
+    Wrapped so test boxes without flash-attn keep green via the manual
+    PyTorch fallback. Audio encoder forward shape is identical either way.
+    """
+    try:
+        from flash_attn import flash_attn_varlen_func  # type: ignore
+        return flash_attn_varlen_func
+    except ImportError:
+        return None
+
+
+_FLASH_ATTN_VARLEN = _try_import_flash_attn()
+
+
+class _PackedMultiHeadAttention(nn.Module):
+    """Whisper-style MHA with variable-length packed sequences.
+
+    Param naming matches OpenAI Whisper (``query`` / ``key`` / ``value`` /
+    ``out`` — not ``q_proj`` / ``k_proj`` / etc.) so the checkpoint keys
+    load directly.
+    """
+
+    def __init__(self, n_state: int, n_head: int, use_flash_attn: bool = True) -> None:
+        super().__init__()
+        if n_state % n_head != 0:
+            raise ValueError(f"n_state={n_state} not divisible by n_head={n_head}")
+        self.n_head = n_head
+        self.query = _AutoCastLinear(n_state, n_state)
+        self.key = _AutoCastLinear(n_state, n_state, bias=False)
+        self.value = _AutoCastLinear(n_state, n_state)
+        self.out = _AutoCastLinear(n_state, n_state)
+
+        if use_flash_attn and _FLASH_ATTN_VARLEN is None:
+            logger.warning("flash-attn not available — falling back to manual attention.")
+        self.use_flash_attn = use_flash_attn and _FLASH_ATTN_VARLEN is not None
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
+        """Packed-sequence attention.
+
+        Args:
+            x:          (total_tokens, n_state) packed tensor.
+            cu_seqlens: (num_seqs + 1,) cumulative seq lengths,
+                        e.g. [0, len1, len1+len2, ...]. int32.
+        """
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+
+        n_tokens, n_state = q.shape
+        head_dim = n_state // self.n_head
+        q = q.view(n_tokens, self.n_head, head_dim)
+        k = k.view(n_tokens, self.n_head, head_dim)
+        v = v.view(n_tokens, self.n_head, head_dim)
+
+        if self.use_flash_attn and q.dtype in (torch.float16, torch.bfloat16):
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            attn_output = _FLASH_ATTN_VARLEN(
+                q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen,
+            )
+        else:
+            attn_output = self._manual_packed_attention(q, k, v, cu_seqlens)
+
+        attn_output = attn_output.contiguous().view(n_tokens, n_state)
+        return self.out(attn_output)
+
+    @staticmethod
+    def _manual_packed_attention(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Pad-attention-unpack fallback for the packed format."""
+        _, n_head, head_dim = q.shape
+        scale = head_dim ** -0.5
+
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        batch = len(seqlens)
+        max_len = max(seqlens)
+
+        # Pad each sequence to max_len so we can run a single batched matmul.
+        q_pad = torch.zeros(batch, max_len, n_head, head_dim, dtype=q.dtype, device=q.device)
+        k_pad = torch.zeros_like(q_pad)
+        v_pad = torch.zeros_like(q_pad)
+        for i, ln in enumerate(seqlens):
+            start = int(cu_seqlens[i].item())
+            end = int(cu_seqlens[i + 1].item())
+            q_pad[i, :ln] = q[start:end]
+            k_pad[i, :ln] = k[start:end]
+            v_pad[i, :ln] = v[start:end]
+
+        # (B, H, T, D)
+        q_pad = q_pad.transpose(1, 2)
+        k_pad = k_pad.transpose(1, 2)
+        v_pad = v_pad.transpose(1, 2)
+
+        # Mask padding columns out of softmax.
+        padding_mask = (
+            torch.arange(max_len, device=q.device)[None, :]
+            >= torch.tensor(seqlens, device=q.device)[:, None]
+        )
+        attn_mask = torch.zeros(batch, 1, 1, max_len, dtype=q.dtype, device=q.device)
+        attn_mask = attn_mask.masked_fill(
+            padding_mask.unsqueeze(1).unsqueeze(2), -torch.finfo(q.dtype).max,
+        )
+
+        scores = torch.matmul(q_pad, k_pad.transpose(-2, -1)) * scale + attn_mask
+        weights = F.softmax(scores, dim=-1)
+        context = torch.matmul(weights, v_pad)  # (B, H, T, D)
+        context = context.transpose(1, 2).contiguous()  # (B, T, H, D)
+
+        # Unpack back to packed.
+        return torch.cat([context[i, :ln] for i, ln in enumerate(seqlens)], dim=0)
+
+
+# ---------------------------------------------------------------------------
+# Residual block (Whisper attn + FFN)
+# ---------------------------------------------------------------------------
+
+
+class _ResidualAttentionBlock(nn.Module):
+    """Whisper-style attn + FFN residual block (param names match upstream)."""
+
+    def __init__(self, n_state: int, n_head: int, use_flash_attn: bool = True) -> None:
+        super().__init__()
+        self.attn = _PackedMultiHeadAttention(n_state, n_head, use_flash_attn=use_flash_attn)
+        self.attn_ln = nn.LayerNorm(n_state)
+
+        n_mlp = n_state * 4
+        # Sequential layout (Linear, GELU, Linear) so checkpoint keys
+        # blocks.{N}.mlp.0.* / .2.* hit the right module by integer index.
+        self.mlp = nn.Sequential(
+            _AutoCastLinear(n_state, n_mlp),
+            nn.GELU(),
+            _AutoCastLinear(n_mlp, n_state),
+        )
+        self.mlp_ln = nn.LayerNorm(n_state)
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.attn_ln(x), cu_seqlens=cu_seqlens)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+
+
+# ---------------------------------------------------------------------------
+# Encoder — public API
+# ---------------------------------------------------------------------------
+
+
+class MingAudioEncoder(nn.Module):
+    """Whisper audio encoder with packed-sequence support.
+
+    Loadable from the released Ming-flash-omni-2.0 checkpoint's
+    ``audio.*`` weight subtree (caller strips the prefix). Defaults
+    match the released ckpt's ``audio_config.whisper_encoder_config``.
+
+    Note the deviation from the openai-whisper original: the
+    ``positional_embedding`` is a *buffer* with a fixed sinusoidal
+    table sized to ``n_ctx`` (15000 on the released ckpt — enough for
+    ~150 s of audio at the post-conv frame rate). The Ming source's
+    ``modeling_whisper_encoder.py`` notes the same change — they drop
+    the trainable parameter so they can shrink the sequence length
+    below the original 30 s pad.
+    """
+
+    def __init__(
+        self,
+        n_mels: int = 128,
+        n_ctx: int = 15000,
+        n_state: int = 1280,
+        n_head: int = 20,
+        n_layer: int = 32,
+        use_flash_attn: bool = True,
+    ) -> None:
+        super().__init__()
+        self.n_layer = n_layer
+        self.n_mels = n_mels
+        self.use_flash_attn = use_flash_attn
+        self.audio_emb_dim = n_state
+
+        self.conv1 = _AutoCastConv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = _AutoCastConv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        # Buffer (not Parameter) — checkpoint doesn't ship this; we
+        # recompute it. Keeps load_state_dict happy with the snapshot.
+        self.register_buffer("positional_embedding", _sinusoids(n_ctx, n_state))
+        self.blocks = nn.ModuleList(
+            [_ResidualAttentionBlock(n_state, n_head, use_flash_attn=use_flash_attn) for _ in range(n_layer)]
+        )
+        self.ln_post = nn.LayerNorm(n_state)
+
+    def forward(self, x_list: list[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Run the encoder on a list of variable-length mel spectrograms.
+
+        Args:
+            x_list: list of (n_mels, T_i) mel features per audio clip.
+
+        Returns:
+            (packed, cu_seqlens):
+              - packed:     (total_T', n_state) all clips concatenated
+                            along time.
+              - cu_seqlens: (len(x_list) + 1,) int32 cumulative encoded
+                            lengths suitable for re-segmenting / feeding
+                            into the projector.
+        """
+        target_dtype = self.conv1.weight.dtype
+
+        encoded = []
+        encoded_lens: list[int] = []
+        for mel in x_list:
+            mel = mel.to(target_dtype)
+            x = mel.unsqueeze(0)                          # (1, n_mels, T)
+            x = F.gelu(self.conv1(x))
+            x = F.gelu(self.conv2(x))
+            x = x.squeeze(0).transpose(0, 1)              # (T', n_state)
+
+            seq_len = x.shape[0]
+            x = (x + self.positional_embedding[:seq_len, :]).to(x.dtype)
+            encoded.append(x)
+            encoded_lens.append(seq_len)
+
+        packed = torch.cat(encoded, dim=0)                # (sum T', n_state)
+        cu_seqlens = torch.tensor(
+            list(accumulate(encoded_lens, func=operator.add, initial=0)),
+            device=packed.device, dtype=torch.int32,
+        )
+        for block in self.blocks:
+            packed = block(packed, cu_seqlens=cu_seqlens)
+        packed = self.ln_post(packed)
+        return packed, cu_seqlens
+
+
+def build_audio_encoder(
+    audio_config,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+    use_flash_attn: bool = True,
+) -> MingAudioEncoder:
+    """Construct :class:`MingAudioEncoder` from an ``AudioEncoderConfig``.
+
+    Matches ``build_vision_encoder``'s factory shape so the model class
+    treats both modalities symmetrically when wiring submodules.
+    """
+    whisper_cfg = audio_config.whisper_encoder_config
+    encoder = MingAudioEncoder(
+        n_mels=int(whisper_cfg["n_mels"]),
+        n_ctx=int(whisper_cfg["n_ctx"]),
+        n_state=int(whisper_cfg["n_state"]),
+        n_head=int(whisper_cfg["n_head"]),
+        n_layer=int(whisper_cfg["n_layer"]),
+        use_flash_attn=use_flash_attn,
+    )
+    encoder = encoder.to(dtype=dtype, device=device)
+    encoder.eval()
+    return encoder
+
+
+__all__ = ["MingAudioEncoder", "build_audio_encoder"]
diff --git a/mstar/model/ming_omni_flash/components/audio_vae.py b/mstar/model/ming_omni_flash/components/audio_vae.py
new file mode 100644
index 00000000..4eaadc90
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/audio_vae.py
@@ -0,0 +1,726 @@
+"""AudioVAE for Ming-flash-omni-2.0 (step 6d).
+
+Self-contained port of vllm-omni's
+``vllm_omni/model_executor/models/ming_flash_omni/audio_vae.py`` (392 LOC).
+The released ckpt ships the VAE under ``talker/vae/model.safetensors``
+with the top-level prefixes ``encoder.*`` and ``decoder.*``; we mirror
+the upstream module tree so the eventual loader is a plain prefix-strip
++ load_state_dict.
+
+Topology (released ckpt):
+
+  AudioVAE
+    .encoder (Encoder)                            # waveform → latent
+      .encoder (Qwen2Model, sliding-window=64)    # main backbone
+      .aggregator (Qwen2Model, 4 layers)          # patch-summarisation
+      .fc1 (Linear 882 → 896)
+      .fc2 (Linear 896 → 896)
+      .fc3 (Linear 896 → 128)                     # latent_dim*2 (mean+scale)
+      .norm (LayerNorm 896)
+      .cls_embed (Parameter (1, 1, 896))
+    .decoder (Decoder)                            # latent → waveform
+      .decoder (Qwen2Model, sliding-window=64)
+      .fc1 (Linear 64 → 896)
+      .head (ISTFTHead)
+        .out (Linear 896 → 3530 = n_fft + 2)
+        .istft (ISTFT, n_fft=3528, hop=882, win=3528)
+      .upsampling (StreamingLinearUpsample)       # only when patch_size != -1
+
+Two simplifications vs vllm-omni:
+
+  * `encode_latent` uses an inline `_oobleck_sample()` instead of
+    `diffusers.OobleckDiagonalGaussianDistribution` — same math
+    (mean/scale split, softplus on scale, reparameterised sample) but
+    no diffusers dep.  The full diffusers class also exposes
+    `kl_divergence` / `mode` for training; we only need `sample` at
+    inference, so the minimal helper is enough.
+
+  * `Decoder.low_level_reconstruct`'s streaming KV-cache fill path uses
+    HF `Cache` instances; the upstream's `past_key_values` tuple
+    fallback isn't needed on transformers >= 4.43.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import torch
+import torch.nn.functional as F
+
+if TYPE_CHECKING:
+    from transformers import Qwen2Config
+from torch import nn
+
+logger = logging.getLogger(__name__)
+
+
+# ===========================================================================
+# Inline Oobleck-style Gaussian sampler (replaces diffusers dep)
+# ===========================================================================
+
+
+def _oobleck_sample(parameters: torch.Tensor) -> torch.Tensor:
+    """Sample from a diagonal Gaussian parameterised by ``[mean, scale]``.
+
+    Matches the inference-time behaviour of
+    ``diffusers.models.autoencoders.autoencoder_oobleck.OobleckDiagonalGaussianDistribution.sample``:
+
+      mean, scale_raw = parameters.chunk(2, dim=1)
+      scale = softplus(scale_raw) + 1e-4
+      sample = mean + scale * eps
+
+    Args:
+        parameters: ``(B, 2 * latent_dim, T)`` tensor — first half is
+            the mean, second half is the raw scale.
+
+    Returns:
+        ``(B, latent_dim, T)`` sample.
+    """
+    mean, scale_raw = parameters.chunk(2, dim=1)
+    scale = F.softplus(scale_raw) + 1e-4
+    eps = torch.randn_like(mean)
+    return mean + scale * eps
+
+
+# ===========================================================================
+# ISTFT — inverse-STFT reconstruction with optional streaming buffers
+# ===========================================================================
+
+
+class _ISTFT(nn.Module):
+    """Sliding-window OLA inverse STFT used by ISTFTHead.
+
+    Two padding modes:
+
+      * ``"center"`` — wraps ``torch.istft`` directly.
+      * ``"same"`` — hand-rolled F.fold reconstruction so we can
+        manage chunk boundaries via ``audio_buffer`` / ``window_buffer``
+        (essential for the streaming decode path).
+
+    The streaming variant preserves the trailing ``win_length - hop_length``
+    samples of audio + window envelope across chunks so adjacent chunks
+    sum-of-window-envelope-normalise correctly when concatenated.
+    """
+
+    def __init__(
+        self,
+        n_fft: int,
+        hop_length: int,
+        win_length: int,
+        padding: str = "same",
+    ) -> None:
+        super().__init__()
+        if padding not in ("center", "same"):
+            raise ValueError(f"Padding must be 'center' or 'same'; got {padding!r}.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.buffer_len = win_length - hop_length
+        self.register_buffer("window", torch.hann_window(win_length))
+
+    # ------------------------------------------------------------------
+    # Per-chunk buffer plumbing
+    # ------------------------------------------------------------------
+
+    def _buffer_process(
+        self,
+        x: torch.Tensor,
+        buffer: torch.Tensor | None,
+        pad: int,
+        last_chunk: bool,
+        streaming: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Apply OLA buffering for the ``same`` padding mode.
+
+        Non-streaming: trim ``pad`` samples off both ends.
+        Streaming: add the previous chunk's tail into the current head;
+        retain the new tail unless this is the last chunk (in which case
+        trim ``pad`` off the end).
+        """
+        if streaming:
+            if buffer is None:
+                x = x[:, pad:]
+            else:
+                x = x.clone()
+                x[:, : self.buffer_len] = x[:, : self.buffer_len] + buffer
+            buffer = x[:, -self.buffer_len :]
+            if not last_chunk:
+                x = x[:, : -self.buffer_len]
+            else:
+                x = x[:, :-pad]
+        else:
+            x = x[:, pad:-pad]
+        return x, buffer
+
+    def forward(
+        self,
+        spec: torch.Tensor,
+        audio_buffer: torch.Tensor | None = None,
+        window_buffer: torch.Tensor | None = None,
+        streaming: bool = False,
+        last_chunk: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """Inverse-STFT reconstruction.
+
+        Args:
+            spec: ``(B, n_fft//2 + 1, T)`` complex STFT magnitudes.
+
+        Returns:
+            Tuple of ``(waveform, audio_buffer, window_buffer)``.
+            Buffers are None when ``streaming=False`` and the centre
+            padding mode is in use.
+        """
+        if self.padding == "center":
+            y = torch.istft(
+                spec, self.n_fft, self.hop_length, self.win_length, self.window,
+                center=True,
+            )
+            return y, None, None
+
+        # same-padding path
+        pad = (self.win_length - self.hop_length) // 2
+        B, N, T = spec.shape
+
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = F.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, :]
+
+        y, audio_buffer = self._buffer_process(
+            y, audio_buffer, pad, last_chunk=last_chunk, streaming=streaming,
+        )
+
+        # Compute the per-position sum-of-window-squared so OLA averages
+        # correctly. Same fold over a (1, T, win_length) tile of the
+        # squared window.
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = (
+            F.fold(
+                window_sq,
+                output_size=(1, output_size),
+                kernel_size=(1, self.win_length),
+                stride=(1, self.hop_length),
+            )
+            .squeeze(0)
+            .squeeze(0)
+        )
+        window_envelope, window_buffer = self._buffer_process(
+            window_envelope, window_buffer, pad,
+            last_chunk=last_chunk, streaming=streaming,
+        )
+        window_envelope = window_envelope.squeeze()
+
+        if not (window_envelope > 1e-11).all():
+            raise RuntimeError(
+                "ISTFT window envelope has near-zero positions; "
+                "check hop_length / win_length / window choice."
+            )
+        y = y / window_envelope
+
+        return y, audio_buffer, window_buffer
+
+
+# ===========================================================================
+# ISTFTHead — Linear → STFT magnitude/phase → ISTFT → waveform
+# ===========================================================================
+
+
+class _ISTFTHead(nn.Module):
+    """Projects DiT hidden states to STFT mag+phase then runs an ISTFT.
+
+    Output Linear emits ``n_fft + 2`` channels; the first half is the
+    log-magnitude (exp'd + clipped to 1e2) and the second half is the
+    phase. Reassembled as a complex spectrogram for the ISTFT.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        n_fft: int,
+        hop_length: int,
+        padding: str = "same",
+    ) -> None:
+        super().__init__()
+        self.out = nn.Linear(dim, n_fft + 2)
+        self.istft = _ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        audio_buffer: torch.Tensor | None = None,
+        window_buffer: torch.Tensor | None = None,
+        streaming: bool = False,
+        last_chunk: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """Returns ``(audio, x_pred, audio_buffer, window_buffer)``.
+
+        ``audio`` is ``(B, 1, T_samples)``; ``x_pred`` is the raw
+        (B, n_fft+2, T_frames) projection (useful for adversarial /
+        spec-disc training paths; harmless at inference).
+        """
+        x_pred = self.out(x).transpose(1, 2)
+        mag, phase = x_pred.chunk(2, dim=1)
+        mag = torch.exp(mag).clip(max=1e2)
+        spec = mag * (torch.cos(phase) + 1j * torch.sin(phase))
+        audio, audio_buffer, window_buffer = self.istft(
+            spec, audio_buffer=audio_buffer, window_buffer=window_buffer,
+            streaming=streaming, last_chunk=last_chunk,
+        )
+        return audio.unsqueeze(1), x_pred, audio_buffer, window_buffer
+
+
+# ===========================================================================
+# StreamingLinearUpsample — chunked linear upsample for patched latents
+# ===========================================================================
+
+
+class _StreamingLinearUpsample(nn.Module):
+    """Linear upsampling that produces consistent output across chunks.
+
+    Non-streaming: ``upsampler(x)`` directly.
+    Streaming: defer emit until we have a 1-step lookahead so the
+    upsample boundary matches the non-chunked result. Internal ``state``
+    dict tracks: ``prev_chunk``, ``history_last`` (the last frame of the
+    PREVIOUS prev_chunk, kept so the upsample window has left context),
+    ``is_first``.
+    """
+
+    def __init__(self, scale_factor: int = 4) -> None:
+        super().__init__()
+        self.scale_factor = scale_factor
+        self.upsampler = nn.Upsample(
+            scale_factor=scale_factor, mode="linear", align_corners=False,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor | None,
+        state: dict[str, Any] | None = None,
+        is_last: bool = False,
+    ) -> tuple[torch.Tensor | None, dict[str, Any] | None]:
+        if state is None:
+            state = {"prev_chunk": None, "history_last": None, "is_first": True}
+
+        if x is None and not is_last:
+            return None, state
+
+        # Single-chunk fast path: first AND last.
+        if state["is_first"] and is_last:
+            out = self.upsampler(x.transpose(1, 2)).transpose(1, 2)
+            return out, None
+
+        output_chunks: list[torch.Tensor] = []
+
+        if state["is_first"]:
+            state["prev_chunk"] = x
+            state["is_first"] = False
+            if not is_last:
+                return None, state
+
+        # Emit the deferred prev_chunk now that we have a right lookahead.
+        if state["prev_chunk"] is not None:
+            p = state["prev_chunk"].transpose(1, 2)
+            if state["history_last"] is None:
+                lookahead = x[:, :1, :].transpose(1, 2)
+                inp = torch.cat([p, lookahead], dim=2)
+                up = self.upsampler(inp)
+                out_prev = up[:, :, : p.size(2) * self.scale_factor]
+            else:
+                lookahead = x[:, :1, :].transpose(1, 2)
+                inp = torch.cat([state["history_last"], p, lookahead], dim=2)
+                up = self.upsampler(inp)
+                start = self.scale_factor
+                end = start + p.size(2) * self.scale_factor
+                out_prev = up[:, :, start:end]
+            output_chunks.append(out_prev.transpose(1, 2))
+            state["history_last"] = p[:, :, -1:]
+            state["prev_chunk"] = x
+
+        if is_last:
+            p = state["prev_chunk"].transpose(1, 2)
+            inp = torch.cat([state["history_last"], p], dim=2)
+            up = self.upsampler(inp)
+            out_last = up[:, :, self.scale_factor :]
+            output_chunks.append(out_last.transpose(1, 2))
+            state = None
+
+        final = torch.cat(output_chunks, dim=1) if output_chunks else None
+        return final, state
+
+
+# ===========================================================================
+# Encoder / Decoder (Qwen2-backed)
+# ===========================================================================
+
+
+def _build_vae_qwen2_config(backbone: dict, attn_implementation: str) -> "Qwen2Config":
+    """Build a Qwen2Config from the VAE backbone dict, stripping fields HF doesn't accept."""
+    from transformers import Qwen2Config
+    # Drop fields that Qwen2Config doesn't accept as kwargs (HF would
+    # store them as custom attrs, but cleaner to drop). `is_causal` is
+    # the only field upstream adds that HF's Qwen2 ignores.
+    accepted = {
+        k: v for k, v in backbone.items()
+        if k not in ("is_causal", "transformers_version", "torch_dtype",
+                     "_attn_implementation", "_attn_implementation_autoset",
+                     "attn_implementation", "model_type", "architectures")
+    }
+    cfg = Qwen2Config(**accepted, attn_implementation=attn_implementation)
+    return cfg
+
+
+def _resolve_attn_implementation() -> str:
+    """Prefer FA2 when available; else sdpa."""
+    try:
+        from transformers.utils import is_flash_attn_2_available
+        return "flash_attention_2" if is_flash_attn_2_available() else "sdpa"
+    except Exception:
+        return "sdpa"
+
+
+class _Decoder(nn.Module):
+    """Latent → waveform via Qwen2 backbone + ISTFTHead.
+
+    Module-tree mirrors upstream so the released ckpt's
+    ``decoder.decoder.layers.N.*`` (Qwen2Model), ``decoder.fc1``,
+    ``decoder.head.out``, ``decoder.head.istft.window`` keys all
+    land via plain state-dict equality.
+    """
+
+    def __init__(
+        self,
+        decoder_args: dict,
+        output_dim: int = 882,
+        latent_dim: int = 64,
+        patch_size: int = -1,
+        attn_implementation: str | None = None,
+    ) -> None:
+        super().__init__()
+        from transformers import Qwen2Model
+        if attn_implementation is None:
+            attn_implementation = _resolve_attn_implementation()
+        cfg = _build_vae_qwen2_config(decoder_args, attn_implementation=attn_implementation)
+        logger.info("AudioVAE Decoder: using attn_implementation=%r", cfg._attn_implementation)
+
+        self.decoder = Qwen2Model(cfg)
+        self.output_dim = output_dim
+        self.latent_dim = latent_dim
+        self.hop_length = output_dim
+        self.fc1 = nn.Linear(latent_dim, cfg.hidden_size)
+        self.head = _ISTFTHead(
+            dim=cfg.hidden_size,
+            n_fft=self.hop_length * 4,
+            hop_length=self.hop_length,
+            padding="same",
+        )
+        self.patch_size = patch_size
+        if self.patch_size != -1:
+            self.upsampling = _StreamingLinearUpsample(scale_factor=patch_size)
+
+    def low_level_reconstruct(
+        self,
+        x: torch.Tensor,
+        past_key_values=None,
+        use_cache: bool = False,
+        stream_state: tuple[Any, Any, Any] = (None, None, None),
+        last_chunk: bool = False,
+    ):
+        """Reconstruct ``(B, 1, T_samples)`` waveform from latent ``(B, T, latent_dim)``.
+
+        Non-streaming path runs the full upsample + backbone + head.
+        Streaming path threads ``stream_state = (upsample_state,
+        audio_buffer, window_buffer)`` and the Qwen2 backbone's
+        ``past_key_values`` across chunks; bridges the sliding-window
+        boundary with the partial-fill trick from upstream when the
+        first chunk would exceed ``sliding_window``.
+        """
+        upsample_state, audio_buffer, window_buffer = stream_state
+        bsz, device, dtype = x.size(0), x.device, x.dtype
+        x = self.fc1(x)
+        if self.patch_size != -1:
+            if use_cache:
+                x, upsample_state = self.upsampling(
+                    x, state=upsample_state, is_last=last_chunk,
+                )
+                if x is None:
+                    stream_state = (upsample_state, audio_buffer, window_buffer)
+                    return torch.empty(bsz, 1, 0, device=device, dtype=dtype), stream_state, past_key_values
+            else:
+                x = self.upsampling.upsampler(x.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states_list: list[torch.Tensor] = []
+
+        # Sliding-window bridge: when the cache is empty and this chunk
+        # would push past `sliding_window`, fill the cache with the
+        # first (sw_size - 1) tokens first so the second pass benefits
+        # from the cached prefix.
+        if use_cache and getattr(self.decoder.config, "sliding_window", None) is not None:
+            sw_size = self.decoder.config.sliding_window
+            target_len = sw_size - 1
+            past_len = _get_past_len(past_key_values)
+            curr_len = x.shape[1]
+            if past_len < target_len and (past_len + curr_len) >= sw_size:
+                fill_len = target_len - past_len
+                x_fill = x[:, :fill_len, :]
+                outputs = self.decoder(
+                    inputs_embeds=x_fill, past_key_values=past_key_values, use_cache=True,
+                )
+                hidden_states_list.append(outputs.last_hidden_state)
+                past_key_values = outputs.past_key_values
+                x = x[:, fill_len:, :]
+
+        outputs = self.decoder(
+            inputs_embeds=x, past_key_values=past_key_values, use_cache=use_cache,
+        )
+        hidden_states_list.append(outputs.last_hidden_state)
+        past_key_values = outputs.past_key_values
+
+        full_hidden = (
+            torch.cat(hidden_states_list, dim=1)
+            if len(hidden_states_list) > 1
+            else hidden_states_list[0]
+        )
+        x_out, _x_pred, audio_buffer, window_buffer = self.head(
+            full_hidden,
+            streaming=use_cache,
+            audio_buffer=audio_buffer,
+            window_buffer=window_buffer,
+            last_chunk=last_chunk,
+        )
+        stream_state = (upsample_state, audio_buffer, window_buffer)
+        return x_out, stream_state, past_key_values
+
+
+def _get_past_len(past_key_values) -> int:
+    """Recover past-seq-len across the various HF cache shapes."""
+    if past_key_values is None:
+        return 0
+    if hasattr(past_key_values, "get_seq_length"):
+        return int(past_key_values.get_seq_length())
+    if isinstance(past_key_values, tuple) and len(past_key_values) > 0:
+        return int(past_key_values[0][0].shape[-2])
+    return 0
+
+
+class _Encoder(nn.Module):
+    """Waveform → latent via Qwen2 backbone + optional patch aggregator.
+
+    With ``patch_size != -1`` the encoder runs a second short Qwen2
+    backbone (4 layers) over each patch concatenated with a learnable
+    [CLS] embedding and outputs the [CLS] row only — same shape as
+    the Aggregator (`components/talker_dit.Aggregator`) but inside the
+    VAE encoder rather than at the talker output.
+    """
+
+    def __init__(
+        self,
+        encoder_args: dict,
+        input_dim: int = 882,
+        hop_size: int = 882,
+        latent_dim: int = 64,
+        patch_size: int = -1,
+        attn_implementation: str | None = None,
+    ) -> None:
+        super().__init__()
+        from transformers import Qwen2Model
+        if attn_implementation is None:
+            attn_implementation = _resolve_attn_implementation()
+        cfg = _build_vae_qwen2_config(encoder_args, attn_implementation=attn_implementation)
+        logger.info("AudioVAE Encoder: using attn_implementation=%r", cfg._attn_implementation)
+
+        self.encoder = Qwen2Model(cfg)
+        self.input_dim = input_dim
+        self.hop_size = hop_size
+        self.latent_dim = latent_dim
+
+        self.fc1 = nn.Linear(input_dim, cfg.hidden_size, bias=False)
+        self.fc2 = nn.Linear(cfg.hidden_size, cfg.hidden_size)
+        self.fc3 = nn.Linear(cfg.hidden_size, latent_dim * 2)
+        self.norm = nn.LayerNorm(cfg.hidden_size)
+        self.patch_size = patch_size
+        if patch_size != -1:
+            # Aggregator is a 4-layer Qwen2 backbone (upstream
+            # explicitly overrides num_hidden_layers to 4).
+            agg_cfg = _build_vae_qwen2_config(
+                {**encoder_args, "num_hidden_layers": 4},
+                attn_implementation=attn_implementation,
+            )
+            self.aggregator = Qwen2Model(agg_cfg)
+            # Learnable CLS embedding prepended to each patch.
+            self.cls_embed = nn.Parameter(torch.empty(1, 1, cfg.hidden_size))
+            # Match upstream's normal_(0, 0.02) init so eager-init
+            # weights match if the loader is bypassed in tests.
+            nn.init.normal_(self.cls_embed, mean=0.0, std=0.02)
+
+    # ------------------------------------------------------------------
+    # Waveform → frames windowed slicing
+    # ------------------------------------------------------------------
+
+    def get_frames(self, x: torch.Tensor) -> torch.Tensor:
+        """Slide a ``(input_dim,)`` window over the waveform with stride hop_size.
+
+        Pads the right edge so the final window doesn't overshoot.
+        Returns ``(B, num_frames, input_dim)``.
+        """
+        num_frames_total = (x.size(-1) + self.hop_size - 1) // self.hop_size
+        expected_len = (num_frames_total - 1) * self.hop_size + self.input_dim
+        padding_needed = expected_len - x.size(-1)
+        waveform = F.pad(x, (0, padding_needed), value=0.0)
+        frames = waveform.unfold(dimension=-1, size=self.input_dim, step=self.hop_size)
+        return frames
+
+    def pad_patch_insert_cls(self, x: torch.Tensor) -> torch.Tensor:
+        """Group frames into patches of ``patch_size`` and append a CLS row to each."""
+        bsz, num_frame, dim = x.size()
+        r = num_frame % self.patch_size
+        pad_num = self.patch_size - r if r else 0
+        x = F.pad(x, (0, 0, 0, pad_num), value=0.0)
+        x = x.reshape(-1, self.patch_size, dim)
+        cls = self.cls_embed.expand(x.size(0), -1, -1)
+        x = torch.cat((x, cls), dim=1)
+        x = x.reshape(bsz, -1, dim)
+        return x
+
+    def forward(self, waveform: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """Returns ``(latent_params, waveform.unsqueeze(1))``.
+
+        ``latent_params`` is ``(B, T_latents, latent_dim*2)`` — the
+        first half is the Gaussian mean and the second half is the
+        raw scale; pass through `_oobleck_sample` to draw a latent.
+        """
+        x = self.get_frames(waveform)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        h = self.encoder(inputs_embeds=x).last_hidden_state
+
+        if self.patch_size != -1:
+            h = self.pad_patch_insert_cls(h)
+            h = self.aggregator(inputs_embeds=h).last_hidden_state
+            bsz, _, dim = h.size()
+            h = h.reshape(-1, self.patch_size + 1, dim)
+            h = h[:, -1:, :].reshape(bsz, -1, dim)
+
+        h = self.fc3(h)
+        return h, waveform.unsqueeze(1)
+
+
+# ===========================================================================
+# AudioVAE — wraps Encoder + Decoder
+# ===========================================================================
+
+
+class AudioVAE(nn.Module):
+    """Top-level Audio VAE.
+
+    Plain nn.Module (not PreTrainedModel) so we don't inherit HF
+    config machinery — the dataclass `AudioVAEConfig` carries the dims
+    and the loader handles weights directly.
+    """
+
+    def __init__(
+        self,
+        audio_vae_config,
+        attn_implementation: str | None = None,
+    ) -> None:
+        super().__init__()
+        self.config = audio_vae_config
+        self.encoder = _Encoder(
+            encoder_args=audio_vae_config.enc_backbone,
+            input_dim=audio_vae_config.encoder_input_dim,
+            hop_size=audio_vae_config.encoder_hop_size,
+            latent_dim=audio_vae_config.latent_dim,
+            patch_size=audio_vae_config.patch_size,
+            attn_implementation=attn_implementation,
+        )
+        self.decoder = _Decoder(
+            decoder_args=audio_vae_config.dec_backbone,
+            output_dim=audio_vae_config.decoder_output_dim,
+            latent_dim=audio_vae_config.latent_dim,
+            patch_size=audio_vae_config.patch_size,
+            attn_implementation=attn_implementation,
+        )
+
+    @property
+    def sample_rate(self) -> int:
+        return self.config.sample_rate
+
+    def encode_latent(
+        self,
+        waveform: torch.Tensor,
+        waveform_length: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Run the encoder and sample a latent (Gaussian re-parameterised).
+
+        Returns ``(latent, frame_num)``.  ``latent`` is
+        ``(B, latent_dim, T_latents)``; ``frame_num`` is the per-clip
+        latent count after patching.
+        """
+        frame_num = torch.ceil(
+            waveform_length / self.config.encoder_input_dim,
+        ).to(torch.int32)
+        if self.config.patch_size != -1:
+            frame_num = torch.ceil(frame_num / self.config.patch_size)
+        h, _y = self.encoder(waveform)
+        # encoder.fc3 emits (B, T, latent_dim*2) — transpose to channels-second
+        # for `_oobleck_sample` (chunks on dim=1).
+        h = h.transpose(1, 2)
+        latent = _oobleck_sample(h)
+        latent = latent.transpose(1, 2)
+        return latent, frame_num
+
+    def decode(
+        self,
+        latent: torch.Tensor,
+        past_key_values=None,
+        use_cache: bool = False,
+        stream_state: tuple[Any, Any, Any] = (None, None, None),
+        last_chunk: bool = False,
+    ):
+        """Decode latent → waveform; threads the streaming state for chunked TTS."""
+        waveform, stream_state, past_key_values = self.decoder.low_level_reconstruct(
+            latent,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            stream_state=stream_state,
+            last_chunk=last_chunk,
+        )
+        return waveform, stream_state, past_key_values
+
+
+def build_audio_vae(
+    audio_vae_config,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+    attn_implementation: str | None = None,
+) -> AudioVAE:
+    """Construct an `AudioVAE` from `AudioVAEConfig`.
+
+    ``attn_implementation`` defaults to ``"sdpa"`` on CPU and FA2 when
+    flash-attn is importable AND the target device is CUDA. Caller can
+    pin to ``"eager"`` for debugging or ``"sdpa"`` to mirror what
+    vllm-omni's talker actually uses at runtime (it forces sdpa on the
+    talker LLM regardless of FA2 availability).
+    """
+    if attn_implementation is None:
+        device_str = str(device)
+        if device_str == "cpu" or device_str.startswith("cpu"):
+            attn_implementation = "sdpa"
+        else:
+            attn_implementation = _resolve_attn_implementation()
+    vae = AudioVAE(audio_vae_config, attn_implementation=attn_implementation)
+    vae = vae.to(dtype=dtype, device=device)
+    vae.eval()
+    return vae
+
+
+__all__ = ["AudioVAE", "build_audio_vae"]
diff --git a/mstar/model/ming_omni_flash/components/byte5_encoder.py b/mstar/model/ming_omni_flash/components/byte5_encoder.py
new file mode 100644
index 00000000..7787216b
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/byte5_encoder.py
@@ -0,0 +1,224 @@
+"""ByT5 glyph/text encoder for Ming-flash-omni-2.0 image generation.
+
+Native mstar port of vllm-omni's ``byte5_encoder.py``. Bundles the byt5
+tokenizer + HF T5 encoder + :class:`T5EncoderBlockByT5Mapper`. The released
+checkpoint's byt5 weights were trained with per-language font/color special
+tokens, so we replicate that vocabulary extension before loading — otherwise
+``byt5_model.pt`` shape-mismatches at the embedding table.
+
+Typical forward: a list of prompt strings (optionally carrying
+``<cn-font-N>`` / ``<color-N>`` markers) → ``[B, byt5_max_length,
+diffusion_c_input_dim]`` features, padded positions zeroed so the downstream
+``torch.cat`` onto cap_feats injects no garbage.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from types import SimpleNamespace
+
+import torch
+from torch import nn
+
+from mstar.model.ming_omni_flash.components.t5_block_mapper import (
+    T5EncoderBlockByT5Mapper,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _add_multilingual_special_tokens(
+    tokenizer,
+    text_encoder: nn.Module,
+    font_ann_path: Path,
+    color_ann_path: Path,
+    add_font: bool,
+    add_color: bool,
+    add_align: bool = False,
+) -> None:
+    """Extend the byt5 vocab with per-language font + color markers.
+
+    Mirrors ``add_special_token_multilingual`` in Ming's bizgen utils. The token
+    set must match what the checkpoint was trained with, otherwise the resized
+    embedding table won't line up with the shipped weights.
+    """
+    idx_font_dict = json.loads(Path(font_ann_path).read_text())
+    idx_color_dict = json.loads(Path(color_ann_path).read_text())
+
+    font_tokens: list[str] = []
+    for font_code in idx_font_dict:
+        prefix = font_code[:3]
+        if prefix in ("cn-", "en-", "jp-", "kr-"):
+            font_tokens.append(f"<{prefix}font-{idx_font_dict[font_code]}>")
+        else:
+            font_tokens.append(f"<font-{idx_font_dict[font_code]}>")
+    color_tokens = [f"<color-{i}>" for i in range(len(idx_color_dict))]
+    align_tokens = [f"<align-{i}>" for i in range(3)]
+
+    extra: list[str] = []
+    if add_color:
+        extra += color_tokens
+    if add_font:
+        extra += font_tokens
+    if add_align:
+        extra += align_tokens
+    tokenizer.add_tokens(extra, special_tokens=True)
+    text_encoder.resize_token_embeddings(len(tokenizer))
+
+
+class MingByT5Encoder(nn.Module):
+    """Bundles byt5 tokenizer + T5 encoder + :class:`T5EncoderBlockByT5Mapper`.
+
+    Build via :meth:`from_checkpoint` when the checkpoint ships byt5 weights;
+    otherwise callers can skip this and the pipeline falls back to no-byt5
+    conditioning.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        text_encoder: nn.Module,
+        mapper: T5EncoderBlockByT5Mapper,
+        max_length: int,
+    ) -> None:
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        self.mapper = mapper
+        self.max_length = max_length
+
+    @classmethod
+    def from_checkpoint(
+        cls,
+        byte5_dir: Path,
+        *,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> MingByT5Encoder:
+        """Load tokenizer + encoder + mapper from the checkpoint's ``byt5`` dir.
+
+        Wrapped in ``torch.random.fork_rng`` so any ``nn.init`` inside
+        ``from_pretrained`` / vocab-resize cannot advance the default generator
+        — otherwise the diffusion pipeline's seeded noise becomes
+        order-dependent across requests (same-seed replays would diverge).
+        """
+        cuda_devs = list(range(torch.cuda.device_count())) if torch.cuda.is_available() else []
+        with torch.random.fork_rng(devices=cuda_devs, enabled=True):
+            return cls._from_checkpoint_impl(byte5_dir, device=device, dtype=dtype)
+
+    @classmethod
+    def _from_checkpoint_impl(
+        cls,
+        byte5_dir: Path,
+        *,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> MingByT5Encoder:
+        from transformers import AutoTokenizer, T5ForConditionalGeneration
+
+        byte5_dir = Path(byte5_dir)
+        # Ming checkpoint uses ``byt5`` (no 'e') in filenames and JSON keys;
+        # the ``byte5_`` variable spelling below is kept for readability.
+        cfg_raw = json.loads((byte5_dir / "byt5.json").read_text())
+        cfg = SimpleNamespace(**cfg_raw)
+        byte5_config = cfg.byt5_config
+        mapper_config = cfg.byt5_mapper_config
+        max_length = int(cfg.byt5_max_length)
+
+        # ---- Tokenizer + T5 encoder (base).
+        ckpt_key = byte5_config.get("byt5_ckpt_path")
+        byte5_ckpt_path = byte5_dir / ckpt_key.lstrip("./")
+        tokenizer = AutoTokenizer.from_pretrained(byte5_ckpt_path, local_files_only=True)
+        text_encoder = T5ForConditionalGeneration.from_pretrained(
+            byte5_ckpt_path, local_files_only=True
+        ).get_encoder()
+
+        # ---- Extend vocab with font/color markers so the shipped weights load.
+        if byte5_config.get("special_token"):
+            if not byte5_config.get("multilingual", True):
+                raise NotImplementedError(
+                    "Non-multilingual byt5 vocab extension is not ported; "
+                    "the released Ming checkpoint uses multilingual=True."
+                )
+            _add_multilingual_special_tokens(
+                tokenizer,
+                text_encoder,
+                font_ann_path=byte5_dir / byte5_config["font_ann_path"].lstrip("./"),
+                color_ann_path=byte5_dir / byte5_config["color_ann_path"].lstrip("./"),
+                add_font=bool(byte5_config.get("font_special_token")),
+                add_color=bool(byte5_config.get("color_special_token")),
+            )
+
+        # ---- Load byt5 text-encoder weights. base.pt wraps the backbone in a
+        # trainable-module container (module.text_tower.encoder.*); byt5_model.pt
+        # carries the top-level encoder state. Follow Ming's two-step load.
+        base_state = torch.load(byte5_dir / "byt5_model" / "base.pt", map_location="cpu", weights_only=False)
+        prefix = "module.text_tower.encoder."
+        base_filtered = {
+            name[len(prefix):]: state
+            for name, state in base_state["state_dict"].items()
+            if name.startswith(prefix)
+        }
+        text_encoder.load_state_dict(base_filtered, strict=True)
+        del base_state, base_filtered
+
+        encoder_state = torch.load(byte5_dir / "byt5_model" / "byt5_model.pt", map_location="cpu", weights_only=False)
+        text_encoder.load_state_dict(encoder_state)
+        del encoder_state
+
+        text_encoder.to(device=device, dtype=dtype).eval()
+
+        # ---- Mapper (stock HF T5Block layout ⇒ direct state_dict load).
+        mapper = T5EncoderBlockByT5Mapper(
+            byte5_config=text_encoder.config,
+            num_layers=int(mapper_config["num_layers"]),
+            sdxl_channels=int(mapper_config["sdxl_channels"]),
+        )
+        mapper_state = torch.load(byte5_dir / "byt5_mapper" / "byt5_mapper.pt", map_location="cpu", weights_only=False)
+        mapper.load_weights(mapper_state.items())
+        del mapper_state
+        mapper.to(device=device, dtype=dtype).eval()
+
+        logger.info(
+            "[MingByT5Encoder] ready: d_model=%d mapper_layers=%d sdxl_channels=%d max_length=%d vocab=%d",
+            text_encoder.config.d_model,
+            mapper_config["num_layers"],
+            mapper_config["sdxl_channels"],
+            max_length,
+            len(tokenizer),
+        )
+        return cls(tokenizer, text_encoder, mapper, max_length)
+
+    @torch.inference_mode()
+    def forward(self, texts: list[str]) -> torch.Tensor:
+        """Tokenize → T5 encode → mapper; zeroes padded positions.
+
+        Returns ``[B, max_length, sdxl_channels]``.
+        """
+        device = next(self.text_encoder.parameters()).device
+        dtype = next(self.text_encoder.parameters()).dtype
+
+        tokens = self.tokenizer(
+            texts,
+            padding="max_length",
+            max_length=self.max_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        input_ids = tokens.input_ids.to(device)
+        attention_mask = tokens.attention_mask.to(device)
+
+        encoder_out = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask.float(),
+        )
+        hidden_states = encoder_out[0]
+        feats = self.mapper(hidden_states, attention_mask)
+        feats = feats * attention_mask.unsqueeze(-1).to(dtype=feats.dtype)
+        return feats.to(dtype=dtype)
+
+
+__all__ = ["MingByT5Encoder"]
diff --git a/mstar/model/ming_omni_flash/components/condition_encoder.py b/mstar/model/ming_omni_flash/components/condition_encoder.py
new file mode 100644
index 00000000..ac60969a
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/condition_encoder.py
@@ -0,0 +1,243 @@
+"""Ming-flash-omni-2.0 condition encoder for image generation (step 9b).
+
+Native mstar port of vllm-omni's ``condition_encoder.py``. Encodes the thinker
+hidden states (sliced at the learnable ``<imagePatch>`` query-token positions)
+into the DiT's ``cap_feats`` conditioning:
+
+    thinker hidden states [B, N, 4096]
+              │ proj_in (Linear, bias)        -> [B, N, 1536]
+              │ Qwen2 connector (bidirectional, non-causal)
+              │ proj_out (Linear, bias)       -> [B, N, 2560]
+              │ F.normalize(dim=-1) × 1000    (text_encoder_norm)
+              ▼
+       cap_feats consumed by ZImageTransformer2DModel
+
+Only transformers is required (the connector is a small Qwen2 backbone loaded
+via ``Qwen2ForCausalLM.from_pretrained``); there is no diffusers dependency, so
+the forward path is unit-testable with a stub connector.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+logger = logging.getLogger(__name__)
+
+
+class MingConditionEncoder(nn.Module):
+    """Qwen2 connector + proj_in/out + L2-normalize×1000 → DiT condition embeds.
+
+    The connector runs bidirectionally (``is_causal=False``) since it encodes a
+    fixed block of query-token hidden states rather than decoding
+    autoregressively. ``proj_in`` / ``proj_out`` / connector are populated by
+    :meth:`load_from_checkpoint`; before that the module is cheap to construct
+    (Identity projections), which keeps dummy-init and unit tests light.
+
+    Args:
+        image_gen_config: an ``ImageGenConfig`` (mstar) exposing
+            ``connector_subfolder`` / ``mlp_subfolder`` /
+            ``diffusion_c_input_dim`` / ``text_encoder_norm`` /
+            ``use_identity_mlp``.
+        thinker_hidden_size: hidden size of the thinker (BailingMoeV2); 4096 on
+            the released checkpoint.
+        device / dtype: optional placement applied after loading.
+    """
+
+    def __init__(
+        self,
+        image_gen_config,
+        *,
+        thinker_hidden_size: int = 4096,
+        device: torch.device | str | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+        self.config = image_gen_config
+        self.thinker_hidden_size = thinker_hidden_size
+        self._target_device = torch.device(device) if device is not None else None
+        self._target_dtype = dtype
+
+        self.connector: nn.Module | None = None
+        self.connector_hidden_size: int | None = None
+        self.proj_in: nn.Module = nn.Identity()
+        self.proj_out: nn.Module = nn.Identity()
+        self.norm: nn.Module = nn.Identity()
+
+    # ------------------------------------------------------------------
+    # Weight loading
+    # ------------------------------------------------------------------
+
+    def load_from_checkpoint(self, model_path: str | Path) -> None:
+        """Load the Qwen2 connector + proj_in/proj_out weights from disk."""
+        from transformers import AutoConfig, Qwen2ForCausalLM
+
+        model_path = Path(model_path)
+        connector_path = model_path / self.config.connector_subfolder
+        logger.info("[MingConditionEncoder] loading connector from %s", connector_path)
+
+        connector_cfg = AutoConfig.from_pretrained(connector_path, trust_remote_code=True, local_files_only=True)
+        connector_cfg.is_decoder = False
+        self.connector_hidden_size = int(connector_cfg.hidden_size)
+
+        connector = Qwen2ForCausalLM.from_pretrained(
+            connector_path,
+            config=connector_cfg,
+            torch_dtype=self._target_dtype,
+            local_files_only=True,
+        )
+        # Force bidirectional attention defensively — some transformers versions
+        # read ``self_attn.is_causal`` in forward.
+        for module in connector.modules():
+            if hasattr(module, "is_causal"):
+                module.is_causal = False
+
+        self.connector = getattr(connector, "model", connector)  # base encoder, no LM head
+
+        self.proj_in = nn.Linear(self.thinker_hidden_size, self.connector_hidden_size, bias=True)
+        # text_encoder_norm = L2 normalize on the final cap_feats (NOT an
+        # intermediate RMSNorm); applied explicitly in forward(). Keep
+        # self.norm as Identity.
+        self.norm = nn.Identity()
+        self.proj_out = nn.Linear(self.connector_hidden_size, self.config.diffusion_c_input_dim, bias=True)
+
+        mlp_path = model_path / self.config.mlp_subfolder
+        mlp_cfg_path = mlp_path / "config.json"
+        if mlp_cfg_path.exists() and not json.loads(mlp_cfg_path.read_text()).get("use_identity_mlp", False):
+            raise NotImplementedError(f"{mlp_cfg_path} has use_identity_mlp=False; ToClipMLP path not implemented.")
+        self._load_optional_mlp_weights(mlp_path)
+
+        if self._target_device is not None:
+            self.to(self._target_device)
+        if self._target_dtype is not None:
+            self.to(dtype=self._target_dtype)
+
+    def _load_optional_mlp_weights(self, mlp_path: Path) -> None:
+        """Copy proj_in / proj_out (+ optional norm) weights from ``mlp/``.
+
+        Expected keys (inclusionAI/Ming-flash-omni-2.0): ``proj_in.{weight,bias}``
+        [1536,4096]/[1536], ``proj_out.{weight,bias}`` [2560,1536]/[2560], and
+        ``query_tokens_dict.16x16`` [256,4096] which is consumed on the thinker
+        side (skipped here). Missing proj weights are logged as errors — the
+        conditioning is meaningless without them.
+        """
+        if not mlp_path.exists():
+            logger.warning("[MingConditionEncoder] mlp/ missing at %s — proj/norm stay random-init", mlp_path)
+            return
+
+        from safetensors.torch import load_file
+
+        candidates = sorted(mlp_path.glob("*.safetensors")) or sorted(mlp_path.glob("*.bin"))
+        if not candidates:
+            logger.warning("[MingConditionEncoder] no weight files under %s", mlp_path)
+            return
+
+        state: dict[str, torch.Tensor] = {}
+        for p in candidates:
+            if p.suffix == ".safetensors":
+                state.update(load_file(str(p)))
+            else:
+                state.update(torch.load(str(p), map_location="cpu"))
+
+        handled: set[str] = set()
+
+        def _copy(dst: torch.Tensor, src_key: str) -> bool:
+            src = state.get(src_key)
+            if src is None:
+                logger.error("[MingConditionEncoder] mlp/ missing key %r", src_key)
+                return False
+            if tuple(src.shape) != tuple(dst.shape):
+                logger.error(
+                    "[MingConditionEncoder] mlp/%s shape mismatch: ckpt=%s module=%s",
+                    src_key,
+                    tuple(src.shape),
+                    tuple(dst.shape),
+                )
+                return False
+            with torch.no_grad():
+                dst.copy_(src.to(dtype=dst.dtype, device=dst.device))
+            handled.add(src_key)
+            return True
+
+        ok = all(
+            [
+                _copy(self.proj_in.weight, "proj_in.weight"),
+                _copy(self.proj_in.bias, "proj_in.bias"),
+                _copy(self.proj_out.weight, "proj_out.weight"),
+                _copy(self.proj_out.bias, "proj_out.bias"),
+            ]
+        )
+        if not ok:
+            logger.error("[MingConditionEncoder] proj_in/proj_out NOT fully loaded; conditioning will be garbage.")
+
+        if "norm.weight" in state and hasattr(self.norm, "weight"):
+            _copy(self.norm.weight, "norm.weight")
+
+        for k in state:
+            if k.startswith("query_tokens_dict"):
+                handled.add(k)  # thinker-side; not loaded here
+
+        leftover = set(state.keys()) - handled
+        if leftover:
+            logger.warning("[MingConditionEncoder] mlp/ unhandled keys: %s", sorted(leftover))
+
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+
+    def forward(
+        self,
+        thinker_hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Encode ``[B, N, thinker_hidden_size]`` → ``[B, N, diffusion_c_input_dim]``."""
+        if self.connector is None:
+            raise RuntimeError("MingConditionEncoder.load_from_checkpoint() must run before forward().")
+        if thinker_hidden_states.dim() != 3:
+            raise ValueError(f"expected [B, N, H], got shape {tuple(thinker_hidden_states.shape)}")
+
+        b, n, _ = thinker_hidden_states.shape
+        x = self.proj_in(thinker_hidden_states)
+
+        # Ming passes a 4D all-ones mask [B, 1, N, N] to force full bidirectional
+        # self-attention over the query positions.
+        if attention_mask is None:
+            attention_mask = torch.ones((b, 1, n, n), dtype=x.dtype, device=x.device)
+        elif attention_mask.dim() == 2:
+            attention_mask = attention_mask.to(x.dtype)[:, None, None, :].expand(b, 1, n, n)
+
+        out = self.connector(
+            inputs_embeds=x,
+            attention_mask=attention_mask,
+            use_cache=False,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        hidden = out.hidden_states[-1]
+        cap_feats = self.proj_out(hidden)
+
+        cap_feats = F.normalize(cap_feats, dim=-1)
+        if self.config.text_encoder_norm:
+            cap_feats = cap_feats * 1000.0
+        return cap_feats
+
+    @torch.no_grad()
+    def zero_negative(self, cap_feats: torch.Tensor) -> torch.Tensor:
+        """Zero tensor shaped like ``cap_feats`` for CFG negatives."""
+        return torch.zeros_like(cap_feats)
+
+    def extra_repr(self) -> str:
+        return (
+            f"thinker_hidden_size={self.thinker_hidden_size}, "
+            f"connector_hidden_size={self.connector_hidden_size}, "
+            f"diffusion_c_input_dim={self.config.diffusion_c_input_dim}, "
+            f"text_encoder_norm={self.config.text_encoder_norm}"
+        )
+
+
+__all__ = ["MingConditionEncoder"]
diff --git a/mstar/model/ming_omni_flash/components/decoder_layer.py b/mstar/model/ming_omni_flash/components/decoder_layer.py
new file mode 100644
index 00000000..511e9730
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/decoder_layer.py
@@ -0,0 +1,111 @@
+"""Ling-2.0 decoder layer (TP-aware, hybrid dense / MoE)."""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+
+from mstar.distributed.communication import TPCommGroup
+from mstar.engine.cache_manager import BatchedCacheManager
+from mstar.model.components.distributed.mlp import ParallelGatedMLP
+from mstar.model.components.norm import RMSNorm
+from mstar.model.ming_omni_flash.components.attention import LingAttention
+from mstar.model.ming_omni_flash.components.moe import LingMoeBlock
+from mstar.model.ming_omni_flash.components.rope import (
+    LingPartialMRotaryEmbedding,
+)
+
+
+class LingDecoderLayer(nn.Module):
+    """One Ling-2.0 decoder layer; layer_idx decides dense-vs-MoE FFN.
+
+    All sub-modules receive ``comm_group``; defaults to single-rank
+    trivial when not set. Dense layer-0 MLP uses :class:`ParallelGatedMLP`
+    so its `down_proj` all-reduces across ranks.
+    """
+
+    def __init__(
+        self,
+        layer_idx: int,
+        first_k_dense_replace: int,
+        hidden_size: int,
+        intermediate_size: int,
+        moe_intermediate_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        num_experts: int,
+        num_experts_per_tok: int,
+        num_shared_experts: int,
+        n_group: int,
+        topk_group: int,
+        routed_scaling_factor: float,
+        rotary: LingPartialMRotaryEmbedding,
+        use_qkv_bias: bool = False,
+        use_bias: bool = False,
+        comm_group: TPCommGroup | None = None,
+    ) -> None:
+        super().__init__()
+        if comm_group is None:
+            comm_group = TPCommGroup.trivial()
+        self.layer_idx = layer_idx
+        self.is_moe = layer_idx >= first_k_dense_replace
+
+        self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
+
+        self.self_attn = LingAttention(
+            hidden_size=hidden_size,
+            num_heads=num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            rotary=rotary,
+            use_qkv_bias=use_qkv_bias,
+            use_bias=use_bias,
+            comm_group=comm_group,
+        )
+
+        if self.is_moe:
+            self.mlp: nn.Module = LingMoeBlock(
+                hidden_size=hidden_size,
+                num_experts=num_experts,
+                num_experts_per_tok=num_experts_per_tok,
+                moe_intermediate_size=moe_intermediate_size,
+                num_shared_experts=num_shared_experts,
+                n_group=n_group,
+                topk_group=topk_group,
+                routed_scaling_factor=routed_scaling_factor,
+                comm_group=comm_group,
+            )
+        else:
+            # Dense layer-0 MLP — ParallelGatedMLP so its column-parallel
+            # gate/up + row-parallel down handle TP sharding internally.
+            self.mlp = ParallelGatedMLP(
+                comm_group=comm_group,
+                hidden_size=hidden_size,
+                intermediate_size=intermediate_size,
+                bias=False,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_handle: BatchedCacheManager,
+        position_ids: torch.Tensor,
+        image_mask: torch.Tensor | None = None,
+        audio_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        h = self.input_layernorm(hidden_states)
+        h = self.self_attn(h, cache_handle, position_ids)
+        h = residual + h
+
+        residual = h
+        h = self.post_attention_layernorm(h)
+        if self.is_moe:
+            h = self.mlp(h, image_mask=image_mask, audio_mask=audio_mask)
+        else:
+            h = self.mlp(h)
+        return residual + h
diff --git a/mstar/model/ming_omni_flash/components/imagegen_pipeline.py b/mstar/model/ming_omni_flash/components/imagegen_pipeline.py
new file mode 100644
index 00000000..2503ac84
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/imagegen_pipeline.py
@@ -0,0 +1,375 @@
+"""Ming-flash-omni-2.0 imagegen diffusion pipeline (step 9b).
+
+Native mstar port of vllm-omni's ``pipeline_ming_imagegen.py`` +
+``z_image/pipeline_z_image.py`` denoise loop. The upstream pipeline subclasses
+``ZImagePipeline`` (diffusers-/vllm_omni-coupled) and reads cross-stage tensors
+off a global forward-context. This port:
+
+  * keeps the **denoise loop pure** (``MingImageDenoiser.denoise``) — it takes
+    the DiT, scheduler, latents and prompt embeds as plain arguments, so the
+    flow-matching + classifier-free-guidance math is unit-testable with stubs
+    and has no diffusers dependency;
+  * pushes diffusers/transformers loading behind
+    :meth:`MingImagePipeline.from_checkpoint` (lazy import) so the module
+    imports cleanly even where diffusers is unavailable.
+
+Flow-matching denoise (Z-Image convention):
+  - latents start as Gaussian noise; timesteps come from
+    FlowMatchEulerDiscreteScheduler with dynamic shifting (``mu`` from
+    :func:`calculate_shift`);
+  - per step the DiT predicts velocity; CFG combines pos/neg; the prediction is
+    negated before ``scheduler.step`` (Z-Image sign convention);
+  - final latents are un-shifted/un-scaled and VAE-decoded to ``[-1, 1]``.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def calculate_shift(
+    image_seq_len: int,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+) -> float:
+    """Dynamic-shift ``mu`` for FlowMatchEulerDiscreteScheduler (Z-Image)."""
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    return image_seq_len * m + b
+
+
+@dataclass
+class MingImageGenSamplingParams:
+    """Resolved sampling knobs for one image-gen request."""
+
+    height: int = 1024
+    width: int = 1024
+    num_inference_steps: int = 50
+    guidance_scale: float = 2.0
+    seed: int | None = None
+    cfg_truncation: float = 1.0
+    cfg_normalization: float = 0.0
+
+
+def combine_cfg(
+    pos: torch.Tensor,
+    neg: torch.Tensor,
+    guidance_scale: float,
+    cfg_normalization: float = 0.0,
+) -> torch.Tensor:
+    """Classifier-free-guidance combination with optional renormalization.
+
+    ``pred = pos + scale * (pos - neg)``; when ``cfg_normalization > 0`` the
+    result is rescaled so its norm does not exceed ``cfg_normalization`` × the
+    positive prediction's norm (Z-Image's renorm trick). Operates in fp32.
+    """
+    pos = pos.float()
+    neg = neg.float()
+    pred = pos + guidance_scale * (pos - neg)
+    if cfg_normalization and float(cfg_normalization) > 0.0:
+        ori = torch.linalg.vector_norm(pos)
+        new = torch.linalg.vector_norm(pred)
+        max_new = ori * float(cfg_normalization)
+        scale = torch.where(
+            new > max_new,
+            (max_new / new.clamp(min=1e-12)).to(pred.dtype),
+            pred.new_tensor(1.0),
+        )
+        pred = pred * scale
+    return pred
+
+
+class MingImageDenoiser:
+    """Pure flow-matching + CFG denoise loop (no diffusers coupling).
+
+    Holds references to the DiT transformer and a diffusers-style scheduler
+    (anything exposing ``.step(model_output, t, sample) -> (prev_sample, ...)``
+    and ``.timesteps``). The loop math mirrors ZImagePipeline.forward steps 6.
+    """
+
+    def __init__(self, transformer, scheduler, dtype: torch.dtype = torch.float32) -> None:
+        self.transformer = transformer
+        self.scheduler = scheduler
+        self.dtype = dtype
+
+    def denoise(
+        self,
+        latents: torch.Tensor,
+        timesteps,
+        prompt_embeds: list[torch.Tensor],
+        negative_prompt_embeds: list[torch.Tensor] | None,
+        guidance_scale: float,
+        cfg_truncation: float = 1.0,
+        cfg_normalization: float = 0.0,
+    ) -> torch.Tensor:
+        """Run the denoising loop and return the final ``[B, C, H, W]`` latents.
+
+        Args:
+            latents: initial noise ``[B, C, H, W]`` (fp32).
+            timesteps: iterable of scheduler timesteps (1-D tensor).
+            prompt_embeds / negative_prompt_embeds: list[Tensor] one per item.
+            guidance_scale: CFG scale; ``> 0`` enables CFG (needs negatives).
+            cfg_truncation: disable CFG once normalized time exceeds this.
+            cfg_normalization: optional CFG renorm factor (0 = off).
+        """
+        actual_batch = latents.shape[0]
+        do_cfg = guidance_scale > 0 and negative_prompt_embeds is not None
+
+        ts = timesteps if isinstance(timesteps, torch.Tensor) else torch.as_tensor(timesteps)
+        norm_ts = ((1000 - ts.float()) / 1000).tolist()
+
+        for i, t in enumerate(timesteps):
+            if isinstance(t, torch.Tensor):
+                timestep = t.expand(latents.shape[0])
+            else:
+                timestep = torch.tensor([t] * latents.shape[0])
+            timestep = (1000 - timestep) / 1000
+            t_norm = norm_ts[i]
+
+            current_scale = guidance_scale
+            if do_cfg and cfg_truncation is not None and float(cfg_truncation) <= 1 and t_norm > cfg_truncation:
+                current_scale = 0.0
+            apply_cfg = do_cfg and current_scale > 0
+
+            latents_typed = latents.to(self.dtype)
+            if apply_cfg:
+                latent_model_input = latents_typed.repeat(2, 1, 1, 1)
+                embeds_input = prompt_embeds + negative_prompt_embeds
+                timestep_input = timestep.repeat(2)
+            else:
+                latent_model_input = latents_typed
+                embeds_input = prompt_embeds
+                timestep_input = timestep
+
+            # DiT expects a list of [C, F, H, W] (frame axis inserted at dim 2).
+            latent_model_input = latent_model_input.unsqueeze(2)
+            model_out = self.transformer(
+                list(latent_model_input.unbind(dim=0)),
+                timestep_input,
+                embeds_input,
+            )[0]
+
+            if apply_cfg:
+                pos_out = model_out[:actual_batch]
+                neg_out = model_out[actual_batch:]
+                noise_pred = torch.stack(
+                    [
+                        combine_cfg(pos_out[j], neg_out[j], current_scale, cfg_normalization)
+                        for j in range(actual_batch)
+                    ],
+                    dim=0,
+                )
+            else:
+                noise_pred = torch.stack([o.float() for o in model_out], dim=0)
+
+            noise_pred = noise_pred.squeeze(2)
+            noise_pred = -noise_pred  # Z-Image sign convention
+
+            latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
+
+        return latents
+
+
+class MingImagePipeline:
+    """Text-to-image / img2img pipeline for Ming-flash-omni-2.0.
+
+    Construct via :meth:`from_checkpoint` (loads VAE / scheduler / DiT /
+    condition encoder / optional ByT5 — diffusers + transformers required) or
+    inject components directly (used by tests). The conditioning path is Ming's
+    own (Qwen2 connector), so there is no Z-Image text encoder / tokenizer.
+    """
+
+    def __init__(
+        self,
+        *,
+        transformer,
+        scheduler,
+        vae,
+        condition_encoder,
+        image_gen_config,
+        byte5=None,
+        device: torch.device | str = "cpu",
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        self.transformer = transformer
+        self.scheduler = scheduler
+        self.vae = vae
+        self.condition_encoder = condition_encoder
+        self.image_gen_config = image_gen_config
+        self.byte5 = byte5
+        self.device = torch.device(device)
+        self.dtype = dtype
+        self.denoiser = MingImageDenoiser(transformer, scheduler, dtype=dtype)
+        self.vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1) if vae is not None else 8
+
+    @classmethod
+    def from_checkpoint(cls, model_path, image_gen_config, *, device="cuda", dtype=torch.bfloat16):
+        """Load all components from the checkpoint (lazy diffusers import).
+
+        Kept separate from ``__init__`` so the module imports without diffusers;
+        only this path needs it.
+        """
+        from pathlib import Path
+
+        from diffusers import AutoencoderKL
+        from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+
+        from mstar.model.ming_omni_flash.components.byte5_encoder import MingByT5Encoder
+        from mstar.model.ming_omni_flash.components.condition_encoder import MingConditionEncoder
+        from mstar.model.ming_omni_flash.components.zimage_transformer import MingZImageTransformer2DModel
+
+        model_path = Path(model_path)
+        cfg = image_gen_config
+
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            model_path, subfolder=cfg.scheduler_subfolder, local_files_only=True
+        )
+        scheduler.config["use_dynamic_shifting"] = True
+
+        vae = AutoencoderKL.from_pretrained(
+            model_path, subfolder=cfg.vae_subfolder, local_files_only=True, torch_dtype=dtype
+        ).to(device).eval()
+
+        transformer = MingZImageTransformer2DModel(
+            all_patch_size=tuple(cfg.dit.all_patch_size),
+            all_f_patch_size=tuple(cfg.dit.all_f_patch_size),
+            dim=cfg.dit.dim,
+            n_layers=cfg.dit.n_layers,
+            n_refiner_layers=cfg.dit.n_refiner_layers,
+            n_heads=cfg.dit.n_heads,
+            n_kv_heads=cfg.dit.n_kv_heads,
+            in_channels=cfg.dit.in_channels,
+            norm_eps=cfg.dit.norm_eps,
+            rope_theta=cfg.dit.rope_theta,
+            t_scale=cfg.dit.t_scale,
+            axes_dims=tuple(cfg.dit.axes_dims),
+            axes_lens=tuple(cfg.dit.axes_lens),
+            cap_feat_dim=cfg.diffusion_c_input_dim,
+        ).to(device, dtype=dtype).eval()
+
+        condition_encoder = MingConditionEncoder(
+            cfg, thinker_hidden_size=4096, device=device, dtype=dtype
+        )
+        condition_encoder.load_from_checkpoint(model_path)
+
+        byte5_dir = model_path / "byt5"
+        byte5 = None
+        if (byte5_dir / "byt5.json").exists():
+            byte5 = MingByT5Encoder.from_checkpoint(byte5_dir, device=torch.device(device), dtype=dtype)
+
+        return cls(
+            transformer=transformer,
+            scheduler=scheduler,
+            vae=vae,
+            condition_encoder=condition_encoder,
+            image_gen_config=cfg,
+            byte5=byte5,
+            device=device,
+            dtype=dtype,
+        )
+
+    def prepare_latents(self, batch_size, height, width, generator=None) -> torch.Tensor:
+        """Gaussian init latents ``[B, C, H/vae, W/vae]`` (fp32)."""
+        c = self.transformer.in_channels
+        vae_scale = self.vae_scale_factor * 2
+        shape = (batch_size, c, height // vae_scale, width // vae_scale)
+        return torch.randn(shape, generator=generator, device=self.device, dtype=torch.float32)
+
+    def build_cap_feats(
+        self,
+        thinker_hidden_states: torch.Tensor,
+        negative_hidden: torch.Tensor | None = None,
+        byte5_texts: list[str] | None = None,
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Run the condition encoder (+ optional ByT5) → (pos, neg) embed lists.
+
+        Negatives default to zeros (Ming's CFG convention) unless explicit
+        negative thinker states are supplied. ByT5 glyph features are appended
+        along the sequence dim; the negative side gets zeros for that span so
+        CFG doesn't push away from rendered text.
+        """
+        if thinker_hidden_states.dim() == 2:
+            thinker_hidden_states = thinker_hidden_states.unsqueeze(0)
+        cap_feats = self.condition_encoder(thinker_hidden_states)
+
+        negative_cap_feats = None
+        if negative_hidden is not None:
+            if negative_hidden.dim() == 2:
+                negative_hidden = negative_hidden.unsqueeze(0)
+            negative_cap_feats = self.condition_encoder(negative_hidden)
+
+        if byte5_texts and self.byte5 is not None:
+            byte5_feats = self.byte5(byte5_texts).to(cap_feats.dtype)
+            cap_feats = torch.cat((cap_feats, byte5_feats), dim=1)
+            if negative_cap_feats is not None:
+                negative_cap_feats = torch.cat((negative_cap_feats, torch.zeros_like(byte5_feats)), dim=1)
+
+        prompt_embeds = [cap_feats[i] for i in range(cap_feats.shape[0])]
+        if negative_cap_feats is not None:
+            negative_prompt_embeds = [negative_cap_feats[i] for i in range(negative_cap_feats.shape[0])]
+        else:
+            negative_prompt_embeds = [self.condition_encoder.zero_negative(e) for e in prompt_embeds]
+        return prompt_embeds, negative_prompt_embeds
+
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        """Un-shift/un-scale then VAE-decode to a ``[B, 3, H, W]`` image in [-1,1]."""
+        latents = latents.to(self.vae.dtype)
+        latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+        return self.vae.decode(latents, return_dict=False)[0]
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        thinker_hidden_states: torch.Tensor,
+        params: MingImageGenSamplingParams,
+        *,
+        negative_hidden: torch.Tensor | None = None,
+        byte5_texts: list[str] | None = None,
+    ) -> torch.Tensor:
+        """End-to-end text-to-image: condition → denoise → VAE decode."""
+        generator = None
+        if params.seed is not None:
+            generator = torch.Generator(device=self.device).manual_seed(int(params.seed))
+
+        prompt_embeds, negative_prompt_embeds = self.build_cap_feats(
+            thinker_hidden_states, negative_hidden, byte5_texts
+        )
+        latents = self.prepare_latents(len(prompt_embeds), params.height, params.width, generator)
+
+        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        self.scheduler.sigma_min = 0.0
+        self.scheduler.set_timesteps(params.num_inference_steps, device=self.device, mu=mu)
+
+        latents = self.denoiser.denoise(
+            latents,
+            self.scheduler.timesteps,
+            prompt_embeds,
+            negative_prompt_embeds,
+            guidance_scale=params.guidance_scale,
+            cfg_truncation=params.cfg_truncation,
+            cfg_normalization=params.cfg_normalization,
+        )
+        return self.decode_latents(latents)
+
+
+__all__ = [
+    "MingImageDenoiser",
+    "MingImageGenSamplingParams",
+    "MingImagePipeline",
+    "calculate_shift",
+    "combine_cfg",
+]
diff --git a/mstar/model/ming_omni_flash/components/model.py b/mstar/model/ming_omni_flash/components/model.py
new file mode 100644
index 00000000..86b7bf69
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/model.py
@@ -0,0 +1,215 @@
+"""Ling-2.0 thinker LLM (full forward, no KV cache yet).
+
+Composes :class:`LingDecoderLayer` × N with a shared rope, vocab
+embedding, final RMSNorm, and an untied lm_head. The shape downstream
+mstar code will eventually wrap is one of these :class:`LingMoeModel`
+instances behind a :class:`NodeSubmodule` (step 3c).
+
+Reference structure: vllm-omni's :class:`BailingMoeV2Model` +
+:class:`BailingMoeV2ForCausalLM`
+``/tmp/vllm-omni/.../modeling_bailing_moe_v2.py:662-895``.
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+
+from mstar.distributed.communication import TPCommGroup
+from mstar.model.components.norm import RMSNorm
+from mstar.model.ming_omni_flash.components.decoder_layer import (
+    LingDecoderLayer,
+)
+from mstar.model.ming_omni_flash.components.rope import (
+    LingPartialMRotaryEmbedding,
+)
+
+
+class LingMoeModel(nn.Module):
+    """Full Ling-2.0 thinker forward (embed + layers + lm_head).
+
+    All shape-relevant config flattens into the constructor so callers
+    don't need a :class:`MingFlashOmniModelConfig` instance — useful for
+    small-dim unit tests. The eventual mstar submodule (step 3c) builds
+    one of these from the real config.
+
+    Args (all required, but small-dim test configs only need plausible
+    values; nothing here is hard-coded to Ming-specific dims):
+        vocab_size: e.g. 157184 on released ckpt.
+        hidden_size: e.g. 4096.
+        intermediate_size: dense layer-0 MLP intermediate; e.g. 9216.
+        moe_intermediate_size: per-expert intermediate; e.g. 1024.
+        num_hidden_layers: e.g. 32.
+        num_attention_heads, num_kv_heads, head_dim: e.g. 32 / 4 / 128.
+        rms_norm_eps: 1e-6.
+        rope_theta: 2_400_000.
+        max_position_embeddings: 32768.
+        partial_rotary_factor: 0.5.
+        mrope_section: [8, 12, 12].
+        num_experts: 256.
+        num_experts_per_tok: 8.
+        num_shared_experts: 1.
+        n_group: 8.
+        topk_group: 4.
+        routed_scaling_factor: 2.5.
+        first_k_dense_replace: 1.
+        tie_word_embeddings: False on released ckpt — lm_head is a
+            separate matrix from embed_tokens.
+    """
+
+    def __init__(
+        self,
+        *,
+        vocab_size: int,
+        hidden_size: int,
+        intermediate_size: int,
+        moe_intermediate_size: int,
+        num_hidden_layers: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        rope_theta: float,
+        max_position_embeddings: int,
+        partial_rotary_factor: float,
+        mrope_section: list[int],
+        num_experts: int,
+        num_experts_per_tok: int,
+        num_shared_experts: int,
+        n_group: int,
+        topk_group: int,
+        routed_scaling_factor: float,
+        first_k_dense_replace: int,
+        tie_word_embeddings: bool = False,
+        use_qkv_bias: bool = False,
+        use_bias: bool = False,
+        comm_group: TPCommGroup | None = None,
+    ) -> None:
+        super().__init__()
+        if comm_group is None:
+            comm_group = TPCommGroup.trivial()
+        self.comm_group = comm_group
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+
+        # embed_tokens + lm_head stay replicated. At hidden_size=4096
+        # they're 1.3 GB each — cheap compared to the layers.
+        self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
+
+        # Single rotary instance shared across every layer — inv_freq is
+        # config-only, no per-layer state.
+        rotary = LingPartialMRotaryEmbedding(
+            head_dim=head_dim,
+            partial_rotary_factor=partial_rotary_factor,
+            mrope_section=mrope_section,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_position_embeddings,
+        )
+
+        self.layers = nn.ModuleList([
+            LingDecoderLayer(
+                layer_idx=i,
+                first_k_dense_replace=first_k_dense_replace,
+                hidden_size=hidden_size,
+                intermediate_size=intermediate_size,
+                moe_intermediate_size=moe_intermediate_size,
+                num_attention_heads=num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                head_dim=head_dim,
+                rms_norm_eps=rms_norm_eps,
+                num_experts=num_experts,
+                num_experts_per_tok=num_experts_per_tok,
+                num_shared_experts=num_shared_experts,
+                n_group=n_group,
+                topk_group=topk_group,
+                routed_scaling_factor=routed_scaling_factor,
+                rotary=rotary,
+                use_qkv_bias=use_qkv_bias,
+                use_bias=use_bias,
+                comm_group=comm_group,
+            )
+            for i in range(num_hidden_layers)
+        ])
+
+        self.norm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)
+        self.tie_word_embeddings = tie_word_embeddings
+        if tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+
+    def forward(
+        self,
+        cache_handle,
+        input_ids: torch.Tensor | None = None,
+        input_embeds: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        image_mask: torch.Tensor | None = None,
+        audio_mask: torch.Tensor | None = None,
+        return_hidden_states: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        """Run the full thinker forward.
+
+        Args:
+            cache_handle: :class:`BatchedCacheManager` from the engine
+                (or a unit-test mock with ``set_layer_idx`` +
+                ``run_attention``). Required — the attention layer
+                writes K/V to its paged cache and runs FlashInfer
+                attention against it.
+            input_ids: ``(T,)`` token ids — if provided, ``embed_tokens``
+                turns them into embeddings.
+            input_embeds: ``(T, hidden_size)`` precomputed embeddings —
+                used directly (multimodal callers pass this with vision /
+                audio embeddings already spliced in).
+            position_ids: ``(T,)`` for 1D rope, or ``(3, T)`` for 3D
+                video_rope. Defaults to ``torch.arange(T)`` if None.
+            image_mask, audio_mask: per-token modality masks for
+                :class:`LingMoeBlock`. ``None`` ⇒ all text routing.
+
+            return_hidden_states: when True, also return the post-norm
+                hidden states ``(T, hidden_size)`` as a second tuple element.
+                The image-gen path (step 9b) needs these at the
+                ``<imagePatch>`` query-token positions to condition the DiT —
+                ``lm_head`` logits are irrelevant there.
+
+        Returns:
+            ``(T, vocab_size)`` logits by default. The caller (the submodule)
+            slices the last position for next-token sampling. When
+            ``return_hidden_states`` is True, returns
+            ``(logits, hidden_states)`` where ``hidden_states`` is the
+            post-norm ``(T, hidden_size)`` tensor.
+        """
+        if (input_ids is None) == (input_embeds is None):
+            raise ValueError(
+                "Exactly one of input_ids / input_embeds must be provided"
+            )
+
+        if input_embeds is None:
+            assert input_ids is not None
+            h = self.embed_tokens(input_ids)
+        else:
+            h = input_embeds
+
+        if h.dim() != 2:
+            raise ValueError(
+                f"LingMoeModel expects packed (T, hidden) input; got "
+                f"shape {tuple(h.shape)}."
+            )
+
+        T = h.shape[0]
+        if position_ids is None:
+            position_ids = torch.arange(T, device=h.device)
+
+        for layer_idx, layer in enumerate(self.layers):
+            cache_handle.set_layer_idx(layer_idx)
+            h = layer(
+                h, cache_handle, position_ids,
+                image_mask=image_mask,
+                audio_mask=audio_mask,
+            )
+
+        h = self.norm(h)
+        logits = self.lm_head(h)
+        if return_hidden_states:
+            return logits, h
+        return logits
diff --git a/mstar/model/ming_omni_flash/components/moe.py b/mstar/model/ming_omni_flash/components/moe.py
new file mode 100644
index 00000000..9d7b5b4c
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/moe.py
@@ -0,0 +1,303 @@
+"""Ling-2.0 MoE block (TP-aware ``MultiRouter`` flavour).
+
+Same 3-router text/image/audio gate selection as step 3b, now with
+per-rank expert sharding when ``comm_group.world_size > 1``:
+
+  * Fused expert tensors hold ``(E, 2*shard_inter, hidden)`` and
+    ``(E, hidden, shard_inter)`` per rank, where
+    ``shard_inter = moe_intermediate_size // tp_size``.
+  * Mminf's ``_gate_up_weight_loader`` / ``_down_proj_weight_loader``
+    handle per-rank slicing during checkpoint load — these get
+    attached to the params via the ``_attach_weight_loaders`` dance
+    that survives ``.to_empty`` / ``.to(...)``.
+  * Shared expert is a ``ParallelGatedMLP`` so its ``down_proj``
+    all-reduces internally.
+  * Forward TP path mirrors :class:`ParallelSparseMoeBlock._dispatch_tp`:
+    `fused_experts(..., reduce_results=False)` → ``all_reduce`` →
+    ``moe_sum_reduce_triton``.
+
+Routers (``LingMoeRouter``) stay replicated across ranks — gates must
+make identical decisions so every rank dispatches tokens to the same
+experts.
+
+Reference: vllm-omni's ``BailingMoeV2SparseMoeBlock`` (lines 304-433)
++ mstar's :class:`ParallelSparseMoeBlock`
+(`mstar/model/components/moe.py:318-414`).
+"""
+
+from __future__ import annotations
+
+from functools import partial
+
+import torch
+from torch import nn
+
+from mstar.distributed.communication import TPCommGroup
+from mstar.distributed.utils import divide
+from mstar.model.components.distributed.mlp import ParallelGatedMLP
+from mstar.model.components.mlp import GatedMLP
+from mstar.model.components.moe import (
+    _dispatch,
+    _down_proj_weight_loader,
+    _gate_up_weight_loader,
+    dispatch_experts_fused,
+)
+from mstar.model.ming_omni_flash.components.router import LingMoeRouter
+
+
+def _normalize_modality_mask(
+    mask: torch.Tensor | None, num_tokens: int, name: str,
+) -> torch.Tensor | None:
+    """Reshape a modality mask to ``(num_tokens, 1)`` bool, or pass through None."""
+    if mask is None:
+        return None
+    if mask.dim() == 1:
+        if mask.shape[0] != num_tokens:
+            raise ValueError(
+                f"{name} length {mask.shape[0]} != num_tokens={num_tokens}"
+            )
+        return mask.reshape(num_tokens, 1).bool()
+    if mask.dim() == 2:
+        if mask.numel() != num_tokens:
+            raise ValueError(
+                f"{name} shape {tuple(mask.shape)} has {mask.numel()} elements; "
+                f"expected num_tokens={num_tokens}"
+            )
+        return mask.reshape(num_tokens, 1).bool()
+    if mask.dim() == 3:
+        if mask.shape[-1] != 1 or mask.numel() != num_tokens:
+            raise ValueError(
+                f"{name} shape {tuple(mask.shape)} not compatible with "
+                f"num_tokens={num_tokens}"
+            )
+        return mask.reshape(num_tokens, 1).bool()
+    raise ValueError(
+        f"{name} must be 1D, 2D, or 3D; got shape {tuple(mask.shape)}"
+    )
+
+
+class LingMoeBlock(nn.Module):
+    """Ling-2.0 MoE FFN with text/image/audio gate selection per token.
+
+    Constructor takes the FULL ``moe_intermediate_size``; the per-rank
+    ``shard_inter`` is computed from ``comm_group.world_size``.
+
+    Args:
+        hidden_size: model hidden dim.
+        num_experts: total routed experts.
+        num_experts_per_tok: top-k experts per token.
+        moe_intermediate_size: per-expert intermediate dim (FULL —
+            sharding handled internally).
+        num_shared_experts: number of shared experts (1 on the released
+            ckpt). The shared expert is a ``ParallelGatedMLP`` of width
+            ``moe_intermediate_size * num_shared_experts``.
+        n_group, topk_group, routed_scaling_factor: passed to the
+            :class:`LingMoeRouter`s.
+        comm_group: TP comm group; defaults to single-rank trivial.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_experts: int,
+        num_experts_per_tok: int,
+        moe_intermediate_size: int,
+        num_shared_experts: int,
+        n_group: int,
+        topk_group: int,
+        routed_scaling_factor: float = 1.0,
+        comm_group: TPCommGroup | None = None,
+    ) -> None:
+        super().__init__()
+        if comm_group is None:
+            comm_group = TPCommGroup.trivial()
+        self.comm_group = comm_group
+        tp_size = comm_group.world_size
+        tp_rank = comm_group.rank
+
+        self.hidden_size = hidden_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_intermediate_size = moe_intermediate_size
+
+        router_kwargs = dict(
+            hidden_size=hidden_size,
+            num_experts=num_experts,
+            num_experts_per_tok=num_experts_per_tok,
+            n_group=n_group,
+            topk_group=topk_group,
+            routed_scaling_factor=routed_scaling_factor,
+        )
+        # Routers — replicated. All ranks must agree on which experts a
+        # given token routes to, so gate weights are loaded identically
+        # per rank (default weight_loader, no shard_id).
+        self.gate = LingMoeRouter(**router_kwargs)
+        self.image_gate = LingMoeRouter(**router_kwargs)
+        self.audio_gate = LingMoeRouter(**router_kwargs)
+
+        # Fused expert tensors with per-rank intermediate shard.
+        shard_inter = divide(moe_intermediate_size, tp_size)
+        self.experts = nn.Module()
+        self.experts.gate_up_proj = nn.Parameter(
+            torch.empty(num_experts, 2 * shard_inter, hidden_size)
+        )
+        self.experts.down_proj = nn.Parameter(
+            torch.empty(num_experts, hidden_size, shard_inter)
+        )
+
+        # Shared expert: ParallelGatedMLP. Its down_proj all-reduces, so
+        # the shared output already lives on the full hidden state at
+        # every rank.
+        if num_shared_experts <= 0:
+            raise ValueError(
+                "LingMoeBlock requires num_shared_experts >= 1; released "
+                "Ming-flash-omni-2.0 has 1."
+            )
+        self.shared_expert = ParallelGatedMLP(
+            comm_group=comm_group,
+            hidden_size=hidden_size,
+            intermediate_size=moe_intermediate_size * num_shared_experts,
+            bias=False,
+        )
+
+        self._attach_weight_loaders(tp_rank, tp_size, moe_intermediate_size)
+
+    # ------------------------------------------------------------------
+    # Weight loader plumbing — mirrors ParallelSparseMoeBlock
+    # ------------------------------------------------------------------
+
+    def _attach_weight_loaders(
+        self, tp_rank: int, tp_size: int, full_inter: int,
+    ) -> None:
+        """Attach mstar's per-rank fused-expert weight loaders.
+
+        The loaders accept shard ids ``"gate:N"``, ``"up:N"``, ``"down:N"``
+        and slice along the intermediate dim per rank, then write into
+        the right expert slot. ``load_hf_weights`` dispatches based on
+        the ``StackedParamRule.shard_id`` we configure in the loader.
+        """
+        self.experts.gate_up_proj.weight_loader = partial(
+            _gate_up_weight_loader, tp_rank, tp_size, full_inter,
+        )
+        self.experts.down_proj.weight_loader = partial(
+            _down_proj_weight_loader, tp_rank, tp_size, full_inter,
+        )
+
+    def _apply(self, fn, recurse=True):
+        """Re-attach loaders after any ``to_empty`` / ``.to(...)`` since
+        those operations re-allocate Parameters and drop attached
+        attributes on the old objects."""
+        result = super()._apply(fn, recurse=recurse)
+        self._attach_weight_loaders(
+            self.comm_group.rank,
+            self.comm_group.world_size,
+            self.moe_intermediate_size,
+        )
+        return result
+
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        image_mask: torch.Tensor | None = None,
+        audio_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Route + dispatch + add shared expert output.
+
+        TP=1 path uses the direct ``_dispatch`` helper (mstar's
+        triton-fused or naive loop depending on availability). TP>1
+        path uses the unreduced fused_experts call + manual all-reduce
+        + sum-reduce — mirrors :class:`ParallelSparseMoeBlock._dispatch_tp`.
+        """
+        input_shape = hidden_states.shape
+        flat = hidden_states.view(-1, hidden_states.shape[-1]).contiguous()
+        num_tokens = flat.shape[0]
+
+        # Text-gate baseline routing (always computed).
+        _, topk_weight, topk_idx = self.gate(flat)
+
+        image_mask = _normalize_modality_mask(image_mask, num_tokens, "image_mask")
+        audio_mask = _normalize_modality_mask(audio_mask, num_tokens, "audio_mask")
+
+        if image_mask is not None:
+            _, img_w, img_idx = self.image_gate(flat)
+            topk_idx = torch.where(image_mask, img_idx, topk_idx)
+            topk_weight = torch.where(image_mask, img_w, topk_weight)
+        if audio_mask is not None:
+            _, aud_w, aud_idx = self.audio_gate(flat)
+            topk_idx = torch.where(audio_mask, aud_idx, topk_idx)
+            topk_weight = torch.where(audio_mask, aud_w, topk_weight)
+
+        if self.comm_group.world_size == 1:
+            routed = _dispatch(
+                flat,
+                self.experts.gate_up_proj,
+                self.experts.down_proj,
+                self.num_experts,
+                topk_idx,
+                topk_weight,
+            )
+        else:
+            routed = self._dispatch_tp(flat, topk_weight, topk_idx)
+
+        shared = self.shared_expert(flat)
+        # Upstream sums routed + shared without an additional gate
+        # (BailingMoeV2SparseMoeBlock.forward:429). The
+        # routed_scaling_factor is baked into topk_weight via the router.
+        return (routed + shared).view(input_shape)
+
+    def _dispatch_tp(
+        self,
+        flat: torch.Tensor,
+        routing_weights: torch.Tensor,
+        selected_experts: torch.Tensor,
+    ) -> torch.Tensor:
+        """TP>1 expert dispatch.
+
+        Identical to :func:`ParallelSparseMoeBlock._dispatch_tp` — runs
+        fused_experts WITHOUT the final per-token reduce, all-reduces
+        the per-rank partial results across TP ranks, then sum-reduces
+        across top-k. Result is the full-precision routed output at
+        every rank.
+
+        Falls back to the naive per-expert loop in
+        :func:`dispatch_experts_fused` when ``sgl_kernel`` isn't loadable
+        (e.g. ABI-mismatched against the installed torch). The naive path
+        already returns ``(tokens, hidden)`` summed across top-k, so we
+        all-reduce that directly — math is equivalent because sum-over-TP
+        and sum-over-top-k commute.
+        """
+        from mstar.utils.fused_moe.align import has_sgl_kernel
+
+        if has_sgl_kernel():
+            from mstar.utils.fused_moe import fused_experts, moe_sum_reduce_triton
+
+            cache3 = fused_experts(
+                flat,
+                self.experts.gate_up_proj,
+                self.experts.down_proj,
+                routing_weights,
+                selected_experts,
+                reduce_results=False,
+            )
+            self.comm_group.all_reduce(cache3)
+            output = torch.empty_like(flat)
+            moe_sum_reduce_triton(cache3, output, routed_scaling_factor=1.0)
+            return output
+
+        partial = dispatch_experts_fused(
+            flat,
+            self.experts.gate_up_proj,
+            self.experts.down_proj,
+            self.experts.gate_up_proj.shape[0],
+            selected_experts,
+            routing_weights,
+        )
+        self.comm_group.all_reduce(partial)
+        return partial
+
+
+__all__ = ["LingMoeBlock", "GatedMLP"]  # GatedMLP re-export for back-compat
diff --git a/mstar/model/ming_omni_flash/components/positions.py b/mstar/model/ming_omni_flash/components/positions.py
new file mode 100644
index 00000000..5f14d221
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/positions.py
@@ -0,0 +1,209 @@
+"""3D MRoPE position-id helpers for Ming-flash-omni-2.0.
+
+Ming-flash-omni-2.0 uses partial 3D MRoPE
+(`mrope_section=[8, 12, 12]`, `partial_rotary_factor=0.5`) in the
+``video_rope`` layout. The cos/sin remap lives in
+:class:`mstar.model.ming_omni_flash.components.rope.LingPartialMRotaryEmbedding`;
+this module produces the *position-id* tensors that feed into it.
+
+Three helpers cover the modality-specific position layouts used by the
+Thinker prefill walks:
+
+  * :func:`get_rope_index_text`   — pure-text span (sentinels included).
+  * :func:`get_rope_index_audio`  — audio embeddings (treated as text
+    positions per ``modeling_bailing_moe_v2.get_rope_index``, which
+    only special-cases ``image_*`` / ``video_*`` tokens).
+  * :func:`get_rope_index_vision` — image (or video) embeddings with
+    grid-aware T/H/W position ids per
+    ``modeling_bailing_moe_v2.get_rope_index:592-647``.
+
+All three return ``(3, seq_len)`` tensors with rows ``[t, h, w]``;
+the rope module's ``video_rope`` remap will pick out H/W on even/odd
+spatial slots and T on the temporal tail (see
+``LingPartialMRotaryEmbedding._cos_sin_3d_video_rope`` for the layout).
+"""
+
+from __future__ import annotations
+
+import torch
+
+
+def get_rope_index_text(
+    seq_len: int,
+    start_pos: int | float,
+    device: torch.device | str | None = None,
+    dtype: torch.dtype = torch.long,
+) -> torch.Tensor:
+    """3D MRoPE positions for a pure-text span.
+
+    All three (T, H, W) components share the same sequential positions
+    ``[start_pos, start_pos+1, ..., start_pos+seq_len-1]``. This matches
+    the pure-text branch of ``modeling_bailing_moe_v2.get_rope_index``
+    (`./modeling_bailing_moe_v2.py:658-675`).
+
+    Args:
+        seq_len: number of tokens in this span.
+        start_pos: position offset for the first token.
+        device:  target device.
+        dtype:   integer dtype for the position ids (rope module
+                 casts to float internally; long matches the upstream).
+
+    Returns:
+        ``(3, seq_len)`` tensor.
+    """
+    positions = torch.arange(seq_len, dtype=dtype, device=device) + int(start_pos)
+    return positions.unsqueeze(0).expand(3, -1).contiguous()
+
+
+def get_rope_index_audio(
+    num_audio_tokens: int,
+    start_pos: int | float,
+    device: torch.device | str | None = None,
+    dtype: torch.dtype = torch.long,
+) -> torch.Tensor:
+    """3D MRoPE positions for an audio span.
+
+    Ming's `get_rope_index` does NOT special-case audio: audio tokens
+    advance through the same per-token position counter as text. Each
+    audio token contributes one position; T/H/W all match. Audio
+    semantics live in the audio encoder + projector (which already
+    down-sample to one embedding per LLM-time-step).
+
+    Args:
+        num_audio_tokens: number of audio embeddings (after the
+            projector's conv1d down-sample).
+        start_pos: position offset for the first audio embedding.
+        device:  target device.
+        dtype:   integer dtype for position ids.
+
+    Returns:
+        ``(3, num_audio_tokens)`` tensor, identical rows.
+    """
+    return get_rope_index_text(num_audio_tokens, start_pos, device=device, dtype=dtype)
+
+
+def get_rope_index_vision(
+    grid_thw: torch.Tensor,
+    start_pos: int | float,
+    spatial_merge_size: int,
+    device: torch.device | str | None = None,
+    second_per_grid_t: float | None = None,
+    tokens_per_second: int = 2,
+    dtype: torch.dtype = torch.long,
+) -> torch.Tensor:
+    """3D MRoPE positions for a vision span (single image or video).
+
+    Mirrors `modeling_bailing_moe_v2.get_rope_index:625-647` for one
+    image:
+
+    * Temporal:    ``arange(grid_t)`` expanded across ``H*W``, optionally
+                   scaled by ``second_per_grid_t * tokens_per_second``
+                   for absolute video timestamps.
+    * Height:      ``arange(llm_grid_h)`` expanded across ``T * W``.
+    * Width:       ``arange(llm_grid_w)`` expanded across ``T * H``.
+
+    ``llm_grid_h = grid_h // spatial_merge_size`` (same for W). All
+    three components are offset by ``start_pos`` so the span fits into
+    the global position-id counter the caller is tracking.
+
+    Multi-image / video frames concatenate across images by calling
+    this helper per image and stitching the results — see
+    :func:`stitch_vision_positions` (or the dispatch in
+    `BailingMoeV2ThinkerSubmodule.prepare_inputs`).
+
+    Args:
+        grid_thw: ``(3,)`` long tensor of (T, H, W) grid sizes.
+        start_pos: position offset for this image's first token.
+        spatial_merge_size: from `VisionEncoderConfig.spatial_merge_size`
+            (= 2 on the released ckpt).
+        device:  target device.
+        second_per_grid_t: when set, multiply the temporal component by
+            ``second_per_grid_t * tokens_per_second`` for absolute video
+            timestamps. None ⇒ raw frame index. Image inputs always pass
+            None; video inputs pass the per-clip frame interval.
+        tokens_per_second: temporal-resolution multiplier
+            (= 2 on the released ckpt; mirrors ``config.tokens_per_second``).
+        dtype: integer dtype for position ids.
+
+    Returns:
+        ``(3, grid_t * (H/m) * (W/m))`` tensor of T/H/W positions
+        offset by ``start_pos``.
+    """
+    if grid_thw.dim() != 1 or grid_thw.numel() != 3:
+        raise ValueError(
+            f"grid_thw must be a 1-D tensor of length 3 (T, H, W); "
+            f"got shape {tuple(grid_thw.shape)}"
+        )
+    grid_t = int(grid_thw[0].item())
+    grid_h = int(grid_thw[1].item())
+    grid_w = int(grid_thw[2].item())
+    if grid_h % spatial_merge_size != 0 or grid_w % spatial_merge_size != 0:
+        raise ValueError(
+            f"grid_h={grid_h} / grid_w={grid_w} not divisible by "
+            f"spatial_merge_size={spatial_merge_size}."
+        )
+    llm_grid_h = grid_h // spatial_merge_size
+    llm_grid_w = grid_w // spatial_merge_size
+
+    # Temporal: arange(grid_t), expanded across H*W, optionally absolute time.
+    range_t = torch.arange(grid_t, dtype=dtype, device=device).view(-1, 1)
+    expanded_t = range_t.expand(-1, llm_grid_h * llm_grid_w)
+    if second_per_grid_t is not None:
+        # Float math then back to int (matches modeling_bailing_moe_v2 path).
+        t_index = (
+            expanded_t.float() * float(second_per_grid_t) * float(tokens_per_second)
+        ).to(dtype).flatten()
+    else:
+        t_index = expanded_t.flatten()
+
+    h_index = (
+        torch.arange(llm_grid_h, dtype=dtype, device=device)
+        .view(1, -1, 1)
+        .expand(grid_t, -1, llm_grid_w)
+        .flatten()
+    )
+    w_index = (
+        torch.arange(llm_grid_w, dtype=dtype, device=device)
+        .view(1, 1, -1)
+        .expand(grid_t, llm_grid_h, -1)
+        .flatten()
+    )
+    return torch.stack([t_index, h_index, w_index], dim=0) + int(start_pos)
+
+
+def vision_span_max_position(
+    grid_thw: torch.Tensor,
+    start_pos: int | float,
+    spatial_merge_size: int,
+    second_per_grid_t: float | None = None,
+    tokens_per_second: int = 2,
+) -> int:
+    """Compute one past the largest position id this vision span produces.
+
+    Useful for advancing the global ``start_pos`` counter past a vision
+    span when the next walk needs to know where text positions resume
+    (mirrors ``modeling_bailing_moe_v2.get_rope_index``'s
+    ``llm_pos_ids_list[-1].max() + 1`` accounting at the end of an
+    image span).
+    """
+    grid_t = int(grid_thw[0].item())
+    grid_h = int(grid_thw[1].item())
+    grid_w = int(grid_thw[2].item())
+    llm_grid_h = grid_h // spatial_merge_size
+    llm_grid_w = grid_w // spatial_merge_size
+
+    if second_per_grid_t is not None:
+        max_t = int((grid_t - 1) * float(second_per_grid_t) * float(tokens_per_second))
+    else:
+        max_t = grid_t - 1
+    max_h = llm_grid_h - 1
+    max_w = llm_grid_w - 1
+    return int(start_pos) + max(max_t, max_h, max_w) + 1
+
+
+__all__ = [
+    "get_rope_index_text",
+    "get_rope_index_audio",
+    "get_rope_index_vision",
+    "vision_span_max_position",
+]
diff --git a/mstar/model/ming_omni_flash/components/projectors.py b/mstar/model/ming_omni_flash/components/projectors.py
new file mode 100644
index 00000000..6a02323f
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/projectors.py
@@ -0,0 +1,165 @@
+"""Vision + audio projectors for Ming-flash-omni-2.0.
+
+Ports the two ``nn.Sequential`` blocks built inline in
+``modeling_bailingmm2.py:BailingMM2NativeForConditionalGeneration.__init__``
+(lines 66-88 of the Ming source repo) into standalone modules that mstar
+can load weights into directly. The released checkpoint stores the
+weights under the top-level prefixes ``linear_proj.*`` (vision) and
+``linear_proj_audio.*`` (audio):
+
+  * Vision (mlp_depth=2):
+      linear_proj.0.{weight,bias}   -> Linear(vision_out_hidden, llm_hidden)
+      [GELU at index 1, no params]
+      linear_proj.2.{weight,bias}   -> Linear(llm_hidden, llm_hidden)
+
+  * Audio (mlp_depth=2):
+      linear_proj_audio.0.{weight,bias}   -> Conv1d(audio_d_model, llm_hidden, ds_kernel_size, ds_stride)
+      [Transpose at index 1, GELU at index 2, no params]
+      linear_proj_audio.3.{weight,bias}   -> Linear(llm_hidden, llm_hidden)
+      [Transpose at index 4, no params]
+
+We mirror the upstream layer ordering exactly so the
+``linear_proj.*`` / ``linear_proj_audio.*`` keys from the checkpoint land
+on the right ``nn.Module`` slot via plain index-based lookup.
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+
+
+class _Transpose(nn.Module):
+    """Used inside ``nn.Sequential`` chains (modeling_utils.py:Transpose)."""
+
+    def __init__(self, dim0: int, dim1: int) -> None:
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.transpose(self.dim0, self.dim1)
+
+
+class MingVisionProjector(nn.Module):
+    """MLP projector: vision encoder output -> LLM hidden space.
+
+    Args:
+        vision_dim: ``VisionEncoderConfig.out_hidden_size`` (4096 on the
+            released ckpt — the vision encoder already projects internally
+            via its ``PatchMerger``).
+        llm_dim:    ``ThinkerLLMConfig.hidden_size`` (4096).
+        mlp_depth:  ``MingFlashOmniModelConfig.mlp_depth`` (2 on the
+            released ckpt). depth=1 yields a single Linear; depth=N adds
+            (N-1) GELU+Linear pairs after it.
+    """
+
+    def __init__(self, vision_dim: int, llm_dim: int, mlp_depth: int = 2) -> None:
+        super().__init__()
+        if mlp_depth < 1:
+            raise ValueError(f"mlp_depth must be >= 1, got {mlp_depth}")
+        layers: list[nn.Module] = [nn.Linear(vision_dim, llm_dim)]
+        for _ in range(1, mlp_depth):
+            layers.append(nn.GELU())
+            layers.append(nn.Linear(llm_dim, llm_dim))
+        # Expose as ``proj`` (not raw ``nn.Sequential``) so subclassing /
+        # surgery has a stable name. Weight loading walks ``proj.<idx>.*``.
+        self.proj = nn.Sequential(*layers)
+
+    def forward(self, vision_embeds: torch.Tensor) -> torch.Tensor:
+        """Project vision tokens.
+
+        Args:
+            vision_embeds: (N_tokens, vision_dim) or (B, N_tokens, vision_dim).
+
+        Returns:
+            Same shape with the last dim replaced by ``llm_dim``.
+        """
+        return self.proj(vision_embeds)
+
+
+class MingAudioProjector(nn.Module):
+    """Conv1d-downsample + MLP projector: Whisper encoder -> LLM hidden space.
+
+    Layer ordering matches ``modeling_bailingmm2.py`` exactly so the
+    released ckpt's ``linear_proj_audio.0`` / ``.3`` keys hit the Conv1d
+    and Linear by integer index.
+
+    Args:
+        audio_dim:     ``AudioEncoderConfig.d_model`` (= whisper n_state,
+                       1280 on the released ckpt).
+        llm_dim:       ``ThinkerLLMConfig.hidden_size``.
+        ds_kernel_size: temporal kernel for the down-sample conv (3 on
+                       the released ckpt).
+        ds_stride:     temporal stride (2 on the released ckpt).
+        mlp_depth:     ``MingFlashOmniModelConfig.mlp_depth`` (2 on the
+                       released ckpt; depth=N adds (N-1) GELU+Linear pairs
+                       after the conv).
+    """
+
+    def __init__(
+        self,
+        audio_dim: int,
+        llm_dim: int,
+        ds_kernel_size: int = 3,
+        ds_stride: int = 2,
+        mlp_depth: int = 2,
+    ) -> None:
+        super().__init__()
+        if mlp_depth < 1:
+            raise ValueError(f"mlp_depth must be >= 1, got {mlp_depth}")
+        self.ds_kernel_size = ds_kernel_size
+        self.ds_stride = ds_stride
+        self.audio_dim = audio_dim
+        self.llm_dim = llm_dim
+
+        layers: list[nn.Module] = [
+            nn.Conv1d(
+                audio_dim,
+                llm_dim,
+                kernel_size=ds_kernel_size,
+                stride=ds_stride,
+                padding=ds_kernel_size // 2,
+            ),
+            # Conv1d output is (B, llm_dim, T'); MLP wants (B, T', llm_dim).
+            _Transpose(-1, -2),
+        ]
+        for _ in range(1, mlp_depth):
+            layers.append(nn.GELU())
+            layers.append(nn.Linear(llm_dim, llm_dim))
+        # Trailing transpose flips back to (B, llm_dim, T') — that's the
+        # shape upstream callers expect after the projector.
+        layers.append(_Transpose(-1, -2))
+        self.proj = nn.Sequential(*layers)
+
+    def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
+        """Project a packed (B, T, audio_dim) tensor.
+
+        Args:
+            audio_embeds: (B, T, audio_dim) Whisper encoder output, channels-last.
+
+        Returns:
+            (B, llm_dim, T') tensor, where
+            ``T' = (T - ds_kernel_size + 2*(ds_kernel_size//2)) // ds_stride + 1``.
+        """
+        # Conv1d expects (B, C, T) — flip first.
+        x = audio_embeds.transpose(-1, -2)
+        return self.proj(x)
+
+    def compute_output_length(self, input_length: torch.Tensor) -> torch.Tensor:
+        """Output sequence length after Whisper conv stems + this projector.
+
+        Mirrors :func:`projectors.AudioProjector.compute_output_length` from
+        vllm-omni: the Whisper encoder has two fixed Conv1d stems (kernel=3,
+        stride=2 then stride=1 -> see ``whisper_encoder``); we then apply
+        ``Conv1d(ds_kernel_size, ds_stride)``. The Whisper stem formula
+        ``(L - 3 + 2) // 2 + 1`` applies once, then the projector conv.
+        """
+        # Whisper encoder stem (conv1: kernel=3, pad=1, stride=2)
+        length = (input_length - 3 + 2 * 1) // 2 + 1
+        # Projector conv (kernel=ds_kernel_size, pad=ds_kernel_size//2, stride=ds_stride)
+        length = (length - self.ds_kernel_size + 2 * (self.ds_kernel_size // 2)) // self.ds_stride + 1
+        return length
+
+
+__all__ = ["MingVisionProjector", "MingAudioProjector"]
diff --git a/mstar/model/ming_omni_flash/components/prompt_utils.py b/mstar/model/ming_omni_flash/components/prompt_utils.py
new file mode 100644
index 00000000..f54bfd0d
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/prompt_utils.py
@@ -0,0 +1,130 @@
+"""Ming-flash-omni-2.0 prompt utilities (step 8).
+
+Port of vllm-omni's ``ming_flash_omni/prompt_utils.py``. Two unrelated
+helper families share the file because both are tightly coupled to
+Ming-specific prompt conventions:
+
+1. **Image-gen query-token expansion** — string-level helpers that mark
+   the ``<image><imagePatch>*N</image>`` block the thinker substitutes
+   with learnable image-gen query embeddings during forward. Used by the
+   ImageGen path (step 9); included here so the constants live in one
+   place.
+
+2. **TTS / talker caption builder** — the JSON caption template + merge
+   helper for the standalone ``ming_flash_omni_tts`` talker-only deploy.
+   Lets the talker accept the same JSON caption shape vllm-omni speaks
+   (speaker / dialect / style / emotion / BGM controls).
+"""
+
+from __future__ import annotations
+
+import copy
+import json
+from typing import Any
+
+# ============================================================
+# Image-gen query-token block (thinker stage — used by step 9)
+# ============================================================
+
+_IMAGE_OPEN_TOKEN = "<image>"
+_IMAGE_CLOSE_TOKEN = "</image>"
+IMAGE_PATCH_TOKEN = "<imagePatch>"
+
+# Matches ``ImageGenConfig(img_gen_scales=[16])`` → 16*16 = 256 on the
+# released inclusionAI/Ming-flash-omni-2.0 checkpoint.
+DEFAULT_NUM_QUERY_TOKENS = 256
+
+
+def maybe_expand_image_gen_prompt(
+    prompt: str,
+    num_query_tokens: int = DEFAULT_NUM_QUERY_TOKENS,
+) -> str:
+    """Append the ``<image><imagePatch>*N</image>`` suffix for text-to-image.
+
+    The thinker expects image-generation requests to end with an N-wide
+    block of ``<imagePatch>`` tokens (wrapped in ``<image>`` / ``</image>``)
+    whose positions get substituted with learnable query embeddings during
+    forward.
+
+    No-op (returns the input unchanged) when ``prompt`` is not a non-empty
+    string, or already contains an ``<imagePatch>`` block (avoids double
+    expansion).
+
+    Args:
+        prompt: raw user prompt text.
+        num_query_tokens: total query tokens to emit (default 256).
+    """
+    if not isinstance(prompt, str) or not prompt:
+        return prompt
+    if IMAGE_PATCH_TOKEN in prompt:
+        return prompt
+    suffix = _IMAGE_OPEN_TOKEN + (IMAGE_PATCH_TOKEN * num_query_tokens) + _IMAGE_CLOSE_TOKEN
+    return prompt + suffix
+
+
+# ============================================================
+# TTS / talker caption builder (talker-only deploy)
+# ============================================================
+
+DEFAULT_PROMPT = "Please generate speech based on the following description.\n"
+
+# Base caption schema the standalone talker understands. Keys are the
+# Ming-native Chinese field names (序号 = index, 说话人 = speaker,
+# 方言 = dialect, 风格 = style, 语速 = speed, 基频 = pitch, 音量 = volume,
+# 情感 = emotion, BGM = background music block, IP = persona).
+BASE_CAPTION_TEMPLATE: dict[str, Any] = {
+    "audio_sequence": [
+        {
+            "序号": 1,
+            "说话人": "speaker_1",
+            "方言": None,
+            "风格": None,
+            "语速": None,
+            "基频": None,
+            "音量": None,
+            "情感": None,
+            "BGM": {
+                "Genre": None,
+                "Mood": None,
+                "Instrument": None,
+                "Theme": None,
+                "ENV": None,
+                "SNR": None,
+            },
+            "IP": None,
+        }
+    ]
+}
+
+
+def create_instruction(user_input: dict[str, Any]) -> str:
+    """Return a JSON caption string for ``audio_sequence[0]``.
+
+    Only keys already present on the base template are merged in; unknown
+    keys are silently ignored so the output schema stays stable (the
+    talker's prompt parser keys off the exact field set).
+
+    Args:
+        user_input: partial caption controls, e.g.
+            ``{"说话人": "speaker_2", "情感": "happy"}``.
+
+    Returns:
+        A UTF-8 JSON string (``ensure_ascii=False`` to keep the Chinese
+        field names readable, matching upstream).
+    """
+    caption = copy.deepcopy(BASE_CAPTION_TEMPLATE)
+    item = caption["audio_sequence"][0]
+    for key, value in user_input.items():
+        if key in item:
+            item[key] = value
+    return json.dumps(caption, ensure_ascii=False)
+
+
+__all__ = [
+    "IMAGE_PATCH_TOKEN",
+    "DEFAULT_NUM_QUERY_TOKENS",
+    "maybe_expand_image_gen_prompt",
+    "DEFAULT_PROMPT",
+    "BASE_CAPTION_TEMPLATE",
+    "create_instruction",
+]
diff --git a/mstar/model/ming_omni_flash/components/rope.py b/mstar/model/ming_omni_flash/components/rope.py
new file mode 100644
index 00000000..64d9c11e
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/rope.py
@@ -0,0 +1,265 @@
+"""Ling-2.0 partial 3D rotary embeddings (``video_rope`` flavor).
+
+Ling-2.0's attention uses **partial rotary** (only the first
+``head_dim * partial_rotary_factor`` dims of each head are rotated; the rest
+pass through unchanged) with **3D MRoPE positions** (time / height / width
+each get their own position id) in the ``video_rope`` cos/sin layout.
+
+The cos/sin layout is the unusual bit. Standard MRoPE places contiguous
+frequency sections per axis:
+
+    [ T T ... T  H H ... H  W W ... W ]   (sizes mrope_section = [Nt, Nh, Nw])
+
+Ling's ``video_rope`` interleaves H and W element-wise in the spatial
+section and puts T at the end:
+
+    [ H W H W ... H W   T T ... T ]       (sizes hw_size = Nh + Nw,  Nt at tail)
+
+For pure-text positions (1D position_ids, no T/H/W split) the rotation
+degenerates to the standard 1D rotary on the first ``rotary_dim`` dims.
+
+References
+----------
+* Ming upstream ``apply_3d_rotary_pos_emb``
+  ``/tmp/ming_repo/modeling_bailing_moe_v2.py:226-313`` (video_rope branch
+  is the ``elif rope_type == "video_rope"`` block).
+* vllm-omni ``MingVideoRopeMRotaryEmbedding._remap_video_rope``
+  ``/tmp/vllm-omni/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py:79-110``
+  — same remap as ours; we port the math without depending on vllm.
+"""
+
+from __future__ import annotations
+
+import torch
+from torch import nn
+
+
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Standard neox-style rotary half-rotation: ``[-x2, x1]``."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _build_inv_freq(rotary_dim: int, theta: float) -> torch.Tensor:
+    """Standard rotary inverse-frequency table: ``theta ** (-2i / rotary_dim)`` for i in [0, rotary_dim/2)."""
+    return 1.0 / (
+        theta ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim)
+    )
+
+
+class LingPartialMRotaryEmbedding(nn.Module):
+    """Partial rotary + ``video_rope`` 3D MRoPE.
+
+    Args:
+        head_dim: full head dim of the attention layer.
+        partial_rotary_factor: fraction of head_dim that's actually rotated
+            (the rest is concatenated pass-through). The model uses 0.5;
+            head_dim=128 → rotary_dim=64.
+        mrope_section: per-axis cos/sin section sizes. Released ckpt:
+            ``[8, 12, 12]``. The first is Nt (time), the rest are Nh
+            (height) and Nw (width); Nh+Nw must equal rotary_dim/2 − Nt
+            (i.e. the section sums to rotary_dim/2 — see config invariant).
+        rope_theta: rotary base frequency. Released ckpt: ``2_400_000``.
+        max_position_embeddings: max sequence length; precomputed cache size.
+
+    The forward expects ``position_ids`` of shape ``(3, num_tokens)`` for
+    3D positions or ``(num_tokens,)`` for plain 1D rope (degenerates to
+    standard rotary).
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        partial_rotary_factor: float,
+        mrope_section: list[int],
+        rope_theta: float,
+        max_position_embeddings: int,
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.rotary_dim = int(head_dim * partial_rotary_factor)
+        if self.rotary_dim % 2 != 0:
+            raise ValueError(
+                f"rotary_dim must be even (got {self.rotary_dim}); check "
+                f"partial_rotary_factor."
+            )
+        self.mrope_section = list(mrope_section)
+        if sum(self.mrope_section) != self.rotary_dim // 2:
+            raise ValueError(
+                f"sum(mrope_section)={sum(self.mrope_section)} must equal "
+                f"rotary_dim//2={self.rotary_dim // 2}"
+            )
+        if len(self.mrope_section) != 3:
+            raise ValueError(
+                f"mrope_section must be length-3 [Nt, Nh, Nw]; got {self.mrope_section}"
+            )
+        self.hw_size = self.mrope_section[1] + self.mrope_section[2]
+
+        self.rope_theta = float(rope_theta)
+        self.max_position_embeddings = int(max_position_embeddings)
+
+        # Cache inv_freq once; cos/sin tables are computed on first forward
+        # (lazy so we don't pay for max_position_embeddings * rotary_dim
+        # storage on CPU for tests).
+        self.register_buffer(
+            "inv_freq",
+            _build_inv_freq(self.rotary_dim, self.rope_theta),
+            persistent=False,
+        )
+
+    # ------------------------------------------------------------------
+    # cos / sin cache
+    # ------------------------------------------------------------------
+
+    def _compute_cos_sin(
+        self, position_ids: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute cos/sin for ``position_ids``.
+
+        ``position_ids`` is ``(num_tokens,)`` or ``(3, num_tokens)``.
+        Returns ``cos, sin`` of shape ``(num_tokens, rotary_dim)`` in the
+        video_rope layout (H/W interleaved spatial + T tail).
+        """
+        if position_ids.dim() == 1:
+            return self._cos_sin_1d(position_ids)
+        if position_ids.dim() != 2 or position_ids.shape[0] != 3:
+            raise ValueError(
+                f"position_ids must be (num_tokens,) or (3, num_tokens); "
+                f"got shape {tuple(position_ids.shape)}"
+            )
+        return self._cos_sin_3d_video_rope(position_ids)
+
+    def _cos_sin_1d(
+        self, position_ids: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Standard 1D rotary cos/sin — used for pure-text positions."""
+        # (num_tokens, rotary_dim/2)
+        freqs = position_ids.float().unsqueeze(-1) * self.inv_freq.unsqueeze(0)
+        # (num_tokens, rotary_dim) — neox style: cat freqs with themselves
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos(), emb.sin()
+
+    def _cos_sin_3d_video_rope(
+        self, position_ids: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """3D positions → video_rope layout.
+
+        position_ids: ``(3, num_tokens)`` — row 0 = time, row 1 = height,
+        row 2 = width.
+
+        Steps:
+          1. Compute per-axis freqs: ``(3, num_tokens, rotary_dim/2)``.
+          2. Form (cos, sin) of shape ``(3, num_tokens, rotary_dim)`` neox-style.
+          3. Remap each rotary_dim/2 frequency-pair index ``i`` into:
+                - i < hw_size  →  H if i even, W if i odd
+                - i ≥ hw_size  →  T
+             Pairs ``(cos[i], cos[i + rotary_dim/2])`` correspond to the
+             same frequency, so the same row assignment applies to both
+             halves.
+        """
+        # (3, num_tokens, rotary_dim/2)
+        freqs = position_ids.float().unsqueeze(-1) * self.inv_freq.view(1, 1, -1)
+        # (3, num_tokens, rotary_dim) — neox cat
+        cos_3d = torch.cat((freqs, freqs), dim=-1).cos()
+        sin_3d = torch.cat((freqs, freqs), dim=-1).sin()
+        return self._remap_video_rope(cos_3d, sin_3d)
+
+    def _remap_video_rope(
+        self, cos_3d: torch.Tensor, sin_3d: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Remap per-axis cos/sin into the video_rope 2D layout.
+
+        cos_3d, sin_3d: ``(3, num_tokens, rotary_dim)``.
+        Returns: ``(num_tokens, rotary_dim)``.
+
+        Mirror of vllm-omni's ``_remap_video_rope`` with one difference:
+        we operate on the *full* rotary_dim tables (not the half-tables
+        chunked from the cos_sin cache), because we never built a cache —
+        we computed freqs in 1:1 correspondence with positions in the
+        forward path. The H/W alternation rule still picks the correct
+        index because each half of the neox-cat repeats the same
+        frequency.
+        """
+        # Both halves of the rotary_dim (the first and second halves
+        # contain the same frequencies after the neox cat) get the same
+        # axis-assignment. So a single index i in [0, rotary_dim/2) picks
+        # a frequency-pair that should come from one axis.
+        half = self.rotary_dim // 2
+
+        result_cos = torch.empty_like(cos_3d[0])
+        result_sin = torch.empty_like(sin_3d[0])
+
+        # Spatial half: H on even indices, W on odd indices, capped at hw_size.
+        # Then mirror to the second half (which holds the same freqs).
+        for offset in (0, half):
+            # H rows go on even positions [0, 2, 4, ...] up to hw_size
+            result_cos[:, offset : offset + self.hw_size : 2] = cos_3d[
+                1, :, offset : offset + self.hw_size : 2
+            ]
+            result_cos[:, offset + 1 : offset + self.hw_size : 2] = cos_3d[
+                2, :, offset + 1 : offset + self.hw_size : 2
+            ]
+            result_sin[:, offset : offset + self.hw_size : 2] = sin_3d[
+                1, :, offset : offset + self.hw_size : 2
+            ]
+            result_sin[:, offset + 1 : offset + self.hw_size : 2] = sin_3d[
+                2, :, offset + 1 : offset + self.hw_size : 2
+            ]
+            # Temporal tail
+            result_cos[:, offset + self.hw_size : offset + half] = cos_3d[
+                0, :, offset + self.hw_size : offset + half
+            ]
+            result_sin[:, offset + self.hw_size : offset + half] = sin_3d[
+                0, :, offset + self.hw_size : offset + half
+            ]
+        return result_cos, result_sin
+
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, position_ids: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Rotate the first ``rotary_dim`` dims of q and k in-place.
+
+        Args:
+            q, k: ``(..., num_tokens, head_dim)`` (typical layout from
+                ParallelAttention is ``(num_tokens, num_heads, head_dim)``).
+                Only the last dim and the per-token axis matter.
+            position_ids: ``(num_tokens,)`` for 1D rope or
+                ``(3, num_tokens)`` for video_rope.
+
+        Returns:
+            ``(q, k)`` with rotation applied to the rotary half.
+        """
+        if q.shape[-1] != self.head_dim or k.shape[-1] != self.head_dim:
+            raise ValueError(
+                f"q/k last dim {q.shape[-1]}/{k.shape[-1]} != "
+                f"head_dim {self.head_dim}"
+            )
+
+        cos, sin = self._compute_cos_sin(position_ids)
+        # Broadcast cos/sin across the leading axes of q (typically a
+        # heads axis comes BEFORE the token axis: q is (..., heads, T,
+        # head_dim)). cos starts as (T, rotary_dim); we need to insert
+        # ones at every leading dim of q so the broadcast aligns
+        # (T at the second-to-last position, rotary_dim at the last).
+        while cos.dim() < q.dim():
+            cos = cos.unsqueeze(0)
+            sin = sin.unsqueeze(0)
+
+        q_rot, q_pass = q[..., : self.rotary_dim], q[..., self.rotary_dim :]
+        k_rot, k_pass = k[..., : self.rotary_dim], k[..., self.rotary_dim :]
+        cos_q = cos.to(q.dtype)
+        sin_q = sin.to(q.dtype)
+        cos_k = cos.to(k.dtype)
+        sin_k = sin.to(k.dtype)
+
+        q_rot = (q_rot * cos_q) + (_rotate_half(q_rot) * sin_q)
+        k_rot = (k_rot * cos_k) + (_rotate_half(k_rot) * sin_k)
+        return (
+            torch.cat([q_rot, q_pass], dim=-1),
+            torch.cat([k_rot, k_pass], dim=-1),
+        )
diff --git a/mstar/model/ming_omni_flash/components/router.py b/mstar/model/ming_omni_flash/components/router.py
new file mode 100644
index 00000000..ae6dff1f
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/router.py
@@ -0,0 +1,159 @@
+"""Ling-2.0 MoE router with grouped expert selection.
+
+Ling-2.0 (BailingMoeV2) uses ``router_type: "MultiRouter"``, which differs from
+mstar's standard :class:`mstar.model.components.moe.TopKRouter` in four ways:
+
+  * **Sigmoid** activation on the gate logits, not softmax.
+  * A learned per-expert bias added to the routing scores before top-k —
+    not gradient-trained on this checkpoint (stored as ``requires_grad=False``).
+  * **Group-limited top-k**: the ``num_experts`` are partitioned into
+    ``n_group`` groups; tokens may only route to experts within the
+    ``topk_group`` highest-scoring groups (group score = sum of top-2
+    expert scores in that group). This caps cross-group all-to-all
+    bandwidth at the cost of expressiveness.
+  * Weights are renormalised to sum to 1 across the chosen top-k and then
+    multiplied by ``routed_scaling_factor``.
+
+Returns the same 3-tuple as :class:`TopKRouter` (``logits, weights, indices``)
+so it can drop into mstar's existing :class:`SparseMoeBlockWithSharedExpert`
+and the fused-Triton dispatch path.
+
+Reference: vllm-omni's ``BailingMoeV2Gate``
+``/tmp/vllm-omni/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py:211-279``
+and Ming upstream ``modeling_bailing_moe_v2.py:696-765``.
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class LingMoeRouter(nn.Module):
+    """Ling-2.0 ``MultiRouter`` (group-limited top-k with sigmoid + bias).
+
+    Args:
+        hidden_size: input hidden dimension.
+        num_experts: total routed experts. Must divide evenly by ``n_group``.
+        num_experts_per_tok: top-k experts selected per token.
+        n_group: expert groups; the experts are split contiguously by
+            ``num_experts // n_group``.
+        topk_group: how many groups a single token may route into.
+        routed_scaling_factor: post-renormalisation scale applied to the
+            top-k weights (matches upstream ``routed_scaling_factor``).
+
+    The gate ``nn.Linear`` weight is **replicated** across TP ranks in the
+    parallel build (router decisions must be identical across ranks); for
+    this step-3a unit-test scope we just expose a plain ``nn.Linear``.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_experts: int,
+        num_experts_per_tok: int,
+        n_group: int,
+        topk_group: int,
+        routed_scaling_factor: float = 1.0,
+    ) -> None:
+        super().__init__()
+        if num_experts % n_group != 0:
+            raise ValueError(
+                f"num_experts={num_experts} must be divisible by n_group={n_group}"
+            )
+        if topk_group > n_group:
+            raise ValueError(
+                f"topk_group={topk_group} cannot exceed n_group={n_group}"
+            )
+        self.hidden_size = hidden_size
+        self.num_experts = num_experts
+        self.top_k = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.experts_per_group = num_experts // n_group
+        self.routed_scaling_factor = routed_scaling_factor
+
+        # Gate projection — replicated (no bias).
+        self.gate = nn.Linear(hidden_size, num_experts, bias=False)
+
+        # Expert bias — not gradient-trained, but stored as a parameter so
+        # state_dict loaders see it.
+        self.expert_bias = nn.Parameter(
+            torch.zeros(num_experts), requires_grad=False,
+        )
+
+    def _group_limited_topk(self, scores: torch.Tensor) -> torch.Tensor:
+        """Pick the top-k experts under the ``topk_group``-best-groups constraint.
+
+        Args:
+            scores: ``(num_tokens, num_experts)``. Already sigmoid + bias.
+
+        Returns:
+            ``(num_tokens, top_k)`` int64 expert indices.
+
+        Per-group score = sum of that group's top-2 expert scores. The
+        ``topk_group`` groups with the highest per-group scores are kept;
+        the rest are masked out before the final top-k.
+        """
+        num_tokens = scores.size(0)
+        # (N, n_group, experts_per_group)
+        grouped = scores.view(num_tokens, self.n_group, self.experts_per_group)
+        # Per-group score: sum of top-2 expert scores in that group.
+        # Matches upstream exactly (``.topk(2, dim=-1)[0].sum(dim=-1)``).
+        group_scores = grouped.topk(2, dim=-1)[0].sum(dim=-1)
+        # Pick the topk_group best groups.
+        group_idx = torch.topk(
+            group_scores, k=self.topk_group, dim=-1, sorted=False
+        )[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1.0)
+        # Broadcast group mask back across experts_per_group.
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(num_tokens, self.n_group, self.experts_per_group)
+            .reshape(num_tokens, -1)
+        )
+        # Mask un-selected groups' experts to -inf so they can't be picked.
+        masked = scores.masked_fill(~score_mask.bool(), float("-inf"))
+        return torch.topk(masked, k=self.top_k, dim=-1, sorted=False)[1]
+
+    def forward(
+        self, hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Route tokens to experts.
+
+        Args:
+            hidden_states: ``(..., hidden_size)``. Flattened internally.
+
+        Returns:
+            Three tensors matching :class:`TopKRouter`'s shape:
+              - ``router_logits``: ``(N, num_experts)`` raw gate logits
+                (pre-sigmoid). Kept as float32 for stability and parity
+                with ``TopKRouter``.
+              - ``routing_weights``: ``(N, top_k)`` normalised + scaled
+                weights for the chosen experts.
+              - ``selected_experts``: ``(N, top_k)`` int64 expert indices.
+        """
+        hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1])
+        # Linear is rank-replicated; the float() cast matches upstream's
+        # ``logits = logits.float()`` for numeric stability.
+        logits = F.linear(hidden_states, self.gate.weight).float()
+        # Per-expert sigmoid (NOT softmax). Bias is added AFTER sigmoid
+        # in the routing path; the gathered weights below pull from the
+        # un-biased sigmoid scores.
+        sigmoid_scores = torch.sigmoid(logits)
+        scored_for_routing = sigmoid_scores + self.expert_bias
+
+        selected_experts = self._group_limited_topk(scored_for_routing)
+        # Gather the un-biased sigmoid score for the chosen experts.
+        chosen_scores = torch.gather(
+            sigmoid_scores, dim=1, index=selected_experts,
+        ).to(logits.dtype)
+        if self.top_k > 1:
+            chosen_scores = chosen_scores / (
+                chosen_scores.sum(dim=-1, keepdim=True) + 1e-20
+            )
+        routing_weights = chosen_scores * self.routed_scaling_factor
+
+        return logits, routing_weights, selected_experts
diff --git a/mstar/model/ming_omni_flash/components/t5_block_mapper.py b/mstar/model/ming_omni_flash/components/t5_block_mapper.py
new file mode 100644
index 00000000..25373e37
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/t5_block_mapper.py
@@ -0,0 +1,129 @@
+"""T5EncoderBlockByT5Mapper — Ming's per-block T5 stack mapping byt5 features
+onto the DiT condition space.
+
+Native mstar port of vllm-omni's ``t5_block_mapper.py``. The upstream version
+builds on vllm-omni's TP-fused ``T5Block`` (fused ``qkv_proj`` / ``wi``) and
+therefore needs a stacked-weight remap at load time. We instead build on
+HuggingFace's stock ``T5Block``, whose submodule layout (``SelfAttention.q/k/v/o``
++ ``DenseReluDense.wi_0/wi_1/wo``) is byte-for-byte what Ming's
+``byt5_mapper.pt`` ships — so the checkpoint loads with a plain
+``load_state_dict`` (no fused mapping). This keeps the port pure-torch + stock
+transformers, consistent with the rest of the mstar modeling tree.
+
+The mapper stacks ``num_layers`` encoder blocks on the byt5 features, RMSNorms,
+then projects ``d_model -> sdxl_channels`` (Ming's ``diffusion_c_input_dim``).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+from transformers.models.t5.modeling_t5 import T5Block, T5LayerNorm
+
+
+class T5EncoderBlockByT5Mapper(nn.Module):
+    """Stacks ``num_layers`` HF T5 encoder blocks on top of byt5 features and
+    projects them to ``sdxl_channels``.
+
+    Args:
+        byte5_config: an HF ``T5Config`` (``text_encoder.config`` from the
+            loaded byt5 backbone). Supplies ``d_model`` / ``num_heads`` /
+            ``layer_norm_epsilon`` / relative-attention knobs.
+        num_layers: number of T5 encoder blocks to stack (0 ⇒ norm + project
+            only). Only the first block carries the relative-attention bias;
+            the rest reuse the position_bias it emits (standard T5 weight
+            sharing).
+        sdxl_channels: output projection width. ``None`` ⇒ no projection
+            (returns ``d_model``-wide features after the first RMSNorm).
+    """
+
+    def __init__(self, byte5_config, num_layers: int, sdxl_channels: int | None = None) -> None:
+        super().__init__()
+        if num_layers > 0:
+            self.blocks = nn.ModuleList(
+                [
+                    T5Block(byte5_config, has_relative_attention_bias=(i == 0))
+                    for i in range(num_layers)
+                ]
+            )
+        else:
+            self.blocks = None
+        self.layer_norm = T5LayerNorm(byte5_config.d_model, eps=byte5_config.layer_norm_epsilon)
+        if sdxl_channels is not None:
+            self.channel_mapper = nn.Linear(byte5_config.d_model, sdxl_channels)
+            self.final_layer_norm = T5LayerNorm(sdxl_channels, eps=byte5_config.layer_norm_epsilon)
+        else:
+            self.channel_mapper = None
+            self.final_layer_norm = None
+
+    @staticmethod
+    def get_extended_attention_mask(attention_mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+        """Turn a {0,1} pad mask into an additive (-inf on pad) attention bias.
+
+        Mirrors the upstream helper: accepts a 2-D ``[B, S]`` or pre-broadcast
+        3-D ``[B, S, S]`` mask and returns ``[B, 1, *, S]`` with ``0`` on keep
+        positions and ``finfo.min`` on pad positions, ready to add to the
+        attention logits inside ``T5Block``.
+        """
+        if attention_mask.dim() == 3:
+            extended = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            extended = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(f"Unexpected attention_mask shape {tuple(attention_mask.shape)}")
+        extended = extended.to(dtype=dtype)
+        return (1.0 - extended) * torch.finfo(dtype).min
+
+    def forward(self, inputs_embeds: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        extended_mask = self.get_extended_attention_mask(attention_mask, dtype=inputs_embeds.dtype)
+
+        hidden_states = inputs_embeds
+        position_bias = None
+
+        if self.blocks is not None:
+            for block in self.blocks:
+                # HF T5Block returns (hidden_states, position_bias) with
+                # use_cache=False; the first block computes position_bias from
+                # its relative-attention table and later blocks reuse it.
+                hidden_states, position_bias = block(
+                    hidden_states,
+                    attention_mask=extended_mask,
+                    position_bias=position_bias,
+                    use_cache=False,
+                )
+
+        hidden_states = self.layer_norm(hidden_states)
+        if self.channel_mapper is not None:
+            hidden_states = self.channel_mapper(hidden_states)
+            hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Load Ming's HF-format ``byt5_mapper.pt`` directly.
+
+        Because we build on stock HF ``T5Block`` (unfused q/k/v/o, wi_0/wi_1/wo)
+        the source and target names already match — no stacked-param remap like
+        the vllm-omni port needs. Names present in the checkpoint but absent
+        from the module (or vice versa) are skipped and reported via the return
+        value, so callers can assert full coverage.
+        """
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            if param.shape != loaded_weight.shape:
+                raise ValueError(
+                    f"Shape mismatch loading byt5 mapper weight {name}: "
+                    f"param {tuple(param.shape)} vs checkpoint {tuple(loaded_weight.shape)}"
+                )
+            with torch.no_grad():
+                param.copy_(loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+__all__ = ["T5EncoderBlockByT5Mapper"]
diff --git a/mstar/model/ming_omni_flash/components/talker_dit.py b/mstar/model/ming_omni_flash/components/talker_dit.py
new file mode 100644
index 00000000..2c7fa316
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/talker_dit.py
@@ -0,0 +1,852 @@
+"""CFM + DiT building blocks for the Ming-flash-omni-2.0 Talker (step 6b).
+
+Ports the modeling primitives from vllm-omni's
+``ming_flash_omni/talker_module.py`` (lines 1–402: DiT modules + CFM)
+into mstar. Skips the vllm-only CFMGraphExecutor / Pool plumbing —
+mstar has its own batching surface.
+
+Upstream module layout (mirror the names so the loader can map
+``talker/model.safetensors`` keys 1:1):
+
+  flowmodel.x_embedder, .c_embedder, .t_embedder, .blocks.{N}.norm1,
+  .blocks.{N}.attn.to_{q,k,v}, .blocks.{N}.attn.to_out.{0,1}, ..., .final_layer
+
+Two external deps replaced with in-tree minimal ports to keep the
+runtime dep surface small:
+
+  * ``DiTTimestepEmbedding`` — SinusPositionEmbedding + 2-layer MLP.
+    Mirrors vllm-omni's ``timestep_embedding.DiTTimestepEmbedding``.
+  * ``RotaryEmbedding`` — non-xpos 1-D RoPE matching x_transformers'
+    ``RotaryEmbedding.forward_from_seq_len`` exactly so the same
+    apply pattern works. We port both classes (without the xpos
+    branch — the talker config doesn't enable it).
+"""
+
+from __future__ import annotations
+
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+# ===========================================================================
+# Sinusoidal timestep embedding (port of vllm-omni's DiTTimestepEmbedding)
+# ===========================================================================
+
+
+class _SinusPositionEmbedding(nn.Module):
+    """Sinusoidal embedding for scalar timesteps (DDPM / DiT convention).
+
+    Mirrors vllm-omni's ``SinusPositionEmbedding`` exactly:
+    ``scale * x * exp(-log(10000) * k / (half_dim - 1))`` for
+    ``k in [0, half_dim)``, then concat(sin, cos).
+    """
+
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        if dim % 2 != 0:
+            raise ValueError(f"freq_embed_dim must be even, got {dim}")
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor, scale: float = 1000.0) -> torch.Tensor:
+        device = x.device
+        half = self.dim // 2
+        # log-spaced inverse frequencies
+        emb = math.log(10000.0) / (half - 1)
+        emb = torch.exp(torch.arange(half, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1).float() * emb.unsqueeze(0)
+        out = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return out.to(x.dtype)
+
+
+class DiTTimestepEmbedding(nn.Module):
+    """SinusPosEmb → Linear → SiLU → Linear. Output is ``hidden_size``-dim."""
+
+    def __init__(self, dim: int, freq_embed_dim: int = 256) -> None:
+        super().__init__()
+        self.time_embed = _SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(freq_embed_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+        )
+
+    def forward(self, timestep: torch.Tensor) -> torch.Tensor:
+        h = self.time_embed(timestep)
+        h = h.to(timestep.dtype)
+        return self.time_mlp(h)
+
+
+# ===========================================================================
+# RoPE — non-xpos 1-D variant (port of x_transformers.RotaryEmbedding)
+# ===========================================================================
+#
+# x_transformers uses an INTERLEAVED pair layout: freqs are stacked as
+# ``(f, f)`` per dim and then flattened, and ``rotate_half`` permutes
+# adjacent pairs as ``(x1, x2) -> (-x2, x1)`` rather than the neox-cat
+# split-by-halves convention used by Ling-2.0's thinker.
+# We must mirror this layout exactly because the released ckpt's
+# weights were trained against it.
+
+
+def _rotate_half_interleaved(x: torch.Tensor) -> torch.Tensor:
+    """Pair-wise rotation: ``(..., d, 2) -> stack(-x2, x1)`` then flatten."""
+    x = x.unflatten(-1, (-1, 2))
+    x1, x2 = x.unbind(dim=-1)
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def _apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    """Standard partial-rotary apply with the interleaved pair layout.
+
+    Args:
+        t: ``(B, H, T, head_dim)`` queries or keys.
+        freqs: ``(1, T, head_dim)`` rotary frequency table.
+    """
+    rot_dim = freqs.shape[-1]
+    seq_len = t.shape[-2]
+    freqs = freqs[:, -seq_len:, :]
+    # Broadcast (1, T, D) to match (B, H, T, D) along the heads axis.
+    if t.ndim == 4 and freqs.ndim == 3:
+        freqs = freqs.unsqueeze(1)  # (1, 1, T, D)
+
+    rotated = t[..., :rot_dim]
+    passed = t[..., rot_dim:]
+    orig_dtype = rotated.dtype
+    cos = freqs.cos().to(orig_dtype)
+    sin = freqs.sin().to(orig_dtype)
+    rotated = (rotated * cos) + (_rotate_half_interleaved(rotated) * sin)
+    out = torch.cat([rotated, passed], dim=-1)
+    return out
+
+
+class RotaryEmbedding(nn.Module):
+    """Non-xpos 1-D rotary embeddings matching x_transformers' interleaved layout.
+
+    ``forward_from_seq_len(T)`` returns ``(freqs, xpos_scale=None)`` where
+    freqs is ``(1, T, dim)``. The DiT only ever uses ``xpos_scale=None``
+    (released ckpt's ``use_xpos`` is implicitly False).
+    """
+
+    def __init__(self, dim: int, base: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward_from_seq_len(
+        self, seq_len: int,
+    ) -> tuple[torch.Tensor, None]:
+        t = torch.arange(seq_len, device=self.inv_freq.device)
+        # einsum('b i, j -> b i j') with t unsqueezed to (1, T) and
+        # inv_freq as (D//2,). Result: (1, T, D//2).
+        freqs = torch.einsum(
+            "i,j->ij", t.type_as(self.inv_freq), self.inv_freq,
+        ).unsqueeze(0)  # (1, T, D//2)
+        # Stack pair-wise then flatten so each adjacent (f, f) pair lines
+        # up with ``rotate_half_interleaved``'s (-x2, x1) layout.
+        freqs = torch.stack((freqs, freqs), dim=-1).flatten(-2)  # (1, T, D)
+        return freqs, None
+
+
+# ===========================================================================
+# DiT building blocks (RMSNorm, FeedForward, Attention, DiTBlock, FinalLayer,
+# CondEmbedder)
+# ===========================================================================
+
+
+class _RMSNorm(nn.Module):
+    """Plain RMSNorm with a learnable scale (mirrors upstream)."""
+
+    def __init__(self, dim: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.weight.dtype in (torch.float16, torch.bfloat16):
+            x = x.to(self.weight.dtype)
+        return F.rms_norm(
+            x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps,
+        )
+
+
+class _FeedForward(nn.Module):
+    """Linear → GELU → Dropout → Linear (port of upstream FeedForward).
+
+    Layer indices in the released ckpt: ``ff.0.0`` (first Linear),
+    ``ff.0.1`` (GELU, no params), ``ff.1`` (Dropout, no params),
+    ``ff.2`` (second Linear). Match by integer index.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int | None = None,
+        mult: float = 4,
+        dropout: float = 0.0,
+        approximate: str = "none",
+    ) -> None:
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU(approximate=approximate),
+        )
+        self.ff = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.ff(x)
+
+
+class _Attention(nn.Module):
+    """Single-block attention with optional QK-norm, RoPE, and key-padding mask.
+
+    Param names — `to_q`, `to_k`, `to_v`, `to_out.0`, (`to_out.1` is a
+    Dropout, no params) — mirror upstream exactly so the talker ckpt's
+    ``blocks.N.attn.to_q.weight`` etc. load by state_dict equality.
+    `q_norm` / `k_norm` are present only when ``qk_norm="rms_norm"``
+    (released ckpt sets qk_norm=None, so both are None and absent from
+    state_dict).
+
+    Mask handling matches upstream (`talker_module.Attention.forward`):
+      * ``mask`` is a ``(B, T)`` boolean key-padding mask — True for
+        valid positions, False for padding.
+      * When ``attn_mask_enabled=True``: build an SDPA attention mask
+        from ``mask`` so padded keys are excluded from softmax.
+      * Regardless of `attn_mask_enabled`: zero out output rows at
+        masked-out positions before returning (matches upstream's
+        unconditional ``x.masked_fill(~mask, 0.0)``).
+
+    The released flowmodel + aggregator configs set
+    ``attn_mask_enabled=False`` so the SDPA mask branch is a no-op on
+    the live model; we still preserve the parameter for parity.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        qk_norm: str | None = None,
+        attn_mask_enabled: bool = True,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.heads = heads
+        self.dim_head = dim_head
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.attn_mask_enabled = attn_mask_enabled
+
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if qk_norm is None:
+            self.q_norm = None
+            self.k_norm = None
+        elif qk_norm == "rms_norm":
+            self.q_norm = _RMSNorm(dim_head)
+            self.k_norm = _RMSNorm(dim_head)
+        else:
+            raise ValueError(f"Unimplemented qk_norm: {qk_norm!r}")
+
+        # ``to_out`` is a ModuleList of [Linear, Dropout] (matches
+        # upstream so ckpt keys ``to_out.0.weight`` etc. land).
+        self.to_out = nn.ModuleList([
+            nn.Linear(self.inner_dim, dim),
+            nn.Dropout(dropout),
+        ])
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor | None = None,
+        rope: tuple[torch.Tensor, torch.Tensor | None] | None = None,
+    ) -> torch.Tensor:
+        B = x.shape[0]
+        q = self.to_q(x).view(B, -1, self.heads, self.dim_head).transpose(1, 2)
+        k = self.to_k(x).view(B, -1, self.heads, self.dim_head).transpose(1, 2)
+        v = self.to_v(x).view(B, -1, self.heads, self.dim_head).transpose(1, 2)
+        if self.q_norm is not None:
+            q = self.q_norm(q)
+        if self.k_norm is not None:
+            k = self.k_norm(k)
+
+        if rope is not None:
+            freqs, _xpos_scale = rope  # xpos_scale always None on this path
+            q = _apply_rotary_pos_emb(q, freqs)
+            k = _apply_rotary_pos_emb(k, freqs)
+
+        # SDPA mask. Upstream builds a (B', H, T, T) bool mask from a
+        # (B, T) key-padding mask and uses additive masking via SDPA's
+        # attn_mask kwarg. We replicate the same shape so float weights
+        # see identical attention patterns.
+        attn_mask = None
+        if self.attn_mask_enabled and mask is not None:
+            # mask shape: (B, T). Expand to (B, H, Tq, Tk).
+            attn_mask = mask[:, None, None, :].expand(B, self.heads, q.shape[-2], k.shape[-2])
+
+        out = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False,
+        )
+        out = out.transpose(1, 2).reshape(B, -1, self.inner_dim)
+        out = self.to_out[0](out)
+        out = self.to_out[1](out)
+
+        if mask is not None:
+            # Unconditional output-zeroing at masked positions (matches
+            # upstream's ``x.masked_fill(~mask, 0.0)``, executed even
+            # when attn_mask_enabled is False).
+            out = out.masked_fill(~mask[:, :, None], 0.0)
+        return out
+
+
+class _DiTBlock(nn.Module):
+    """Pre-norm attention + pre-norm FFN with residuals (upstream DiTBlock).
+
+    Forward signature matches upstream `(x, mask, rope)` so the
+    Aggregator can pass a key-padding mask through to the attention.
+    For the CFM DiT path the caller passes mask=None.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.1,
+        qk_norm: str | None = None,
+        attn_mask_enabled: bool = True,
+    ) -> None:
+        super().__init__()
+        self.norm1 = _RMSNorm(hidden_size)
+        self.attn = _Attention(
+            dim=hidden_size,
+            heads=num_heads,
+            dim_head=hidden_size // num_heads,
+            dropout=dropout,
+            qk_norm=qk_norm,
+            attn_mask_enabled=attn_mask_enabled,
+        )
+        self.norm2 = _RMSNorm(hidden_size)
+        self.mlp = _FeedForward(
+            dim=hidden_size, mult=mlp_ratio, dropout=dropout, approximate="tanh",
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor | None,
+        rope: tuple[torch.Tensor, torch.Tensor | None] | None,
+    ) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x), mask=mask, rope=rope)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class _FinalLayer(nn.Module):
+    """RMSNorm → Linear; projects DiT hidden states back to ``out_channels``."""
+
+    def __init__(self, hidden_size: int, out_channels: int) -> None:
+        super().__init__()
+        self.norm_final = _RMSNorm(hidden_size)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.norm_final(x))
+
+
+class _CondEmbedder(nn.Module):
+    """Projects LLM hidden states (cond) into the DiT hidden space."""
+
+    def __init__(self, input_feature_size: int, hidden_size: int) -> None:
+        super().__init__()
+        self.cond_embedder = nn.Linear(input_feature_size, hidden_size)
+
+    def forward(self, llm_cond: torch.Tensor) -> torch.Tensor:
+        return self.cond_embedder(llm_cond)
+
+
+# ===========================================================================
+# DiT (assembles N DiTBlocks + embedders + final layer)
+# ===========================================================================
+
+
+class DiT(nn.Module):
+    """Diffusion-transformer for audio-latent generation (port of upstream DiT).
+
+    Forward signature mirrors upstream so the calling code in
+    ``CFM.sample`` (and `forward_with_cfg`) works unchanged. The
+    optional ``spk_embedder`` is omitted on the released ckpt (the
+    flowmodel config has no ``spk_dim``).
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 64,
+        hidden_size: int = 1024,
+        depth: int = 8,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        llm_cond_dim: int = 896,
+        dropout: float = 0.0,
+        qk_norm: str | None = None,
+        spk_dim: int | None = None,
+        attn_mask_enabled: bool = False,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.t_embedder = DiTTimestepEmbedding(hidden_size)
+        self.x_embedder = nn.Linear(in_channels, hidden_size)
+        self.c_embedder = _CondEmbedder(llm_cond_dim, hidden_size)
+        self.spk_embedder = (
+            nn.Linear(spk_dim, hidden_size) if spk_dim is not None else None
+        )
+
+        self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
+        self.blocks = nn.ModuleList([
+            _DiTBlock(
+                hidden_size=hidden_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                dropout=dropout,
+                qk_norm=qk_norm,
+                attn_mask_enabled=attn_mask_enabled,
+            )
+            for _ in range(depth)
+        ])
+        self.final_layer = _FinalLayer(hidden_size, self.out_channels)
+
+    def forward(
+        self,
+        x: torch.Tensor,                # (B, patch_size, in_channels)
+        t: torch.Tensor,                # (B,) or scalar
+        c: torch.Tensor,                # (B, 1, llm_cond_dim)
+        latent_history: torch.Tensor,   # (B, his_patch_size, in_channels)
+        spk_emb: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Returns hidden states of shape ``(B, prefix + T, out_channels)``.
+
+        ``prefix`` is 1 (t+c) plus 1 if spk_embedder is set; the caller
+        is expected to take the last ``T`` rows where ``T`` is the
+        sum of ``latent_history`` and ``x`` lengths.
+        """
+        x = torch.cat([latent_history, x], dim=1)
+        x = self.x_embedder(x)
+        t_h = self.t_embedder(t).unsqueeze(1)
+        c_h = self.c_embedder(c)
+        y = t_h + c_h
+        if spk_emb is None:
+            if self.spk_embedder is not None:
+                raise AssertionError(
+                    "DiT was built with spk_embedder but spk_emb was None at forward."
+                )
+            x = torch.cat([y, x], dim=1)
+        else:
+            assert self.spk_embedder is not None, "spk_emb provided but spk_embedder=None"
+            x = torch.cat([self.spk_embedder(spk_emb), y, x], dim=1)
+
+        rope = self.rotary_embed.forward_from_seq_len(x.shape[1])
+        for block in self.blocks:
+            # DiT path: mask=None (CFM only uses RoPE; the Aggregator is
+            # what actually exercises the mask branch).
+            x = block(x, None, rope)
+        return self.final_layer(x)
+
+    def forward_with_cfg(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        c: torch.Tensor,
+        latent_history: torch.Tensor,
+        spk_emb: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Classifier-free guidance: double the batch and pass null cond.
+
+        Returns only the last ``x.shape[1]`` rows (the denoised x).
+        """
+        x_cat = torch.cat([x, x], dim=0)
+        lh_cat = torch.cat([latent_history, latent_history], dim=0)
+        null_c = torch.zeros_like(c)
+        c_cat = torch.cat([c, null_c], dim=0)
+        if t.ndim == 0:
+            t = t.repeat(x_cat.shape[0])
+        spk_cat = None if spk_emb is None else torch.cat([spk_emb, spk_emb], dim=0)
+        out = self.forward(x_cat, t, c_cat, lh_cat, spk_cat)
+        return out[:, -x.shape[1]:, :]
+
+
+# ===========================================================================
+# CFM (Conditional Flow Matching sampler)
+# ===========================================================================
+
+
+def get_epss_timesteps(
+    n: int, device: torch.device | str, dtype: torch.dtype,
+) -> torch.Tensor:
+    """EPSS schedule (port of upstream ``get_epss_timesteps``).
+
+    Returns ``n + 1`` integration timesteps in [0, 1]. Predefined
+    fixed-step schedules (5, 6, 7, 10, 12, 16) match the upstream's
+    empirically-tuned packing of more steps near t=0 where prediction
+    error is highest; other ``n`` values fall back to linspace.
+    """
+    dt = 1 / 32
+    predefined = {
+        5: [0, 2, 4, 8, 16, 32],
+        6: [0, 2, 4, 6, 8, 16, 32],
+        7: [0, 2, 4, 6, 8, 16, 24, 32],
+        10: [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
+        12: [0, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32],
+        16: [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 32],
+    }
+    schedule = predefined.get(n)
+    if not schedule:
+        return torch.linspace(0, 1, n + 1, device=device, dtype=dtype)
+    return dt * torch.tensor(schedule, device=device, dtype=dtype)
+
+
+class CFM(nn.Module):
+    """Conditional Flow Matching sampler over a wrapped DiT.
+
+    Single ``sample`` entry point — given an LLM condition and a noise
+    latent, integrate the velocity field for ``steps`` substeps with
+    classifier-free guidance.
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        steps: int = 10,
+        sway_sampling_coef: float | None = -1.0,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.steps = steps
+        self.sway_sampling_coef = sway_sampling_coef
+
+    @torch.no_grad()
+    def sample(
+        self,
+        llm_cond: torch.Tensor,           # (B, 1, llm_cond_dim)
+        lat_cond: torch.Tensor,           # (B, his_patch_size, latent_dim)
+        y0: torch.Tensor,                 # (B, patch_size, latent_dim) — initial noise
+        t: torch.Tensor,                  # (steps + 1,) — from get_epss_timesteps
+        sde_args: torch.Tensor,           # (3,) — [cfg_strength, sigma, temperature]
+        sde_rnd: torch.Tensor,            # (steps, B, patch_size, latent_dim)
+    ) -> torch.Tensor:
+        """Returns the denoised latent ``(B, patch_size, latent_dim)``."""
+        if t.shape[0] != self.steps + 1:
+            raise ValueError(
+                f"CFM.sample: expected t of length steps+1 = {self.steps + 1}, got {t.shape[0]}"
+            )
+        if sde_rnd.shape[0] != self.steps:
+            raise ValueError(
+                f"CFM.sample: expected sde_rnd[0] = {self.steps}, got {sde_rnd.shape[0]}"
+            )
+
+        def velocity(fn_t: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+            pred_cfg = self.model.forward_with_cfg(x, fn_t, llm_cond, lat_cond, None)
+            pred, null_pred = torch.chunk(pred_cfg, 2, dim=0)
+            # Standard CFG composition.
+            return pred + (pred - null_pred) * sde_args[0]
+
+        if self.sway_sampling_coef is not None:
+            t = t + self.sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+
+        for step in range(self.steps):
+            dt = t[step + 1] - t[step]
+            y0 = y0 + velocity(t[step], y0) * dt
+            # SDE noise term: sigma * sqrt(temperature) * sqrt(|dt|) * eps
+            y0 = y0 + sde_args[1] * (sde_args[2] ** 0.5) * (dt.abs() ** 0.5) * sde_rnd[step]
+        return y0
+
+
+# ===========================================================================
+# Factory: build a DiT + CFM from TalkerConfig
+# ===========================================================================
+
+
+def build_talker_cfm(
+    talker_config,
+    llm_cond_dim: int | None = None,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+) -> CFM:
+    """Construct DiT + CFM from a :class:`TalkerConfig`.
+
+    The released ckpt's flowmodel block carries
+    ``in_channels=64, hidden_size=1024, depth=8, num_heads=16, mlp_ratio=4``
+    with no spk_dim. ``llm_cond_dim`` defaults to the talker LLM hidden
+    size (896) when not specified.
+    """
+    flow = talker_config.flowmodel
+    if llm_cond_dim is None:
+        llm_cond_dim = talker_config.llm.hidden_size
+    dit = DiT(
+        in_channels=flow.in_channels,
+        hidden_size=flow.hidden_size,
+        depth=flow.depth,
+        num_heads=flow.num_heads,
+        mlp_ratio=flow.mlp_ratio,
+        llm_cond_dim=llm_cond_dim,
+        dropout=flow.dropout,
+        qk_norm=flow.qk_norm,
+        attn_mask_enabled=flow.attn_mask_enabled,
+    )
+    cfm = CFM(model=dit, steps=talker_config.steps)
+    cfm = cfm.to(dtype=dtype, device=device)
+    cfm.eval()
+    return cfm
+
+
+# ===========================================================================
+# Aggregator (DiT-shaped, maps audio latents back to LLM cond space)
+# ===========================================================================
+
+
+class Aggregator(nn.Module):
+    """Maps generated audio-latent patches back to LLM embedding space.
+
+    Port of upstream `talker_module.Aggregator` (lines 702-744). Same
+    DiTBlock stack as the CFM head but the input embedder is `nn.Linear`
+    (audio-latent → hidden) plus a learnable [CLS]-style `word_embedder`
+    prepended to the sequence; the output is the `[CLS]` row only,
+    projected to `llm_input_dim` so it can re-enter the talker LLM's
+    embedding space (closing the conditional-history loop).
+
+    The released aggregator block matches the flowmodel shape
+    (`depth=8, hidden_size=1024, num_heads=16, mlp_ratio=4, in_channels=64`)
+    except `dropout=0.1` and an `attn_mask_enabled=False` default that
+    still preserves the output-masking branch.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 64,
+        hidden_size: int = 1024,
+        depth: int = 8,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        llm_input_dim: int = 896,
+        dropout: float = 0.1,
+        qk_norm: str | None = None,
+        attn_mask_enabled: bool = False,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        # Learnable [CLS] token (single-row embedding table — exactly as
+        # upstream uses ``nn.Embedding(1, hidden_size)`` indexed at 0).
+        self.word_embedder = nn.Embedding(1, hidden_size)
+        self.x_embedder = nn.Linear(in_channels, hidden_size)
+
+        self.rotary_embed = RotaryEmbedding(hidden_size // num_heads)
+        self.blocks = nn.ModuleList([
+            _DiTBlock(
+                hidden_size=hidden_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                dropout=dropout,
+                qk_norm=qk_norm,
+                attn_mask_enabled=attn_mask_enabled,
+            )
+            for _ in range(depth)
+        ])
+        self.final_layer = _FinalLayer(hidden_size, llm_input_dim)
+
+    def forward(
+        self,
+        x: torch.Tensor,                 # (B, T, in_channels) audio latents
+        mask: torch.Tensor | None = None,   # (B, T) key-padding mask, True = valid
+    ) -> torch.Tensor:
+        """Returns the [CLS] row only: ``(B, 1, llm_input_dim)``.
+
+        Mirrors upstream `Aggregator.forward`: prepend a single learnable
+        [CLS] token, prepend a True-cell to the mask, run all DiT blocks,
+        project to ``llm_input_dim`` via `final_layer`, return the
+        leading row.
+        """
+        B = x.shape[0]
+        h = self.x_embedder(x)
+        cls_ids = torch.zeros((B, 1), dtype=torch.long, device=h.device)
+        cls_embed = self.word_embedder(cls_ids)
+        h = torch.cat([cls_embed, h], dim=1)
+
+        rope = self.rotary_embed.forward_from_seq_len(h.shape[1])
+        if mask is not None:
+            # Prepend a True column so the [CLS] row is never masked.
+            mask_pad = mask[:, :1].clone().detach()
+            mask = torch.cat([mask_pad, mask], dim=-1)
+
+        for block in self.blocks:
+            h = block(h, mask, rope)
+        h = self.final_layer(h)
+        return h[:, :1, :]
+
+
+def build_aggregator(
+    talker_config,
+    llm_input_dim: int | None = None,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+) -> Aggregator:
+    """Construct an :class:`Aggregator` from a :class:`TalkerConfig`.
+
+    The released ckpt's aggregator block carries
+    ``in_channels=64, hidden_size=1024, depth=8, num_heads=16,
+    mlp_ratio=4, dropout=0.1``. ``llm_input_dim`` defaults to
+    ``talker_config.llm.hidden_size`` (896).
+    """
+    agg = talker_config.aggregator
+    if llm_input_dim is None:
+        llm_input_dim = talker_config.llm.hidden_size
+    module = Aggregator(
+        in_channels=agg.in_channels,
+        hidden_size=agg.hidden_size,
+        depth=agg.depth,
+        num_heads=agg.num_heads,
+        mlp_ratio=agg.mlp_ratio,
+        llm_input_dim=llm_input_dim,
+        dropout=agg.dropout,
+        qk_norm=agg.qk_norm,
+        attn_mask_enabled=agg.attn_mask_enabled,
+    )
+    module = module.to(dtype=dtype, device=device)
+    module.eval()
+    return module
+
+
+# ===========================================================================
+# Talker LLM backbone (Qwen2)
+# ===========================================================================
+
+
+def build_talker_llm(
+    talker_llm_config,
+    attn_implementation: str = "sdpa",
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+):
+    """Construct a HF `Qwen2Model` from our `TalkerLLMConfig`.
+
+    The talker's LLM is a stock Qwen2 model — no custom modules, no
+    TP needed in the typical topology (the talker colocates on a
+    single rank). Reusing `transformers.Qwen2Model` keeps the surface
+    small and inherits HF's KV-cache + attention impl. The ckpt's
+    weight keys under `talker/model.safetensors` start with `model.`
+    and follow the standard Qwen2 layout, so the eventual loader
+    will be a simple prefix strip.
+
+    Args:
+        talker_llm_config: `TalkerLLMConfig` instance.
+        attn_implementation: passed through to Qwen2Config so the
+            model can use FA2 / SDPA. The upstream vllm-omni talker
+            uses ``"sdpa"`` (the ckpt's Qwen2 has
+            `_attn_implementation: flash_attention_2` baked into its
+            config dict but the vllm-omni runtime forcibly overrides
+            to sdpa to play nicely with vLLM's attention machinery
+            — we follow the same default).
+        dtype: cast the model to this dtype after construction.
+        device: device to materialise the model on.
+
+    Returns:
+        A `transformers.models.qwen2.modeling_qwen2.Qwen2Model`
+        instance with all parameters allocated (weights are still
+        random; the loader populates them later).
+    """
+    try:
+        from transformers import Qwen2Config, Qwen2Model
+    except ImportError as e:
+        raise ImportError(
+            "build_talker_llm requires transformers >= 4.43 (Qwen2 support). "
+            f"Original error: {e}"
+        ) from e
+
+    llm_cfg = Qwen2Config(
+        vocab_size=talker_llm_config.vocab_size,
+        hidden_size=talker_llm_config.hidden_size,
+        intermediate_size=talker_llm_config.intermediate_size,
+        num_hidden_layers=talker_llm_config.num_hidden_layers,
+        num_attention_heads=talker_llm_config.num_attention_heads,
+        num_key_value_heads=talker_llm_config.num_key_value_heads,
+        hidden_act=talker_llm_config.hidden_act,
+        max_position_embeddings=talker_llm_config.max_position_embeddings,
+        rms_norm_eps=talker_llm_config.rms_norm_eps,
+        rope_theta=talker_llm_config.rope_theta,
+        use_sliding_window=talker_llm_config.use_sliding_window,
+        sliding_window=talker_llm_config.sliding_window,
+        max_window_layers=talker_llm_config.max_window_layers,
+        tie_word_embeddings=talker_llm_config.tie_word_embeddings,
+        attention_dropout=talker_llm_config.attention_dropout,
+        use_cache=talker_llm_config.use_cache,
+        bos_token_id=talker_llm_config.bos_token_id,
+        eos_token_id=talker_llm_config.eos_token_id,
+        attn_implementation=attn_implementation,
+    )
+    model = Qwen2Model(llm_cfg)
+    model = model.to(dtype=dtype, device=device)
+    model.eval()
+    return model
+
+
+def build_talker_heads(
+    talker_config,
+    spk_embed_dim: int = 192,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+) -> dict[str, nn.Module]:
+    """Build the talker's small per-purpose Linear heads.
+
+    Two heads sit alongside the LLM + CFM + Aggregator + AudioVAE:
+
+      * ``stop_head`` — ``Linear(hidden_size, 2, bias=True)``: binary
+        end-of-audio classifier consumed during the generation loop
+        to decide when to stop.
+      * ``spk_head`` — ``Linear(spk_embed_dim=192, hidden_size,
+        bias=True)``: projects a CAMPPlus speaker embedding into the
+        LLM hidden space; the projected embedding is prepended to
+        the prompt as a voice-condition token.
+
+    Returned as a dict so callers can wire them into the talker
+    forward without depending on a specific module-tree shape.
+    """
+    hidden = talker_config.llm.hidden_size
+    stop_head = nn.Linear(hidden, 2, bias=True)
+    spk_head = nn.Linear(spk_embed_dim, hidden, bias=True)
+    stop_head = stop_head.to(dtype=dtype, device=device)
+    spk_head = spk_head.to(dtype=dtype, device=device)
+    stop_head.eval()
+    spk_head.eval()
+    return {"stop_head": stop_head, "spk_head": spk_head}
+
+
+__all__ = [
+    "DiT",
+    "CFM",
+    "Aggregator",
+    "DiTTimestepEmbedding",
+    "RotaryEmbedding",
+    "get_epss_timesteps",
+    "build_talker_cfm",
+    "build_aggregator",
+    "build_talker_llm",
+    "build_talker_heads",
+]
diff --git a/mstar/model/ming_omni_flash/components/talker_generator.py b/mstar/model/ming_omni_flash/components/talker_generator.py
new file mode 100644
index 00000000..02a1eeb9
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/talker_generator.py
@@ -0,0 +1,543 @@
+"""TalkerGenerator: orchestrates Qwen2 + CFM + Aggregator + AudioVAE (step 6e-1).
+
+Port of vllm-omni's ``MingAudioGenerator`` (``talker_module.py:854-1146``)
+plus the streaming-decode utilities (`silence_holder`,
+`trim_trailing_silence`). Stateless across requests — one ``__init__``
+binds the model components, then each call to `generate_latents` runs a
+fresh per-request AR loop.
+
+Skipped from upstream:
+  * `CFMGraphExecutorPool` / `CFMGraphExecutor` — vllm-specific CUDA-graph
+    batching infrastructure. We always run `cfm_sample_step` through the
+    manual path; mstar's engine layer handles graph capture separately.
+  * `build_tts_input` / `_looks_like_music_prompt` — prompt-construction
+    helpers that go alongside the eventual `process_prompt` audio-out path.
+    Lives in step 8 (TTS caption template).
+
+The generator's outputs feed directly into the mstar graph wiring in
+step 6e-2:
+  * `generate_latents()` is what `TalkerSubmodule.forward` will call per
+    request, returning the list of CFM-generated latent patches.
+  * `decode_to_waveform()` is what the audio-output submodule will call
+    to produce the final waveform tensor for `EMIT_TO_CLIENT`.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+from torch import nn
+
+from mstar.model.ming_omni_flash.components.talker_dit import (
+    CFM,
+    Aggregator,
+    get_epss_timesteps,
+)
+
+if TYPE_CHECKING:
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2Model
+
+    from mstar.model.ming_omni_flash.components.audio_vae import AudioVAE
+    from mstar.model.ming_omni_flash.config import TalkerConfig
+
+logger = logging.getLogger(__name__)
+
+
+# ===========================================================================
+# Streaming silence / trim utilities
+# ===========================================================================
+
+
+def trim_trailing_silence(
+    waveform: torch.Tensor,
+    sample_rate: int,
+    sil_th: float = 1e-3,
+    tail_silence_s: float = 0.3,
+) -> torch.Tensor:
+    """Drop low-energy frames off the tail; keep a short trailing silence.
+
+    Accepts 2-D ``(C, T)`` or 3-D ``(B, C, T)`` waveforms. Anything else
+    is passed through unchanged (defensive: rather than raise, leave the
+    output untouched so a misshaped tensor doesn't crash decode).
+    """
+    if waveform.numel() == 0:
+        return waveform
+
+    original_dim = waveform.dim()
+    if original_dim == 3:
+        speech = waveform[:, 0, :]
+    elif original_dim == 2:
+        speech = waveform
+    else:
+        return waveform
+
+    frame_size = int(sample_rate * 0.1)
+    frame_step = int(sample_rate * 0.1)
+    if speech.shape[-1] < frame_size:
+        keep = min(speech.shape[-1], int(tail_silence_s * sample_rate))
+        trimmed = speech[..., :keep]
+    else:
+        num_frame = (speech.shape[-1] - frame_size) // frame_step + 1
+        cur_len = (num_frame - 1) * frame_step + frame_size
+        speech = speech[..., :cur_len]
+        spe_frames = speech.unfold(-1, frame_size, frame_step)
+        scores = spe_frames.abs().mean(dim=-1)
+        scores = scores.mean(dim=list(range(scores.dim() - 1)))
+        idx = int(scores.shape[0]) - 1
+        while idx >= 0 and scores[idx] <= sil_th:
+            idx -= 1
+        if idx < 0:
+            keep = min(speech.shape[-1], int(tail_silence_s * sample_rate))
+            trimmed = speech[..., :keep]
+        else:
+            non_sil_len = idx * frame_step + frame_size + int(tail_silence_s * sample_rate)
+            non_sil_len = min(non_sil_len, speech.shape[-1])
+            trimmed = speech[..., :non_sil_len]
+
+    if original_dim == 3:
+        return trimmed.unsqueeze(1)
+    return trimmed
+
+
+def silence_holder(
+    speech: torch.Tensor,
+    sample_rate: int,
+    sil_cache: dict | None = None,
+    last_chunk: bool = True,
+    sil_th: float = 1e-3,
+    last_sil: float = 0.3,
+) -> tuple[torch.Tensor, dict]:
+    """Streaming silence holder used during chunked VAE decode.
+
+    Buffers low-energy chunks until a non-silent frame arrives (or the
+    stream ends), so the client doesn't see long silent runs that would
+    later get trimmed anyway. ``sil_cache`` carries state across chunks:
+    ``{"holder": [tensors], "buffer": [tensors]}``.
+
+    Same algorithm as upstream's ``silence_holder``. The leading-silence
+    holder lets you defer emission of long silent regions; the
+    short-chunk buffer concatenates chunks smaller than one frame.
+    """
+    if speech.numel() == 0:
+        return speech, sil_cache or {"holder": [], "buffer": []}
+
+    frame_step = int(sample_rate * 0.1)
+    frame_size = int(sample_rate * 0.1)
+    if sil_cache is None:
+        sil_cache = {"holder": [], "buffer": []}
+
+    if sil_cache["buffer"]:
+        speech = torch.cat([*sil_cache["buffer"], speech], dim=-1)
+        sil_cache["buffer"] = []
+
+    if speech.shape[-1] < frame_size:
+        sil_cache["buffer"].append(speech)
+        if last_chunk:
+            out = torch.cat(sil_cache["holder"] + sil_cache["buffer"], dim=-1)
+            return out[..., : int(last_sil * sample_rate)], sil_cache
+        return torch.zeros((*speech.shape[:-1], 0), device=speech.device, dtype=speech.dtype), sil_cache
+
+    num_frame = (speech.shape[-1] - frame_size) // frame_step + 1
+    cur_len = (num_frame - 1) * frame_step + frame_size
+    if speech.shape[-1] > cur_len:
+        sil_cache["buffer"].append(speech[..., cur_len:])
+        speech = speech[..., :cur_len]
+
+    spe_frames = speech.unfold(-1, frame_size, frame_step)
+    scores = spe_frames.abs().mean(dim=-1)
+    scores = scores.mean(dim=list(range(scores.dim() - 1)))
+    idx = int(scores.shape[0]) - 1
+    while idx >= 0 and scores[idx] <= sil_th:
+        idx -= 1
+
+    if idx < 0:
+        sil_cache["holder"].append(speech)
+        if last_chunk:
+            out = torch.cat(sil_cache["holder"] + sil_cache["buffer"], dim=-1)
+            return out[..., : int(last_sil * sample_rate)], sil_cache
+        return torch.zeros((*speech.shape[:-1], 0), device=speech.device, dtype=speech.dtype), sil_cache
+
+    non_sil_len = idx * frame_step + frame_size
+    if last_chunk:
+        non_sil_len += int(last_sil * sample_rate)
+    non_sil_len = min(non_sil_len, speech.shape[-1])
+    speech_out = torch.cat([*sil_cache["holder"], speech[..., :non_sil_len]], dim=-1)
+    sil_cache["holder"] = []
+    if non_sil_len < speech.shape[-1]:
+        sil_cache["holder"].append(speech[..., non_sil_len:])
+    return speech_out, sil_cache
+
+
+# ===========================================================================
+# TalkerGenerator
+# ===========================================================================
+
+
+class TalkerGenerator:
+    """Drives prefill → AR decode → VAE decode for a single TTS request.
+
+    Stateless across requests: bind the model components once at
+    construction, then each `generate_latents` / `decode_to_waveform`
+    call runs a fresh per-request flow. The eventual `TalkerSubmodule`
+    (step 6e-2) instantiates one per worker and calls into it once per
+    request.
+
+    Field naming mirrors upstream `MingAudioGenerator.__init__` so the
+    eventual graph-walk wiring + tests can reference the same surface.
+    """
+
+    def __init__(
+        self,
+        talker_config: "TalkerConfig",
+        llm: "Qwen2Model",
+        cfm: CFM,
+        aggregator: Aggregator,
+        stop_head: nn.Module,
+        audio_vae: "AudioVAE | None" = None,
+        cfg_strength: float | None = None,
+    ) -> None:
+        self.config = talker_config
+        self.llm = llm
+        self.cfm = cfm
+        self.aggregator = aggregator
+        self.stop_head = stop_head
+        self.audio_vae = audio_vae
+        self.patch_size = talker_config.patch_size
+        self.his_patch_size = talker_config.history_patch_size
+        self.latent_dim = talker_config.vae.latent_dim
+        self.cfg_strength = (
+            cfg_strength if cfg_strength is not None else talker_config.cfg_strength
+        )
+        # Trailing latent frames prepended on each VAE-decode chunk so the
+        # Qwen2 backbone sees enough context for FA2 to be happy.
+        self._vae_decode_pad_frames = 32
+
+    # ------------------------------------------------------------------
+    # Step entry points (mirror upstream MingAudioGenerator)
+    # ------------------------------------------------------------------
+
+    def llm_step(
+        self,
+        inputs_embeds: torch.Tensor,
+        *,
+        step: int,
+        past_key_values=None,
+        use_static_cache: bool,
+    ) -> torch.Tensor:
+        """Single Qwen2 forward step; returns the last hidden state row.
+
+        On step 0 (or when no static cache is in use), call the LLM
+        without an explicit `cache_position`. On subsequent decode
+        steps with a `StaticCache`, supply `cache_position` so the
+        cache knows where to write the new K/V.
+        """
+        if step == 0 or not use_static_cache:
+            outputs = self.llm(
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+            )
+        else:
+            past_seen = int(past_key_values.get_seq_length())
+            cache_position = torch.arange(
+                past_seen,
+                past_seen + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+            outputs = self.llm(
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                cache_position=cache_position,
+            )
+        return outputs.last_hidden_state[:, -1:, :]
+
+    def cfm_sample_step(
+        self,
+        last_hidden_state: torch.Tensor,
+        his_lat: torch.Tensor,
+        *,
+        cfg: float | None = None,
+        sigma: float = 0.25,
+        temperature: float = 0.0,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """One CFM sampling step.
+
+        Returns ``(gen_lat, next_inputs_embeds, stop_out)`` where:
+          * `gen_lat`: ``(B, patch_size, latent_dim)`` — the new
+            latent patch.
+          * `next_inputs_embeds`: ``(B, 1, llm_hidden)`` — what to feed
+            the LLM on the next step (Aggregator output).
+          * `stop_out`: ``(B, 2)`` — softmaxed stop classifier output.
+        """
+        if cfg is None:
+            cfg = self.cfg_strength
+
+        bat_size, _, z_dim = his_lat.shape
+        randn_tensor = torch.randn(
+            (bat_size, self.patch_size, z_dim),
+            device=last_hidden_state.device,
+            dtype=last_hidden_state.dtype,
+        )
+        t = get_epss_timesteps(
+            self.config.steps,
+            device=last_hidden_state.device,
+            dtype=last_hidden_state.dtype,
+        )
+        sde_rnd = torch.randn(
+            (self.config.steps, *randn_tensor.shape),
+            device=last_hidden_state.device,
+            dtype=last_hidden_state.dtype,
+        )
+        sde_args = torch.tensor(
+            [cfg, sigma, temperature],
+            device=last_hidden_state.device,
+            dtype=last_hidden_state.dtype,
+        )
+
+        gen_lat = self.cfm.sample(last_hidden_state, his_lat, randn_tensor, t, sde_args, sde_rnd)
+        inputs_embeds = self.aggregator(gen_lat)
+        stop_out = self.stop_head(last_hidden_state[:, -1, :]).softmax(dim=-1)
+        return gen_lat, inputs_embeds, stop_out
+
+    # ------------------------------------------------------------------
+    # AR generation loop
+    # ------------------------------------------------------------------
+
+    @torch.no_grad()
+    def generate_latents(
+        self,
+        inputs_embeds: torch.Tensor,
+        *,
+        prompt_wav_lat: torch.Tensor | None = None,
+        min_new_token: int = 10,
+        max_steps: int = 1000,
+        cfg: float | None = None,
+        sigma: float = 0.25,
+        temperature: float = 0.0,
+        use_static_cache: bool = True,
+    ) -> list[torch.Tensor]:
+        """AR loop: prefill → repeated (LLM step → CFM sample → stop check).
+
+        Returns the list of per-step CFM-generated latent patches in
+        emission order. Each entry is ``(B, patch_size, latent_dim)``;
+        caller concatenates along dim=1 before feeding to `decode_to_waveform`
+        for the one-shot decode path, or hands them in one at a time for
+        the streaming path.
+        """
+        if cfg is None:
+            cfg = self.cfg_strength
+        device = next(self.llm.parameters()).device
+        dtype = next(self.llm.parameters()).dtype
+
+        his_lat = self._init_his_lat(prompt_wav_lat, device, dtype)
+        past_key_values, max_cache_len = self._init_kv_cache(use_static_cache, device, dtype)
+        prefill_len = inputs_embeds.shape[1]
+        all_latents: list[torch.Tensor] = []
+
+        steps_budget = min(max_steps, max_cache_len - prefill_len) if max_cache_len else max_steps
+        for step in range(steps_budget):
+            last_hs = self.llm_step(
+                inputs_embeds,
+                step=step,
+                past_key_values=past_key_values,
+                use_static_cache=use_static_cache,
+            )
+            gen_lat, inputs_embeds, stop_out = self.cfm_sample_step(
+                last_hs, his_lat, cfg=cfg, sigma=sigma, temperature=temperature,
+            )
+            his_lat = self._update_his_lat(his_lat, gen_lat)
+            all_latents.append(gen_lat)
+
+            stop_prob = float(stop_out[0, 1].detach().cpu().item())
+            if step > min_new_token and stop_prob > 0.5:
+                logger.debug("TalkerGenerator: stop at step=%d (prob=%.4f)", step, stop_prob)
+                break
+
+        return all_latents
+
+    # ------------------------------------------------------------------
+    # KV cache + history-latent bookkeeping
+    # ------------------------------------------------------------------
+
+    def _init_his_lat(
+        self,
+        prompt_wav_lat: torch.Tensor | None,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> torch.Tensor:
+        """Build the initial history-latent buffer (shape (1, his_patch_size, latent_dim)).
+
+        If `prompt_wav_lat` is supplied (e.g. voice-prompt conditioning),
+        right-align it inside the his_patch_size window; otherwise the
+        buffer starts as zeros.
+        """
+        his_lat = torch.zeros(
+            1, self.his_patch_size, self.latent_dim, device=device, dtype=dtype,
+        )
+        if prompt_wav_lat is not None:
+            start_index = self.his_patch_size - prompt_wav_lat.size(1)
+            if start_index < 0:
+                his_lat[:] = prompt_wav_lat[:, -start_index:, :]
+            else:
+                his_lat[:, start_index:, :] = prompt_wav_lat
+        return his_lat
+
+    def _init_kv_cache(
+        self,
+        use_static_cache: bool,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple[object | None, int]:
+        """Allocate a `StaticCache` for the Qwen2 LLM when requested.
+
+        Returns ``(cache_or_None, max_cache_len)``. `StaticCache` is the
+        upstream choice; matches what the released ckpt's serving path
+        uses and lets us pass `cache_position` through `llm_step` on
+        step > 0.
+        """
+        max_cache_len = 2048
+        if not use_static_cache:
+            return None, max_cache_len
+        from transformers import Qwen2Config, StaticCache
+        # Build a Qwen2Config from our TalkerLLMConfig dataclass so
+        # StaticCache can read the layer / head dims it needs.
+        llm_cfg = Qwen2Config(
+            hidden_size=self.config.llm.hidden_size,
+            num_hidden_layers=self.config.llm.num_hidden_layers,
+            num_attention_heads=self.config.llm.num_attention_heads,
+            num_key_value_heads=self.config.llm.num_key_value_heads,
+            vocab_size=self.config.llm.vocab_size,
+            max_position_embeddings=self.config.llm.max_position_embeddings,
+        )
+        cache = StaticCache(
+            config=llm_cfg,
+            max_batch_size=1,
+            max_cache_len=max_cache_len,
+            device=device,
+            dtype=dtype,
+        )
+        return cache, max_cache_len
+
+    def _update_his_lat(
+        self, his_lat: torch.Tensor, gen_lat: torch.Tensor,
+    ) -> torch.Tensor:
+        """Slide the his_patch_size window forward by patch_size."""
+        if self.his_patch_size == self.patch_size:
+            return gen_lat
+        if self.his_patch_size > self.patch_size:
+            return torch.cat(
+                [his_lat[:, self.patch_size - self.his_patch_size:], gen_lat], dim=1,
+            )
+        raise NotImplementedError(
+            f"his_patch_size ({self.his_patch_size}) < patch_size ({self.patch_size})",
+        )
+
+    # ------------------------------------------------------------------
+    # Duration cap heuristic (port of upstream `duration_capped_steps`)
+    # ------------------------------------------------------------------
+
+    def duration_capped_steps(
+        self, text_len: int, requested_max_steps: int,
+    ) -> int:
+        """Cap requested max_steps by a duration heuristic.
+
+        Mirrors upstream: each generation step yields
+        ``(patch_size * vae_patch_size * vae_hop_length) / sample_rate``
+        seconds of audio. The max-duration budget per turn is
+        ``max(2.0, text_len * 5818/16000)`` seconds (the 5818/16000
+        constant is a duration-per-token estimate matched against
+        the released ckpt's prosody).
+        """
+        if self.audio_vae is None:
+            return requested_max_steps
+        sample_rate = float(self.audio_vae.config.sample_rate)
+        vae_patch_size = float(self.audio_vae.config.patch_size)
+        hop_size = float(self.audio_vae.decoder.hop_length)
+        seconds_per_step = (self.patch_size * vae_patch_size * hop_size) / sample_rate
+        if seconds_per_step <= 0:
+            return requested_max_steps
+        max_duration_s = max(2.0, float(text_len) * (5818.0 / 16000.0))
+        max_steps_by_duration = max(1, int(max_duration_s / seconds_per_step))
+        return min(requested_max_steps, max_steps_by_duration)
+
+    # ------------------------------------------------------------------
+    # Audio decode (one-shot + streaming)
+    # ------------------------------------------------------------------
+
+    def decode_to_waveform(
+        self, latents: list[torch.Tensor], stream_decode: bool = True,
+    ) -> torch.Tensor:
+        """Decode latents → waveform via `AudioVAE.decode`.
+
+        ``stream_decode=True`` runs the chunked path (matches the live
+        serving topology where each CFM step's latent is decoded as it
+        emits); False concatenates everything and runs one decode.
+        """
+        if self.audio_vae is None:
+            raise RuntimeError("TalkerGenerator: audio_vae is None — cannot decode.")
+        if not latents:
+            device = next(self.llm.parameters()).device
+            dtype = next(self.llm.parameters()).dtype
+            return torch.zeros((1, 1, 0), device=device, dtype=dtype)
+
+        if stream_decode:
+            return self._stream_decode(latents)
+        all_lat = torch.cat(latents, dim=1)
+        waveform, _, _ = self.audio_vae.decode(
+            all_lat, use_cache=False, stream_state=(None, None, None), last_chunk=True,
+        )
+        return waveform
+
+    def _stream_decode(self, latents: list[torch.Tensor]) -> torch.Tensor:
+        """Chunked VAE decode with sliding-window pad + silence holder."""
+        sr = int(self.audio_vae.config.sample_rate)
+        decode_pad: torch.Tensor | None = None
+        sil_cache: dict | None = None
+        wav_chunks: list[torch.Tensor] = []
+
+        for i, lat in enumerate(latents):
+            last_chunk = (i == len(latents) - 1)
+            if decode_pad is not None:
+                vae_input = torch.cat([decode_pad, lat], dim=1)
+                pad_frames = decode_pad.shape[1]
+            else:
+                vae_input = lat
+                pad_frames = 0
+
+            speech, _, _ = self.audio_vae.decode(
+                vae_input,
+                use_cache=False,
+                stream_state=(None, None, None),
+                last_chunk=True,
+            )
+            total_frames = vae_input.shape[1]
+            dcs = speech.shape[-1] // total_frames
+            speech_chunk = speech[:, :, pad_frames * dcs:][0].detach().float()
+            speech_chunk, sil_cache = silence_holder(
+                speech_chunk, sr, sil_cache=sil_cache, last_chunk=last_chunk,
+            )
+            if speech_chunk.numel() > 0:
+                wav_chunks.append(speech_chunk)
+            decode_pad = vae_input[:, -self._vae_decode_pad_frames:, :].detach()
+
+        if not wav_chunks:
+            device = next(self.llm.parameters()).device
+            dtype = next(self.llm.parameters()).dtype
+            return torch.zeros((1, 1, 0), device=device, dtype=dtype)
+        return torch.cat(wav_chunks, dim=-1).unsqueeze(0)
+
+    def trim_trailing_silence(self, waveform: torch.Tensor) -> torch.Tensor:
+        """Tail-silence trim using the audio VAE's sample rate."""
+        if self.audio_vae is None:
+            return waveform
+        return trim_trailing_silence(waveform, int(self.audio_vae.config.sample_rate))
+
+
+__all__ = [
+    "TalkerGenerator",
+    "silence_holder",
+    "trim_trailing_silence",
+]
diff --git a/mstar/model/ming_omni_flash/components/vision_encoder.py b/mstar/model/ming_omni_flash/components/vision_encoder.py
new file mode 100644
index 00000000..08b69291
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/vision_encoder.py
@@ -0,0 +1,149 @@
+"""Vision encoder factory for Ming-flash-omni-2.0.
+
+The Ming-flash-omni-2.0 vision encoder is ``Qwen3MoeVisionTransformer``
+from the Ming source repo's ``qwen3_moe_vit.py`` (574 LOC). Rather than
+fork the file, we resolve it dynamically from the staged Ming source dir
+that ``MingFlashOmniModel.__init__`` already symlinks alongside the
+snapshot (see ``_prepare_tokenizer_dir``).
+
+The vllm-omni port (``vision_encoder.py:MingVisionEncoder``) wraps
+vLLM's ``Qwen3Omni_VisionTransformer`` because vLLM ships a TP/quant-
+aware re-implementation. mstar doesn't have vLLM as a dep, and the
+upstream encoder runs at full quality on a single GPU (~1 GB at bf16),
+so we use the reference implementation as-is. The encoder is built once
+per process and lives on the rank that owns the ``vision_encoder`` graph
+node (typically rank 0; see ``configs/ming_flash_omni.yaml``).
+
+Returned encoder's ``.forward(hidden_states, grid_thw)`` matches the
+upstream signature: returns a single ``(N_tokens, out_hidden_size)``
+tensor when ``use_deepstack=False`` (the default for the released ckpt,
+since the LLM-side DeepStack splicing isn't enabled in step 4), or a
+``(hidden_states, deepstack_feature_lists)`` tuple when
+``use_deepstack=True``.
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+import sys
+from pathlib import Path
+
+import torch
+from torch import nn
+
+from mstar.model.ming_omni_flash.config import VisionEncoderConfig
+
+logger = logging.getLogger(__name__)
+
+
+def _import_ming_vit(local_dir: str | None = None) -> type[nn.Module]:
+    """Resolve ``Qwen3MoeVisionTransformer`` from the staged Ming source.
+
+    ``MingFlashOmniModel.__init__`` pushes the snapshot dir onto
+    ``sys.path`` and symlinks ``qwen3_moe_vit.py`` into it (see
+    ``_MING_CODE_FILES`` and ``_prepare_tokenizer_dir``). We import via
+    that path so all the other dynamic imports the file performs
+    (e.g. ``from configuration_bailingmm2 import ...``) keep resolving
+    against the same staged tree.
+
+    Args:
+        local_dir: Optional snapshot dir to put on ``sys.path`` first.
+            Callers that bypass ``MingFlashOmniModel.__init__`` (tests,
+            standalone benchmarks) can pass this to avoid an
+            ``ImportError`` on a fresh interpreter.
+    """
+    if local_dir is not None:
+        if str(local_dir) not in sys.path:
+            sys.path.insert(0, str(local_dir))
+        # Also push the Ming source repo (if discoverable) so the dynamic
+        # imports inside qwen3_moe_vit.py resolve cross-file. The snapshot
+        # is the symlink staging dir; we discover any "real" source by
+        # following one of the staged symlinks back to its target.
+        candidate = Path(local_dir) / "qwen3_moe_vit.py"
+        if candidate.is_symlink():
+            ming_root = Path(candidate).resolve().parent
+            if str(ming_root) not in sys.path:
+                sys.path.insert(0, str(ming_root))
+
+    try:
+        module = importlib.import_module("qwen3_moe_vit")
+    except ImportError as e:
+        raise ImportError(
+            "Could not import qwen3_moe_vit. Ensure MingFlashOmniModel "
+            "was constructed (which stages the Ming source files), or "
+            "pass local_dir=<snapshot path> explicitly. See "
+            "PORTING_NOTES.md 'Ming source dependency' for setup."
+        ) from e
+
+    return module.Qwen3MoeVisionTransformer
+
+
+def build_vision_encoder(
+    config: VisionEncoderConfig,
+    use_deepstack: bool = False,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str | torch.device = "cpu",
+    attn_implementation: str = "flash_attention_2",
+    local_dir: str | None = None,
+) -> nn.Module:
+    """Construct the Ming vision encoder.
+
+    Args:
+        config:              VisionEncoderConfig from MingFlashOmniModelConfig.
+        use_deepstack:       Whether ``.forward()`` returns the per-checkpoint
+                             deepstack feature lists. Off by default — the
+                             LLM-side DeepStack splice lands with step 5
+                             (thinker graph walks for vision prefill).
+        dtype:               Cast the encoder to this dtype after construction.
+                             bf16 matches the released ckpt; fp16 also works.
+        device:              Final device for the encoder weights.
+        attn_implementation: Maps to ``config._attn_implementation`` on the
+                             internal Qwen3VLMoeVisionConfig. ``flash_attention_2``
+                             is mandatory for video performance — sdpa falls
+                             into the per-segment Python loop (see qwen3_omni
+                             model.py:1508-1519 for the same gotcha).
+        local_dir:           Snapshot directory to add to sys.path if the Ming
+                             source modules aren't already importable.
+
+    Returns:
+        An ``nn.Module`` ready to consume ``(pixel_values, grid_thw)``.
+        Weight loading is the caller's job — Ming stores vision encoder
+        weights under the top-level ``vision.*`` prefix in the released
+        ckpt.
+    """
+    Qwen3MoeVisionTransformer = _import_ming_vit(local_dir=local_dir)
+
+    # Build the internal config the Ming module expects.
+    module = sys.modules["qwen3_moe_vit"]
+    InternalConfig = module.Qwen3VLMoeVisionConfig
+    internal_config = InternalConfig(
+        depth=config.depth,
+        hidden_size=config.hidden_size,
+        hidden_act=config.hidden_act,
+        intermediate_size=config.intermediate_size,
+        num_heads=config.num_heads,
+        in_channels=config.in_channels,
+        patch_size=config.patch_size,
+        spatial_merge_size=config.spatial_merge_size,
+        temporal_patch_size=config.temporal_patch_size,
+        out_hidden_size=config.out_hidden_size,
+        num_position_embeddings=config.num_position_embeddings,
+        deepstack_visual_indexes=list(config.deepstack_visual_indexes),
+    )
+    # The attention path branches on _attn_implementation. The Ming
+    # source hard-codes it to "flash_attention_2" inside __init__ of
+    # Qwen3VLMoeVisionAttention, but we set it on the config too for
+    # the rare debug path that wants to flip to "sdpa" or "eager".
+    internal_config._attn_implementation = attn_implementation
+
+    encoder = Qwen3MoeVisionTransformer(
+        internal_config,
+        use_deepstack=use_deepstack,
+    )
+    encoder = encoder.to(dtype=dtype, device=device)
+    encoder.eval()
+    return encoder
+
+
+__all__ = ["build_vision_encoder"]
diff --git a/mstar/model/ming_omni_flash/components/zimage_transformer.py b/mstar/model/ming_omni_flash/components/zimage_transformer.py
new file mode 100644
index 00000000..1ada9a22
--- /dev/null
+++ b/mstar/model/ming_omni_flash/components/zimage_transformer.py
@@ -0,0 +1,654 @@
+"""ZImage DiT transformer for Ming-flash-omni-2.0 image generation (step 9b).
+
+Native mstar port of vllm-omni's ``z_image/z_image_transformer.py`` +
+Ming's ``ming_zimage_transformer.py`` subclass. The upstream module is built
+on vllm's tensor-parallel linears (``QKVParallelLinear`` / ``MergedColumn`` /
+``RowParallel``), a custom fused ``Attention``, vllm's ``RotaryEmbedding``,
+and ``CachedTransformer`` — none of which belong in the pure-torch mstar
+modeling tree. This reimplementation:
+
+  * uses plain ``nn.Linear`` with the **unfused** parameter names the released
+    checkpoint actually ships (``attention.to_q/to_k/to_v``,
+    ``feed_forward.w1/w3``), so the state dict loads with a direct ``copy_`` —
+    no stacked-param remap (same approach as the byt5 mapper port);
+  * reimplements the interleaved (GPT-J / ``is_neox_style=False``) RoPE that
+    vllm's ``RotaryEmbedding(is_neox_style=False)`` applies, the GLIDE/DiT
+    ``timestep_embedding``, and FP32 ``RMSNorm`` exactly;
+  * runs attention through ``F.scaled_dot_product_attention``.
+
+Architecture (released ckpt): dim=3840, 30 main layers + 2 noise-refiner + 2
+context-refiner blocks, 30 heads (head_dim=128), 16-channel latents, 3D axial
+RoPE with axes_dims=(32,48,48) summing to the 128-wide head. Caption features
+(byt5 + connector, 2560-dim) are embedded, refined, then concatenated with the
+patch-embedded image tokens into one unified sequence for the main blocks.
+
+NOTE — attention masking divergence: vllm-omni *computes* the pad mask but
+leaves it unapplied in attention ("we don't support multi prompts now"). This
+port applies it (additive ``-inf`` on padded keys) so padded cap/image tokens
+cannot leak into real positions. For the dominant batch-size-1 text-to-image
+path with sequences already a multiple of ``SEQ_MULTI_OF`` the two are
+numerically identical; they only diverge when caption padding is non-zero,
+where applying the mask is the correct behavior.
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+
+ADALN_EMBED_DIM = 256
+SEQ_MULTI_OF = 32
+
+
+# ============================================================
+# Primitives (native equivalents of the vllm-omni helpers)
+# ============================================================
+
+
+def timestep_embedding(t: torch.Tensor, dim: int, max_period: float = 10000.0) -> torch.Tensor:
+    """GLIDE/DiT sinusoidal timestep embedding (cos-then-sin, log-spaced).
+
+    Mirrors ``vllm_omni...timestep_embedding`` byte-for-byte so the adaLN
+    conditioning matches the validated serving path.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+class RMSNorm(nn.Module):
+    """FP32 RMSNorm with a learnable scale (matches vllm-omni's forward_native)."""
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        input_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(-1, keepdim=True)
+        out = x * torch.rsqrt(variance + self.variance_epsilon)
+        out = self.weight.to(torch.float32) * out
+        return out.to(input_dtype)
+
+
+def _rotate_half_interleaved(x: torch.Tensor) -> torch.Tensor:
+    """GPT-J style rotate: (-x_odd, x_even) interleaved back together."""
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+
+
+def apply_rotary_emb_interleaved(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    """Apply interleaved (is_neox_style=False) RoPE to ``[B, S, H, D]``.
+
+    ``cos``/``sin`` are ``[B, S, D/2]`` (per-axis concatenated half-frequencies
+    from :class:`RopeEmbedder`); each entry is duplicated to the adjacent pair
+    to match the interleaved convention, broadcasting over the head axis.
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    # [B, S, D/2] -> [B, S, 1, D] with each freq duplicated to its pair.
+    cos_r = cos[..., None, :].repeat_interleave(2, dim=-1)
+    sin_r = sin[..., None, :].repeat_interleave(2, dim=-1)
+    x_rot = x[..., :ro_dim]
+    rotated = x_rot * cos_r + _rotate_half_interleaved(x_rot) * sin_r
+    if ro_dim < x.shape[-1]:
+        return torch.cat([rotated, x[..., ro_dim:]], dim=-1)
+    return rotated
+
+
+class RopeEmbedder:
+    """Per-axis (3D axial) RoPE frequency table, matching vllm-omni's."""
+
+    def __init__(
+        self,
+        theta: float = 256.0,
+        axes_dims: tuple[int, ...] = (16, 56, 56),
+        axes_lens: tuple[int, ...] = (64, 128, 128),
+    ) -> None:
+        self.theta = theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        assert len(axes_dims) == len(axes_lens), "axes_dims and axes_lens must match"
+        self.cos_cached: list[torch.Tensor] | None = None
+        self.sin_cached: list[torch.Tensor] | None = None
+
+    @staticmethod
+    def precompute_freqs(dim, end, theta: float = 256.0):
+        cos_list, sin_list = [], []
+        for d, e in zip(dim, end, strict=True):
+            freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=torch.float64) / d))
+            timestep = torch.arange(e, dtype=torch.float64)
+            freqs = torch.outer(timestep, freqs).float()
+            cos_list.append(torch.cos(freqs))
+            sin_list.append(torch.sin(freqs))
+        return cos_list, sin_list
+
+    def __call__(self, ids: torch.Tensor):
+        assert ids.ndim == 2
+        assert ids.shape[-1] == len(self.axes_dims)
+        device = ids.device
+        if self.cos_cached is None:
+            self.cos_cached, self.sin_cached = self.precompute_freqs(self.axes_dims, self.axes_lens, theta=self.theta)
+            self.cos_cached = [c.to(device) for c in self.cos_cached]
+            self.sin_cached = [s.to(device) for s in self.sin_cached]
+        elif self.cos_cached[0].device != device:
+            self.cos_cached = [c.to(device) for c in self.cos_cached]
+            self.sin_cached = [s.to(device) for s in self.sin_cached]
+
+        cos_result, sin_result = [], []
+        for i in range(len(self.axes_dims)):
+            index = ids[:, i]
+            cos_result.append(self.cos_cached[i][index])
+            sin_result.append(self.sin_cached[i][index])
+        return torch.cat(cos_result, dim=-1), torch.cat(sin_result, dim=-1)
+
+
+# ============================================================
+# Modules
+# ============================================================
+
+
+class TimestepEmbedder(nn.Module):
+    def __init__(self, out_size: int, mid_size: int | None = None, frequency_embedding_size: int = 256) -> None:
+        super().__init__()
+        if mid_size is None:
+            mid_size = out_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, mid_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(mid_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        t_freq = timestep_embedding(t, self.frequency_embedding_size)
+        weight_dtype = self.mlp[0].bias.dtype
+        if weight_dtype.is_floating_point:
+            t_freq = t_freq.to(weight_dtype)
+        return self.mlp(t_freq)
+
+
+class ZImageAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, num_kv_heads: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = dim // num_heads
+        # Unfused projections — the checkpoint ships to_q/to_k/to_v separately.
+        self.to_q = nn.Linear(dim, num_heads * self.head_dim, bias=False)
+        self.to_k = nn.Linear(dim, num_kv_heads * self.head_dim, bias=False)
+        self.to_v = nn.Linear(dim, num_kv_heads * self.head_dim, bias=False)
+        self.norm_q = RMSNorm(self.head_dim, eps=eps)
+        self.norm_k = RMSNorm(self.head_dim, eps=eps)
+        self.to_out = nn.ModuleList([nn.Linear(dim, dim, bias=False)])
+        self.scale = 1.0 / (self.head_dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        bsz, seqlen, _ = hidden_states.shape
+        query = self.to_q(hidden_states).unflatten(-1, (self.num_heads, self.head_dim))
+        key = self.to_k(hidden_states).unflatten(-1, (self.num_kv_heads, self.head_dim))
+        value = self.to_v(hidden_states).unflatten(-1, (self.num_kv_heads, self.head_dim))
+
+        query = self.norm_q(query)
+        key = self.norm_k(key)
+
+        query = apply_rotary_emb_interleaved(query, cos, sin)
+        key = apply_rotary_emb_interleaved(key, cos, sin)
+        dtype = query.dtype
+
+        # [B, S, H, D] -> [B, H, S, D] for SDPA.
+        q = query.transpose(1, 2)
+        k = key.transpose(1, 2).to(dtype)
+        v = value.transpose(1, 2).to(dtype)
+
+        attn_bias = None
+        if attention_mask is not None:
+            # bool [B, S] keep-mask -> additive [B, 1, 1, S].
+            if attention_mask.dtype == torch.bool:
+                attn_bias = torch.zeros(bsz, 1, 1, seqlen, dtype=dtype, device=q.device)
+                attn_bias = attn_bias.masked_fill(~attention_mask[:, None, None, :], float("-inf"))
+            else:
+                attn_bias = attention_mask
+
+        enable_gqa = self.num_kv_heads != self.num_heads
+        out = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_bias, scale=self.scale, enable_gqa=enable_gqa
+        )
+        out = out.transpose(1, 2).flatten(2, 3).to(dtype)
+        return self.to_out[0](out)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int) -> None:
+        super().__init__()
+        # Unfused SwiGLU gate/up (checkpoint ships w1 + w3 separately).
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class ZImageTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+        modulation: bool = True,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.layer_id = layer_id
+        self.attention = ZImageAttention(dim, n_heads, n_kv_heads, eps=1e-5)
+        self.feed_forward = FeedForward(dim, hidden_dim=int(dim / 3 * 8))
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(
+                nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True),
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor | None,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        adaln_input: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if self.modulation:
+            assert adaln_input is not None
+            scale_msa, gate_msa, scale_mlp, gate_mlp = (
+                self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
+            )
+            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
+            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
+
+            attn_out = self.attention(self.attention_norm1(x) * scale_msa, attn_mask, cos, sin)
+            x = x + gate_msa * self.attention_norm2(attn_out)
+            x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
+        else:
+            attn_out = self.attention(self.attention_norm1(x), attn_mask, cos, sin)
+            x = x + self.attention_norm2(attn_out)
+            x = x + self.ffn_norm2(self.feed_forward(self.ffn_norm1(x)))
+        return x
+
+
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size: int, out_channels: int) -> None:
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
+        )
+
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        scale = 1.0 + self.adaLN_modulation(c)
+        x = self.norm_final(x) * scale.unsqueeze(1)
+        return self.linear(x)
+
+
+class ZImageTransformer2DModel(nn.Module):
+    """Native Z-Image DiT (pure torch). See module docstring for divergences."""
+
+    def __init__(
+        self,
+        all_patch_size: tuple[int, ...] = (2,),
+        all_f_patch_size: tuple[int, ...] = (1,),
+        in_channels: int = 16,
+        dim: int = 3840,
+        n_layers: int = 30,
+        n_refiner_layers: int = 2,
+        n_heads: int = 30,
+        n_kv_heads: int = 30,
+        norm_eps: float = 1e-5,
+        qk_norm: bool = True,
+        cap_feat_dim: int = 2560,
+        rope_theta: float = 256.0,
+        t_scale: float = 1000.0,
+        axes_dims: tuple[int, ...] = (32, 48, 48),
+        axes_lens: tuple[int, ...] = (1024, 512, 512),
+    ) -> None:
+        super().__init__()
+        assert len(all_patch_size) == len(all_f_patch_size)
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.all_patch_size = tuple(all_patch_size)
+        self.all_f_patch_size = tuple(all_f_patch_size)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.rope_theta = rope_theta
+        self.t_scale = t_scale
+
+        all_x_embedder = {}
+        all_final_layer = {}
+        for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size, strict=True):
+            all_x_embedder[f"{patch_size}-{f_patch_size}"] = nn.Linear(
+                f_patch_size * patch_size * patch_size * in_channels, dim, bias=True
+            )
+            all_final_layer[f"{patch_size}-{f_patch_size}"] = FinalLayer(
+                dim, patch_size * patch_size * f_patch_size * self.out_channels
+            )
+        self.all_x_embedder = nn.ModuleDict(all_x_embedder)
+        self.all_final_layer = nn.ModuleDict(all_final_layer)
+
+        self.noise_refiner = nn.ModuleList(
+            [
+                ZImageTransformerBlock(1000 + i, dim, n_heads, n_kv_heads, norm_eps, modulation=True)
+                for i in range(n_refiner_layers)
+            ]
+        )
+        self.context_refiner = nn.ModuleList(
+            [
+                ZImageTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, modulation=False)
+                for i in range(n_refiner_layers)
+            ]
+        )
+        self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024)
+        self.cap_embedder = nn.Sequential(
+            RMSNorm(cap_feat_dim, eps=norm_eps),
+            nn.Linear(cap_feat_dim, dim, bias=True),
+        )
+        self.x_pad_token = nn.Parameter(torch.empty((1, dim)))
+        self.cap_pad_token = nn.Parameter(torch.empty((1, dim)))
+        self.layers = nn.ModuleList(
+            [ZImageTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, modulation=True) for i in range(n_layers)]
+        )
+        self.axes_dims = tuple(axes_dims)
+        self.axes_lens = tuple(axes_lens)
+        self.rope_embedder = RopeEmbedder(theta=rope_theta, axes_dims=self.axes_dims, axes_lens=self.axes_lens)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    def unpatchify(self, x: list[torch.Tensor], size: list[tuple], patch_size: int, f_patch_size: int):
+        pH = pW = patch_size
+        pF = f_patch_size
+        bsz = len(x)
+        assert len(size) == bsz
+        for i in range(bsz):
+            Fr, H, W = size[i]
+            ori_len = (Fr // pF) * (H // pH) * (W // pW)
+            x[i] = (
+                x[i][:ori_len]
+                .view(Fr // pF, H // pH, W // pW, pF, pH, pW, self.out_channels)
+                .permute(6, 0, 3, 1, 4, 2, 5)
+                .reshape(self.out_channels, Fr, H, W)
+            )
+        return x
+
+    @staticmethod
+    def create_coordinate_grid(size, start=None, device=None):
+        if start is None:
+            start = (0 for _ in size)
+        axes = [
+            torch.arange(x0, x0 + span, dtype=torch.int32, device=device)
+            for x0, span in zip(start, size, strict=True)
+        ]
+        grids = torch.meshgrid(axes, indexing="ij")
+        return torch.stack(grids, dim=-1)
+
+    def patchify_and_embed(
+        self,
+        all_image: list[torch.Tensor],
+        all_cap_feats: list[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+    ):
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+
+        all_image_out, all_image_size, all_image_pos_ids, all_image_pad_mask = [], [], [], []
+        all_cap_pos_ids, all_cap_pad_mask, all_cap_feats_out = [], [], []
+
+        for image, cap_feat in zip(all_image, all_cap_feats, strict=True):
+            # ---- Caption
+            cap_ori_len = len(cap_feat)
+            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+            cap_padded_pos_ids = self.create_coordinate_grid(
+                size=(cap_ori_len + cap_padding_len, 1, 1), start=(1, 0, 0), device=device
+            ).flatten(0, 2)
+            all_cap_pos_ids.append(cap_padded_pos_ids)
+            all_cap_pad_mask.append(
+                torch.cat(
+                    [
+                        torch.zeros((cap_ori_len,), dtype=torch.bool, device=device),
+                        torch.ones((cap_padding_len,), dtype=torch.bool, device=device),
+                    ]
+                )
+            )
+            all_cap_feats_out.append(torch.cat([cap_feat, cap_feat[-1:].repeat(cap_padding_len, 1)], dim=0))
+
+            # ---- Image
+            C, Fr, H, W = image.size()
+            all_image_size.append((Fr, H, W))
+            F_tokens, H_tokens, W_tokens = Fr // pF, H // pH, W // pW
+            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+
+            image_ori_len = len(image)
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+            image_ori_pos_ids = self.create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_ori_len + cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            image_padding_pos_ids = (
+                self.create_coordinate_grid(size=(1, 1, 1), start=(0, 0, 0), device=device)
+                .flatten(0, 2)
+                .repeat(image_padding_len, 1)
+            )
+            all_image_pos_ids.append(torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0))
+            all_image_pad_mask.append(
+                torch.cat(
+                    [
+                        torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
+                        torch.ones((image_padding_len,), dtype=torch.bool, device=device),
+                    ]
+                )
+            )
+            all_image_out.append(torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0))
+
+        return (
+            all_image_out,
+            all_cap_feats_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_cap_pos_ids,
+            all_image_pad_mask,
+            all_cap_pad_mask,
+        )
+
+    def _unified_prepare(self, x, x_cos, x_sin, cap_feats, cap_cos, cap_sin, x_item_seqlens, cap_item_seqlens):
+        bsz = x.shape[0]
+        device = x.device
+        unified, unified_cos, unified_sin = [], [], []
+        for i in range(bsz):
+            x_len, cap_len = x_item_seqlens[i], cap_item_seqlens[i]
+            unified.append(torch.cat([x[i][:x_len], cap_feats[i][:cap_len]]))
+            unified_cos.append(torch.cat([x_cos[i][:x_len], cap_cos[i][:cap_len]]))
+            unified_sin.append(torch.cat([x_sin[i][:x_len], cap_sin[i][:cap_len]]))
+        unified_item_seqlens = [a + b for a, b in zip(cap_item_seqlens, x_item_seqlens, strict=True)]
+        unified_max = max(unified_item_seqlens)
+        unified = pad_sequence(unified, batch_first=True, padding_value=0.0)
+        unified_cos = pad_sequence(unified_cos, batch_first=True, padding_value=0.0)
+        unified_sin = pad_sequence(unified_sin, batch_first=True, padding_value=0.0)
+        unified_attn_mask = torch.zeros((bsz, unified_max), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(unified_item_seqlens):
+            unified_attn_mask[i, :seq_len] = 1
+        return unified, unified_cos, unified_sin, unified_attn_mask
+
+    def forward(
+        self,
+        x: list[torch.Tensor],
+        t: torch.Tensor,
+        cap_feats: list[torch.Tensor],
+        patch_size: int = 2,
+        f_patch_size: int = 1,
+    ):
+        assert patch_size in self.all_patch_size
+        assert f_patch_size in self.all_f_patch_size
+        bsz = len(x)
+        device = x[0].device
+        t = t * self.t_scale
+        t = self.t_embedder(t)
+
+        (
+            x,
+            cap_feats,
+            x_size,
+            x_pos_ids,
+            cap_pos_ids,
+            x_inner_pad_mask,
+            cap_inner_pad_mask,
+        ) = self.patchify_and_embed(x, cap_feats, patch_size, f_patch_size)
+
+        # ---- x embed + noise refine
+        x_item_seqlens = [len(_) for _ in x]
+        assert all(_ % SEQ_MULTI_OF == 0 for _ in x_item_seqlens)
+        x_max = max(x_item_seqlens)
+        x = torch.cat(x, dim=0)
+        x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
+        adaln_input = t.type_as(x)
+        x_pad_mask = torch.cat(x_inner_pad_mask)
+        x = torch.where(x_pad_mask.unsqueeze(1).expand_as(x), self.x_pad_token.expand(x.shape[0], -1), x)
+        x = list(x.split(x_item_seqlens, dim=0))
+        x_cos, x_sin = self.rope_embedder(torch.cat(x_pos_ids, dim=0))
+        x_cos = list(x_cos.split(x_item_seqlens, dim=0))
+        x_sin = list(x_sin.split(x_item_seqlens, dim=0))
+        x = pad_sequence(x, batch_first=True, padding_value=0.0)
+        x_cos = pad_sequence(x_cos, batch_first=True, padding_value=0.0)
+        x_sin = pad_sequence(x_sin, batch_first=True, padding_value=0.0)
+        x_attn_mask = torch.zeros((bsz, x_max), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(x_item_seqlens):
+            x_attn_mask[i, :seq_len] = 1
+        for layer in self.noise_refiner:
+            x = layer(x, x_attn_mask, x_cos, x_sin, adaln_input)
+
+        # ---- cap embed + context refine
+        cap_item_seqlens = [len(_) for _ in cap_feats]
+        assert all(_ % SEQ_MULTI_OF == 0 for _ in cap_item_seqlens)
+        cap_max = max(cap_item_seqlens)
+        cap_feats = torch.cat(cap_feats, dim=0)
+        cap_feats = self.cap_embedder(cap_feats)
+        cap_pad_mask = torch.cat(cap_inner_pad_mask)
+        cap_feats = torch.where(
+            cap_pad_mask.unsqueeze(1).expand_as(cap_feats),
+            self.cap_pad_token.expand(cap_feats.shape[0], -1),
+            cap_feats,
+        )
+        cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
+        cap_cos, cap_sin = self.rope_embedder(torch.cat(cap_pos_ids, dim=0))
+        cap_cos = list(cap_cos.split(cap_item_seqlens, dim=0))
+        cap_sin = list(cap_sin.split(cap_item_seqlens, dim=0))
+        cap_feats = pad_sequence(cap_feats, batch_first=True, padding_value=0.0)
+        cap_cos = pad_sequence(cap_cos, batch_first=True, padding_value=0.0)
+        cap_sin = pad_sequence(cap_sin, batch_first=True, padding_value=0.0)
+        cap_attn_mask = torch.zeros((bsz, cap_max), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(cap_item_seqlens):
+            cap_attn_mask[i, :seq_len] = 1
+        for layer in self.context_refiner:
+            cap_feats = layer(cap_feats, cap_attn_mask, cap_cos, cap_sin)
+
+        # ---- unify + main blocks
+        unified, unified_cos, unified_sin, unified_attn_mask = self._unified_prepare(
+            x, x_cos, x_sin, cap_feats, cap_cos, cap_sin, x_item_seqlens, cap_item_seqlens
+        )
+        for layer in self.layers:
+            unified = layer(unified, unified_attn_mask, unified_cos, unified_sin, adaln_input)
+
+        unified = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
+        unified = list(unified.unbind(dim=0))
+        return self.unpatchify(unified, x_size, patch_size, f_patch_size), {}
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Direct state-dict load — our unfused layout matches the checkpoint.
+
+        Unlike vllm-omni (which fuses to_qkv / w13 and remaps), we keep
+        to_q/to_k/to_v + w1/w3 separate, so the released DiT weights copy in
+        verbatim. Returns the set of param names covered so callers can assert
+        completeness.
+        """
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            if param.shape != loaded_weight.shape:
+                raise ValueError(
+                    f"Shape mismatch loading ZImage DiT weight {name}: "
+                    f"param {tuple(param.shape)} vs checkpoint {tuple(loaded_weight.shape)}"
+                )
+            with torch.no_grad():
+                param.copy_(loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MingZImageTransformer2DModel(ZImageTransformer2DModel):
+    """ZImage DiT with Ming's reference-latent (img2img) support.
+
+    Ming's img2img path concatenates a VAE-encoded reference latent along the
+    frame axis before patchification, then drops the reference-frame prediction
+    from the unpatchified output. ``ref_latent`` is threaded through as an
+    explicit forward arg (the upstream reads it from a global forward-context;
+    mstar passes it directly).
+    """
+
+    def forward(
+        self,
+        x: list[torch.Tensor],
+        t: torch.Tensor,
+        cap_feats: list[torch.Tensor],
+        patch_size: int = 2,
+        f_patch_size: int = 1,
+        ref_latent: list[torch.Tensor] | None = None,
+    ):
+        self._dropping_ref = ref_latent is not None
+        if ref_latent is not None:
+            per_item = ref_latent[0].unsqueeze(1).to(dtype=x[0].dtype, device=x[0].device)  # [C, 1, H, W]
+            x = [torch.cat([img, per_item], dim=1) for img in x]
+        return super().forward(x, t, cap_feats, patch_size=patch_size, f_patch_size=f_patch_size)
+
+    def unpatchify(self, x, size, patch_size, f_patch_size):
+        out = super().unpatchify(x, size, patch_size, f_patch_size)
+        if getattr(self, "_dropping_ref", False):
+            # Drop the reference frame (F==2 -> keep first frame only).
+            return [t[:, :1, :, :] for t in out]
+        return out
+
+
+__all__ = ["ZImageTransformer2DModel", "MingZImageTransformer2DModel"]
diff --git a/mstar/model/ming_omni_flash/config.py b/mstar/model/ming_omni_flash/config.py
new file mode 100644
index 00000000..c0b1d035
--- /dev/null
+++ b/mstar/model/ming_omni_flash/config.py
@@ -0,0 +1,873 @@
+"""Configuration dataclass for Ming-flash-omni-2.0.
+
+Mirrors mstar's qwen3_omni pattern (pure ``@dataclass`` tree,
+``from_pretrained(local_dir)``, convenience ``@property``s) so the rest of
+the framework can read dims off the loaded config without going through
+``transformers.PretrainedConfig`` machinery.
+
+The released checkpoint (``inclusionAI/Ming-flash-omni-2.0``) does NOT match
+upstream vllm-omni's flat ``MingFlashOmniConfig`` nesting. On disk only the
+``BailingMM2Config`` shape lives at ``config.json``::
+
+    config.json                     # thinker: audio_config + llm_config + vision_config + scalars
+    talker/config.json              # talker top-level (BailingTalker2)
+    talker/llm/config.json          # talker LLM backbone (Qwen2)
+    talker/vae/config.json          # talker AudioVAE
+    transformer/config.json         # image-gen DiT (ZImageTransformer2DModel)
+    vae/config.json                 # image-gen VAE
+    scheduler/scheduler_config.json # image-gen diffusion scheduler
+    byt5/google__byt5-smal/config.json   # image-gen text encoder
+    connector/config.json           # image-gen connector
+    mlp/config.json                 # image-gen projector
+
+This loader follows the on-disk layout: it parses ``config.json`` for the
+thinker path and lazy-loads talker / image-gen from sibling subdirs when
+those exist. Talker and image-gen are SKELETON dataclasses today — exhaustive
+field semantics land with the talker port (step 6 of PORTING_NOTES.md) and
+the image-gen port (step 9).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Thinker LLM (Ling-2.0 sparse MoE — model_type "bailing_moe_v2")
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ThinkerLLMConfig:
+    """Ling-2.0 sparse-MoE thinker (BailingMoeV2).
+
+    Field set is the union of what upstream
+    ``vllm_omni/transformers_utils/configs/ming_flash_omni.py:BailingMoeV2Config``
+    declares and what the released ``llm_config`` actually populates.
+    Defaults reflect the released ckpt, not the upstream class defaults
+    (which were trained for a smaller config).
+    """
+
+    # Dims
+    vocab_size: int = 157184
+    hidden_size: int = 4096
+    intermediate_size: int = 9216
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 4
+    head_dim: int | None = None  # computed in __post_init__
+
+    # Norm / activation
+    hidden_act: str = "silu"
+    rms_norm_eps: float = 1e-6
+    use_qk_norm: bool = True
+    use_qkv_bias: bool = False
+    use_bias: bool = False
+    tie_word_embeddings: bool = False
+
+    # Position / RoPE
+    max_position_embeddings: int = 32768
+    rope_theta: float = 2_400_000.0
+    rope_scaling: dict[str, Any] | None = None
+    partial_rotary_factor: float = 0.5
+
+    # MoE
+    num_experts: int = 256
+    num_shared_experts: int = 1
+    num_experts_per_tok: int = 8
+    moe_intermediate_size: int = 1024
+    first_k_dense_replace: int = 1
+    router_type: str = "MultiRouter"
+    n_group: int = 8
+    topk_group: int = 4
+    moe_router_topk_scaling_factor: float = 2.5
+    norm_topk_prob: bool = True
+    use_expert_bias: bool = True
+    output_router_logits: bool = False
+
+    # Misc
+    pad_token_id: int = 156892
+    eos_token_id: int = 156895
+    use_interleaved_frame_timestamp: bool = True
+
+    # Multimodal token IDs (used by the prefill processor / chat template).
+    # Defaults mirror the actual tokenizer (`tokenizer.json` added_tokens at
+    # the released ckpt; cross-checked against Jonathan1909's patched config
+    # and vllm-omni's BailingMoeV2Config defaults). Two gotchas the on-disk
+    # `config.json` of `inclusionAI/Ming-flash-omni-2.0` introduces:
+    #   * `video_start_token` is mislabeled as 157159 (= </image>) in the
+    #     ckpt config; the real `<video>` token is 157160. Jonathan1909's
+    #     patched config corrects this. `__post_init__` warns loudly if a
+    #     load picks up the bogus value.
+    #   * `audio_*` / `*_end` / `tokens_per_second` are not in the on-disk
+    #     llm_config at all; they're tokenizer-derived constants and are
+    #     hardcoded in vllm-omni. We mirror those defaults here so
+    #     vision/audio masking + MRoPE temporal-position math can read them
+    #     directly off `ThinkerLLMConfig`.
+    image_patch_token: int = 157157
+    video_patch_token: int = 157175
+    audio_patch_token: int = 157168
+    image_start_token: int = 157158
+    video_start_token: int = 157160
+    audio_start_token: int = 157169
+    image_end_token: int = 157159
+    video_end_token: int = 157161
+    audio_end_token: int = 157170
+    tokens_per_second: int = 2
+
+    def __post_init__(self) -> None:
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+        # Released ckpt has hidden_size=4096, num_attention_heads=32 → head_dim=128.
+        # Mirror qwen3_omni's loud-on-mismatch warning (config.py:46-64) so a
+        # silently-wrong head_dim doesn't break MRoPE downstream.
+        if self.head_dim * self.num_attention_heads != self.hidden_size and self.head_dim != 128:
+            logger.warning(
+                "ThinkerLLMConfig: unusual head_dim=%d "
+                "(hidden_size=%d, num_attention_heads=%d). "
+                "Expected head_dim=128 for Ming-flash-omni-2.0. "
+                "Verify the checkpoint config.json contains 'head_dim': 128 "
+                "under llm_config.",
+                self.head_dim, self.hidden_size, self.num_attention_heads,
+            )
+        # The inclusionAI ckpt's llm_config.video_start_token is mislabeled
+        # (157159 = </image> per tokenizer; the real <video> token is 157160).
+        # If we picked up the bogus value, repair it and warn loudly — vision
+        # masking would otherwise key on </image> for video-start markers.
+        if self.video_start_token == 157159 and self.image_end_token == 157159:
+            logger.warning(
+                "ThinkerLLMConfig: ckpt-supplied video_start_token=157159 "
+                "matches image_end_token (= </image> per tokenizer). The "
+                "released inclusionAI/Ming-flash-omni-2.0 config.json "
+                "mislabels this field; correcting to 157160 (= <video>). "
+                "If this is intentional, set video_start_token explicitly "
+                "after construction."
+            )
+            self.video_start_token = 157160
+
+    @property
+    def mrope_section(self) -> list[int]:
+        """MRoPE section split. Upstream default [8, 12, 12] sums to 32 — the
+        number of rotary dims (head_dim=128 * partial_rotary_factor=0.5)."""
+        return (self.rope_scaling or {}).get("mrope_section", [8, 12, 12])
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> ThinkerLLMConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        return cls(**{k: v for k, v in d.items() if k in fnames})
+
+
+# ---------------------------------------------------------------------------
+# Vision encoder (Qwen3-MoE ViT — model_type "qwen3_moe_vit")
+# ---------------------------------------------------------------------------
+
+@dataclass
+class VisionEncoderConfig:
+    depth: int = 27
+    hidden_size: int = 1152
+    intermediate_size: int = 4304
+    num_heads: int = 16
+    in_channels: int = 3
+    patch_size: int = 16
+    spatial_merge_size: int = 2
+    temporal_patch_size: int = 2
+    out_hidden_size: int = 4096
+    num_position_embeddings: int = 2304
+    deepstack_visual_indexes: tuple[int, ...] = (8, 16, 24)
+    hidden_act: str = "gelu_pytorch_tanh"
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> VisionEncoderConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        filtered = {k: v for k, v in d.items() if k in fnames}
+        # HF stores tuple fields as lists; coerce.
+        if "deepstack_visual_indexes" in filtered and isinstance(
+            filtered["deepstack_visual_indexes"], list
+        ):
+            filtered["deepstack_visual_indexes"] = tuple(
+                filtered["deepstack_visual_indexes"]
+            )
+        return cls(**filtered)
+
+
+# ---------------------------------------------------------------------------
+# Audio encoder (Whisper-style, with Ming-side knobs)
+# ---------------------------------------------------------------------------
+
+@dataclass
+class AudioEncoderConfig:
+    """Whisper encoder.
+
+    On disk the outer ``audio_config`` carries Ming-side knobs (downsample
+    kernel + stride for the post-encoder convolution, ``norm_query_embeds``)
+    while the actual Whisper dims sit nested under
+    ``audio_config.whisper_encoder_config`` as ``{n_ctx, n_head, n_layer,
+    n_mels, n_state}``. We keep the same nesting and expose convenience
+    properties so callers can read ``d_model`` / ``encoder_layers`` /
+    ``encoder_attention_heads`` without traversing the dict.
+    """
+
+    ds_kernel_size: int = 3
+    ds_stride: int = 2
+    norm_query_embeds: bool = True
+    whisper_encoder_config: dict[str, Any] = field(
+        default_factory=lambda: {
+            "n_ctx": 15000, "n_head": 20, "n_layer": 32, "n_mels": 128, "n_state": 1280,
+        }
+    )
+
+    @property
+    def d_model(self) -> int:
+        return int(self.whisper_encoder_config["n_state"])
+
+    @property
+    def encoder_layers(self) -> int:
+        return int(self.whisper_encoder_config["n_layer"])
+
+    @property
+    def encoder_attention_heads(self) -> int:
+        return int(self.whisper_encoder_config["n_head"])
+
+    @property
+    def n_mels(self) -> int:
+        return int(self.whisper_encoder_config["n_mels"])
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> AudioEncoderConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        return cls(**{k: v for k, v in d.items() if k in fnames})
+
+
+# ---------------------------------------------------------------------------
+# Talker — CFM (continuous flow matching) head + Qwen2 LLM backbone
+# ---------------------------------------------------------------------------
+#
+# Step 6a (this commit) lifts the on-disk talker config tree into proper
+# typed dataclasses so the downstream modeling code (CFM head, DiT
+# blocks, Aggregator, Qwen2 backbone) can read dims off `config.talker`
+# directly. The on-disk layout is::
+#
+#   talker/config.json        — top-level BailingTalker2 config (CFM steps,
+#                                patch sizes, flowmodel + aggregator blocks)
+#   talker/llm/config.json    — Qwen2 LLM backbone (896-dim, 24 layers)
+#   talker/vae/config.json    — AudioVAE (44.1 kHz output, latent_dim=64)
+#
+# The flowmodel / aggregator blocks share the same DiTBlockConfig shape
+# (depth, hidden_size, num_heads, mlp_ratio, dropout, in_channels) so we
+# reuse one dataclass for both.
+
+
+@dataclass
+class TalkerLLMConfig:
+    """Qwen2 LLM backbone used inside the talker.
+
+    Distinct from `ThinkerLLMConfig` — different vocab, smaller dims, no
+    MoE, GQA with sliding-window attention. Field set mirrors the
+    populated keys in `talker/llm/config.json` on the released ckpt.
+    """
+
+    vocab_size: int = 151936
+    hidden_size: int = 896
+    intermediate_size: int = 4864
+    num_hidden_layers: int = 24
+    num_attention_heads: int = 14
+    num_key_value_heads: int = 2
+
+    # Norm / activation
+    hidden_act: str = "silu"
+    rms_norm_eps: float = 1e-6
+    tie_word_embeddings: bool = True
+
+    # Position / RoPE
+    max_position_embeddings: int = 32768
+    rope_theta: float = 1_000_000.0
+
+    # Sliding window attention. On the released ckpt the talker LLM has
+    # use_sliding_window=False but a non-trivial sliding_window value;
+    # we expose both so the eventual attention impl can branch correctly.
+    use_sliding_window: bool = False
+    sliding_window: int = 32768
+    max_window_layers: int = 21
+
+    # Misc
+    bos_token_id: int = 151643
+    eos_token_id: int = 151645
+    attention_dropout: float = 0.0
+    use_cache: bool = True
+
+    @property
+    def head_dim(self) -> int:
+        return self.hidden_size // self.num_attention_heads
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> TalkerLLMConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        return cls(**{k: v for k, v in d.items() if k in fnames})
+
+
+@dataclass
+class DiTBlockConfig:
+    """Shared shape for the talker's `flowmodel` and `aggregator` DiT stacks.
+
+    Both blocks live under the talker top-level config with identical
+    fields (only the dropout value differs — flowmodel=0, aggregator=0.1).
+    The block builds out as Attention + RMSNorm + FeedForward(GeGLU);
+    individual layers are stacked `depth` times.
+    """
+
+    attn_backend: str = "torch"
+    attn_mask_enabled: bool = False
+    depth: int = 8
+    dropout: float = 0.0
+    hidden_size: int = 1024
+    in_channels: int = 64
+    mlp_ratio: int = 4
+    num_heads: int = 16
+    pe_attn_head: Any | None = None
+    qk_norm: Any | None = None
+
+    @property
+    def head_dim(self) -> int:
+        return self.hidden_size // self.num_heads
+
+    @property
+    def intermediate_size(self) -> int:
+        return self.hidden_size * self.mlp_ratio
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> DiTBlockConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        return cls(**{k: v for k, v in d.items() if k in fnames})
+
+
+@dataclass
+class AudioVAEConfig:
+    """Audio VAE: encoder (waveform → latent) + decoder (latent → ISTFT mag/phase).
+
+    Both encoder and decoder are Qwen2-LLM-shaped sliding-window attention
+    backbones (`enc_kwargs.backbone` / `dec_kwargs.backbone` on disk —
+    nearly identical to `TalkerLLMConfig` but with vocab_size=1 since they
+    don't tokenize text). The encoder takes waveform features and outputs
+    `latent_dim` channels; the decoder consumes the latent and projects to
+    `output_dim` STFT bins (882 on the released ckpt) for an ISTFT head.
+
+    Discriminator + loss-weight fields (`hifi_gan_disc_kwargs`,
+    `lambda_*`) are training-time only and stored as raw dicts here —
+    inference doesn't read them.
+    """
+
+    sample_rate: int = 44100
+    patch_size: int = 4
+    init_method: str = "kaiming"
+
+    # Encoder / decoder dims pulled out of enc_kwargs / dec_kwargs.
+    # Defaults match the released ckpt's `talker/vae/config.json`:
+    # input_dim=882 (waveform frame size), hop_size=882 (one latent per
+    # 882-sample chunk → 50 latents/sec at 44.1 kHz), latent_dim=64,
+    # output_dim=882 (ISTFTHead hop_length; n_fft = output_dim * 4 = 3528).
+    latent_dim: int = 64
+    encoder_input_dim: int = 882
+    encoder_hop_size: int = 882
+    decoder_output_dim: int = 882
+
+    # The full Qwen2-shaped backbones for enc/dec are kept as raw dicts
+    # here; the modeling code (step 6d) will lift the relevant fields
+    # into its own block-builder.
+    enc_backbone: dict[str, Any] = field(default_factory=dict)
+    dec_backbone: dict[str, Any] = field(default_factory=dict)
+
+    # Training-time only; not consumed by inference. Stored verbatim so a
+    # full round-trip remains possible.
+    hifi_gan_disc_kwargs: dict[str, Any] = field(default_factory=dict)
+    spec_disc_kwargs: dict[str, Any] = field(default_factory=dict)
+    semantic_module_kwargs: dict[str, Any] | None = None
+    lambda_adv: float = 1.0
+    lambda_disc: float = 1.0
+    lambda_feat_match_loss: float = 1.0
+    lambda_mel_loss: float = 1.0
+    lambda_semantic: float = 2.0
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> AudioVAEConfig:
+        enc_kwargs = d.get("enc_kwargs") or {}
+        dec_kwargs = d.get("dec_kwargs") or {}
+        return cls(
+            sample_rate=int(d.get("sample_rate", 44100)),
+            patch_size=int(d.get("patch_size", 4)),
+            init_method=str(d.get("init_method", "kaiming")),
+            latent_dim=int(enc_kwargs.get("latent_dim", dec_kwargs.get("latent_dim", 64))),
+            encoder_input_dim=int(enc_kwargs.get("input_dim", 80)),
+            encoder_hop_size=int(enc_kwargs.get("hop_size", 320)),
+            decoder_output_dim=int(dec_kwargs.get("output_dim", 882)),
+            enc_backbone=dict(enc_kwargs.get("backbone", {})),
+            dec_backbone=dict(dec_kwargs.get("backbone", {})),
+            hifi_gan_disc_kwargs=dict(d.get("hifi_gan_disc_kwargs") or {}),
+            spec_disc_kwargs=dict(d.get("spec_disc_kwargs") or {}),
+            semantic_module_kwargs=d.get("semantic_module_kwargs"),
+            lambda_adv=float(d.get("lambda_adv", 1.0)),
+            lambda_disc=float(d.get("lambda_disc", 1.0)),
+            lambda_feat_match_loss=float(d.get("lambda_feat_match_loss", 1.0)),
+            lambda_mel_loss=float(d.get("lambda_mel_loss", 1.0)),
+            lambda_semantic=float(d.get("lambda_semantic", 2.0)),
+        )
+
+
+@dataclass
+class TalkerConfig:
+    """Ming-flash-omni-2.0 talker (BailingTalker2) — Qwen2 LLM + CFM head.
+
+    Sub-configs:
+      * ``llm`` — Qwen2-based AR LLM that consumes thinker hidden states
+        + voice prompt and emits CFM-driving latents.
+      * ``flowmodel`` / ``aggregator`` — DiT stacks (CFM + condition
+        aggregator); identical shape, different dropout.
+      * ``vae`` — AudioVAE for waveform synthesis from CFM-generated
+        latents.
+
+    Inference-time scalars (`steps`, `cfg_strength`, `patch_size`,
+    `history_patch_size`) drive the CFM sampling loop.
+    """
+
+    # Top-level scalars from talker/config.json
+    steps: int = 10
+    patch_size: int = 4
+    history_patch_size: int = 32
+    cfg_strength: float = 2.0
+
+    # Typed sub-configs (replaces step-1's raw-dict skeletons).
+    llm: TalkerLLMConfig = field(default_factory=TalkerLLMConfig)
+    flowmodel: DiTBlockConfig = field(default_factory=DiTBlockConfig)
+    aggregator: DiTBlockConfig = field(default_factory=DiTBlockConfig)
+    vae: AudioVAEConfig = field(default_factory=AudioVAEConfig)
+
+    # Convenience accessors used by Model.get_output_sample_rate (kept
+    # for backward compat with code that previously read `vae_sample_rate`
+    # / `vae_patch_size` directly off this dataclass).
+    @property
+    def vae_sample_rate(self) -> int:
+        return self.vae.sample_rate
+
+    @property
+    def vae_patch_size(self) -> int:
+        return self.vae.patch_size
+
+    @classmethod
+    def from_subdir(cls, talker_dir: str | os.PathLike[str]) -> TalkerConfig | None:
+        """Load from ``<local_dir>/talker/``; return None if the subdir is absent."""
+        talker_dir = Path(talker_dir)
+        cfg_path = talker_dir / "config.json"
+        if not cfg_path.exists():
+            return None
+
+        with open(cfg_path) as f:
+            raw = json.load(f)
+
+        # Top-level scalars
+        steps = int(raw.get("steps", 10))
+        patch_size = int(raw.get("patch_size", 4))
+        history_patch_size = int(raw.get("history_patch_size", 32))
+        cfg_strength = float(raw.get("cfg_strength", 2.0))
+
+        # flowmodel + aggregator sub-blocks
+        flowmodel = DiTBlockConfig.from_dict(raw.get("flowmodel", {}))
+        aggregator = DiTBlockConfig.from_dict(raw.get("aggregator", {}))
+
+        # LLM sub-config
+        llm = TalkerLLMConfig()
+        llm_path = talker_dir / "llm" / "config.json"
+        if llm_path.exists():
+            with open(llm_path) as f:
+                llm = TalkerLLMConfig.from_dict(json.load(f))
+
+        # VAE sub-config
+        vae = AudioVAEConfig()
+        vae_path = talker_dir / "vae" / "config.json"
+        if vae_path.exists():
+            with open(vae_path) as f:
+                vae = AudioVAEConfig.from_dict(json.load(f))
+
+        return cls(
+            steps=steps,
+            patch_size=patch_size,
+            history_patch_size=history_patch_size,
+            cfg_strength=cfg_strength,
+            llm=llm,
+            flowmodel=flowmodel,
+            aggregator=aggregator,
+            vae=vae,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Image generation (step 9a — typed sub-configs)
+# ---------------------------------------------------------------------------
+#
+# The released ckpt's imagegen components live in sibling subdirs:
+#   transformer/  — ZImageTransformer2DModel (the diffusion DiT)
+#   vae/          — AutoencoderKL (16-channel latent, scaling/shift)
+#   scheduler/    — FlowMatchEulerDiscreteScheduler
+#   connector/    — Qwen2ForCausalLM (caption-feature connector)
+#   byt5/         — ByT5 glyph encoder + T5-block mapper (text rendering)
+#   mlp/          — projector knobs (img_gen_scales, diffusion_c_input_dim)
+
+
+@dataclass
+class ZImageDiTConfig:
+    """ZImageTransformer2DModel (the image-gen diffusion backbone).
+
+    Fields mirror the released ``transformer/config.json`` (a diffusers
+    config with ``_class_name="ZImageTransformer2DModel"``). The DiT is a
+    flow-matching transformer with 3D axial RoPE (``axes_dims`` /
+    ``axes_lens``) operating on 16-channel VAE latents.
+    """
+
+    dim: int = 3840
+    n_layers: int = 30
+    n_refiner_layers: int = 2
+    n_heads: int = 30
+    n_kv_heads: int = 30
+    in_channels: int = 16
+    cap_feat_dim: int = 2560
+    siglip_feat_dim: int | None = None
+    norm_eps: float = 1e-5
+    qk_norm: bool = True
+    rope_theta: float = 256.0
+    t_scale: float = 1000.0
+    axes_dims: tuple[int, ...] = (32, 48, 48)
+    axes_lens: tuple[int, ...] = (1536, 512, 512)
+    all_patch_size: tuple[int, ...] = (2,)
+    all_f_patch_size: tuple[int, ...] = (1,)
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> ZImageDiTConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        filtered = {k: v for k, v in d.items() if k in fnames}
+        for tup_key in ("axes_dims", "axes_lens", "all_patch_size", "all_f_patch_size"):
+            if tup_key in filtered and isinstance(filtered[tup_key], list):
+                filtered[tup_key] = tuple(filtered[tup_key])
+        return cls(**filtered)
+
+
+@dataclass
+class ImageVAEConfig:
+    """AutoencoderKL for image-gen (vae/config.json).
+
+    16-channel latent space with diffusers-style ``scaling_factor`` /
+    ``shift_factor`` applied to latents before / after the DiT.
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    latent_channels: int = 16
+    layers_per_block: int = 2
+    norm_num_groups: int = 32
+    sample_size: int = 1024
+    scaling_factor: float = 0.3611
+    shift_factor: float = 0.1159
+    force_upcast: bool = True
+    act_fn: str = "silu"
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> ImageVAEConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        return cls(**{k: v for k, v in d.items() if k in fnames})
+
+
+@dataclass
+class ImageGenSchedulerConfig:
+    """FlowMatchEulerDiscreteScheduler (scheduler/scheduler_config.json)."""
+
+    num_train_timesteps: int = 1000
+    shift: float = 3.0
+    use_dynamic_shifting: bool = False
+
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> ImageGenSchedulerConfig:
+        fnames = {f.name for f in cls.__dataclass_fields__.values()}
+        return cls(**{k: v for k, v in d.items() if k in fnames})
+
+
+@dataclass
+class ByT5MapperConfig:
+    """ByT5 glyph encoder + T5-block mapper (byt5/byt5.json).
+
+    Drives the text-rendering pathway: a ByT5-small encoder feeds a
+    ``T5EncoderBlockByT5Mapper`` that projects glyph features into the
+    DiT's caption-feature space (``sdxl_channels``). The font / color
+    special-token annotation files live alongside in ``byt5/``.
+    """
+
+    byt5_mapper_type: str = "T5EncoderBlockByT5Mapper"
+    mapper_num_layers: int = 4
+    sdxl_channels: int = 2560
+    byt5_name: str = "google/byt5-small"
+    byt5_max_length: int = 256
+    multilingual: bool = True
+    special_token: bool = True
+    color_special_token: bool = True
+    font_special_token: bool = True
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> ByT5MapperConfig:
+        mapper = d.get("byt5_mapper_config", {}) or {}
+        byt5 = d.get("byt5_config", {}) or {}
+        return cls(
+            byt5_mapper_type=str(d.get("byt5_mapper_type", "T5EncoderBlockByT5Mapper")),
+            mapper_num_layers=int(mapper.get("num_layers", 4)),
+            sdxl_channels=int(mapper.get("sdxl_channels", 2560)),
+            byt5_name=str(byt5.get("byt5_name", "google/byt5-small")),
+            byt5_max_length=int(d.get("byt5_max_length", 256)),
+            multilingual=bool(byt5.get("multilingual", True)),
+            special_token=bool(byt5.get("special_token", True)),
+            color_special_token=bool(byt5.get("color_special_token", True)),
+            font_special_token=bool(byt5.get("font_special_token", True)),
+        )
+
+
+@dataclass
+class ImageGenConfig:
+    """Ming-flash-omni-2.0 image-generation pipeline (ZImage DiT + ByT5).
+
+    Typed sub-config tree parsed from the released ckpt's imagegen
+    subdirs. The DiT / VAE / scheduler / byt5 dataclasses carry the dims
+    the modeling code (step 9b+) reads; the connector is a Qwen2 backbone
+    kept as a raw dict (built via the talker's Qwen2 path when wired).
+    """
+
+    # Subfolder names (mirror upstream MingImageGenConfig)
+    transformer_subfolder: str = "transformer"
+    vae_subfolder: str = "vae"
+    scheduler_subfolder: str = "scheduler"
+    byt5_subfolder: str = "byt5"
+    connector_subfolder: str = "connector"
+    mlp_subfolder: str = "mlp"
+
+    # From mlp/config.json
+    img_gen_scales: list[int] = field(default_factory=lambda: [16])
+    diffusion_c_input_dim: int = 2560
+    text_encoder_norm: bool = True
+    use_identity_mlp: bool = True
+    dit_type: str = "zimage"
+
+    # Defaults for image-gen sampling (match upstream MingImageGenConfig)
+    num_inference_steps: int = 30
+    guidance_scale: float = 2.0
+    default_height: int = 1024
+    default_width: int = 1024
+
+    # Typed sub-configs (populated by from_subdirs when the subdir exists).
+    dit: ZImageDiTConfig = field(default_factory=ZImageDiTConfig)
+    vae: ImageVAEConfig = field(default_factory=ImageVAEConfig)
+    scheduler: ImageGenSchedulerConfig = field(default_factory=ImageGenSchedulerConfig)
+    byt5: ByT5MapperConfig = field(default_factory=ByT5MapperConfig)
+    # The connector is a Qwen2 LLM (1536-dim, 28L); kept as a raw dict
+    # because it's built via the shared Qwen2 path at model-construction
+    # time, not read field-by-field here.
+    connector: dict[str, Any] | None = None
+
+    @property
+    def num_query_tokens(self) -> int:
+        """Total learnable query tokens appended to the thinker for image-gen.
+
+        ``img_gen_scales=[16]`` ⇒ 256. Matches upstream
+        ``MingImageGenConfig.num_query_tokens`` and
+        ``vllm_omni/.../ming_flash_omni/prompt_utils.py:DEFAULT_NUM_QUERY_TOKENS``.
+        """
+        return sum(s * s for s in self.img_gen_scales)
+
+    @classmethod
+    def from_subdirs(cls, local_dir: str | os.PathLike[str]) -> ImageGenConfig | None:
+        """Load from sibling subdirs; return None if none of the imagegen
+        subdirs exist (e.g. a thinker-only checkpoint)."""
+        local_dir = Path(local_dir)
+        # Use the DiT transformer config presence as the load gate — that's
+        # the most expensive component and would fail loudly later anyway.
+        if not (local_dir / "transformer" / "config.json").exists():
+            return None
+
+        instance = cls()
+
+        # mlp/config.json overrides the imagegen knobs we expose at the top
+        # level (img_gen_scales, diffusion_c_input_dim, text_encoder_norm).
+        mlp_path = local_dir / instance.mlp_subfolder / "config.json"
+        if mlp_path.exists():
+            with open(mlp_path) as f:
+                mlp_raw = json.load(f)
+            if "img_gen_scales" in mlp_raw:
+                instance.img_gen_scales = list(mlp_raw["img_gen_scales"])
+            if "diffusion_c_input_dim" in mlp_raw:
+                instance.diffusion_c_input_dim = int(mlp_raw["diffusion_c_input_dim"])
+            if "text_encoder_norm" in mlp_raw:
+                instance.text_encoder_norm = bool(mlp_raw["text_encoder_norm"])
+            if "use_identity_mlp" in mlp_raw:
+                instance.use_identity_mlp = bool(mlp_raw["use_identity_mlp"])
+            if "dit_type" in mlp_raw:
+                instance.dit_type = str(mlp_raw["dit_type"])
+
+        # transformer/ (DiT) — the load gate, always present here.
+        dit_path = local_dir / instance.transformer_subfolder / "config.json"
+        with open(dit_path) as f:
+            instance.dit = ZImageDiTConfig.from_dict(json.load(f))
+
+        # vae/
+        vae_path = local_dir / instance.vae_subfolder / "config.json"
+        if vae_path.exists():
+            with open(vae_path) as f:
+                instance.vae = ImageVAEConfig.from_dict(json.load(f))
+
+        # scheduler/
+        sched_path = local_dir / instance.scheduler_subfolder / "scheduler_config.json"
+        if sched_path.exists():
+            with open(sched_path) as f:
+                instance.scheduler = ImageGenSchedulerConfig.from_dict(json.load(f))
+
+        # byt5/byt5.json
+        byt5_path = local_dir / instance.byt5_subfolder / "byt5.json"
+        if byt5_path.exists():
+            with open(byt5_path) as f:
+                instance.byt5 = ByT5MapperConfig.from_json(json.load(f))
+
+        # connector/ (Qwen2) — keep raw.
+        conn_path = local_dir / instance.connector_subfolder / "config.json"
+        if conn_path.exists():
+            with open(conn_path) as f:
+                instance.connector = json.load(f)
+
+        return instance
+
+
+# ---------------------------------------------------------------------------
+# Top-level
+# ---------------------------------------------------------------------------
+
+@dataclass
+class MingFlashOmniModelConfig:
+    """Unified config for Ming-flash-omni-2.0 loaded from a local HF checkpoint."""
+
+    local_dir: str = ""
+
+    # Top-level scalar from config.json (cross-modal connector MLP depth)
+    mlp_depth: int = 2
+
+    # Sub-configs
+    thinker_llm: ThinkerLLMConfig = field(default_factory=ThinkerLLMConfig)
+    vision: VisionEncoderConfig = field(default_factory=VisionEncoderConfig)
+    audio_encoder: AudioEncoderConfig = field(default_factory=AudioEncoderConfig)
+    talker: TalkerConfig | None = None
+    image_gen: ImageGenConfig | None = None
+
+    # ------------------------------------------------------------------
+    # Sanity checks
+    # ------------------------------------------------------------------
+
+    def __post_init__(self) -> None:
+        llm = self.thinker_llm
+        assert llm.head_dim is not None  # set in ThinkerLLMConfig.__post_init__
+
+        # head_dim consistency. We tolerate the upstream-default mismatch
+        # (head_dim=128 paired with hidden_size//num_heads) because Ming
+        # explicitly overrides it; only fail when nothing matches.
+        if llm.head_dim * llm.num_attention_heads != llm.hidden_size and llm.head_dim != 128:
+            raise ValueError(
+                f"ThinkerLLMConfig: head_dim={llm.head_dim} inconsistent with "
+                f"hidden_size={llm.hidden_size} / num_attention_heads={llm.num_attention_heads}"
+            )
+
+        # MRoPE / partial-rotary invariant. The rotary subset of each head is
+        # ``head_dim * partial_rotary_factor`` dims, which come in (cos, sin)
+        # pairs — so ``mrope_section`` partitions half of that (the dims that
+        # one of cos/sin owns) across the time / height / width axes. The
+        # same arithmetic governs Qwen3-Omni (head_dim=128, partial=1.0 →
+        # sum([16,24,24])=64=128/2) and Ming-flash-omni (head_dim=128,
+        # partial=0.5 → sum([8,12,12])=32=64/2).
+        rotary_pair_dims = int(llm.head_dim * llm.partial_rotary_factor) // 2
+        section_sum = sum(llm.mrope_section)
+        if section_sum != rotary_pair_dims:
+            raise ValueError(
+                f"MRoPE section {llm.mrope_section} sums to {section_sum} but "
+                f"(head_dim={llm.head_dim} * partial_rotary_factor="
+                f"{llm.partial_rotary_factor}) / 2 = {rotary_pair_dims}. "
+                f"Section must partition the cos/sin half of the rotary dims."
+            )
+
+        # Multimodal token IDs must be within vocab.
+        for name in (
+            "image_patch_token", "video_patch_token", "audio_patch_token",
+            "image_start_token", "video_start_token", "audio_start_token",
+            "image_end_token", "video_end_token", "audio_end_token",
+        ):
+            v = getattr(llm, name)
+            if not (0 <= v < llm.vocab_size):
+                raise ValueError(
+                    f"ThinkerLLMConfig.{name}={v} is out of range for "
+                    f"vocab_size={llm.vocab_size}"
+                )
+
+    # ------------------------------------------------------------------
+    # Convenience accessors (downstream code reads these — keep stable)
+    # ------------------------------------------------------------------
+
+    @property
+    def thinker_hidden_size(self) -> int:
+        return self.thinker_llm.hidden_size
+
+    @property
+    def thinker_num_layers(self) -> int:
+        return self.thinker_llm.num_hidden_layers
+
+    @property
+    def thinker_head_dim(self) -> int:
+        assert self.thinker_llm.head_dim is not None
+        return self.thinker_llm.head_dim
+
+    @property
+    def thinker_num_kv_heads(self) -> int:
+        return self.thinker_llm.num_key_value_heads
+
+    @property
+    def vocab_size(self) -> int:
+        return self.thinker_llm.vocab_size
+
+    # ------------------------------------------------------------------
+    # Construction
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def from_pretrained(cls, local_dir: str | os.PathLike[str]) -> MingFlashOmniModelConfig:
+        """Load configuration from a local HF checkpoint directory.
+
+        Reads ``config.json`` for the thinker path. Lazy-loads ``talker/`` and
+        the imagegen subdir family if present — a thinker-only snapshot will
+        leave those as None.
+        """
+        local_dir = str(local_dir)
+        config_path = Path(local_dir) / "config.json"
+        if not config_path.exists():
+            raise FileNotFoundError(f"config.json not found in {local_dir}")
+
+        with open(config_path) as f:
+            raw: dict[str, Any] = json.load(f)
+
+        thinker_llm = ThinkerLLMConfig.from_dict(raw.get("llm_config", {}))
+        vision = VisionEncoderConfig.from_dict(raw.get("vision_config", {}))
+        audio_encoder = AudioEncoderConfig.from_dict(raw.get("audio_config", {}))
+        mlp_depth = int(raw.get("mlp_depth", 2))
+
+        talker = TalkerConfig.from_subdir(Path(local_dir) / "talker")
+        image_gen = ImageGenConfig.from_subdirs(local_dir)
+
+        return cls(
+            local_dir=local_dir,
+            mlp_depth=mlp_depth,
+            thinker_llm=thinker_llm,
+            vision=vision,
+            audio_encoder=audio_encoder,
+            talker=talker,
+            image_gen=image_gen,
+        )
diff --git a/mstar/model/ming_omni_flash/loader.py b/mstar/model/ming_omni_flash/loader.py
new file mode 100644
index 00000000..9f5c7613
--- /dev/null
+++ b/mstar/model/ming_omni_flash/loader.py
@@ -0,0 +1,618 @@
+"""Weight loader for the Ling-2.0 thinker (TP-aware via load_hf_weights).
+
+Step 3e refactor: instead of a custom per-shard loop, we now stream
+the checkpoint through mstar's :func:`load_hf_weights` machinery.
+Per-rank slicing happens inside the parameter-attached
+``weight_loader`` callbacks of the TP-aware modules — same pattern as
+Qwen3-Omni's loader at
+``mstar/model/qwen3_omni/qwen3_omni_model.py:1242-1334``.
+
+## What this loader handles
+
+1. **Outer prefix strip**: ``model.X.Y`` → ``X.Y`` (the wrapper is
+   ``BailingMM2NativeForConditionalGeneration.model``).
+2. **Per-layer renames**: ``model.layers.{i}.attention.{query_key_value,
+   dense,q_norm,k_norm}.weight`` → ``layers.{i}.self_attn.{qkv_proj,
+   dense,q_norm,k_norm}.weight``; ``mlp.{gate,image_gate,audio_gate}.weight``
+   → ``mlp.{...}.gate.weight`` (extra nesting for the router's inner
+   nn.Linear); ``mlp.shared_experts.*`` → ``mlp.shared_expert.*``.
+3. **Packed QKV split**: ``attention.query_key_value.weight`` is one
+   `(Q+2K)*D x H` tensor in the checkpoint, but :class:`QKVParallelLinear`
+   wants three calls (one each with shard_id ``"q"``/``"k"``/``"v"``).
+   Done by ``_split_packed_qkv`` which intercepts QKV keys and emits
+   three synthetic stream entries.
+4. **Per-expert fusion**: 256 separate ``experts.N.gate_proj.weight``
+   keys per layer → packed ``experts.gate_up_proj`` tensor.
+   ``_remap_thinker_keys`` rewrites them to
+   ``experts.{gate,up,down}_proj.__expertN__.weight`` so
+   :class:`StackedParamRule.source_suffix` matching works; the per-rule
+   ``shard_id="gate:N"`` / ``"up:N"`` / ``"down:N"`` strings drive
+   mstar's per-rank ``_gate_up_weight_loader`` / ``_down_proj_weight_loader``
+   to write into the right expert slot per rank.
+
+Per-rank TP slicing happens automatically — every TP-aware module
+(``QKVParallelLinear``, ``RowParallelLinear``, ``ParallelGatedMLP``,
+``LingMoeBlock.experts``) attaches its own ``weight_loader`` callback
+that knows its ``tp_rank``/``tp_size`` and slices the loaded tensor
+accordingly.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from collections.abc import Iterable
+
+import torch
+
+from mstar.model.loader.base import StackedParamRule, load_hf_weights
+from mstar.model.loader.iterators import iter_safetensors_shards
+from mstar.model.ming_omni_flash.components.model import LingMoeModel
+
+logger = logging.getLogger(__name__)
+
+
+# Outermost ckpt prefix — strip before everything else.
+_CKPT_THINKER_PREFIX = "model."
+
+
+# Per-key static rename rules (only the substring matters; expert
+# fusion + QKV split are handled separately).
+_SUBSTRING_RENAMES: list[tuple[str, str]] = [
+    # Embed / norm / lm_head (after the outer model. strip).
+    # `lm_head.weight` lands directly.
+    # `model.word_embeddings.weight` → `embed_tokens.weight`
+    # `model.norm.weight` → `norm.weight`
+    # The substring matcher below handles `model.` → `` only when it's a prefix.
+
+    # Attention rename (per-layer, applies to any layer index).
+    # query_key_value isn't actually emitted past _split_packed_qkv (the
+    # split produces synthetic q_proj/k_proj/v_proj keys instead), but
+    # the rule's harmless and documents intent.
+    ("attention.query_key_value", "self_attn.qkv_proj"),
+    # Synthetic q/k/v keys emitted by _split_packed_qkv. Their StackedParamRule
+    # routes them into the fused self_attn.qkv_proj via shard_id "q"/"k"/"v".
+    ("attention.q_proj", "self_attn.q_proj"),
+    ("attention.k_proj", "self_attn.k_proj"),
+    ("attention.v_proj", "self_attn.v_proj"),
+    ("attention.dense", "self_attn.dense"),
+    ("attention.q_norm", "self_attn.q_norm"),
+    ("attention.k_norm", "self_attn.k_norm"),
+    # Router renames (per-layer, applies to gate / image_gate / audio_gate).
+    # mlp.gate.weight → mlp.gate.gate.weight (nested through the router's nn.Linear)
+    ("mlp.gate.weight", "mlp.gate.gate.weight"),
+    ("mlp.image_gate.weight", "mlp.image_gate.gate.weight"),
+    ("mlp.audio_gate.weight", "mlp.audio_gate.gate.weight"),
+    # Shared expert (singular in mstar vs plural in ckpt).
+    ("mlp.shared_experts.", "mlp.shared_expert."),
+]
+
+
+_EXPERT_KEY_RE = re.compile(
+    r"^(.*)\.mlp\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$"
+)
+
+
+def _strip_outer_model_prefix(key: str) -> str | None:
+    """Strip the outermost ``model.`` (the wrapper). Returns None for
+    keys we don't expect (audio.*, vision.*, etc. — these aren't part
+    of the thinker text-only path)."""
+    if not key.startswith(_CKPT_THINKER_PREFIX):
+        return None
+    stripped = key[len(_CKPT_THINKER_PREFIX):]
+    # After the strip the LLM is rooted at "model.layers..." / "model.norm..." /
+    # "model.word_embeddings..." (the inner HF wrapper). lm_head.weight is
+    # directly here without an extra "model." prefix.
+    return stripped
+
+
+def _apply_substring_renames(key: str) -> str:
+    for src, dst in _SUBSTRING_RENAMES:
+        if src in key:
+            key = key.replace(src, dst)
+    # Embed / norm: strip the inner ``model.`` prefix where applicable.
+    # `model.word_embeddings.weight` → `embed_tokens.weight`
+    if key.startswith("model.word_embeddings"):
+        key = key.replace("model.word_embeddings", "embed_tokens", 1)
+    # `model.norm.weight` → `norm.weight`
+    elif key.startswith("model.norm"):
+        key = key.replace("model.norm", "norm", 1)
+    # `model.layers.X` → `layers.X`
+    elif key.startswith("model.layers."):
+        key = key[len("model."):]
+    return key
+
+
+def _remap_thinker_keys(key: str) -> str | None:
+    """Full name remapping for thinker keys.
+
+    Returns the post-rename key, or None to drop the key entirely.
+    """
+    stripped = _strip_outer_model_prefix(key)
+    if stripped is None:
+        return None  # not a thinker key (audio.*, vision.*, etc.)
+
+    # Per-expert fusion marker: rewrite so the StackedParamRule's
+    # suffix-match picks them up.
+    m = _EXPERT_KEY_RE.match(stripped)
+    if m:
+        prefix, expert_idx, proj = m.groups()
+        # prefix looks like "model.layers.5"; strip the inner "model."
+        if prefix.startswith("model.layers."):
+            prefix = prefix[len("model."):]
+        return f"{prefix}.mlp.experts.{proj}.__expert{expert_idx}__.weight"
+
+    renamed = _apply_substring_renames(stripped)
+    return renamed
+
+
+def _build_thinker_stacked_params(num_experts: int) -> list[StackedParamRule]:
+    """Build the per-expert + dense-MLP rules.
+
+    Per-expert rules MUST come first because the dense-MLP ``.gate_proj``
+    / ``.up_proj`` / ``.down_proj`` suffixes would also match the
+    remapped MoE keys otherwise — :func:`_apply_stacked` returns on first
+    match.
+    """
+    rules: list[StackedParamRule] = []
+    for i in range(num_experts):
+        rules.append(StackedParamRule(
+            target_suffix=".experts.gate_up_proj",
+            source_suffix=f".experts.gate_proj.__expert{i}__.weight",
+            shard_id=f"gate:{i}",
+        ))
+        rules.append(StackedParamRule(
+            target_suffix=".experts.gate_up_proj",
+            source_suffix=f".experts.up_proj.__expert{i}__.weight",
+            shard_id=f"up:{i}",
+        ))
+        rules.append(StackedParamRule(
+            target_suffix=".experts.down_proj",
+            source_suffix=f".experts.down_proj.__expert{i}__.weight",
+            shard_id=f"down:{i}",
+        ))
+    # Dense layer-0 MLP fusion (ParallelGatedMLP holds gate_up_proj).
+    rules.append(StackedParamRule(".gate_up_proj", ".gate_proj", 0))
+    rules.append(StackedParamRule(".gate_up_proj", ".up_proj", 1))
+    # Attention QKV fusion: synthetic q/k/v keys from _split_packed_qkv
+    # route into the fused self_attn.qkv_proj.weight via shard_id strings.
+    # QKVParallelLinear's weight_loader does per-rank head-axis slicing.
+    rules.append(StackedParamRule(".qkv_proj", ".q_proj", "q"))
+    rules.append(StackedParamRule(".qkv_proj", ".k_proj", "k"))
+    rules.append(StackedParamRule(".qkv_proj", ".v_proj", "v"))
+    return rules
+
+
+def _split_packed_qkv(
+    weights: Iterable[tuple[str, torch.Tensor]],
+    num_attention_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+) -> Iterable[tuple[str, torch.Tensor]]:
+    """Stream-transform: split each ``attention.query_key_value.weight``
+    into 3 synthetic ``self_attn.{q,k,v}_proj.weight`` entries.
+
+    ``QKVParallelLinear`` doesn't have a single ``query_key_value``
+    weight_loader; it dispatches via shard_id ``"q"``/``"k"``/``"v"``
+    on three separate keys. We emit those keys here so the stacked rules
+    (``.qkv_proj``, ``.q_proj`` / ``.k_proj`` / ``.v_proj``) route them
+    into the right slots.
+
+    Packing in ckpt: weight is `(num_heads + 2*num_kv_heads)*head_dim x hidden`,
+    rows ordered [Q rows, K rows, V rows].
+    """
+    q_size = num_attention_heads * head_dim
+    kv_size = num_kv_heads * head_dim
+    qkv_total = q_size + 2 * kv_size
+
+    pattern = re.compile(r"^(.*attention\.)query_key_value\.weight$")
+
+    for raw_key, tensor in weights:
+        m = pattern.match(raw_key)
+        if m is None:
+            yield raw_key, tensor
+            continue
+        if tensor.shape[0] != qkv_total:
+            raise ValueError(
+                f"{raw_key}: expected first dim {qkv_total} "
+                f"(num_heads={num_attention_heads}, num_kv_heads={num_kv_heads},"
+                f" head_dim={head_dim}); got {tensor.shape[0]}"
+            )
+        prefix = m.group(1)
+        q_slice = tensor[0:q_size, :]
+        k_slice = tensor[q_size:q_size + kv_size, :]
+        v_slice = tensor[q_size + kv_size:qkv_total, :]
+        yield f"{prefix}q_proj.weight", q_slice
+        yield f"{prefix}k_proj.weight", k_slice
+        yield f"{prefix}v_proj.weight", v_slice
+
+
+def load_thinker_weights(
+    model: LingMoeModel,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> None:
+    """Stream the checkpoint into the TP-aware LingMoeModel.
+
+    Sequencing:
+      1. Iterate sharded safetensors via mstar's `iter_safetensors_shards`.
+      2. Pre-split packed QKV keys into synthetic q/k/v keys.
+      3. Pass through `load_hf_weights` with our `name_remapper` +
+         per-expert StackedParamRules + dense-MLP rules. mstar's
+         parameter-attached `weight_loader`s do per-rank slicing.
+
+    Args:
+        model: LingMoeModel constructed with the right comm_group; param
+            tensors must already be on `device`.
+        local_dir: path to the Ming snapshot.
+        device: where to materialise loaded tensors (`"cpu"` /
+            `"cuda"` / `"cuda:N"`).
+        strict: if True, raise when any LingMoeModel parameter received
+            no checkpoint tensor.
+    """
+    llm_cfg = None
+    # Reach into the model to recover num_heads / num_kv_heads / head_dim
+    # for the QKV split — we don't have the config here directly.
+    first_attn = model.layers[0].self_attn
+    num_heads = first_attn.total_num_heads
+    num_kv = first_attn.total_num_kv_heads
+    head_dim = first_attn.head_dim
+
+    # Look up via the safetensors index: each layer's experts.{N} keys
+    # might land in a different shard. iter_safetensors_shards yields
+    # all matching keys across shards. We pre-strip to thinker-only keys
+    # via the prefix arg so vision / audio shards (only present in 100B
+    # model? not sure) don't get streamed.
+    raw_weights = iter_safetensors_shards(
+        local_dir, device=device, prefix=_CKPT_THINKER_PREFIX,
+    )
+
+    # Wrap with the QKV split + name remapper. load_hf_weights handles
+    # the rest (stacked rules, weight_loader dispatch).
+    split_weights = _split_packed_qkv(
+        raw_weights,
+        num_attention_heads=num_heads,
+        num_kv_heads=num_kv,
+        head_dim=head_dim,
+    )
+
+    stacked = _build_thinker_stacked_params(
+        num_experts=model.layers[-1].mlp.num_experts if model.layers[-1].is_moe
+        else 0,  # if there's no MoE layer (e.g. tiny test model), skip
+    )
+
+    loaded = load_hf_weights(
+        model, split_weights,
+        stacked_params=stacked,
+        name_remapper=_remap_thinker_keys,
+    )
+
+    if strict:
+        target_keys = set(model.state_dict().keys())
+        # Filter expert keys: each fused param gets loaded multiple times
+        # (one per expert / shard); load_hf_weights returns the param
+        # name once per first hit. That's fine — but it means we can't
+        # check "every param was touched at least once". Instead, check
+        # the simpler thing: every param that ISN'T a fused expert tensor
+        # was touched.
+        missing = []
+        for k in target_keys:
+            if k.endswith(".experts.gate_up_proj") or k.endswith(".experts.down_proj"):
+                # Fused; load_hf_weights's `loaded` set has the target
+                # name once per shard rule that matched, so if any one
+                # rule matched we're OK. Just check it's in `loaded`.
+                if k not in loaded:
+                    missing.append(k)
+            elif k not in loaded:
+                missing.append(k)
+        if missing:
+            raise KeyError(
+                f"Missing thinker parameters after load (strict=True). "
+                f"Sample missing keys: {sorted(missing)[:10]} "
+                f"(total {len(missing)})"
+            )
+
+    logger.info(
+        "Loaded %d unique target params into LingMoeModel(num_hidden_layers=%d) "
+        "from %s (rank %d/%d).",
+        len(loaded), model.num_hidden_layers, local_dir,
+        model.comm_group.rank, model.comm_group.world_size,
+    )
+
+
+# ===========================================================================
+# Vision / audio encoder + projector loaders (step 4b)
+# ===========================================================================
+#
+# These modules aren't TP-aware (run on a single rank in the typical
+# topology — vision_encoder + audio_encoder colocate on rank 0 per
+# configs/ming_flash_omni.yaml). Loading is a plain prefix-strip +
+# load_state_dict path; no per-rank slicing or stacked-rule fusion.
+#
+# Released ckpt's relevant top-level prefixes:
+#   vision.*              -> MingVisionEncoder (Qwen3MoeVisionTransformer)
+#   audio.*               -> MingAudioEncoder  (Whisper)
+#   linear_proj.*         -> MingVisionProjector (nn.Sequential under .proj)
+#   linear_proj_audio.*   -> MingAudioProjector  (nn.Sequential under .proj)
+
+
+def _load_prefixed_state_dict(
+    module: torch.nn.Module,
+    local_dir: str,
+    prefix: str,
+    inner_prefix: str = "",
+    subdir: str = "",
+    device: str = "cpu",
+    strict: bool = True,
+    allow_missing: set[str] | None = None,
+    allow_unexpected: set[str] | None = None,
+) -> set[str]:
+    """Common path for the encoder/projector/talker/vae loaders.
+
+    Streams keys matching ``prefix`` from the safetensors shards under
+    ``<local_dir>/<subdir>``, strips that outer prefix, optionally
+    prepends ``inner_prefix``, then runs ``module.load_state_dict``.
+
+    Args:
+        module:        target nn.Module.
+        local_dir:     snapshot dir with model.safetensors{,.index.json}.
+        prefix:        outer ckpt prefix to filter shards by + strip.
+                       Pass ``""`` to load every key (no strip) — used
+                       by the AudioVAE loader where the ckpt's
+                       ``encoder.*`` and ``decoder.*`` are top-level
+                       siblings with no shared prefix.
+        inner_prefix:  prepended to the stripped key before lookup. Used
+                       by the projector loaders so ckpt's ``0.weight``
+                       hits ``proj.0.weight`` on our module.
+        subdir:        relative subdirectory under ``local_dir`` to look
+                       for the safetensors shard set. Used by the
+                       talker loaders (``"talker"`` /
+                       ``"talker/vae"``); the thinker / vision / audio
+                       encoder loaders pass ``""`` (top-level).
+        device:        target device for loaded tensors.
+        strict:        if True, raise on any key mismatch (missing or
+                       unexpected) other than entries in ``allow_missing``.
+        allow_missing: parameter / buffer names in the module's
+                       state_dict that the ckpt is allowed to skip.
+                       (E.g. Whisper's ``positional_embedding`` buffer is
+                       regenerated locally — ckpt drops it.)
+
+    Returns the set of keys actually loaded (post-rename).
+    """
+    target_dir = f"{local_dir}/{subdir}" if subdir else local_dir
+    raw_weights = iter_safetensors_shards(
+        target_dir, device=device, prefix=prefix or None,
+    )
+    state = {}
+    for key, tensor in raw_weights:
+        if prefix and not key.startswith(prefix):
+            # Defensive: iter_safetensors_shards should already filter.
+            continue
+        sub_key = key[len(prefix):] if prefix else key
+        if inner_prefix:
+            sub_key = f"{inner_prefix}{sub_key}"
+        state[sub_key] = tensor
+
+    if not state:
+        raise KeyError(
+            f"No checkpoint keys matched prefix {prefix!r} under {target_dir}. "
+            f"Snapshot may be a thinker-only / talker-only variant."
+        )
+
+    missing, unexpected = module.load_state_dict(state, strict=False)
+    allow_missing = allow_missing or set()
+    allow_unexpected = allow_unexpected or set()
+    # Resolve allow_unexpected against the post-rename keys we
+    # actually loaded (callers express these in the module's namespace,
+    # not the ckpt's).
+    real_missing = [m for m in missing if m not in allow_missing]
+    real_unexpected = [u for u in unexpected if u not in allow_unexpected]
+    if strict and (real_missing or real_unexpected):
+        raise KeyError(
+            f"State-dict mismatch loading prefix {prefix!r} from {target_dir}: "
+            f"missing={real_missing[:10]} (total {len(real_missing)}); "
+            f"unexpected={real_unexpected[:10]} (total {len(real_unexpected)})."
+        )
+
+    logger.info(
+        "Loaded %d params (prefix=%r, subdir=%r) from %s (missing=%d, unexpected=%d).",
+        len(state), prefix, subdir, local_dir, len(missing), len(unexpected),
+    )
+    return set(state.keys())
+
+
+def load_vision_encoder_weights(
+    encoder: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``vision.*`` weights from the snapshot into a Ming vision encoder.
+
+    Works with the module returned by ``build_vision_encoder``
+    (``Qwen3MoeVisionTransformer`` from the staged Ming source). Key
+    names after the ``vision.`` strip already match the module's
+    state_dict — no further remapping needed.
+    """
+    return _load_prefixed_state_dict(
+        encoder, local_dir, prefix="vision.", device=device, strict=strict,
+    )
+
+
+def load_audio_encoder_weights(
+    encoder: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``audio.*`` weights from the snapshot into MingAudioEncoder.
+
+    The released ckpt ships its own (trained) ``positional_embedding``
+    that overrides the sinusoidal init in :func:`_sinusoids` — load
+    via ``load_state_dict``'s buffer support (no special-casing needed).
+    """
+    return _load_prefixed_state_dict(
+        encoder, local_dir, prefix="audio.", device=device, strict=strict,
+    )
+
+
+def load_vision_projector_weights(
+    projector: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``linear_proj.*`` into MingVisionProjector.
+
+    Ckpt key shape is ``linear_proj.{0,2}.{weight,bias}``; our module's
+    state_dict shape is ``proj.{0,2}.{weight,bias}``, so we prepend
+    ``proj.`` after stripping ``linear_proj.``.
+    """
+    return _load_prefixed_state_dict(
+        projector, local_dir, prefix="linear_proj.",
+        inner_prefix="proj.", device=device, strict=strict,
+    )
+
+
+def load_audio_projector_weights(
+    projector: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``linear_proj_audio.*`` into MingAudioProjector.
+
+    Ckpt key shape is ``linear_proj_audio.{0,3}.{weight,bias}``; module
+    has them under ``proj.{0,3}.{weight,bias}``.
+    """
+    return _load_prefixed_state_dict(
+        projector, local_dir, prefix="linear_proj_audio.",
+        inner_prefix="proj.", device=device, strict=strict,
+    )
+
+
+# ===========================================================================
+# Talker + AudioVAE loaders (step 6f)
+# ===========================================================================
+#
+# The talker lives in two separate safetensors files under the snapshot:
+#
+#   talker/model.safetensors        — model.* (Qwen2 LLM backbone),
+#                                     cfm.*, aggregator.*, stop_head.*,
+#                                     spk_head.*
+#   talker/vae/model.safetensors    — encoder.* + decoder.* (AudioVAE)
+#
+# All loaders below are non-TP — the talker colocates on a single rank
+# in the typical topology and the snapshot's key layout matches the
+# upstream module tree 1:1, so a plain prefix-strip +
+# load_state_dict via _load_prefixed_state_dict is enough.
+
+
+def load_talker_llm_weights(
+    qwen2_model: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``model.*`` from ``talker/model.safetensors`` into a Qwen2Model.
+
+    Qwen2Model's own state_dict keys don't have a leading ``model.``
+    (HF wraps it in Qwen2ForCausalLM if you want that prefix). The
+    ckpt's ``model.embed_tokens.weight`` / ``model.layers.N.*`` keys
+    strip to ``embed_tokens.weight`` / ``layers.N.*`` which is what
+    Qwen2Model expects.
+    """
+    return _load_prefixed_state_dict(
+        qwen2_model, local_dir, prefix="model.", subdir="talker",
+        device=device, strict=strict,
+    )
+
+
+def load_talker_cfm_weights(
+    cfm: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``cfm.*`` from ``talker/model.safetensors`` into a `CFM` module.
+
+    ``CFM(model=DiT)`` so its state_dict has top-level ``model.<...>``
+    keys. The ckpt's ``cfm.model.blocks.N.attn.to_q.weight`` strips
+    to ``model.blocks.N.attn.to_q.weight`` — matches.
+
+    The ckpt ships ``cfm.model.rotary_embed.inv_freq`` but our
+    `RotaryEmbedding` registers `inv_freq` as ``persistent=False``,
+    so the buffer is absent from our state_dict and load_state_dict
+    flags it as unexpected. We allow it — the inv_freq table is
+    deterministic from ``head_dim`` and ``rope_theta``, so the
+    locally-recomputed buffer is numerically equivalent.
+    """
+    return _load_prefixed_state_dict(
+        cfm, local_dir, prefix="cfm.", subdir="talker",
+        device=device, strict=strict,
+        allow_unexpected={"model.rotary_embed.inv_freq"},
+    )
+
+
+def load_talker_aggregator_weights(
+    aggregator: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``aggregator.*`` from ``talker/model.safetensors``.
+
+    Same `rotary_embed.inv_freq` non-persistent-buffer story as the CFM
+    loader — accept it as expected-unexpected.
+    """
+    return _load_prefixed_state_dict(
+        aggregator, local_dir, prefix="aggregator.", subdir="talker",
+        device=device, strict=strict,
+        allow_unexpected={"rotary_embed.inv_freq"},
+    )
+
+
+def load_talker_heads_weights(
+    heads: dict[str, torch.nn.Module],
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> dict[str, set[str]]:
+    """Load ``stop_head.*`` + ``spk_head.*`` into the matching heads dict.
+
+    Args:
+        heads: dict produced by `build_talker_heads`; must contain
+            both ``stop_head`` and ``spk_head`` keys.
+        local_dir: snapshot dir; the heads live in talker/model.safetensors.
+
+    Returns:
+        ``{"stop_head": loaded_keys, "spk_head": loaded_keys}``.
+    """
+    out: dict[str, set[str]] = {}
+    for name in ("stop_head", "spk_head"):
+        if name not in heads:
+            raise KeyError(f"`heads` dict missing required key {name!r}")
+        out[name] = _load_prefixed_state_dict(
+            heads[name], local_dir, prefix=f"{name}.", subdir="talker",
+            device=device, strict=strict,
+        )
+    return out
+
+
+def load_talker_audio_vae_weights(
+    audio_vae: torch.nn.Module,
+    local_dir: str,
+    device: str = "cpu",
+    strict: bool = True,
+) -> set[str]:
+    """Load ``encoder.*`` + ``decoder.*`` from ``talker/vae/model.safetensors``.
+
+    The ckpt's two top-level subtrees (`encoder.*`, `decoder.*`) match
+    `AudioVAE`'s state_dict structure directly — no prefix to strip.
+    Pass ``prefix=""`` so the helper loads every key.
+    """
+    return _load_prefixed_state_dict(
+        audio_vae, local_dir, prefix="", subdir="talker/vae",
+        device=device, strict=strict,
+    )
diff --git a/mstar/model/ming_omni_flash/ming_omni_flash_model.py b/mstar/model/ming_omni_flash/ming_omni_flash_model.py
new file mode 100644
index 00000000..f500a2f1
--- /dev/null
+++ b/mstar/model/ming_omni_flash/ming_omni_flash_model.py
@@ -0,0 +1,1682 @@
+"""MingFlashOmniModel: native mstar port of Ming-flash-omni-2.0.
+
+Step 3d: text-only thinker path is wired end-to-end. Vision / audio /
+talker / image-gen are step 4+.
+
+The released checkpoint (``inclusionAI/Ming-flash-omni-2.0``, 2026-02-11) is a
+Ling-2.0 sparse-MoE omni model: 100B total / 6B active params, ~238 GB / 42
+shards. The vllm-omni reference port (~6,500 LOC) lives at::
+
+    /sgl-workspace/vllm-omni/vllm_omni/model_executor/models/ming_flash_omni/
+
+That tree is the source of truth for the architecture; this scaffold mirrors
+mstar's class shape (``mstar/model/qwen3_omni/qwen3_omni_model.py``) and
+leaves each abstractmethod raising ``NotImplementedError`` with a pointer to
+the corresponding upstream file/symbol.
+
+Target partition layout (mirrors vllm-omni's deploy yamls):
+
+    Thinker   — Ling-2.0 MoE LLM + vision/audio encoders -> text out
+    Talker    — CFM head + small LLM -> audio waveform via AudioVAE
+    ImageGen  — ByT5 + ZImage DiT -> image out (separate deploy)
+
+Mapping to vllm-omni source (use these as the porting cribsheet):
+
+    Thinker       -> ming_flash_omni_thinker.py            (1,164 LOC)
+    Talker        -> ming_flash_omni_talker.py + talker_module.py
+    AudioVAE      -> audio_vae.py
+    AudioEncoder  -> audio_encoder.py
+    Vision        -> vision_encoder.py + projectors.py
+    Ling MoE LLM  -> modeling_bailing_moe_v2.py            (892 LOC)
+    ImageGen      -> /sgl-workspace/vllm-omni/vllm_omni/diffusion/models/ming_flash_omni/
+    Pipeline glue -> pipeline.py + ming_flash_omni.py
+    Prompt tokens -> prompt_utils.py (IMAGE_PATCH_TOKEN, BASE_CAPTION_TEMPLATE)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+from pathlib import Path
+
+import torch
+
+from mstar.communication.tensors import NameToTensorList
+from mstar.conductor.request_info import (
+    CurrentForwardConductorMetadata,
+    PartitionDefinition,
+    StreamingConnectionState,
+)
+from mstar.engine.base import EngineType
+from mstar.engine.kv_store import KVCacheConfig
+from mstar.graph.base import (
+    GraphEdge,
+    GraphNode,
+    GraphSection,
+    Loop,
+    Sequential,
+    TensorPointerInfo,
+)
+from mstar.graph.special_destinations import EMIT_TO_CLIENT
+from mstar.model.base import ForwardPassArgs, Model
+from mstar.model.ming_omni_flash.components.model import LingMoeModel
+from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+from mstar.model.ming_omni_flash.loader import load_thinker_weights
+from mstar.model.ming_omni_flash.submodules import (
+    AudioEncoderSubmodule,
+    BailingMoeV2ThinkerSubmodule,
+    VisionEncoderSubmodule,
+)
+from mstar.streaming.chunk_policy import FixedChunkPolicy
+from mstar.streaming.topology import Connection, PartitionTopology, StreamingGraphEdge
+
+logger = logging.getLogger(__name__)
+
+
+_NOT_PORTED = (
+    "MingFlashOmniModel is a scaffold; the native mstar port is incomplete. "
+    "Benchmark via `--inference-system vllm_omni` against a vllm-omni server "
+    "(see benchmark/vllm_omni_instructions.md) until this lands. Reference "
+    "implementation: /sgl-workspace/vllm-omni/vllm_omni/model_executor/models/ming_flash_omni/."
+)
+
+
+# Files in the Ming GitHub repo (https://github.com/inclusionAI/Ming) that
+# the HF AutoTokenizer / AutoProcessor for Ming-flash-omni-2.0 needs to find
+# adjacent to the snapshot's ``config.json``. The HF checkpoint ships only
+# weights + sub-dir configs; the modeling/processing/tokenization Python
+# modules live in the source repo. ``_prepare_tokenizer_dir`` symlinks these
+# alongside the snapshot when both are available.
+_MING_CODE_FILES = (
+    # Python modules (configs, modeling, processing)
+    "configuration_audio.py",
+    "configuration_bailing_moe_v2.py",
+    "configuration_bailing_talker.py",
+    "configuration_bailingmm2.py",
+    "configuration_whisper_encoder.py",
+    "audio_processing_bailingmm2.py",
+    "bailingmm_utils.py",
+    "bailingmm_utils_video.py",
+    "chat_format.py",
+    "image_processing_bailingmm2.py",
+    "modeling_bailing_moe_v2.py",
+    "modeling_bailing_talker.py",
+    "modeling_bailingmm2.py",
+    "modeling_utils.py",
+    "modeling_whisper_encoder.py",
+    "processing_bailingmm2.py",
+    "qwen2_5_vit.py",
+    "qwen3_moe_vit.py",
+    "s3bpe_tokenizer.py",
+    "tokenization_bailing.py",
+    # JSON assets the processor / tokenizer load from disk
+    "preprocessor_config.json",
+    "processor_config.json",
+    "special_tokens_map.json",
+    "tokenizer_config.json",
+    "tokenizer.json",
+)
+
+
+def _resolve_local_hf_snapshot(repo_id: str, cache_dir: str | None = None) -> str:
+    """Resolve a HF repo id to a local snapshot path (downloading if needed).
+
+    Mirrors mstar/model/qwen3_omni/qwen3_omni_model.py:_resolve_local_hf_snapshot.
+    Returns the repo id unchanged if the download fails — that way an
+    air-gapped environment with a pre-populated cache (or a local-path repo
+    id) still resolves.
+    """
+    from huggingface_hub import snapshot_download
+
+    try:
+        local_dir = snapshot_download(
+            repo_id=repo_id,
+            cache_dir=cache_dir,
+            local_files_only=False,
+        )
+    except Exception as e:
+        logger.warning("Error downloading from HuggingFace: %s", str(e))
+        return repo_id
+    return str(Path(local_dir))
+
+
+def _find_ming_code_dir() -> str | None:
+    """Locate a clone of https://github.com/inclusionAI/Ming on disk.
+
+    Lookup order:
+      1. ``MING_CODE_DIR`` environment variable (explicit override).
+      2. ``./Ming`` or ``/tmp/ming_repo`` (common dev locations).
+      3. Any directory on ``sys.path`` containing ``configuration_bailingmm2.py``.
+
+    Returns ``None`` if nothing is found. Caller is responsible for surfacing
+    a clear error/warning in that case.
+    """
+    override = os.environ.get("MING_CODE_DIR")
+    candidates: list[str] = []
+    if override:
+        candidates.append(override)
+    candidates.extend(["./Ming", "/tmp/ming_repo"])
+    candidates.extend(sys.path)
+
+    for c in candidates:
+        if c and (Path(c) / "configuration_bailingmm2.py").exists():
+            return str(Path(c).resolve())
+    return None
+
+
+def _prepare_tokenizer_dir(snapshot_dir: str, ming_code_dir: str) -> None:
+    """Symlink Ming source files alongside the snapshot's ``config.json``.
+
+    ``transformers.AutoTokenizer.from_pretrained(snapshot, trust_remote_code=True)``
+    resolves ``auto_map`` references (e.g. ``configuration_bailingmm2.py``)
+    by file path adjacent to ``config.json`` — not via PYTHONPATH. We bridge
+    that by symlinking the .py files from ``ming_code_dir`` into the snapshot
+    dir. Idempotent: existing files (and existing symlinks) are skipped, so
+    re-running on a populated snapshot is a no-op.
+    """
+    snap = Path(snapshot_dir)
+    src = Path(ming_code_dir)
+    for name in _MING_CODE_FILES:
+        target = snap / name
+        if target.exists() or target.is_symlink():
+            continue
+        source = src / name
+        if not source.exists():
+            continue
+        try:
+            target.symlink_to(source)
+        except OSError as e:
+            # Snapshot may be on a filesystem without symlink support, or
+            # may be read-only. Don't crash — the loader below will surface
+            # a clearer error if the file is still missing.
+            logger.debug("Failed to symlink %s -> %s: %s", target, source, e)
+
+
+def _patch_bailing_tokenizer_for_transformers5() -> None:
+    """Make BailingTokenizer load under transformers >= 5.0.
+
+    Two upstream incompatibilities, both in
+    ``tokenization_bailing.BailingTokenizer``:
+
+    (1) transformers 5.x removed ``PreTrainedTokenizerBase.verbose``, but
+    Ming's accessor properties (``gmask_token`` etc.) still reference
+    ``self.verbose`` in their not-set fallback paths.  Backport a class-level
+    default so ``check_special_tokens`` doesn't blow up.
+
+    (2) ``BailingTokenizer.__init__`` sets ``self.add_bos_token = ...``
+    BEFORE calling ``super().__init__()``.  In transformers 5.x the
+    ``PreTrainedTokenizerFast.add_bos_token`` setter immediately calls
+    ``update_post_processor()``, which dereferences ``self._tokenizer`` —
+    but that attribute is only created inside the deferred ``super``
+    call.  Wrap ``update_post_processor`` to no-op when ``_tokenizer``
+    isn't built yet; the deferred super call runs it for real.
+
+    The module is loaded dynamically by ``transformers``' trust_remote_code
+    machinery; look it up in ``sys.modules`` rather than importing it.
+    """
+    import sys as _sys
+    for mod_name, mod in list(_sys.modules.items()):
+        if mod is None or not mod_name.endswith("tokenization_bailing"):
+            continue
+        cls = getattr(mod, "BailingTokenizer", None)
+        if cls is None:
+            continue
+        if not hasattr(cls, "verbose"):
+            cls.verbose = False
+
+    # (2) — patch update_post_processor on the parent fast-tokenizer class
+    # once. Guard against re-patching across multiple model instantiations.
+    try:
+        from transformers import PreTrainedTokenizerFast
+    except ImportError:
+        return
+    if getattr(PreTrainedTokenizerFast.update_post_processor, "_mstar_patched", False):
+        return
+    _orig_upp = PreTrainedTokenizerFast.update_post_processor
+
+    def _safe_update_post_processor(self):
+        if getattr(self, "_tokenizer", None) is None:
+            return
+        return _orig_upp(self)
+
+    _safe_update_post_processor._mstar_patched = True
+    PreTrainedTokenizerFast.update_post_processor = _safe_update_post_processor
+
+
+class MingFlashOmniModel(Model):
+    """Thinker + Talker + ImageGen native port of Ming-flash-omni-2.0.
+
+    See module docstring for the target partition layout and a cribsheet
+    mapping each abstractmethod to the upstream vllm-omni reference file.
+    """
+
+    def __init__(
+        self,
+        model_path_hf: str = "inclusionAI/Ming-flash-omni-2.0",
+        cache_dir: str | None = None,
+        ming_code_dir: str | None = None,
+        **kwargs,
+    ):
+        """Load config + (best-effort) tokenizer + processor.
+
+        Args:
+            model_path_hf: HF repo id or local path to the Ming snapshot.
+            cache_dir: Override HF Hub cache for snapshot_download.
+            ming_code_dir: Path to a clone of github.com/inclusionAI/Ming
+                (must contain ``configuration_bailingmm2.py`` etc.). Required
+                for the tokenizer + processor — the HF checkpoint ships only
+                weights, the Python modules live in the source repo. Falls
+                back to MING_CODE_DIR env var, then to ``./Ming``,
+                ``/tmp/ming_repo``, and sys.path.
+
+        Subclasses' abstractmethods all still raise NotImplementedError; this
+        constructor only stages config / tokenizer / processor so the
+        verification tests for step-1/step-2 can exercise the load path.
+        """
+        self.model_path_hf = model_path_hf
+        self.cache_dir = cache_dir
+
+        local_dir = _resolve_local_hf_snapshot(model_path_hf, cache_dir=cache_dir)
+        self.local_dir = local_dir
+        self.config = MingFlashOmniModelConfig.from_pretrained(local_dir)
+
+        # Tokenizer + processor. The released checkpoint ships only weights
+        # and sub-dir configs — no top-level tokenizer.json / vocab.json, and
+        # none of the .py modules that AutoTokenizer / AutoProcessor's
+        # ``trust_remote_code`` path expects to find next to config.json.
+        # We resolve those from a separately-cloned Ming source repo and
+        # symlink them in. If neither is available, we warn loudly and
+        # leave self.tokenizer / self._processor as None — process_prompt
+        # (step 7) will raise a clearer error then.
+        code_dir = ming_code_dir or _find_ming_code_dir()
+        if code_dir is not None:
+            _prepare_tokenizer_dir(local_dir, code_dir)
+            # transformers' trust_remote_code loader resolves sibling imports
+            # (e.g. ``configuration_bailing_moe_v2``) via ``sys.path``, not by
+            # scanning the snapshot dir. Push the snapshot onto sys.path so
+            # those imports succeed during dynamic module loading.
+            if local_dir not in sys.path:
+                sys.path.insert(0, local_dir)
+        self.ming_code_dir = code_dir
+
+        self.tokenizer = None
+        self._processor = None
+        try:
+            from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                local_dir, cache_dir=cache_dir, trust_remote_code=True,
+            )
+        except AttributeError as e:
+            # Two BailingTokenizer/transformers-5.x incompats — see
+            # _patch_bailing_tokenizer_for_transformers5 for the full story.
+            # Patch once and retry; surface only the second error.
+            if "verbose" in str(e) or "post_processor" in str(e):
+                _patch_bailing_tokenizer_for_transformers5()
+                try:
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        local_dir, cache_dir=cache_dir, trust_remote_code=True,
+                    )
+                except Exception as e2:
+                    self._warn_tokenizer_unavailable("tokenizer", e2)
+            else:
+                self._warn_tokenizer_unavailable("tokenizer", e)
+        except Exception as e:
+            self._warn_tokenizer_unavailable("tokenizer", e)
+
+        try:
+            from transformers import AutoProcessor
+            self._processor = AutoProcessor.from_pretrained(
+                local_dir, cache_dir=cache_dir, trust_remote_code=True,
+            )
+        except Exception as e:
+            self._warn_tokenizer_unavailable("processor", e)
+
+        # Talker tokenizer (talker/llm/) — separate from the thinker
+        # tokenizer. The Thinker->Talker bridge (step 6e-3) decodes the
+        # thinker's text output and re-encodes it here. Loaded lazily on
+        # first use via `_get_talker_tokenizer` so thinker-only configs
+        # don't pay for it.
+        self._talker_tokenizer = None
+
+        # Lazy submodule cache — populated on first get_submodule call.
+        self._submodule_cache: dict[str, object] = {}
+
+    def _get_talker_tokenizer(self):
+        """Load + cache the talker's own Qwen2 tokenizer (talker/llm/).
+
+        The talker re-tokenizes the thinker's detokenized text with this
+        tokenizer (vocab_size 151936, distinct from the thinker's
+        BailingTokenizer). Returns None if the talker subdir / tokenizer
+        is unavailable.
+        """
+        if self._talker_tokenizer is not None:
+            return self._talker_tokenizer
+        talker_dir = Path(self.local_dir) / "talker" / "llm"
+        if not (talker_dir / "tokenizer_config.json").exists():
+            return None
+        try:
+            from transformers import AutoTokenizer
+            self._talker_tokenizer = AutoTokenizer.from_pretrained(
+                str(talker_dir), trust_remote_code=True,
+            )
+        except Exception as e:
+            logger.warning("Talker tokenizer (talker/llm/) failed to load: %s", e)
+            return None
+        return self._talker_tokenizer
+
+    def thinker_text_to_talker_inputs(self, thinker_token_ids) -> "torch.Tensor":
+        """Bridge: thinker output token ids -> talker_text_inputs token ids.
+
+        Ming's thinker->talker handoff passes detokenized TEXT, not
+        hidden states (see vllm-omni pipeline.py `thinker2talker`). We
+        decode the thinker's generated ids with the thinker tokenizer,
+        then re-encode with the talker's own `talker/llm` tokenizer.
+
+        Returns a 1-D long tensor of talker token ids. Raises if either
+        tokenizer is unavailable.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError(
+                "thinker_text_to_talker_inputs: thinker tokenizer not loaded."
+            )
+        talker_tok = self._get_talker_tokenizer()
+        if talker_tok is None:
+            raise RuntimeError(
+                "thinker_text_to_talker_inputs: talker tokenizer (talker/llm/) "
+                "not available — cannot bridge to the Talker partition."
+            )
+        if isinstance(thinker_token_ids, torch.Tensor):
+            ids_list = thinker_token_ids.flatten().tolist()
+        else:
+            ids_list = list(thinker_token_ids)
+        text = self.tokenizer.decode(ids_list, skip_special_tokens=True)
+        talker_ids = talker_tok(text, return_tensors="pt").input_ids[0]
+        return talker_ids
+
+    @staticmethod
+    def _warn_tokenizer_unavailable(what: str, err: Exception) -> None:
+        """Single-place explanation of how to make the tokenizer/processor load.
+
+        Tokenizer + processor live in the Ming source repo, not the HF
+        checkpoint. Without them ``process_prompt`` can't run; the rest of
+        the model loads fine.
+        """
+        logger.warning(
+            "Ming-flash-omni-2.0 %s could not be loaded (%s: %s). "
+            "To enable it: (1) git clone https://github.com/inclusionAI/Ming "
+            "(2) pip install opencv-python-headless openai-whisper "
+            "(3) set MING_CODE_DIR=<path/to/Ming>. The snapshot ships only "
+            "weights; the tokenizer/processor Python modules live in the "
+            "source repo.",
+            what, type(err).__name__, str(err)[:200],
+        )
+
+    # ------------------------------------------------------------------
+    # Model ABC: KV cache config (thinker only for step 3d)
+    # ------------------------------------------------------------------
+
+    def get_kv_cache_config(self) -> list[KVCacheConfig]:
+        llm = self.config.thinker_llm
+        return [KVCacheConfig(
+            num_layers=llm.num_hidden_layers,
+            num_kv_heads=llm.num_key_value_heads,
+            head_dim=llm.head_dim,
+            max_seq_len=llm.max_position_embeddings,
+            num_qo_heads=llm.num_attention_heads,
+            nodes=["Thinker"],
+        )]
+
+    def get_node_engine_types(self) -> dict[str, EngineType]:
+        # Step 5a: vision + audio encoders are stateless graph nodes
+        # alongside the Thinker. Talker / AudioVAE / ImageGen fold in
+        # at step 6+. The encoders only register as nodes here when
+        # the snapshot ships the corresponding sub-configs — a
+        # thinker-only config (configs/ming_flash_omni_thinker_only.yaml)
+        # will still want only Thinker, so callers wire encoder nodes
+        # in their yaml only when needed.
+        types = {
+            "Thinker": EngineType.KV_CACHE,
+            "vision_encoder": EngineType.STATELESS,
+            "audio_encoder": EngineType.STATELESS,
+        }
+        # Step 6e-2: register the Talker as a stateless TTS node when the
+        # snapshot ships a talker/ subdir. The talker runs its full
+        # AR-decode + VAE-decode internally (the CFM step count is
+        # stop_head-determined, not a conductor decode loop), so a single
+        # STATELESS node suffices. Thinker-only configs leave this off.
+        if self.config.talker is not None:
+            types["Talker"] = EngineType.STATELESS
+        # Step 9b: register ImageGen as a stateless diffusion node when the
+        # snapshot ships an imagegen tree (transformer/ + vae/ + connector/).
+        # Its full denoise loop + VAE decode run internally (the step count is
+        # scheduler-determined, not a conductor decode loop), so a single
+        # STATELESS node suffices. Thinker-only / talker-only configs leave it
+        # off.
+        if self.config.image_gen is not None:
+            types["ImageGen"] = EngineType.STATELESS
+        return types
+
+    # ------------------------------------------------------------------
+    # Graph walks: text + audio + vision/video prefill + AR decode (step 5c)
+    # ------------------------------------------------------------------
+
+    def get_graph_walk_graphs(self) -> dict[str, GraphSection]:
+        """Five graph walks covering all modality inputs + autoregressive decode.
+
+        * ``prefill_text`` — Thinker only; text tokens → first sampled
+          token (also the legacy ``prefill`` walk in step 3f).
+        * ``prefill_audio`` — ``audio_encoder`` → Thinker. Audio encoder
+          emits ``audio_embeds`` that the Thinker splices between
+          ``audio_start``/``audio_end`` sentinels (step 5b).
+        * ``prefill_vision`` — ``vision_encoder`` → Thinker. Image
+          inputs; the Thinker splices between ``image_start``/``image_end``.
+        * ``prefill_video`` — ``vision_encoder`` → Thinker. Video inputs
+          (same encoder; the Thinker dispatch reads
+          ``video_second_per_grid`` and switches to video sentinels).
+        * ``thinker_decode`` — single-step AR loop (also the legacy
+          ``decode`` walk in step 3f).
+
+        Each prefill walk's final Thinker node emits the first sampled
+        token to the client (``EMIT_TO_CLIENT`` + ``output_modality="text"``)
+        and the decode loop emits + loops each subsequent token, exactly
+        like step 3f's text-only path.
+        """
+        max_decode = self.get_max_output_tokens()
+
+        imagegen_enabled = self.config.image_gen is not None
+
+        def _thinker_prefill_node(input_names: list[str]) -> GraphNode:
+            outputs = [GraphEdge(
+                next_node=EMIT_TO_CLIENT,
+                name="new_token",
+                output_modality="text",
+                persist=True,
+            )]
+            if imagegen_enabled:
+                # Image-generation requests carry an <imagePatch> query-token
+                # block in the prompt; the thinker computes hidden states at
+                # those positions during prefill and streams them to the
+                # ImageGen partition. The submodule only populates this output
+                # when the request actually asked for an image (otherwise the
+                # edge carries nothing and the FixedChunkPolicy keeps the
+                # consumer idle until producer-done → request_done).
+                outputs.append(
+                    StreamingGraphEdge(
+                        next_node="ImageGen",
+                        name="thinker_hidden_states",
+                        target_partition="ImageGen",
+                    )
+                )
+            return GraphNode(
+                name="Thinker",
+                input_names=input_names,
+                outputs=outputs,
+            )
+
+        prefill_text = _thinker_prefill_node(["text_inputs"])
+
+        # Audio prefill: encoder consumes (audio_features, audio_seqlens)
+        # and emits ``audio_embeds`` → Thinker. The Thinker submodule's
+        # prefill_audio dispatch wraps that with audio_start/audio_end
+        # sentinel embeds and builds text-like 3D MRoPE positions.
+        prefill_audio = Sequential([
+            GraphNode(
+                name="audio_encoder",
+                input_names=["audio_features", "audio_seqlens"],
+                outputs=[GraphEdge(next_node="Thinker", name="audio_embeds")],
+            ),
+            _thinker_prefill_node(["audio_embeds"]),
+        ])
+
+        # Vision prefill (image): encoder takes (pixel_values,
+        # image_grid_thw) and emits ``vision_embeds``. The Thinker still
+        # needs the grid for its 3D MRoPE math, so route grid_thw
+        # straight into the Thinker via a parallel edge from the
+        # conductor's initial inputs (see _get_thinker_prefill_inputs).
+        prefill_vision = Sequential([
+            GraphNode(
+                name="vision_encoder",
+                input_names=["pixel_values", "image_grid_thw"],
+                outputs=[GraphEdge(next_node="Thinker", name="vision_embeds")],
+            ),
+            _thinker_prefill_node(["vision_embeds", "image_grid_thw"]),
+        ])
+
+        # Video prefill: same encoder, plus video_second_per_grid for the
+        # timestamp-scaled temporal positions. The Thinker dispatches on
+        # walk name (prefill_video) so it picks video_start/video_end
+        # sentinels instead of image_*.
+        prefill_video = Sequential([
+            GraphNode(
+                name="vision_encoder",
+                input_names=["pixel_values", "image_grid_thw"],
+                outputs=[GraphEdge(next_node="Thinker", name="vision_embeds")],
+            ),
+            _thinker_prefill_node([
+                "vision_embeds", "image_grid_thw", "video_second_per_grid",
+            ]),
+        ])
+
+        # Thinker decode loop — same shape as step 3f's `decode` walk,
+        # renamed for symmetry with the prefill walks. When the talker is
+        # available, each decoded token additionally streams to the Talker
+        # partition as ``thinker_tokens`` (the Talker accumulates the full
+        # text then re-tokenizes + generates audio in one shot — Ming's
+        # bridge passes detokenized text, not hidden states).
+        talker_enabled = self.config.talker is not None
+        decode_outputs = [
+            GraphEdge(
+                next_node=EMIT_TO_CLIENT,
+                name="new_token",
+                output_modality="text",
+            ),
+            GraphEdge(
+                next_node="Thinker",
+                name="text_inputs",
+                output_modality="text",
+            ),
+        ]
+        if talker_enabled:
+            decode_outputs.append(
+                StreamingGraphEdge(
+                    next_node="Talker",
+                    name="thinker_tokens",
+                    target_partition="Talker",
+                )
+            )
+        thinker_decode = Loop(
+            name="thinker_decode_loop",
+            section=GraphNode(
+                name="Thinker",
+                input_names=["text_inputs"],
+                outputs=decode_outputs,
+            ),
+            max_iters=max_decode,
+            outputs=[],
+        )
+        walks: dict[str, GraphSection] = {
+            "prefill_text": prefill_text,
+            "prefill_audio": prefill_audio,
+            "prefill_vision": prefill_vision,
+            "prefill_video": prefill_video,
+            "thinker_decode": thinker_decode,
+        }
+        if talker_enabled:
+            # Single Talker node: consume the streamed thinker tokens,
+            # run the full AR-decode + VAE-decode internally, emit one
+            # audio chunk to the client.
+            walks["talker"] = GraphNode(
+                name="Talker",
+                input_names=["thinker_tokens"],
+                outputs=[
+                    GraphEdge(
+                        next_node=EMIT_TO_CLIENT,
+                        name="audio_chunk",
+                        output_modality="audio",
+                    ),
+                ],
+            )
+        if self.config.image_gen is not None:
+            # Single ImageGen node: consume the thinker hidden states at the
+            # <imagePatch> query-token positions (streamed from the Thinker as
+            # ``thinker_hidden_states``), run the full diffusion denoise + VAE
+            # decode internally, emit one image to the client. Like the Talker,
+            # the per-request work is one shot (scheduler-determined step
+            # count), so no conductor decode loop is needed.
+            walks["imagegen"] = GraphNode(
+                name="ImageGen",
+                input_names=["thinker_hidden_states"],
+                outputs=[
+                    GraphEdge(
+                        next_node=EMIT_TO_CLIENT,
+                        name="image",
+                        output_modality="image",
+                    ),
+                ],
+            )
+        return walks
+
+    def get_partition_topology(self) -> PartitionTopology:
+        partitions = ["Thinker"]
+        connections = []
+        if self.config.talker is not None:
+            partitions.append("Talker")
+            connections.append(
+                Connection(
+                    from_partition="Thinker",
+                    to_partition="Talker",
+                    edge_name="thinker_tokens",
+                    # The talker needs the FULL text before it generates.
+                    # continue_after_done=True keeps the Talker partition
+                    # alive past the Thinker's text-EOS so it can fire its
+                    # single generation once all tokens have arrived.
+                    chunk_policy_factory=lambda: FixedChunkPolicy(
+                        chunk_size=1, continue_after_done=True,
+                    ),
+                )
+            )
+        if self.config.image_gen is not None:
+            partitions.append("ImageGen")
+            connections.append(
+                Connection(
+                    from_partition="Thinker",
+                    to_partition="ImageGen",
+                    edge_name="thinker_hidden_states",
+                    # The imagegen node fires once, after the thinker has
+                    # produced the query-token hidden states. continue_after_done
+                    # keeps the partition alive until that single handoff lands.
+                    chunk_policy_factory=lambda: FixedChunkPolicy(
+                        chunk_size=1, continue_after_done=True,
+                    ),
+                )
+            )
+        if not connections:
+            return PartitionTopology(partitions=["Thinker"], connections=[])
+        return PartitionTopology(partitions=partitions, connections=connections)
+
+    def get_partitions(self) -> list[PartitionDefinition]:
+        thinker = PartitionDefinition(
+            name="Thinker",
+            graph_walks={
+                "prefill_text", "prefill_audio",
+                "prefill_vision", "prefill_video",
+                "thinker_decode",
+            },
+            initial_walk="prefill_text",
+            producer_partitions=[],
+        )
+        partitions = [thinker]
+        if self.config.talker is not None:
+            partitions.append(
+                PartitionDefinition(
+                    name="Talker",
+                    graph_walks={"talker"},
+                    initial_walk=None,
+                    producer_partitions=["Thinker"],
+                )
+            )
+        if self.config.image_gen is not None:
+            partitions.append(
+                PartitionDefinition(
+                    name="ImageGen",
+                    graph_walks={"imagegen"},
+                    initial_walk=None,
+                    producer_partitions=["Thinker"],
+                )
+            )
+        return partitions
+
+    def get_output_sample_rate(self, modality: str = "audio") -> int:
+        """Talker AudioVAE sample rate (44.1 kHz on the released ckpt)."""
+        if modality == "audio" and self.config.talker is not None:
+            return self.config.talker.vae_sample_rate
+        return super().get_output_sample_rate(modality)
+
+    # ------------------------------------------------------------------
+    # Prefill scheduling — mirrors qwen3_omni's _build_thinker_prefill_schedule
+    # ------------------------------------------------------------------
+
+    def _build_thinker_prefill_schedule(
+        self,
+        input_modalities: list[str],
+        input_signals: dict[str, list[TensorPointerInfo]],
+    ) -> list[tuple[str, dict[str, TensorPointerInfo]]]:
+        """Walk-name + per-input tensor map per modality, in input order.
+
+        Mirrors qwen3_omni's helper: each ``input_modalities`` entry
+        yields one schedule step. The audio walk needs
+        ``audio_features`` (+ optional ``audio_seqlens``); image / video
+        walks need ``pixel_values`` + ``image_grid_thw``; video walks
+        also take ``video_second_per_grid``. Steps the conductor's
+        ``input_signals`` does not actually have (e.g. ``audio`` listed
+        but no ``audio_features`` provided) are silently skipped.
+        """
+        texts = input_signals.get("text_inputs", [])
+        audio_features = input_signals.get("audio_features", [])
+        audio_seqlens = input_signals.get("audio_seqlens", [])
+        pixel_values = input_signals.get("pixel_values", [])
+        image_grid_thws = input_signals.get("image_grid_thw", [])
+        # Video uses pixel_values_videos in HF; accept both keys
+        # for parity with qwen3_omni's helper.
+        pixel_values_videos = input_signals.get("pixel_values_videos", [])
+        video_grid_thws = input_signals.get("video_grid_thw", [])
+        video_second_per_grid = input_signals.get("video_second_per_grid", [])
+
+        schedule: list[tuple[str, dict[str, TensorPointerInfo]]] = []
+        text_idx = audio_idx = vision_idx = video_idx = 0
+        for mod in input_modalities:
+            if mod == "text":
+                if text_idx < len(texts):
+                    schedule.append((
+                        "prefill_text",
+                        {"text_inputs": texts[text_idx]},
+                    ))
+                    text_idx += 1
+            elif mod == "audio":
+                if audio_idx < len(audio_features):
+                    entry: dict[str, TensorPointerInfo] = {
+                        "audio_features": audio_features[audio_idx],
+                    }
+                    if audio_idx < len(audio_seqlens):
+                        entry["audio_seqlens"] = audio_seqlens[audio_idx]
+                    schedule.append(("prefill_audio", entry))
+                    audio_idx += 1
+            elif mod == "image":
+                if vision_idx < len(pixel_values):
+                    entry = {"pixel_values": pixel_values[vision_idx]}
+                    if vision_idx < len(image_grid_thws):
+                        entry["image_grid_thw"] = image_grid_thws[vision_idx]
+                    schedule.append(("prefill_vision", entry))
+                    vision_idx += 1
+            elif mod == "video":
+                if video_idx < len(pixel_values_videos):
+                    entry = {"pixel_values": pixel_values_videos[video_idx]}
+                    if video_idx < len(video_grid_thws):
+                        entry["image_grid_thw"] = video_grid_thws[video_idx]
+                    if video_idx < len(video_second_per_grid):
+                        entry["video_second_per_grid"] = video_second_per_grid[video_idx]
+                    schedule.append(("prefill_video", entry))
+                    video_idx += 1
+        return schedule
+
+    def _get_thinker_prefill_inputs(
+        self,
+        metadata: CurrentForwardConductorMetadata,
+        input_signals: dict[str, list[TensorPointerInfo]],
+    ) -> list[GraphEdge]:
+        """Build the GraphEdges for the current prefill step.
+
+        For audio/vision/video walks the encoder is the first graph
+        node, so each ``input_name`` from the schedule entry routes
+        to that encoder; ``image_grid_thw`` and ``video_second_per_grid``
+        also need to reach the Thinker (for the 3D MRoPE math) and
+        get their own parallel edges to ``Thinker``.
+        """
+        schedule = metadata.kwargs["prefill_schedule"]
+        step = metadata.kwargs["prefill_step"]
+        walk_name, tensor_dict = schedule[step]
+
+        if walk_name == "prefill_text":
+            target_node = "Thinker"
+        elif walk_name == "prefill_audio":
+            target_node = "audio_encoder"
+        elif walk_name in ("prefill_vision", "prefill_video"):
+            target_node = "vision_encoder"
+        else:
+            raise ValueError(f"Unrecognized prefill walk: {walk_name!r}")
+
+        edges: list[GraphEdge] = []
+        for input_name, tensor_info in tensor_dict.items():
+            if input_name in ("image_grid_thw", "video_second_per_grid"):
+                # These go to the Thinker, not the encoder — handled below.
+                continue
+            edge = GraphEdge(next_node=target_node, name=input_name)
+            edge.tensor_info = [tensor_info]
+            edges.append(edge)
+
+        if walk_name in ("prefill_vision", "prefill_video"):
+            # Vision encoder needs image_grid_thw, AND the Thinker needs
+            # it for 3D position math. Emit a duplicate edge to each.
+            if "image_grid_thw" in tensor_dict:
+                enc_edge = GraphEdge(next_node="vision_encoder", name="image_grid_thw")
+                enc_edge.tensor_info = [tensor_dict["image_grid_thw"]]
+                edges.append(enc_edge)
+                thinker_edge = GraphEdge(next_node="Thinker", name="image_grid_thw")
+                thinker_edge.tensor_info = [tensor_dict["image_grid_thw"]]
+                edges.append(thinker_edge)
+            if walk_name == "prefill_video" and "video_second_per_grid" in tensor_dict:
+                vspg_edge = GraphEdge(next_node="Thinker", name="video_second_per_grid")
+                vspg_edge.tensor_info = [tensor_dict["video_second_per_grid"]]
+                edges.append(vspg_edge)
+
+        return edges
+
+    # ------------------------------------------------------------------
+    # Forward-pass arg builders — multimodal prefill scheduling (step 5c)
+    # ------------------------------------------------------------------
+
+    def get_initial_forward_pass_args(
+        self,
+        partition_name: str,
+        input_modalities: list[str],
+        output_modalities: list[str],
+        input_signals: dict[str, list[TensorPointerInfo]],
+        model_kwargs: dict | None = None,
+    ) -> ForwardPassArgs:
+        if partition_name == "Talker":
+            # Talker is a consumer partition: it has no initial inputs of
+            # its own — it self-triggers when the Thinker's streamed
+            # ``thinker_tokens`` arrive. Audio output only.
+            audio_output = "audio" in output_modalities
+            full_metadata = CurrentForwardConductorMetadata(
+                input_modalities=input_modalities,
+                output_modalities=output_modalities,
+                graph_walk="talker",
+                is_prefill=False,
+            )
+            return ForwardPassArgs(
+                full_metadata=full_metadata,
+                inputs=[],
+                unpersist_tensors=[],
+                request_done=not audio_output,
+            )
+        if partition_name == "ImageGen":
+            # ImageGen is a consumer partition: it self-triggers when the
+            # Thinker's streamed ``thinker_hidden_states`` arrive. Image output
+            # only.
+            image_output = "image" in output_modalities
+            full_metadata = CurrentForwardConductorMetadata(
+                input_modalities=input_modalities,
+                output_modalities=output_modalities,
+                graph_walk="imagegen",
+                is_prefill=False,
+            )
+            return ForwardPassArgs(
+                full_metadata=full_metadata,
+                inputs=[],
+                unpersist_tensors=[],
+                request_done=not image_output,
+            )
+        if partition_name != "Thinker":
+            raise ValueError(f"Unknown partition: {partition_name!r}")
+        schedule = self._build_thinker_prefill_schedule(
+            input_modalities, input_signals,
+        )
+        if not schedule:
+            # No modalities provided — fall through to decode immediately.
+            # The conductor will report request_done after the first decode
+            # step returns nothing. Useful for empty-prompt smoke tests.
+            full_metadata = CurrentForwardConductorMetadata(
+                input_modalities=input_modalities,
+                output_modalities=output_modalities,
+                graph_walk="thinker_decode",
+                is_prefill=False,
+            )
+            return ForwardPassArgs(
+                full_metadata=full_metadata,
+                inputs=[],
+                unpersist_tensors=[],
+                request_done=True,
+            )
+
+        first_walk, _ = schedule[0]
+        full_metadata = CurrentForwardConductorMetadata(
+            input_modalities=input_modalities,
+            output_modalities=output_modalities,
+            graph_walk=first_walk,
+            is_prefill=True,
+            kwargs={
+                "prefill_schedule": schedule,
+                "prefill_step": 0,
+            },
+        )
+        inputs = self._get_thinker_prefill_inputs(full_metadata, input_signals)
+        unpersist_tensors = sum(
+            (inp.tensor_info for inp in inputs), start=[],
+        )
+        return ForwardPassArgs(
+            full_metadata=full_metadata,
+            inputs=inputs,
+            unpersist_tensors=unpersist_tensors,
+            step_metadata={
+                "is_prefill": True,
+                "is_last_prefill": len(schedule) == 1,
+            },
+        )
+
+    def get_partition_forward_pass_args(
+        self,
+        partition_name: str,
+        partition_metadata: CurrentForwardConductorMetadata,
+        persist_signals: dict[str, list[TensorPointerInfo]],
+        incoming_connections: list[StreamingConnectionState] | None = None,
+    ) -> ForwardPassArgs:
+        """Thinker state machine: walk schedule → thinker_decode → done.
+
+        Each prefill step pops the next walk from
+        ``metadata.kwargs["prefill_schedule"]``. When all prefill steps
+        are done we transition to ``thinker_decode``; when the decode
+        loop unwinds (the loop's max_iters or check_stop fired) we
+        return ``request_done=True``.
+
+        For the Talker partition the state machine is trivial: it runs
+        its single ``talker`` walk (one Talker node consuming the streamed
+        thinker tokens, generating audio internally) and is then done.
+
+        Thinker shape mirrors ``mstar/model/qwen3_omni/qwen3_omni_model.py:765+``.
+        """
+        if partition_name == "Talker":
+            return self._get_talker_forward(partition_metadata, incoming_connections)
+        if partition_name == "ImageGen":
+            return self._get_imagegen_forward(partition_metadata, incoming_connections)
+        if partition_name != "Thinker":
+            raise ValueError(f"Unknown partition: {partition_name!r}")
+
+        if partition_metadata.is_prefill:
+            step = partition_metadata.kwargs["prefill_step"] + 1
+            schedule = partition_metadata.kwargs["prefill_schedule"]
+            if step < len(schedule):
+                partition_metadata.kwargs["prefill_step"] = step
+                partition_metadata.graph_walk = schedule[step][0]
+            else:
+                partition_metadata.is_prefill = False
+                partition_metadata.graph_walk = "thinker_decode"
+        elif partition_metadata.graph_walk == "thinker_decode":
+            # Decode loop unwound — Thinker is fully done with this request.
+            return ForwardPassArgs(
+                full_metadata=partition_metadata,
+                inputs=[],
+                unpersist_tensors=[],
+                request_done=True,
+            )
+
+        if partition_metadata.is_prefill:
+            schedule = partition_metadata.kwargs["prefill_schedule"]
+            step = partition_metadata.kwargs["prefill_step"]
+            is_last_prefill = step == len(schedule) - 1
+            inputs = self._get_thinker_prefill_inputs(
+                partition_metadata, persist_signals,
+            )
+        else:
+            is_last_prefill = False
+            edge = GraphEdge(next_node="Thinker", name="text_inputs")
+            edge.tensor_info = persist_signals.get("new_token", [])
+            inputs = [edge]
+
+        unpersist_tensors = sum(
+            (inp.tensor_info for inp in inputs), start=[],
+        )
+        return ForwardPassArgs(
+            full_metadata=partition_metadata,
+            inputs=inputs,
+            unpersist_tensors=unpersist_tensors,
+            step_metadata={
+                "is_prefill": partition_metadata.is_prefill,
+                "is_last_prefill": is_last_prefill,
+            },
+        )
+
+    def _get_talker_forward(
+        self,
+        metadata: CurrentForwardConductorMetadata,
+        incoming_connections: list[StreamingConnectionState] | None,
+    ) -> ForwardPassArgs:
+        """Talker partition state machine — runs once, then done.
+
+        The Talker is a single stateless node: it consumes the full
+        stream of ``thinker_tokens`` (gated by the FixedChunkPolicy with
+        continue_after_done=True so it stays alive past the Thinker's
+        text EOS), re-tokenizes the accumulated text, and generates one
+        audio chunk inside ``TalkerSubmodule.forward``. We fire that walk
+        once the producer (Thinker) is done, then report request_done.
+        """
+        conn = incoming_connections[0] if incoming_connections else None
+        producer_done = conn.producer_done if conn else True
+
+        # Wait until the Thinker has finished emitting all its tokens — the
+        # talker needs the FULL text before it can generate. Until then,
+        # return an empty no-op step (the conductor re-invokes us as more
+        # tokens stream in).
+        if not producer_done:
+            return ForwardPassArgs(
+                full_metadata=metadata,
+                inputs=[],
+                unpersist_tensors=[],
+            )
+
+        if metadata.kwargs.get("talker_fired"):
+            # Already generated — the request is complete for this partition.
+            return ForwardPassArgs(
+                full_metadata=metadata,
+                inputs=[],
+                unpersist_tensors=[],
+                request_done=True,
+            )
+
+        metadata.kwargs["talker_fired"] = True
+        metadata.graph_walk = "talker"
+        edge = GraphEdge(next_node="Talker", name="thinker_tokens")
+        return ForwardPassArgs(
+            full_metadata=metadata,
+            inputs=[edge],
+            unpersist_tensors=[],
+        )
+
+    def _get_imagegen_forward(
+        self,
+        metadata: CurrentForwardConductorMetadata,
+        incoming_connections: list[StreamingConnectionState] | None,
+    ) -> ForwardPassArgs:
+        """ImageGen partition state machine — runs once, then done.
+
+        Mirrors :meth:`_get_talker_forward`: the ImageGen node is a single
+        stateless diffusion node consuming the Thinker's streamed
+        ``thinker_hidden_states`` (the query-token hidden states sliced at the
+        ``<imagePatch>`` positions). The FixedChunkPolicy with
+        continue_after_done=True keeps the partition alive until the Thinker has
+        emitted them; we then fire the ``imagegen`` walk once (full denoise +
+        VAE decode happen inside ``ImageGenSubmodule.forward``) and report
+        request_done.
+        """
+        conn = incoming_connections[0] if incoming_connections else None
+        producer_done = conn.producer_done if conn else True
+
+        # Wait until the Thinker has produced the query-token hidden states.
+        if not producer_done:
+            return ForwardPassArgs(
+                full_metadata=metadata,
+                inputs=[],
+                unpersist_tensors=[],
+            )
+
+        if metadata.kwargs.get("imagegen_fired"):
+            return ForwardPassArgs(
+                full_metadata=metadata,
+                inputs=[],
+                unpersist_tensors=[],
+                request_done=True,
+            )
+
+        metadata.kwargs["imagegen_fired"] = True
+        metadata.graph_walk = "imagegen"
+        edge = GraphEdge(next_node="ImageGen", name="thinker_hidden_states")
+        return ForwardPassArgs(
+            full_metadata=metadata,
+            inputs=[edge],
+            unpersist_tensors=[],
+        )
+
+    # ------------------------------------------------------------------
+    # Prompt / output handling
+    # ------------------------------------------------------------------
+
+    def process_prompt(
+        self,
+        prompt: str | None,
+        input_modalities: list[str],
+        output_modalities: list[str],
+        tensors: NameToTensorList | None = None,
+        **kwargs,
+    ) -> NameToTensorList:
+        """Build text_inputs + modality tensors for the prefill schedule.
+
+        Strategy mirrors qwen3_omni's process_prompt (step 7 of porting
+        notes): apply the chat template to TEXT-ONLY messages (so the
+        tokenizer doesn't insert placeholder tokens we'd later have to
+        strip), then run the image / audio sub-processors separately
+        on each modality input.
+
+        The Ming chat template (`tokenizer.apply_chat_template`) is the
+        jinja path that accepts OpenAI roles (user / assistant /
+        system) and rewrites them to Ming's HUMAN / ASSISTANT / SYSTEM.
+        The processor's Python `apply_chat_template` (`BailingMM2Processor`)
+        is stricter and asserts on lowercase roles — see PORTING_NOTES
+        "Role-handling nuance". Using the tokenizer path keeps the
+        interface OpenAI-compatible.
+
+        Input shape (`tensors`):
+
+          * ``image_inputs`` — list of CHW float32 [0, 1] tensors (one
+            per image). Converted to HWC uint8 [0, 255] before the
+            image processor (the upstream BailingMM2ImageProcessor
+            assumes uint8; double-rescaling near-zeros the tensor).
+          * ``audio_inputs`` — list of ``(waveform, sampling_rate)``
+            tuples OR list of 1-D float tensors (sample rate inferred
+            from the processor's default — 16 kHz on the released ckpt).
+          * ``video_inputs`` — list of 4-D (T, C, H, W) float tensors.
+            Currently treated like a stack of images via the image
+            processor's video path; per-frame timestamp scaffolding
+            (``video_second_per_grid``) defaults to 1.0 unless an
+            ``input_metadata["video"][i]["second_per_grid"]`` override
+            is supplied via ``**kwargs``.
+
+        Output shape — keys consumed by
+        ``_build_thinker_prefill_schedule`` in step 5c:
+
+          * ``text_inputs`` — list of 1-D long tensors.
+          * ``pixel_values``, ``image_grid_thw`` — one entry per image.
+          * ``pixel_values_videos``, ``video_grid_thw``,
+            ``video_second_per_grid`` — one entry per video clip.
+          * ``audio_features``, ``audio_seqlens`` — one entry per
+            audio clip; ``audio_features`` is (n_mels, T) and
+            ``audio_seqlens`` is a length-1 int tensor.
+        """
+        if self.tokenizer is None:
+            raise RuntimeError(
+                "MingFlashOmniModel.process_prompt called but tokenizer "
+                "is not loaded. See _warn_tokenizer_unavailable for setup."
+            )
+
+        result: NameToTensorList = {
+            "text_inputs": [],
+            "pixel_values": [],
+            "image_grid_thw": [],
+            "pixel_values_videos": [],
+            "video_grid_thw": [],
+            "video_second_per_grid": [],
+            "audio_features": [],
+            "audio_seqlens": [],
+        }
+
+        # ----- Text path (always present, even for image-/audio-only
+        # turns since the chat template emits role markers + an
+        # assistant-prompt suffix the model needs to start decoding).
+        if prompt is not None:
+            # Image-generation requests append the learnable query-token
+            # block (<image><imagePatch>*N</image>) to the prompt — the
+            # thinker substitutes its image-gen query embeddings at those
+            # positions during forward (step 9). Only when the deploy
+            # actually ships an imagegen sub-config and the caller asked
+            # for an image output.
+            prompt_for_template = prompt
+            if "image" in output_modalities and self.config.image_gen is not None:
+                from mstar.model.ming_omni_flash.components.prompt_utils import (
+                    maybe_expand_image_gen_prompt,
+                )
+                prompt_for_template = maybe_expand_image_gen_prompt(
+                    prompt, num_query_tokens=self.config.image_gen.num_query_tokens,
+                )
+            messages = [{"role": "user", "content": prompt_for_template}]
+            text = self.tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True,
+            )
+            input_ids = self.tokenizer(text, return_tensors="pt").input_ids[0]
+            result["text_inputs"].append(input_ids)
+
+        if tensors is None:
+            return result
+
+        # ----- Image path
+        raw_images = tensors.get("image_inputs", []) or []
+        if raw_images:
+            self._process_image_inputs(raw_images, result)
+
+        # ----- Video path
+        raw_videos = tensors.get("video_inputs", []) or []
+        if raw_videos:
+            video_metadata = kwargs.get("input_metadata", {}).get("video", [])
+            self._process_video_inputs(raw_videos, video_metadata, result)
+
+        # ----- Audio path
+        raw_audios = tensors.get("audio_inputs", []) or []
+        if raw_audios:
+            self._process_audio_inputs(raw_audios, result)
+
+        return result
+
+    # ------------------------------------------------------------------
+    # Per-modality helpers (split out so process_prompt stays readable)
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _image_to_processor_input(img: "torch.Tensor"):
+        """Convert a CHW float [0,1] tensor to HWC uint8 numpy for HF.
+
+        BailingMM2ImageProcessor (and most HF image processors)
+        assume PIL/uint8 inputs with ``do_rescale=True`` by default.
+        Passing a float [0,1] tensor would double-rescale it to
+        near-zero. Mirror qwen3_omni's conversion (qwen3_omni_model.py:
+        1027-1039).
+        """
+        import numpy as np
+        x = img
+        if x.dtype.is_floating_point:
+            x = (x * 255.0).clamp(0, 255).to(torch.uint8)
+        if x.dim() == 3 and x.shape[0] in (1, 3):
+            x = x.permute(1, 2, 0)  # CHW -> HWC
+        arr = x.cpu().contiguous().numpy()
+        if arr.shape[-1] == 1:
+            arr = np.repeat(arr, 3, axis=-1)
+        return arr
+
+    def _process_image_inputs(
+        self,
+        raw_images: list["torch.Tensor"],
+        result: NameToTensorList,
+    ) -> None:
+        if self._processor is None:
+            raise RuntimeError(
+                "process_prompt: image inputs supplied but processor is None. "
+                "See PORTING_NOTES 'Ming source dependency' for setup."
+            )
+        img_proc = self._processor.image_processor
+        for img in raw_images:
+            arr = self._image_to_processor_input(img)
+            out = img_proc(images=[arr], return_tensors="pt")
+            # ``pixel_values`` is (n_patches, C, ph, pw); the encoder
+            # consumes it directly. ``image_grid_thw`` is (1, 3).
+            result["pixel_values"].append(out["pixel_values"])
+            grid = out["image_grid_thw"]
+            if not isinstance(grid, torch.Tensor):
+                grid = torch.as_tensor(grid)
+            result["image_grid_thw"].append(grid[0])
+
+    def _process_video_inputs(
+        self,
+        raw_videos: list["torch.Tensor"],
+        video_metadata: list[dict],
+        result: NameToTensorList,
+    ) -> None:
+        if self._processor is None:
+            raise RuntimeError(
+                "process_prompt: video inputs supplied but processor is None."
+            )
+        img_proc = self._processor.image_processor
+        # Per-frame timestamp override; default 1.0 second/frame so the
+        # Thinker's temporal positions advance once per grid step
+        # (matches modeling_bailing_moe_v2.get_rope_index's `else: 1.0`).
+        for i, video in enumerate(raw_videos):
+            # Convert (T, C, H, W) float [0,1] to (T, H, W, C) uint8.
+            frames = []
+            for t in range(video.shape[0]):
+                frames.append(self._image_to_processor_input(video[t]))
+            out = img_proc(
+                images=None,
+                videos=[frames],
+                **({} if not video_metadata else {}),
+            )
+            result["pixel_values_videos"].append(out["pixel_values_videos"])
+            grid = out["video_grid_thw"]
+            if not isinstance(grid, torch.Tensor):
+                grid = torch.as_tensor(grid)
+            result["video_grid_thw"].append(grid[0])
+            spg = 1.0
+            if i < len(video_metadata):
+                spg = float(video_metadata[i].get("second_per_grid", 1.0))
+            result["video_second_per_grid"].append(torch.tensor(spg))
+
+    def _process_audio_inputs(
+        self,
+        raw_audios: list,
+        result: NameToTensorList,
+    ) -> None:
+        if self._processor is None:
+            raise RuntimeError(
+                "process_prompt: audio inputs supplied but processor is None."
+            )
+        audio_proc = self._processor.audio_processor
+        # Normalise each input into the (waveform, sampling_rate) tuple
+        # the processor expects. Accept either:
+        #   * raw 1-D float tensor (assume the processor's default SR)
+        #   * (waveform_tensor, int sr) tuple
+        default_sr = getattr(audio_proc, "sampling_rate", 16000)
+        for audio in raw_audios:
+            if isinstance(audio, tuple) and len(audio) == 2:
+                waveform, sr = audio
+            else:
+                waveform, sr = audio, default_sr
+            if isinstance(waveform, torch.Tensor):
+                waveform_np = waveform.detach().cpu().numpy()
+            else:
+                waveform_np = waveform
+            out = audio_proc([(waveform_np, sr)])
+            # `audio_feats` is (B, T, n_mels); transpose to (n_mels, T)
+            # per clip — that's what the AudioEncoderSubmodule expects
+            # for a single-clip prepare_inputs.
+            feats = out["audio_feats"]
+            if not isinstance(feats, torch.Tensor):
+                feats = torch.as_tensor(feats)
+            # B=1 per clip in our loop.
+            mel = feats[0].transpose(0, 1).contiguous()  # (n_mels, T)
+            length = out["audio_feats_lengths"]
+            if not isinstance(length, torch.Tensor):
+                length = torch.as_tensor(length)
+            result["audio_features"].append(mel)
+            result["audio_seqlens"].append(length.to(torch.long))
+
+    def postprocess(self, output: torch.Tensor, modality: str, **kwargs) -> bytes:
+        """Encode a finished output tensor to bytes for the client.
+
+        * ``text``  — utf-8 of the detokenized ids.
+        * ``audio`` — raw little-endian float32 PCM (Talker AudioVAE waveform);
+          the sample rate is exposed via :meth:`get_output_sample_rate`.
+          Mirrors qwen3_omni's ``output.cpu().numpy().tobytes()`` convention.
+        * ``image`` — PNG bytes of the ImageGen RGB output. The pipeline emits a
+          ``[B, 3, H, W]`` (or ``[3, H, W]``) tensor in ``[-1, 1]`` (Z-Image VAE
+          convention); we map to uint8 [0, 255] and PNG-encode the first image.
+        """
+        if output is None or output.numel() == 0:
+            return b""
+        if modality == "text":
+            if self.tokenizer is None:
+                return b""
+            text = self.tokenizer.decode(output.tolist(), skip_special_tokens=True)
+            return text.encode("utf-8")
+        if modality == "audio":
+            return output.detach().to("cpu", dtype=torch.float32).numpy().tobytes()
+        if modality == "image":
+            return self._encode_image_png(output)
+        raise ValueError(
+            f"Unsupported modality for Ming-flash-omni-2.0: {modality!r}. "
+            f"Supported: text, audio, image."
+        )
+
+    @staticmethod
+    def _encode_image_png(output: torch.Tensor) -> bytes:
+        """PNG-encode a [-1, 1] RGB tensor ([B,3,H,W] or [3,H,W]) → bytes."""
+        import io
+
+        from PIL import Image
+
+        img = output.detach().to("cpu", dtype=torch.float32)
+        if img.dim() == 4:
+            img = img[0]  # first image of the batch
+        if img.dim() != 3 or img.shape[0] not in (1, 3):
+            raise ValueError(
+                f"ImageGen postprocess expected [3,H,W] (or [B,3,H,W]); got {tuple(output.shape)}"
+            )
+        # [-1, 1] → [0, 255] uint8, CHW → HWC.
+        img = ((img.clamp(-1, 1) + 1.0) * 127.5).round().to(torch.uint8)
+        arr = img.permute(1, 2, 0).numpy()
+        if arr.shape[2] == 1:
+            arr = arr.repeat(3, axis=2)
+        buf = io.BytesIO()
+        Image.fromarray(arr).save(buf, format="PNG")
+        return buf.getvalue()
+
+    # ------------------------------------------------------------------
+    # Submodule construction
+    # ------------------------------------------------------------------
+
+    def get_default_sharding_config(self):
+        """Thinker is TP-capable; engine's worker maps `tp_size` from
+        the yaml's node_group to the rank's comm_group."""
+        from mstar.distributed.base import ShardingConfig
+
+        return ShardingConfig(
+            groups=[],
+            tp_enabled_nodes={"Thinker"},
+            shard_dim={},
+        )
+
+    def get_submodule(self, node_name: str, device="cpu", tp_group=None):
+        if node_name in self._submodule_cache:
+            return self._submodule_cache[node_name]
+        if node_name == "vision_encoder":
+            submodule = self._create_vision_encoder_submodule(device)
+            self._submodule_cache[node_name] = submodule
+            return submodule
+        if node_name == "audio_encoder":
+            submodule = self._create_audio_encoder_submodule(device)
+            self._submodule_cache[node_name] = submodule
+            return submodule
+        if node_name == "Talker":
+            submodule = self._create_talker_submodule(device)
+            self._submodule_cache[node_name] = submodule
+            return submodule
+        if node_name == "ImageGen":
+            submodule = self._create_imagegen_submodule(device)
+            self._submodule_cache[node_name] = submodule
+            return submodule
+        if node_name != "Thinker":
+            raise ValueError(
+                f"Unknown node: {node_name!r}. Registers "
+                f"'Thinker', 'vision_encoder', 'audio_encoder', 'Talker', "
+                f"'ImageGen'."
+            )
+
+        # Build LingMoeModel on the meta device first so the constructor's
+        # `torch.empty(...)` allocations don't materialise on the target
+        # device. Then `.to_empty(device=device)` reallocates each Parameter
+        # in real memory, and the loader streams weights into them.
+        llm = self.config.thinker_llm
+        mrope = llm.mrope_section
+        with torch.device("meta"):
+            model = LingMoeModel(
+                vocab_size=llm.vocab_size,
+                hidden_size=llm.hidden_size,
+                intermediate_size=llm.intermediate_size,
+                moe_intermediate_size=llm.moe_intermediate_size,
+                num_hidden_layers=llm.num_hidden_layers,
+                num_attention_heads=llm.num_attention_heads,
+                num_kv_heads=llm.num_key_value_heads,
+                head_dim=llm.head_dim,
+                rms_norm_eps=llm.rms_norm_eps,
+                rope_theta=llm.rope_theta,
+                max_position_embeddings=llm.max_position_embeddings,
+                partial_rotary_factor=llm.partial_rotary_factor,
+                mrope_section=mrope,
+                num_experts=llm.num_experts,
+                num_experts_per_tok=llm.num_experts_per_tok,
+                num_shared_experts=llm.num_shared_experts,
+                n_group=llm.n_group,
+                topk_group=llm.topk_group,
+                routed_scaling_factor=llm.moe_router_topk_scaling_factor,
+                first_k_dense_replace=llm.first_k_dense_replace,
+                tie_word_embeddings=llm.tie_word_embeddings,
+                use_qkv_bias=llm.use_qkv_bias,
+                use_bias=llm.use_bias,
+                comm_group=tp_group,
+            )
+        # Materialise + cast to bf16 (matches the released ckpt's torch_dtype).
+        #
+        # Cast dtype on the META model FIRST, then `to_empty`. `to_empty`
+        # allocates each Parameter using its current dtype; the meta model's
+        # constructor `torch.empty(...)` calls default to float32, so casting
+        # after allocation means every param is briefly materialised in fp32
+        # (2x the bf16 footprint) before the down-cast. At TP=4 that fp32
+        # allocation peak OOMs at ~78.5/80 GB per rank. Casting the meta model
+        # is metadata-only (no allocation), so `to_empty` then allocates
+        # directly in bf16 — halving the peak and letting TP=4 fit.
+        model.to(self.get_autocast_dtype())
+        model.to_empty(device=device)
+
+        load_thinker_weights(model, self.local_dir, device=device, strict=True)
+        model.eval()
+
+        submodule = BailingMoeV2ThinkerSubmodule(
+            model=model,
+            config=self.config,
+            eos_token_id=llm.eos_token_id,
+        )
+        self._submodule_cache[node_name] = submodule
+        return submodule
+
+    # ------------------------------------------------------------------
+    # Encoder construction helpers (step 5a)
+    # ------------------------------------------------------------------
+
+    def _create_vision_encoder_submodule(self, device: str):
+        """Build Qwen3MoeVisionTransformer + MingVisionProjector, load weights.
+
+        The vision encoder lives on a single rank (no TP) per the
+        typical topology. Uses bf16 to match the released ckpt's dtype.
+        ``attn_implementation`` defaults to ``flash_attention_2`` for
+        video performance (same gotcha as qwen3_omni:1508-1519); fall
+        back to eager only when explicitly disabled via env var.
+        """
+        from mstar.model.ming_omni_flash.components.projectors import (
+            MingVisionProjector,
+        )
+        from mstar.model.ming_omni_flash.components.vision_encoder import (
+            build_vision_encoder,
+        )
+        from mstar.model.ming_omni_flash.loader import (
+            load_vision_encoder_weights,
+            load_vision_projector_weights,
+        )
+
+        dtype = self.get_autocast_dtype()
+        attn = os.environ.get("MING_VISION_ATTN_IMPL", "flash_attention_2")
+
+        vision_encoder = build_vision_encoder(
+            config=self.config.vision,
+            dtype=dtype,
+            device=device,
+            attn_implementation=attn,
+            local_dir=self.local_dir,
+        )
+        load_vision_encoder_weights(
+            vision_encoder, self.local_dir, device=device, strict=True,
+        )
+
+        vision_projector = MingVisionProjector(
+            vision_dim=self.config.vision.out_hidden_size,
+            llm_dim=self.config.thinker_llm.hidden_size,
+            mlp_depth=self.config.mlp_depth,
+        )
+        vision_projector = vision_projector.to(dtype=dtype, device=device)
+        load_vision_projector_weights(
+            vision_projector, self.local_dir, device=device, strict=True,
+        )
+        vision_projector.eval()
+
+        return VisionEncoderSubmodule(
+            vision_encoder=vision_encoder,
+            vision_projector=vision_projector,
+            config=self.config,
+        )
+
+    def _create_audio_encoder_submodule(self, device: str):
+        """Build MingAudioEncoder + MingAudioProjector, load weights.
+
+        Audio encoder is the self-contained Whisper port from step 4a
+        (no openai-whisper runtime dep). Uses bf16 to match the
+        released ckpt's dtype. Flash-attn varlen kicks in when
+        available; otherwise the manual padded-attention fallback runs.
+        """
+        from mstar.model.ming_omni_flash.components.audio_encoder import (
+            build_audio_encoder,
+        )
+        from mstar.model.ming_omni_flash.components.projectors import (
+            MingAudioProjector,
+        )
+        from mstar.model.ming_omni_flash.loader import (
+            load_audio_encoder_weights,
+            load_audio_projector_weights,
+        )
+
+        dtype = self.get_autocast_dtype()
+
+        audio_encoder = build_audio_encoder(
+            audio_config=self.config.audio_encoder,
+            dtype=dtype,
+            device=device,
+            use_flash_attn=True,
+        )
+        load_audio_encoder_weights(
+            audio_encoder, self.local_dir, device=device, strict=True,
+        )
+
+        audio_projector = MingAudioProjector(
+            audio_dim=self.config.audio_encoder.d_model,
+            llm_dim=self.config.thinker_llm.hidden_size,
+            ds_kernel_size=self.config.audio_encoder.ds_kernel_size,
+            ds_stride=self.config.audio_encoder.ds_stride,
+            mlp_depth=self.config.mlp_depth,
+        )
+        audio_projector = audio_projector.to(dtype=dtype, device=device)
+        load_audio_projector_weights(
+            audio_projector, self.local_dir, device=device, strict=True,
+        )
+        audio_projector.eval()
+
+        return AudioEncoderSubmodule(
+            audio_encoder=audio_encoder,
+            audio_projector=audio_projector,
+            config=self.config,
+        )
+
+    def _create_talker_submodule(self, device: str):
+        """Build the full talker stack + load weights, wrap in a submodule.
+
+        Assembles Qwen2 LLM + CFM(DiT) + Aggregator + stop/spk heads +
+        AudioVAE via the step-6b/6c/6d factories, loads each subtree
+        with the step-6f loaders, and wraps the lot in a
+        :class:`TalkerSubmodule` around a :class:`TalkerGenerator`.
+
+        The talker colocates on a single rank (no TP) — bf16 to match
+        the released ckpt's torch_dtype.
+        """
+        if self.config.talker is None:
+            raise RuntimeError(
+                "MingFlashOmniModel: 'Talker' node requested but the snapshot "
+                "has no talker/ subdir (thinker-only checkpoint)."
+            )
+        from mstar.model.ming_omni_flash.components.audio_vae import (
+            build_audio_vae,
+        )
+        from mstar.model.ming_omni_flash.components.talker_dit import (
+            build_aggregator,
+            build_talker_cfm,
+            build_talker_heads,
+            build_talker_llm,
+        )
+        from mstar.model.ming_omni_flash.components.talker_generator import (
+            TalkerGenerator,
+        )
+        from mstar.model.ming_omni_flash.loader import (
+            load_talker_aggregator_weights,
+            load_talker_audio_vae_weights,
+            load_talker_cfm_weights,
+            load_talker_heads_weights,
+            load_talker_llm_weights,
+        )
+        from mstar.model.ming_omni_flash.submodules import TalkerSubmodule
+
+        talker = self.config.talker
+        dtype = self.get_autocast_dtype()
+
+        llm = build_talker_llm(talker.llm, dtype=dtype, device=device)
+        load_talker_llm_weights(llm, self.local_dir, device=device, strict=True)
+
+        cfm = build_talker_cfm(talker, dtype=dtype, device=device)
+        load_talker_cfm_weights(cfm, self.local_dir, device=device, strict=True)
+
+        aggregator = build_aggregator(talker, dtype=dtype, device=device)
+        load_talker_aggregator_weights(
+            aggregator, self.local_dir, device=device, strict=True,
+        )
+
+        heads = build_talker_heads(talker, dtype=dtype, device=device)
+        load_talker_heads_weights(heads, self.local_dir, device=device, strict=True)
+
+        audio_vae = build_audio_vae(talker.vae, dtype=dtype, device=device)
+        load_talker_audio_vae_weights(
+            audio_vae, self.local_dir, device=device, strict=True,
+        )
+
+        generator = TalkerGenerator(
+            talker_config=talker,
+            llm=llm,
+            cfm=cfm,
+            aggregator=aggregator,
+            stop_head=heads["stop_head"],
+            audio_vae=audio_vae,
+        )
+        return TalkerSubmodule(
+            generator=generator,
+            config=self.config,
+            text_bridge=self.thinker_text_to_talker_inputs,
+        )
+
+    def _create_imagegen_submodule(self, device: str):
+        """Build the imagegen diffusion stack + load weights, wrap in a submodule.
+
+        Assembles the ZImage DiT + VAE + scheduler + Qwen2 condition encoder
+        (+ optional ByT5) via :meth:`MingImagePipeline.from_checkpoint`, then
+        wraps it in an :class:`ImageGenSubmodule`. The imagegen stack colocates
+        on a single rank (no TP) — bf16 to match the released ckpt dtype.
+
+        ``from_checkpoint`` lazily imports diffusers (for the VAE + scheduler),
+        so this factory only runs on a box where diffusers is healthy and the
+        snapshot ships the imagegen tree.
+        """
+        if self.config.image_gen is None:
+            raise RuntimeError(
+                "MingFlashOmniModel: 'ImageGen' node requested but the snapshot "
+                "has no imagegen tree (no transformer/ + vae/ + connector/)."
+            )
+        from mstar.model.ming_omni_flash.components.imagegen_pipeline import (
+            MingImagePipeline,
+        )
+        from mstar.model.ming_omni_flash.submodules import ImageGenSubmodule
+
+        pipeline = MingImagePipeline.from_checkpoint(
+            self.local_dir,
+            self.config.image_gen,
+            device=device,
+            dtype=self.get_autocast_dtype(),
+        )
+        return ImageGenSubmodule(pipeline=pipeline, config=self.config)
diff --git a/mstar/model/ming_omni_flash/submodules.py b/mstar/model/ming_omni_flash/submodules.py
new file mode 100644
index 00000000..b8a9acd3
--- /dev/null
+++ b/mstar/model/ming_omni_flash/submodules.py
@@ -0,0 +1,1028 @@
+"""mstar engine submodules for Ming-flash-omni-2.0.
+
+Three submodules covering the multimodal-understanding side of the model:
+
+  * ``VisionEncoderSubmodule`` (enc_dec / stateless) — runs Ming's
+    Qwen3MoeVisionTransformer + MingVisionProjector, returns
+    LLM-space vision embeddings for the Thinker to splice in.
+
+  * ``AudioEncoderSubmodule`` (enc_dec / stateless) — runs
+    MingAudioEncoder + MingAudioProjector, returns LLM-space audio
+    embeddings (packed across clips).
+
+  * ``BailingMoeV2ThinkerSubmodule`` (AR / KV-cache) — the Ling-2.0
+    MoE LLM. Text-only paths are wired today (step 3d–3f); the
+    vision/audio prefill paths grow in via this submodule's
+    ``prepare_inputs`` dispatch in step 5b.
+
+Reference: mstar's :class:`OrpheusLLMSubmodule`
+(`mstar/model/orpheus/submodules.py:20-176`) is the cleanest text-LLM
+template; Qwen3-Omni's submodules
+(`mstar/model/qwen3_omni/submodules.py`) show the multimodal extensions
+and graph-walk dispatch we mirror here.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import torch
+from torch import nn
+
+from mstar.communication.tensors import NameToTensorList
+from mstar.conductor.request_info import CurrentForwardPassInfo
+from mstar.engine.kv_store import PositionInfo
+from mstar.model.ming_omni_flash.components.model import LingMoeModel
+from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+from mstar.model.submodule_base import (
+    ARNodeInputs,
+    ARNodeSubmodule,
+    ModelInputsFromEngine,
+    NodeInputs,
+    NodeSubmodule,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ===================================================================
+# 1. VisionEncoderSubmodule (stateless enc_dec engine)
+# ===================================================================
+
+
+class VisionEncoderSubmodule(NodeSubmodule):
+    """Wraps Ming's Qwen3MoeVisionTransformer + MingVisionProjector.
+
+    Runs once per request (stateless), consumes ``(pixel_values,
+    image_grid_thw)`` and produces ``vision_embeds`` already projected
+    into the Thinker's hidden space (no further linear on the LLM
+    side — Ming applies the projector + L2 norm before splicing,
+    mirroring ``modeling_bailingmm2.extract_image_feature``).
+
+    ``deepstack`` features are deliberately NOT plumbed in step 5a:
+    the released ckpt sets ``use_deepstack=False`` and the deepstack
+    list is not consumed by ``modeling_bailingmm2``'s text-out path.
+    If/when we enable deepstack splicing, ``build_vision_encoder``
+    grows a ``use_deepstack=True`` flag and this submodule's forward
+    will return both tensors.
+    """
+
+    def __init__(
+        self,
+        vision_encoder: nn.Module,
+        vision_projector: nn.Module,
+        config: MingFlashOmniModelConfig,
+    ) -> None:
+        super().__init__()
+        self.vision_encoder = vision_encoder
+        self.vision_projector = vision_projector
+        self.config = config
+
+    def prepare_inputs(
+        self,
+        graph_walk: str,
+        fwd_info: CurrentForwardPassInfo,
+        inputs: NameToTensorList,
+        **kwargs,
+    ) -> NodeInputs:
+        """Pull pixel_values + grid_thw off the conductor's input bundle.
+
+        ``image_grid_thw`` is produced by ``process_prompt`` from the
+        HF image processor; for the test path (no processor) a 1-D
+        ``[T, H, W]`` tensor also works (we promote it to ``(1, 3)``).
+        """
+        if "pixel_values" not in inputs or not inputs["pixel_values"]:
+            raise ValueError(
+                "VisionEncoderSubmodule: missing 'pixel_values' input. "
+                "process_prompt must produce this from the image processor."
+            )
+        pixel_values = inputs["pixel_values"][0]
+        grid_thw = inputs.get(
+            "image_grid_thw", inputs.get("grid_thw", [None])
+        )[0]
+        if grid_thw is None:
+            raise ValueError(
+                "VisionEncoderSubmodule: 'image_grid_thw' is None. "
+                "Make sure process_prompt forwarded image_grid_thw from "
+                "the HF image processor (BailingMM2Processor)."
+            )
+        if grid_thw.dim() == 1:
+            grid_thw = grid_thw.unsqueeze(0)  # promote to (1, 3)
+
+        return NodeInputs(
+            tensor_inputs={
+                "pixel_values": pixel_values,
+                "grid_thw": grid_thw,
+            }
+        )
+
+    def forward(
+        self,
+        graph_walk: str,
+        engine_inputs: ModelInputsFromEngine,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+        **kwargs,
+    ) -> NameToTensorList:
+        """Run encoder → projector → L2-norm.
+
+        Ming applies ``F.normalize(image_embeds, dim=-1)`` after the
+        projector (``modeling_bailingmm2.extract_image_feature:101``).
+        We mirror that so the Thinker sees the same numeric distribution
+        the source model produced during training.
+        """
+        device = pixel_values.device
+        logger.debug(
+            "VisionEncoder: pixel_values=%s grid_thw=%s",
+            tuple(pixel_values.shape), tuple(grid_thw.shape),
+        )
+        # The Ming encoder accepts a single torch.Tensor of stacked
+        # patches; grid_thw selects which positions / images they belong
+        # to. ``use_deepstack=False`` so encoder returns a single tensor.
+        with torch.no_grad():
+            vision_embeds = self.vision_encoder(
+                pixel_values.to(device), grid_thw=grid_thw.to(device),
+            )
+            if isinstance(vision_embeds, tuple):
+                # Defensive: if the encoder was built with
+                # ``use_deepstack=True``, drop the deepstack list.
+                vision_embeds = vision_embeds[0]
+            projected = self.vision_projector(vision_embeds)
+            projected = torch.nn.functional.normalize(projected, dim=-1)
+        return {"vision_embeds": [projected]}
+
+
+# ===================================================================
+# 2. AudioEncoderSubmodule (stateless enc_dec engine)
+# ===================================================================
+
+
+class AudioEncoderSubmodule(NodeSubmodule):
+    """Wraps MingAudioEncoder + MingAudioProjector.
+
+    Consumes a list of variable-length mel spectrograms (one per
+    audio clip) and produces packed ``audio_embeds`` ready for the
+    Thinker to splice. The packed-sequence forward matches the upstream
+    encoder ABI (returns ``(packed, cu_seqlens)``); we drop
+    ``cu_seqlens`` after the projector chunks the per-clip lengths
+    back via ``MingAudioProjector.compute_output_length`` if needed.
+
+    For step 5a the submodule assumes a single audio clip per request
+    (the common case for Q&A / TTS / S2S); multi-clip batched audio
+    folds in alongside Thinker batching in a later step.
+    """
+
+    def __init__(
+        self,
+        audio_encoder: nn.Module,
+        audio_projector: nn.Module,
+        config: MingFlashOmniModelConfig,
+    ) -> None:
+        super().__init__()
+        self.audio_encoder = audio_encoder
+        self.audio_projector = audio_projector
+        self.config = config
+
+    def prepare_inputs(
+        self,
+        graph_walk: str,
+        fwd_info: CurrentForwardPassInfo,
+        inputs: NameToTensorList,
+        **kwargs,
+    ) -> NodeInputs:
+        """Pull mel features + (optional) per-clip lengths.
+
+        ``audio_features`` is the only required input today. It's
+        either ``(n_mels, T)`` for a single clip or ``(B, n_mels, T)``
+        for already-batched input. ``audio_seqlens`` (the original
+        unpadded length per clip) is optional — when present the
+        encoder uses it to skip the padded tail.
+        """
+        if "audio_features" not in inputs or not inputs["audio_features"]:
+            raise ValueError(
+                "AudioEncoderSubmodule: missing 'audio_features' input. "
+                "process_prompt must produce this from the audio processor."
+            )
+        audio_features = inputs["audio_features"][0]
+        audio_seqlens = inputs.get("audio_seqlens", [None])[0]
+        return NodeInputs(
+            tensor_inputs={
+                "audio_features": audio_features,
+                "audio_seqlens": audio_seqlens,
+            }
+        )
+
+    def forward(
+        self,
+        graph_walk: str,
+        engine_inputs: ModelInputsFromEngine,
+        audio_features: torch.Tensor,
+        audio_seqlens: torch.Tensor | None = None,
+        **kwargs,
+    ) -> NameToTensorList:
+        """Encoder → projector → L2-norm (if config.audio_encoder.norm_query_embeds).
+
+        Mirrors ``modeling_bailingmm2.extract_audio_feature``:
+        L2-normalize along the last dim when ``norm_query_embeds`` is
+        set in the audio config (true on the released ckpt).
+        """
+        device = audio_features.device
+        # Accept (n_mels, T) for a single clip or (B, n_mels, T) batched.
+        if audio_features.dim() == 2:
+            mel_list: list[torch.Tensor] = [audio_features.to(device)]
+        elif audio_features.dim() == 3:
+            mel_list = [audio_features[i].to(device) for i in range(audio_features.shape[0])]
+        else:
+            raise ValueError(
+                f"AudioEncoderSubmodule: expected audio_features of rank 2 or 3, "
+                f"got rank {audio_features.dim()} with shape {tuple(audio_features.shape)}"
+            )
+        # If audio_seqlens is provided, trim the padded tail of each clip
+        # before sending it to the encoder so positional embeddings line up.
+        if audio_seqlens is not None:
+            mel_list = [
+                m[..., : int(audio_seqlens[i].item())]
+                for i, m in enumerate(mel_list)
+            ]
+
+        logger.debug(
+            "AudioEncoder: %d clip(s), per-clip mel T=%s",
+            len(mel_list), [int(m.shape[-1]) for m in mel_list],
+        )
+        with torch.no_grad():
+            # Packed encoder returns (total_T', n_state), cu_seqlens int32.
+            packed, cu_seqlens = self.audio_encoder(mel_list)
+            # Projector expects (B, T, audio_dim) shape — feed one clip
+            # at a time when there are multiple, then concat.
+            projected_chunks: list[torch.Tensor] = []
+            seg_starts = cu_seqlens.tolist()
+            for i in range(len(seg_starts) - 1):
+                seg = packed[seg_starts[i]:seg_starts[i + 1]].unsqueeze(0)  # (1, T_i, n_state)
+                # Projector returns (B, llm_dim, T'_i); transpose to (T'_i, llm_dim).
+                projected = self.audio_projector(seg).squeeze(0).transpose(0, 1)
+                projected_chunks.append(projected)
+            audio_embeds = torch.cat(projected_chunks, dim=0)  # (sum T'_i, llm_dim)
+
+            if self.config.audio_encoder.norm_query_embeds:
+                audio_embeds = torch.nn.functional.normalize(audio_embeds, dim=-1)
+
+        return {"audio_embeds": [audio_embeds.to(audio_features.dtype)]}
+
+
+class BailingMoeV2ThinkerSubmodule(ARNodeSubmodule):
+    """Thinker submodule for Ming-flash-omni-2.0.
+
+    Graph walks the dispatch handles:
+      * ``prefill`` / ``prefill_text``: embed text token ids, fill KV
+        cache, sample first token's logits. (``prefill`` is the legacy
+        text-only name kept for backward compat with step 3f; step 5c
+        renames the walk to ``prefill_text``.)
+      * ``prefill_audio``: splice precomputed audio embeddings between
+        ``audio_start`` / ``audio_end`` sentinel embeddings; build
+        text-like 3D MRoPE positions for the span; fill KV cache;
+        sample first token's logits.
+      * ``prefill_vision`` / ``prefill_video``: splice precomputed
+        vision embeddings between ``image_start`` / ``image_end``
+        (or ``video_start`` / ``video_end``) sentinel embeddings;
+        build grid-aware 3D MRoPE positions per
+        ``modeling_bailing_moe_v2.get_rope_index:625-647``; fill KV
+        cache; sample first token's logits.
+      * ``decode`` / ``thinker_decode``: embed the previous token,
+        single-step forward, sample next-token logits.
+
+    The submodule does NOT use ``cache_handle.apply_rope`` — Ling-2.0's
+    partial 3D ``video_rope`` is applied inline by
+    :class:`LingAttention` using the explicit ``position_ids`` argument.
+    """
+
+    # Walk names treated as text-only prefill (no embed splicing).
+    _TEXT_PREFILL_WALKS = ("prefill", "prefill_text")
+    # Walk names treated as autoregressive decode (one-token step).
+    _DECODE_WALKS = ("decode", "thinker_decode")
+
+    def __init__(
+        self,
+        model: LingMoeModel,
+        config: MingFlashOmniModelConfig | None = None,
+        eos_token_id: int = 156895,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.config = config
+        self.eos_token_id = eos_token_id
+        # Stash the embed_tokens / lm_head as direct attributes so the
+        # engine's CUDA-graph captures don't reach through .model.
+        self.embed_tokens = model.embed_tokens
+        self.lm_head = model.lm_head
+
+        # Lazily-cached sentinel token embeddings (1, hidden_size each).
+        # Recomputed on first use per device; allocated lazily so CPU
+        # tests don't materialise the embed table at import time.
+        self._image_start_embed: torch.Tensor | None = None
+        self._image_end_embed: torch.Tensor | None = None
+        self._video_start_embed: torch.Tensor | None = None
+        self._video_end_embed: torch.Tensor | None = None
+        self._audio_start_embed: torch.Tensor | None = None
+        self._audio_end_embed: torch.Tensor | None = None
+
+    # ------------------------------------------------------------------
+    # Sentinel embedding helpers
+    # ------------------------------------------------------------------
+
+    def _sentinel_embed(self, token_id: int, device: torch.device) -> torch.Tensor:
+        """Embed a single sentinel token id; small enough to recompute."""
+        tok = torch.tensor([int(token_id)], dtype=torch.long, device=device)
+        return self.embed_tokens(tok)  # (1, hidden_size)
+
+    def _get_vision_bos_eos(
+        self, device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.config is None:
+            raise RuntimeError(
+                "BailingMoeV2ThinkerSubmodule.config is None — required for "
+                "vision sentinel embeddings. Pass config=... at construction "
+                "(step 5b)."
+            )
+        llm = self.config.thinker_llm
+        if self._image_start_embed is None or self._image_start_embed.device != device:
+            self._image_start_embed = self._sentinel_embed(llm.image_start_token, device)
+            self._image_end_embed = self._sentinel_embed(llm.image_end_token, device)
+        return self._image_start_embed, self._image_end_embed
+
+    def _get_video_bos_eos(
+        self, device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.config is None:
+            raise RuntimeError("config required for video sentinels.")
+        llm = self.config.thinker_llm
+        if self._video_start_embed is None or self._video_start_embed.device != device:
+            self._video_start_embed = self._sentinel_embed(llm.video_start_token, device)
+            self._video_end_embed = self._sentinel_embed(llm.video_end_token, device)
+        return self._video_start_embed, self._video_end_embed
+
+    def _get_audio_bos_eos(
+        self, device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.config is None:
+            raise RuntimeError("config required for audio sentinels.")
+        llm = self.config.thinker_llm
+        if self._audio_start_embed is None or self._audio_start_embed.device != device:
+            self._audio_start_embed = self._sentinel_embed(llm.audio_start_token, device)
+            self._audio_end_embed = self._sentinel_embed(llm.audio_end_token, device)
+        return self._audio_start_embed, self._audio_end_embed
+
+    # ------------------------------------------------------------------
+    # Image-gen producer-side hidden-state extraction (step 9b)
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def extract_image_gen_hidden_states(
+        hidden_states: torch.Tensor,
+        token_ids: torch.Tensor,
+        image_patch_token: int,
+    ) -> torch.Tensor:
+        """Slice the post-norm hidden states at the ``<imagePatch>`` positions.
+
+        For an image-generation request the prompt carries an
+        ``<image><imagePatch>*N</image>`` block (appended by
+        :func:`maybe_expand_image_gen_prompt`, step 8b). The DiT condition
+        encoder (step 9b) consumes the thinker's hidden states *at those N
+        query-token positions* — not the sampled token. This helper pulls them
+        out so the Thinker→ImageGen streaming edge can carry
+        ``thinker_hidden_states``.
+
+        Args:
+            hidden_states: ``(T, H)`` post-norm thinker hidden states (from
+                ``LingMoeModel.forward(..., return_hidden_states=True)``).
+            token_ids: ``(T,)`` the input token ids for the same forward pass
+                (used to locate the patch positions).
+            image_patch_token: the ``<imagePatch>`` token id
+                (``config.thinker_llm.image_patch_token``, 157157 on the
+                released ckpt).
+
+        Returns:
+            ``(N, H)`` hidden states at the patch positions, in order.
+
+        Raises:
+            ValueError: if shapes disagree or no patch tokens are present.
+        """
+        if hidden_states.dim() != 2:
+            raise ValueError(f"expected (T, H) hidden_states, got {tuple(hidden_states.shape)}")
+        if token_ids.dim() != 1:
+            token_ids = token_ids.reshape(-1)
+        if token_ids.shape[0] != hidden_states.shape[0]:
+            raise ValueError(
+                f"token_ids length {token_ids.shape[0]} != hidden_states T "
+                f"{hidden_states.shape[0]}"
+            )
+        mask = token_ids.to(hidden_states.device) == int(image_patch_token)
+        if not bool(mask.any()):
+            raise ValueError(
+                f"no <imagePatch> token ({image_patch_token}) found in token_ids; "
+                "process_prompt must append the query-token block for image output."
+            )
+        return hidden_states[mask]
+
+    # ------------------------------------------------------------------
+    # ARNodeSubmodule contract
+    # ------------------------------------------------------------------
+
+    def prepare_inputs(
+        self,
+        graph_walk: str,
+        fwd_info: CurrentForwardPassInfo,
+        inputs: NameToTensorList,
+        pos_info: dict[str, PositionInfo] = {},
+        **kwargs,
+    ) -> ARNodeInputs:
+        """Dispatch on graph_walk to build per-request ARNodeInputs.
+
+        ``**kwargs`` absorbs engine-passed extras (e.g. ``seen_token_mask``
+        from the KV-cache engine's sampler) that this submodule doesn't use,
+        mirroring the peer models so the engine→submodule contract stays
+        forward-compatible.
+
+        Text-only walks return ``input_ids`` (LingMoeModel embeds them
+        inline). Multimodal walks return precomputed ``input_embeds``
+        + ``custom_pos_ids`` so the position counter stays in sync
+        with the sentinel + modality span structure
+        ``modeling_bailing_moe_v2.get_rope_index`` would have produced.
+        """
+        device = self.get_device()
+        start_pos = int(
+            pos_info.get("main", PositionInfo()).position_id_start
+        )
+
+        if graph_walk in self._DECODE_WALKS or graph_walk in self._TEXT_PREFILL_WALKS:
+            token_ids = inputs["text_inputs"][0].to(device)
+            return ARNodeInputs(
+                input_ids=token_ids,
+                input_seq_len=token_ids.shape[0],
+            )
+
+        if graph_walk == "prefill_audio":
+            return self._prepare_prefill_audio(inputs, device, start_pos)
+
+        if graph_walk in ("prefill_vision", "prefill_video"):
+            return self._prepare_prefill_vision(
+                inputs, device, start_pos, video=(graph_walk == "prefill_video"),
+            )
+
+        raise ValueError(
+            f"BailingMoeV2ThinkerSubmodule: unknown graph_walk {graph_walk!r}. "
+            f"Supported: prefill / prefill_text / prefill_audio / prefill_vision "
+            f"/ prefill_video / decode / thinker_decode."
+        )
+
+    def _prepare_prefill_audio(
+        self,
+        inputs: NameToTensorList,
+        device: torch.device,
+        start_pos: int,
+    ) -> ARNodeInputs:
+        """Audio prefill: splice ``[bos, audio_embeds, eos]``, text positions."""
+        # Local import to keep the components/positions module a leaf in
+        # the dependency graph (avoids a circular import at module load).
+        from mstar.model.ming_omni_flash.components.positions import (
+            get_rope_index_text,
+        )
+        if "audio_embeds" not in inputs or not inputs["audio_embeds"]:
+            raise ValueError(
+                "prefill_audio walk: missing 'audio_embeds' input. "
+                "Make sure the prefill graph routes the AudioEncoder "
+                "output edge into the Thinker."
+            )
+        audio_embeds = inputs["audio_embeds"][0].to(device)
+        bos, eos = self._get_audio_bos_eos(device)
+        # Match dtype between sentinel embeds and audio embeds. The
+        # encoder's projector returns the LLM's autocast dtype while
+        # the embed_tokens table lives in the model's stored dtype —
+        # cast sentinels to the audio dtype so the cat is consistent.
+        bos = bos.to(audio_embeds.dtype)
+        eos = eos.to(audio_embeds.dtype)
+        embeds = torch.cat([bos, audio_embeds, eos], dim=0)
+        seq_len = embeds.shape[0]
+        pos_ids = get_rope_index_text(seq_len, start_pos, device=device)
+        return ARNodeInputs(
+            input_seq_len=seq_len,
+            input_embeds=embeds,
+            custom_pos_ids=pos_ids,
+        )
+
+    def _prepare_prefill_vision(
+        self,
+        inputs: NameToTensorList,
+        device: torch.device,
+        start_pos: int,
+        video: bool,
+    ) -> ARNodeInputs:
+        """Vision prefill: splice ``[bos, vision_embeds, eos]`` + grid positions."""
+        from mstar.model.ming_omni_flash.components.positions import (
+            get_rope_index_text,
+            get_rope_index_vision,
+        )
+        if "vision_embeds" not in inputs or not inputs["vision_embeds"]:
+            raise ValueError(
+                "prefill_vision walk: missing 'vision_embeds' input. "
+                "Make sure the prefill graph routes the VisionEncoder "
+                "output edge into the Thinker."
+            )
+        vision_embeds = inputs["vision_embeds"][0].to(device)
+        grid_thw = inputs.get(
+            "image_grid_thw", inputs.get("video_grid_thw", inputs.get("grid_thw", [None])),
+        )[0]
+        if grid_thw is None:
+            raise ValueError(
+                "prefill_vision walk: missing 'image_grid_thw' input. "
+                "process_prompt must forward this from the image processor."
+            )
+        grid_thw = grid_thw.to(device)
+        if grid_thw.dim() == 1:
+            grid = grid_thw
+        else:
+            # Multi-image / multi-clip support is step 5c (the graph
+            # router will sequence one Sequential per image). For 5b
+            # we restrict to a single image / clip per request.
+            if grid_thw.shape[0] > 1:
+                raise NotImplementedError(
+                    "prefill_vision: multi-image grid_thw not supported in "
+                    "step 5b; one image / clip per request only."
+                )
+            grid = grid_thw[0]
+
+        # Video walks honor a per-frame timestamp via
+        # ``video_second_per_grid``; image walks pass None (one frame).
+        seconds_per_grid: float | None = None
+        if video:
+            spg = inputs.get("video_second_per_grid", [None])[0]
+            if spg is not None:
+                seconds_per_grid = float(
+                    spg.item() if isinstance(spg, torch.Tensor) else spg
+                )
+            else:
+                seconds_per_grid = 1.0  # mirrors the upstream default
+
+        bos, eos = (
+            self._get_video_bos_eos(device) if video
+            else self._get_vision_bos_eos(device)
+        )
+        bos = bos.to(vision_embeds.dtype)
+        eos = eos.to(vision_embeds.dtype)
+        embeds = torch.cat([bos, vision_embeds, eos], dim=0)
+        seq_len = embeds.shape[0]
+
+        if self.config is None:
+            raise RuntimeError("config required for prefill_vision (spatial_merge_size).")
+        spatial_merge = self.config.vision.spatial_merge_size
+        bos_pos = get_rope_index_text(1, start_pos, device=device)
+        vision_pos = get_rope_index_vision(
+            grid_thw=grid,
+            start_pos=start_pos + 1,
+            spatial_merge_size=spatial_merge,
+            device=device,
+            second_per_grid_t=seconds_per_grid,
+            tokens_per_second=self.config.thinker_llm.tokens_per_second,
+        )
+        # eos goes one past the largest vision position so the next walk's
+        # text positions don't collide with the vision span's T/H/W ranges.
+        eos_pos_start = int(vision_pos.max().item()) + 1
+        eos_pos = get_rope_index_text(1, eos_pos_start, device=device)
+        pos_ids = torch.cat([bos_pos, vision_pos, eos_pos], dim=1)
+        if pos_ids.shape != (3, seq_len):
+            raise AssertionError(
+                f"prefill_vision: position_ids shape {tuple(pos_ids.shape)} "
+                f"does not match seq_len={seq_len} (3, T) expectation."
+            )
+        return ARNodeInputs(
+            input_seq_len=seq_len,
+            input_embeds=embeds,
+            custom_pos_ids=pos_ids,
+        )
+
+    def preprocess(
+        self,
+        graph_walk: str,
+        engine_inputs: ModelInputsFromEngine,
+        inputs: list[ARNodeInputs],
+    ) -> dict[str, torch.Tensor | Any]:
+        """Plan attention; pack inputs for forward.
+
+        Single-request only in step 3d; batched preprocess folds in
+        step 3e+ via ``can_batch`` + ``forward_batched``. The text and
+        multimodal paths use mutually exclusive keys downstream so the
+        forward can branch on which one is set: ``text_inputs`` for
+        the input-ids path, ``input_embeds`` + ``position_ids`` for
+        the embeds path.
+        """
+        if len(inputs) > 1:
+            raise NotImplementedError(
+                f"BailingMoeV2ThinkerSubmodule: multi-request batching is "
+                f"step-3e scope; got {len(inputs)} requests"
+            )
+        cache_manager = engine_inputs.cache_manager
+        seq_lens = [inp.input_seq_len for inp in inputs]
+
+        cache_manager.set_active_label("main")
+        cache_manager.plan_attention(
+            seq_lens=seq_lens, is_causal=True, label="main",
+        )
+        # We don't call ``cache_manager.apply_rope`` in attention (we
+        # have our own partial 3D rope), but mstar's plan_rope also
+        # advances internal position-id state used by ``advance_seq_lens``
+        # — keep this call for parity with Orpheus.
+        cache_manager.plan_rope(seq_lens=seq_lens, pos_ids=None, label="main")
+
+        inp = inputs[0]
+        if inp.input_embeds is not None:
+            # Multimodal path: forward gets embeds + explicit positions.
+            return {
+                "input_embeds": inp.input_embeds,
+                "position_ids": inp.custom_pos_ids,
+            }
+        return {
+            "text_inputs": torch.cat([inp.input_ids for inp in inputs]),
+        }
+
+    def forward(
+        self,
+        graph_walk: str,
+        engine_inputs: ModelInputsFromEngine,
+        text_inputs: torch.Tensor | None = None,
+        input_embeds: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        **kwargs,
+    ) -> NameToTensorList:
+        cache_handle = engine_inputs.cache_manager
+        request_info = engine_inputs.single_request_info
+
+        # Image-gen prefill carries an <imagePatch> query-token block; when
+        # those token ids are present we additionally capture the post-norm
+        # hidden states at those positions and stream them to the ImageGen
+        # partition. Only meaningful on the text-input path (the block lives
+        # in the tokenized prompt); embeds-path multimodal prefills don't
+        # carry it.
+        want_image_gen = (
+            self.config is not None
+            and self.config.image_gen is not None
+            and text_inputs is not None
+            and bool((text_inputs == self.config.thinker_llm.image_patch_token).any())
+        )
+
+        if input_embeds is not None:
+            if position_ids is None:
+                raise ValueError(
+                    "BailingMoeV2ThinkerSubmodule.forward: input_embeds "
+                    "provided but position_ids is None. prepare_inputs "
+                    "must emit custom_pos_ids alongside embeds."
+                )
+            model_out = self.model(
+                cache_handle,
+                input_embeds=input_embeds,
+                position_ids=position_ids,
+            )
+        else:
+            if text_inputs is None:
+                raise ValueError(
+                    "BailingMoeV2ThinkerSubmodule.forward: neither "
+                    "text_inputs nor input_embeds provided."
+                )
+            # Text-only path: build 1D positions from the request's
+            # position counter (same as step 3f).
+            start_pos = 0
+            try:
+                start_pos = (
+                    request_info.position_info.get("main", PositionInfo())
+                    .position_id_start
+                )
+            except AttributeError:
+                # ARNodeSubmodule contract may not always provide
+                # position_info; fall back to 0.
+                pass
+
+            num_tokens = text_inputs.shape[0]
+            position_ids_1d = torch.arange(
+                start_pos, start_pos + num_tokens,
+                dtype=torch.long, device=text_inputs.device,
+            )
+            model_out = self.model(
+                cache_handle,
+                input_ids=text_inputs,
+                position_ids=position_ids_1d,
+                return_hidden_states=want_image_gen,
+            )
+
+        if want_image_gen:
+            logits, hidden_states = model_out
+        else:
+            logits = model_out
+
+        # Advance the cache's sequence lengths so the next decode step
+        # knows where to read/write. This is the standard post-forward
+        # call that mstar's KV cache uses to track positions.
+        cache_handle.advance_seq_lens()
+
+        # Sample only the last position's logits (next-token sampling).
+        last_logits = logits[-1:, :]
+        outputs: NameToTensorList = {"logits": [last_logits]}
+        if want_image_gen:
+            patch_hidden = self.extract_image_gen_hidden_states(
+                hidden_states, text_inputs, self.config.thinker_llm.image_patch_token,
+            )
+            outputs["thinker_hidden_states"] = [patch_hidden]
+        return outputs
+
+    def postprocess(
+        self,
+        request_id: str,
+        request_info: CurrentForwardPassInfo,
+        outputs: dict[str, list[torch.Tensor]],
+        **kwargs,
+    ) -> None:
+        """Rebind ``new_token`` → ``text_inputs`` for the decode loop.
+
+        The decode walk's output edge is named ``text_inputs`` so the loop
+        feeds the previous sampled token back into the next iteration.
+        ``submodule.forward`` returns ``{"logits": [...]}``; the KV-cache
+        engine samples that into ``{"new_token": [...]}``; this hook then
+        publishes the same tensor under the ``text_inputs`` key so the
+        graph router finds an output to attach to the loop edge.
+
+        Mirrors :meth:`OrpheusLLMSubmodule.postprocess`.
+        """
+        if "new_token" not in outputs:
+            return
+        outputs["text_inputs"] = outputs["new_token"]
+
+    # ------------------------------------------------------------------
+    # Stop conditions
+    # ------------------------------------------------------------------
+
+    def check_stop(
+        self,
+        request_id: str,
+        request_info: CurrentForwardPassInfo,
+        outputs: dict[str, list[torch.Tensor]],
+    ) -> set[str]:
+        """Stop the ``thinker_decode_loop`` when the sampled token is the EOS
+        (``<|role_end|>`` for Ming, token id 156895).
+
+        The returned name MUST match the ``Loop(name=...)`` declared in
+        ``get_graph_walk_graphs`` (``thinker_decode_loop``). A mismatch makes
+        the worker's dynamic-loop registry raise ``KeyError(NodeAndGraphWalk(
+        node='decode_loop', ...))`` on the EOS step and crash the rank.
+        """
+        new_tokens = outputs.get("new_token") or []
+        if not new_tokens:
+            return set()
+        last = new_tokens[-1]
+        if isinstance(last, torch.Tensor):
+            tok = int(last.flatten()[0].item())
+        else:
+            tok = int(last)
+        if tok == self.eos_token_id:
+            return {"thinker_decode_loop"}
+        return set()
+
+    def can_batch(self, batch, model_inputs) -> bool:
+        # Step 3d is single-request; step 3e adds batching.
+        return False
+
+
+# ===================================================================
+# 4. TalkerSubmodule (stateless TTS — text tokens -> waveform)
+# ===================================================================
+
+
+class TalkerSubmodule(NodeSubmodule):
+    """Stateless TTS node: talker text token ids -> audio waveform.
+
+    Ming's thinker->talker bridge passes DETOKENIZED TEXT, not streaming
+    hidden states (see vllm-omni's pipeline.py: ``thinker2talker`` re-encodes
+    the text with the talker's own ``talker/llm`` tokenizer). That makes the
+    talker a near-standalone TTS node — much simpler than qwen3_omni's
+    streaming-codec handoff. We model it as a single stateless node whose
+    forward runs the full AR loop + VAE decode via :class:`TalkerGenerator`.
+
+    The whole per-request generation (LLM prefill + CFM AR decode + AudioVAE
+    decode) happens inside one ``forward`` call rather than being unrolled
+    into a conductor-driven decode loop, because the CFM step count is
+    self-determined by the stop_head (not a token-by-token graph loop).
+    This keeps the graph wiring (step 6e-3) trivial: one Talker node,
+    one ``EMIT_TO_CLIENT`` audio edge.
+
+    Engine type: STATELESS (no KV cache managed by mstar — the talker LLM
+    manages its own ``StaticCache`` internally inside generate_latents).
+    """
+
+    def __init__(
+        self,
+        generator: "Any",  # TalkerGenerator (avoid import cycle at module load)
+        config: MingFlashOmniModelConfig,
+        max_steps: int = 1000,
+        min_new_token: int = 10,
+        text_bridge: "Any" = None,
+    ) -> None:
+        super().__init__()
+        self.generator = generator
+        self.config = config
+        self.max_steps = max_steps
+        self.min_new_token = min_new_token
+        # Optional Thinker->Talker text bridge: a callable that maps
+        # thinker output token ids -> talker_text_inputs token ids
+        # (detokenize with the thinker tokenizer, re-encode with the
+        # talker/llm tokenizer). When the streaming edge delivers raw
+        # thinker tokens, prepare_inputs runs this first. When None
+        # (unit-test path / pre-bridged inputs), the inputs are assumed
+        # to already be talker-tokenizer ids.
+        self.text_bridge = text_bridge
+        # Stash embed_tokens so prepare_inputs can map talker text ids ->
+        # inputs_embeds without reaching through the generator each time.
+        self.embed_tokens = generator.llm.embed_tokens
+
+    def get_stateless_flavor(self) -> str:
+        # The talker runs in bf16 with autocast off; mirror the audio_codec
+        # flavor (no torch.compile, no autocast) since the CFM ODE loop +
+        # AudioVAE ISTFT are numerically sensitive.
+        return "audio_codec"
+
+    def prepare_inputs(
+        self,
+        graph_walk: str,
+        fwd_info: CurrentForwardPassInfo,
+        inputs: NameToTensorList,
+        **kwargs,
+    ) -> NodeInputs:
+        """Embed the talker text token ids into the LLM's input space.
+
+        ``talker_text_inputs`` is the token-id tensor produced by the
+        Thinker->Talker text bridge (step 6e-3) — already encoded with
+        the talker's own ``talker/llm`` tokenizer. We embed it here so
+        forward gets ready-to-run ``inputs_embeds``.
+        """
+        # Two input shapes accepted:
+        #   * ``talker_text_inputs`` — already talker-tokenized ids
+        #     (unit-test path / pre-bridged).
+        #   * ``thinker_tokens`` — raw thinker output ids streamed from
+        #     the Thinker partition; run text_bridge to re-tokenize.
+        if "talker_text_inputs" in inputs and inputs["talker_text_inputs"]:
+            token_ids = inputs["talker_text_inputs"][0]
+        elif "thinker_tokens" in inputs and inputs["thinker_tokens"]:
+            if self.text_bridge is None:
+                raise RuntimeError(
+                    "TalkerSubmodule: received 'thinker_tokens' but no "
+                    "text_bridge is configured to re-tokenize them."
+                )
+            token_ids = self.text_bridge(inputs["thinker_tokens"][0])
+        else:
+            raise ValueError(
+                "TalkerSubmodule: missing 'talker_text_inputs' / "
+                "'thinker_tokens'. The Thinker->Talker bridge (step 6e-3) "
+                "must supply the text ids."
+            )
+        device = self.embed_tokens.weight.device
+        if token_ids.dim() == 1:
+            token_ids = token_ids.unsqueeze(0)  # (1, T)
+        token_ids = token_ids.to(device)
+        inputs_embeds = self.embed_tokens(token_ids)
+
+        # Optional voice-prompt latent (zero-shot cloning); carried
+        # through as a tensor input when present.
+        prompt_wav_lat = inputs.get("prompt_wav_lat", [None])[0]
+        return NodeInputs(
+            tensor_inputs={
+                "inputs_embeds": inputs_embeds,
+                "prompt_wav_lat": prompt_wav_lat,
+            }
+        )
+
+    def forward(
+        self,
+        graph_walk: str,
+        engine_inputs: ModelInputsFromEngine,
+        inputs_embeds: torch.Tensor,
+        prompt_wav_lat: torch.Tensor | None = None,
+        **kwargs,
+    ) -> NameToTensorList:
+        """Run the full talker generation: AR latents -> VAE waveform.
+
+        Returns ``{"audio_chunk": [waveform]}`` where waveform is
+        ``(1, 1, num_samples)`` at the AudioVAE's sample rate. The
+        text-length duration cap is applied to ``max_steps``.
+        """
+        text_len = inputs_embeds.shape[1]
+        max_steps = self.generator.duration_capped_steps(text_len, self.max_steps)
+        with torch.no_grad():
+            latents = self.generator.generate_latents(
+                inputs_embeds,
+                prompt_wav_lat=prompt_wav_lat,
+                min_new_token=self.min_new_token,
+                max_steps=max_steps,
+            )
+            waveform = self.generator.decode_to_waveform(latents, stream_decode=True)
+            waveform = self.generator.trim_trailing_silence(waveform)
+        return {"audio_chunk": [waveform]}
+
+
+# ===================================================================
+# 5. ImageGenSubmodule (stateless diffusion — thinker hidden -> image)
+# ===================================================================
+
+
+class ImageGenSubmodule(NodeSubmodule):
+    """Stateless image-generation node: thinker hidden states -> RGB image.
+
+    Ming's thinker->imagegen bridge passes the thinker's final hidden states
+    sliced at the learnable ``<imagePatch>`` query-token positions (the block
+    appended by ``maybe_expand_image_gen_prompt`` in ``process_prompt``, step
+    8b). The condition encoder turns those into the DiT's ``cap_feats``, and the
+    diffusion pipeline runs the full flow-matching denoise + VAE decode in one
+    ``forward`` call — like the Talker, the step count is internal
+    (scheduler-determined), not a conductor decode loop. So a single STATELESS
+    node with one ``EMIT_TO_CLIENT`` image edge suffices.
+
+    The whole stack (condition encoder + DiT + VAE + optional ByT5) is owned by
+    a :class:`MingImagePipeline`; this submodule only marshals inputs and calls
+    ``pipeline.generate``.
+    """
+
+    def __init__(
+        self,
+        pipeline: "Any",  # MingImagePipeline (avoid import cycle at module load)
+        config: MingFlashOmniModelConfig,
+        default_params: "Any" = None,  # MingImageGenSamplingParams
+    ) -> None:
+        super().__init__()
+        self.pipeline = pipeline
+        self.config = config
+        if default_params is None:
+            from mstar.model.ming_omni_flash.components.imagegen_pipeline import (
+                MingImageGenSamplingParams,
+            )
+
+            ig = config.image_gen
+            default_params = MingImageGenSamplingParams(
+                height=ig.default_height if ig is not None else 1024,
+                width=ig.default_width if ig is not None else 1024,
+                num_inference_steps=ig.num_inference_steps if ig is not None else 50,
+                guidance_scale=ig.guidance_scale if ig is not None else 2.0,
+            )
+        self.default_params = default_params
+
+    def get_stateless_flavor(self) -> str:
+        # The DiT + VAE denoise loop is numerically sensitive (flow-matching
+        # ODE); mirror the talker/audio_codec flavor (no torch.compile, no
+        # autocast surprises).
+        return "audio_codec"
+
+    def prepare_inputs(
+        self,
+        graph_walk: str,
+        fwd_info: CurrentForwardPassInfo,
+        inputs: NameToTensorList,
+        **kwargs,
+    ) -> NodeInputs:
+        """Pull the thinker hidden states at the query-token positions.
+
+        Accepts either ``thinker_hidden_states`` (already sliced [N, H] or
+        [1, N, H] by the thinker->imagegen bridge) or, in the unit-test path,
+        a pre-built tensor. An optional ``negative_thinker_hidden_states``
+        enables real (non-zero) CFG negatives.
+        """
+        if "thinker_hidden_states" in inputs and inputs["thinker_hidden_states"]:
+            hidden = inputs["thinker_hidden_states"][0]
+        else:
+            raise ValueError(
+                "ImageGenSubmodule: missing 'thinker_hidden_states'. The "
+                "Thinker->ImageGen bridge must supply the query-token hidden "
+                "states."
+            )
+        negative = inputs.get("negative_thinker_hidden_states", [None])[0]
+        return NodeInputs(
+            tensor_inputs={
+                "thinker_hidden_states": hidden,
+                "negative_thinker_hidden_states": negative,
+            }
+        )
+
+    def forward(
+        self,
+        graph_walk: str,
+        engine_inputs: ModelInputsFromEngine,
+        thinker_hidden_states: torch.Tensor,
+        negative_thinker_hidden_states: torch.Tensor | None = None,
+        **kwargs,
+    ) -> NameToTensorList:
+        """Run condition-encode -> denoise -> VAE decode, emit one image.
+
+        Returns ``{"image": [img]}`` where ``img`` is a ``[B, 3, H, W]`` tensor
+        in ``[-1, 1]`` (Z-Image VAE convention); the diffusion output adapter
+        converts it to PIL/base64 downstream.
+        """
+        with torch.no_grad():
+            image = self.pipeline.generate(
+                thinker_hidden_states,
+                self.default_params,
+                negative_hidden=negative_thinker_hidden_states,
+            )
+        return {"image": [image]}
diff --git a/mstar/model/registry.py b/mstar/model/registry.py
index fab97010..4a33da5b 100644
--- a/mstar/model/registry.py
+++ b/mstar/model/registry.py
@@ -1,5 +1,6 @@
 from mstar.model.bagel.bagel_model import BagelModel
 from mstar.model.base import Model
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
 from mstar.model.orpheus.orpheus_model import OrpheusModel
 from mstar.model.pi05.pi05_model import Pi05Model
 from mstar.model.qwen3_omni.qwen3_omni_model import Qwen3OmniModel
@@ -7,6 +8,7 @@
 
 MODEL_REGISTRY: dict[str, type[Model]] = {
     "bagel": BagelModel,
+    "ming_flash_omni": MingFlashOmniModel,
     "orpheus": OrpheusModel,
     "pi05": Pi05Model,
     "qwen3_omni": Qwen3OmniModel,
@@ -16,6 +18,9 @@
 
 HF_MODELS: dict[str, dict] = {
     "bagel": {"model_path_hf": "ByteDance-Seed/BAGEL-7B-MoT"},
+    # Ming-flash-omni-2.0 — Ling-2.0 sparse MoE (100B total / 6B active),
+    # ~238 GB / 42 safetensors shards.
+    "ming_flash_omni": {"model_path_hf": "inclusionAI/Ming-flash-omni-2.0"},
     "orpheus": {"model_path_hf": "canopylabs/orpheus-3b-0.1-ft"},
     # Pi0.5 PyTorch port published by lerobot — single safetensors blob
     # (~14 GB). mstar/model/pi05/weight_loader.py handles the lerobot->mstar
diff --git a/mstar/utils/sampling.py b/mstar/utils/sampling.py
index fe31b236..71e9cc24 100644
--- a/mstar/utils/sampling.py
+++ b/mstar/utils/sampling.py
@@ -418,6 +418,35 @@ def sample(
         return tokens
 
 
+def _flashinfer_rng_scalars(
+    seed: torch.Tensor | int | None,
+    rand_offset: torch.Tensor | int | None,
+) -> tuple[int, int]:
+    """Coerce per-request seed/offset to scalar Python ints for flashinfer.
+
+    The installed flashinfer (0.6.2) sampler FFI takes scalar ``int`` seed and
+    offset; newer builds (>=0.6.6) take per-request tensors. mstar's sampler
+    constructs them as ``[batch]`` tensors (the 0.6.6 form), which crashes the
+    0.6.2 kernel with "Mismatched type on argument #7 ... Expected int but got
+    ffi.Tensor". We collapse to a single scalar here.
+
+    Correctness note: a scalar applies one (seed, offset) to the whole batch
+    rather than per-row. Greedy decode (temperature=0) ignores RNG entirely, so
+    this is exact for the greedy path; for stochastic sampling it only affects
+    cross-row RNG independence (reproducibility), not the sampling distribution.
+    The ``.item()`` is a host sync, but this fn is already ``@torch.compiler.
+    disable`` and runs once per decode step.
+    """
+    def _scalar(v, default=0):
+        if v is None:
+            return default
+        if isinstance(v, torch.Tensor):
+            return int(v.reshape(-1)[0].item()) if v.numel() else default
+        return int(v)
+
+    return _scalar(seed), _scalar(rand_offset)
+
+
 @torch.compiler.disable
 def sample_tokens(
     logits: torch.Tensor,
@@ -464,6 +493,10 @@ def sample_tokens(
 
     import flashinfer
 
+    # Coerce per-request seed/offset tensors to scalar ints for the installed
+    # flashinfer sampler FFI (0.6.2). See _flashinfer_rng_scalars.
+    fi_seed, fi_offset = _flashinfer_rng_scalars(seed, rand_offset)
+
     # Pin the Triton prep kernel (writes probs) and the FlashInfer sampler
     # (reads probs) to the same device/stream so the write-before-read is
     # ordered without an explicit sync. Otherwise FlashInfer runs on the
@@ -484,7 +517,7 @@ def sample_tokens(
             result = flashinfer.sampling.top_p_sampling_from_probs(
                 probs, top_p,
                 deterministic=True,
-                seed=seed, offset=rand_offset,
+                seed=fi_seed, offset=fi_offset,
             )
             return result[0] if isinstance(result, tuple) else result
 
@@ -497,7 +530,7 @@ def sample_tokens(
         result = flashinfer.sampling.top_k_top_p_sampling_from_probs(
             probs, top_k, top_p,
             deterministic=True,
-            seed=seed, offset=rand_offset
+            seed=fi_seed, offset=fi_offset,
         )
         return result[0] if isinstance(result, tuple) else result
 
diff --git a/results/ming_accuracy/ACCURACY.md b/results/ming_accuracy/ACCURACY.md
new file mode 100644
index 00000000..28f8160b
--- /dev/null
+++ b/results/ming_accuracy/ACCURACY.md
@@ -0,0 +1,96 @@
+# Ming-flash-omni-2.0 task-accuracy spot checks — 4×H100
+
+Both runs against the same `vllm-omni 0.19.0` server + hybrid snapshot
+(inclusionAI thinker + Jonathan1909 metadata/talker) used for the T2T
+scaling sweep. Sampling is small — these are directional spot checks,
+not publishable numbers. Dated 2026-06-06.
+
+## Headline
+
+| Suite | Items | Accuracy | Parse rate | Wall (s) | req/s |
+|-------|-------|----------|------------|----------|-------|
+| MMLU (0-shot, ~5/subject) | 285 | **78.9%** | 99.3% | 12.6 | 22.7 |
+| VideoMME (chunk1 subset, stratified) | 51 | **56.9%** | 100.0% | 576.1 | 0.09 |
+
+## MMLU breakdown
+
+Sample: 285 items (cais/mmlu test, ~5 per subject across all 57 subjects). 0-shot.
+Prompt: `<question>\n\nA. ...\nB. ...\nC. ...\nD. ...\n\nAnswer with just the letter (A, B, C, or D):`
+
+### Per-subject (sorted by accuracy, worst first)
+
+| Subject | Correct/Total | Accuracy |
+|---------|--------------|----------|
+| econometrics | 1/5 | 20% |
+| philosophy | 2/5 | 40% |
+| global_facts | 2/5 | 40% |
+| virology | 2/5 | 40% |
+| international_law | 3/5 | 60% |
+| high_school_mathematics | 3/5 | 60% |
+| electrical_engineering | 3/5 | 60% |
+| conceptual_physics | 3/5 | 60% |
+| business_ethics | 3/5 | 60% |
+| high_school_chemistry | 3/5 | 60% |
+| ... | ... | ... |
+| professional_accounting | 5/5 | 100% |
+| high_school_psychology | 5/5 | 100% |
+| human_sexuality | 5/5 | 100% |
+| high_school_computer_science | 5/5 | 100% |
+| miscellaneous | 5/5 | 100% |
+| high_school_government_and_politics | 5/5 | 100% |
+| high_school_us_history | 5/5 | 100% |
+| logical_fallacies | 5/5 | 100% |
+| prehistory | 5/5 | 100% |
+| high_school_european_history | 5/5 | 100% |
+
+## VideoMME breakdown
+
+Sample: 51 items from chunk1 (videos_chunked_01.zip, 30 videos), stratified evenly across short/medium/long durations.
+Prompt: `<question>\n\nA. <opt>\nB. <opt>\nC. <opt>\nD. <opt>\n\nAnswer with just the letter (A, B, C, or D):`
+Video sent as base64-inlined `data:video/mp4` content part on `/v1/chat/completions`.
+
+### By duration
+
+| Duration | Correct/Total | Accuracy |
+|----------|--------------|----------|
+| short | 13/17 | 76.5% |
+| medium | 5/17 | 29.4% |
+| long | 11/17 | 64.7% |
+
+### By task type
+
+| Task type | Correct/Total | Accuracy |
+|-----------|--------------|----------|
+| Temporal Reasoning | 0/3 | 0% |
+| Counting Problem | 1/6 | 17% |
+| OCR Problems | 1/4 | 25% |
+| Attribute Perception | 1/4 | 25% |
+| Action Recognition | 3/5 | 60% |
+| Object Reasoning | 4/6 | 67% |
+| Temporal Perception | 2/3 | 67% |
+| Object Recognition | 6/8 | 75% |
+| Information Synopsis | 5/6 | 83% |
+| Spatial Reasoning | 1/1 | 100% |
+| Action Reasoning | 2/2 | 100% |
+| Spatial Perception | 3/3 | 100% |
+
+## Caveats
+
+- **Small N** — MMLU 5/subject and VideoMME ~17/duration are not enough
+  for headline-quality numbers, especially the per-bucket breakdowns
+  (e.g. VideoMME medium=29% is suspicious vs short=77% / long=65% and
+  could be sample variance).
+- **VideoMME videos limited to chunk1** — only 1 of the 20 dataset
+  zip chunks was extracted (4.9 GB on `/dev/shm`). The full VideoMME is
+  ~30 GB and would need extra disk to land in this container's overlay.
+- **0-shot** for both — no in-context examples. Published Ming numbers
+  may use chain-of-thought / few-shot for higher scores.
+- **Greedy decoding** (`temperature=0`) on the thinker; matches the
+  benchmark wiring used everywhere else in this branch.
+
+## How to reproduce
+
+Server: see [`benchmark/vllm_omni_instructions.md`](../../benchmark/vllm_omni_instructions.md) for the launch recipe.
+Eval scripts were scratch (not committed) — both ~80 LOC, sending
+`/v1/chat/completions` requests in a loop with the standard OpenAI
+shape. JSON output ships per-item details next to this SUMMARY.
\ No newline at end of file
diff --git a/results/ming_t2t_sweep/SUMMARY.md b/results/ming_t2t_sweep/SUMMARY.md
new file mode 100644
index 00000000..cc1281c6
--- /dev/null
+++ b/results/ming_t2t_sweep/SUMMARY.md
@@ -0,0 +1,34 @@
+# Ming-flash-omni-2.0 T2T scaling sweep — 4×H100 80GB
+
+Run via vllm-omni 0.19.0, hybrid snapshot (inclusionAI thinker + Jonathan1909 metadata/talker),
+stage config `ming_flash_omni.yaml` (TP=4 thinker + colocated talker on GPU 3).
+Prompts from `benchmark/assets/simple_text_queries.txt` (general-knowledge English).
+Dated 2026-06-06.
+
+| mode | concurrency | reqs | wall (s) | E2E p50 (ms) | E2E p95 (ms) | req/s | tok/s |
+|------|-------------|------|----------|--------------|--------------|-------|-------|
+| OFFLINE     |           1 |   50 |   69.14  |        1444  |        2310  |  0.72 | 109.6 |
+| CLOSED_LOOP |           2 |   80 |   61.57  |        1436  |        2536  |  1.30 | 198.9 |
+| CLOSED_LOOP |           4 |   80 |   33.94  |        1588  |        2846  |  2.36 | 355.7 |
+| CLOSED_LOOP |           8 |   80 |   21.54  |        1899  |        3396  |  3.71 | 573.4 |
+| CLOSED_LOOP |          16 |   80 |   13.78  |        2144  |        4175  |  5.81 | 887.9 |
+| CLOSED_LOOP |          32 |   80 |   11.50  |        3728  |        7384  |  6.96 | 1060.5 |
+
+## Observations
+
+- **Single-stream baseline** is ~110 tok/s — bounded by TP=4 all-reduce on each
+  decode step. TTFT is uniformly 28-91 ms — the 32-layer MoE prefills fast.
+- **Linear scaling to c=8** (5.2× over single-stream). Beyond that the curve
+  bends: c=16 → 8.1×, c=32 → 9.6×. The knee is between c=16 and c=32.
+- **Tail latency** scales as expected with batch size — E2E p95 goes 2.3 → 7.4 s
+  from c=1 to c=32 while p50 only doubles. The tail is dominated by
+  request-mix variance (token counts span 25-380), not server saturation.
+- **All 470 requests succeeded** across the sweep, no errors or timeouts.
+
+## Reproduce
+
+Server launch + benchmark recipe in
+[`benchmark/vllm_omni_instructions.md`](../../benchmark/vllm_omni_instructions.md).
+Sweep driver was a ~50 LOC scratch script that wraps `benchmark.runner.Benchmark`
+with iterated `BenchmarkConfig` (one per concurrency point); contents in the
+per-run `results.json` files alongside this README.
diff --git a/test/modular/test_ming_flash_omni_audio_vae.py b/test/modular/test_ming_flash_omni_audio_vae.py
new file mode 100644
index 00000000..b1ec9c56
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_audio_vae.py
@@ -0,0 +1,402 @@
+"""Tests for the AudioVAE port (step 6d).
+
+Covers the building blocks in ``components/audio_vae.py``:
+
+  * ISTFT round-trip (center + same padding modes).
+  * StreamingLinearUpsample chunked-vs-single-shot equivalence.
+  * ISTFTHead, Encoder, Decoder shape contracts.
+  * AudioVAE construction from real config + encode/decode round-trip
+    on a tiny synthetic config (CPU, no snapshot).
+  * Snapshot-gated structural assertions against the real
+    ``talker/vae/model.safetensors`` (no weight load — that's 6f).
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.audio_vae import (
+    _ISTFT,
+    AudioVAE,
+    _Decoder,
+    _Encoder,
+    _ISTFTHead,
+    _oobleck_sample,
+    _StreamingLinearUpsample,
+    build_audio_vae,
+)
+from mstar.model.ming_omni_flash.config import AudioVAEConfig
+
+# ---------------------------------------------------------------------------
+# Snapshot discovery
+# ---------------------------------------------------------------------------
+
+
+def _find_local_snapshot() -> str | None:
+    def _has(p: Path) -> bool:
+        return (
+            (p / "talker" / "vae" / "config.json").exists()
+            and (p / "talker" / "vae" / "model.safetensors").exists()
+        )
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and _has(Path(override)):
+        return override
+    hybrid = Path("/dev/shm/ming-hybrid")
+    if _has(hybrid):
+        return str(hybrid)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Tiny Qwen2 backbone dict (keeps tests fast; matches released layout)
+# ---------------------------------------------------------------------------
+
+
+def _tiny_qwen2_backbone(hidden_size: int = 32, num_layers: int = 1) -> dict:
+    return {
+        "hidden_size": hidden_size,
+        "intermediate_size": hidden_size * 2,
+        "num_hidden_layers": num_layers,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "max_position_embeddings": 256,
+        "vocab_size": 1,
+        "use_sliding_window": True,
+        "sliding_window": 32,
+        "max_window_layers": 0,
+        "rope_theta": 1_000_000.0,
+        "rms_norm_eps": 1e-6,
+        "hidden_act": "silu",
+    }
+
+
+# ---------------------------------------------------------------------------
+# Oobleck sampler
+# ---------------------------------------------------------------------------
+
+
+def test_oobleck_sample_split_mean_scale() -> None:
+    """`_oobleck_sample` chunks parameters along dim=1; scale is softplus+eps.
+
+    Verify the chunk split is on the right axis and shape collapses
+    from (B, 2*L, T) to (B, L, T).
+    """
+    params = torch.zeros(2, 8, 5)   # 2*latent_dim=8 → latent_dim=4
+    out = _oobleck_sample(params)
+    assert out.shape == (2, 4, 5)
+
+
+def test_oobleck_sample_returns_mean_when_scale_is_very_negative() -> None:
+    """With scale_raw = -inf-ish, softplus → 0, so sample → mean."""
+    B, L, T = 1, 2, 3
+    mean = torch.full((B, L, T), 7.0)
+    scale_raw = torch.full((B, L, T), -1000.0)
+    params = torch.cat([mean, scale_raw], dim=1)
+    out = _oobleck_sample(params)
+    # softplus(-1000) + 1e-4 ≈ 1e-4 → sample ≈ mean within 1e-3 tolerance.
+    torch.testing.assert_close(out, mean, atol=1e-2, rtol=0)
+
+
+# ---------------------------------------------------------------------------
+# ISTFT
+# ---------------------------------------------------------------------------
+
+
+def test_istft_rejects_invalid_padding() -> None:
+    with pytest.raises(ValueError, match="Padding must be"):
+        _ISTFT(n_fft=8, hop_length=2, win_length=8, padding="left")
+
+
+def test_istft_center_mode_uses_torch_istft() -> None:
+    """`center` mode is a thin torch.istft wrapper; check it runs end-to-end."""
+    n_fft, hop, win = 16, 4, 16
+    istft = _ISTFT(n_fft, hop, win, padding="center")
+    # 4 frames → original waveform length T = (4-1)*hop + win (center=True
+    # internally trims by win/2 each side, but the wrapper passes
+    # center=True so the upstream choice stands).
+    spec = torch.complex(torch.randn(1, n_fft // 2 + 1, 4), torch.randn(1, n_fft // 2 + 1, 4))
+    y, ab, wb = istft(spec)
+    assert y.dim() == 2
+    assert torch.isfinite(y).all()
+    assert ab is None and wb is None
+
+
+def test_istft_same_mode_runs_non_streaming() -> None:
+    """`same` mode path is the streaming-able variant; non-streaming usage trims `pad` from both ends."""
+    n_fft, hop, win = 8, 2, 8
+    istft = _ISTFT(n_fft, hop, win, padding="same")
+    # Choose enough frames that `output_size - 2*pad` is positive.
+    spec = torch.complex(torch.randn(1, n_fft // 2 + 1, 8), torch.randn(1, n_fft // 2 + 1, 8))
+    y, ab, wb = istft(spec, streaming=False)
+    assert y.dim() == 2
+    assert torch.isfinite(y).all()
+
+
+# ---------------------------------------------------------------------------
+# StreamingLinearUpsample
+# ---------------------------------------------------------------------------
+
+
+def test_streaming_upsample_single_shot_path_returns_upscaled() -> None:
+    """``is_first=True, is_last=True`` → straight upsample, no state."""
+    up = _StreamingLinearUpsample(scale_factor=4)
+    x = torch.randn(1, 3, 5)
+    out, state = up(x, state=None, is_last=True)
+    assert state is None
+    assert out.shape == (1, 12, 5)  # 3 * 4 = 12
+
+
+def test_streaming_upsample_first_non_last_defers() -> None:
+    """First chunk with more to come → return None, populate prev_chunk."""
+    up = _StreamingLinearUpsample(scale_factor=4)
+    x = torch.randn(1, 2, 5)
+    out, state = up(x, state=None, is_last=False)
+    assert out is None
+    assert state["prev_chunk"] is x
+    assert state["is_first"] is False
+
+
+def test_streaming_upsample_two_chunk_equivalent_to_single_shot() -> None:
+    """Concatenating two chunked outputs matches a single-shot upsample.
+
+    This is the key correctness property: chunked streaming must not
+    introduce boundary artefacts (the upsampler's left/right lookahead
+    + history_last bookkeeping is exactly what makes this hold).
+    """
+    up = _StreamingLinearUpsample(scale_factor=4)
+    a = torch.randn(1, 3, 5)
+    b = torch.randn(1, 4, 5)
+
+    # Chunked path: first(a), then last(b).
+    out_a, state = up(a, state=None, is_last=False)
+    assert out_a is None
+    out_b, state = up(b, state=state, is_last=True)
+    assert state is None
+    chunked = out_b
+
+    # Single-shot path: concat(a, b) → one upsample.
+    full = torch.cat([a, b], dim=1)
+    single, _ = up(full, state=None, is_last=True)
+
+    torch.testing.assert_close(chunked, single, atol=1e-5, rtol=1e-5)
+
+
+# ---------------------------------------------------------------------------
+# ISTFTHead
+# ---------------------------------------------------------------------------
+
+
+def test_istft_head_output_shape() -> None:
+    """ISTFTHead returns (audio: (B, 1, T), x_pred: (B, n_fft+2, T_frames), bufs...)."""
+    head = _ISTFTHead(dim=16, n_fft=16, hop_length=4)
+    x = torch.randn(1, 8, 16)
+    audio, x_pred, ab, wb = head(x)
+    assert audio.dim() == 3
+    assert audio.shape[0] == 1 and audio.shape[1] == 1
+    assert x_pred.shape == (1, 16 + 2, 8)
+
+
+# ---------------------------------------------------------------------------
+# Encoder / Decoder shape contracts
+# ---------------------------------------------------------------------------
+
+
+def test_encoder_get_frames_pads_right_edge() -> None:
+    """get_frames windows the waveform with input_dim/hop_size stride."""
+    enc = _Encoder(
+        encoder_args=_tiny_qwen2_backbone(hidden_size=32, num_layers=1),
+        input_dim=16, hop_size=16, latent_dim=4, patch_size=-1,
+        attn_implementation="sdpa",
+    )
+    enc = enc.float()
+    # 50-sample waveform with input_dim=16, hop=16 → ceil(50/16) = 4 frames.
+    # Formula: (50 + 16 - 1) // 16 = 65 // 16 = 4.  After padding to
+    # (4-1)*16 + 16 = 64 samples, unfold(size=16, step=16) yields 4 windows.
+    waveform = torch.randn(1, 50)
+    frames = enc.get_frames(waveform)
+    assert frames.shape[0] == 1
+    assert frames.shape[1] == 4      # frames
+    assert frames.shape[2] == 16     # input_dim
+
+
+def test_encoder_forward_emits_latent_params_no_patching() -> None:
+    """patch_size=-1 → skip aggregator path; output `(B, T, 2*latent_dim)`."""
+    enc = _Encoder(
+        encoder_args=_tiny_qwen2_backbone(hidden_size=32),
+        input_dim=16, hop_size=16, latent_dim=4, patch_size=-1,
+        attn_implementation="sdpa",
+    )
+    enc = enc.float().eval()
+    waveform = torch.randn(1, 64)
+    with torch.no_grad():
+        params, y = enc(waveform)
+    # T_frames = ceil((64 + 15) / 16) = 4. 2*latent_dim = 8.
+    assert params.shape == (1, 4, 8)
+    assert y.shape == (1, 1, 64)
+
+
+def test_encoder_forward_with_patching_emits_per_patch_latents() -> None:
+    """patch_size > 0 → aggregator output keeps the [CLS] row per patch."""
+    enc = _Encoder(
+        encoder_args=_tiny_qwen2_backbone(hidden_size=32),
+        input_dim=16, hop_size=16, latent_dim=4, patch_size=2,
+        attn_implementation="sdpa",
+    )
+    enc = enc.float().eval()
+    waveform = torch.randn(1, 64)
+    with torch.no_grad():
+        params, _ = enc(waveform)
+    # T_frames=4, patch_size=2 → 2 patches → 2 latent rows.
+    assert params.shape == (1, 2, 8)
+
+
+def test_decoder_low_level_reconstruct_non_streaming_shape() -> None:
+    """Non-streaming decode produces a waveform tensor of the right rank."""
+    dec = _Decoder(
+        decoder_args=_tiny_qwen2_backbone(hidden_size=32),
+        output_dim=16, latent_dim=4, patch_size=-1,
+        attn_implementation="sdpa",
+    )
+    dec = dec.float().eval()
+    # latent_dim=4, T_frames=3 → after upsampler... patch_size=-1 so no
+    # upsampler. fc1 maps to hidden_size=32, then Qwen2 backbone, then
+    # ISTFTHead emits 1 audio channel.
+    latent = torch.randn(1, 3, 4)
+    with torch.no_grad():
+        out, state, pkv = dec.low_level_reconstruct(latent, use_cache=False)
+    assert out.dim() == 3
+    assert out.shape[0] == 1 and out.shape[1] == 1
+    assert torch.isfinite(out).all()
+    assert state == (None, None, None)
+
+
+def test_decoder_with_patching_upsamples_before_backbone() -> None:
+    """patch_size != -1 enables the streaming upsampler before the Qwen2 backbone."""
+    dec = _Decoder(
+        decoder_args=_tiny_qwen2_backbone(hidden_size=32),
+        output_dim=16, latent_dim=4, patch_size=4,
+        attn_implementation="sdpa",
+    )
+    dec = dec.float().eval()
+    latent = torch.randn(1, 2, 4)
+    with torch.no_grad():
+        out, _, _ = dec.low_level_reconstruct(latent, use_cache=False)
+    # Output waveform exists and is finite.
+    assert out.dim() == 3
+    assert torch.isfinite(out).all()
+
+
+# ---------------------------------------------------------------------------
+# AudioVAE wrapper construction + end-to-end on tiny config
+# ---------------------------------------------------------------------------
+
+
+def _tiny_audio_vae_config() -> AudioVAEConfig:
+    backbone = _tiny_qwen2_backbone(hidden_size=32, num_layers=1)
+    return AudioVAEConfig(
+        sample_rate=8000,
+        patch_size=-1,                # disable patching to keep tests fast
+        latent_dim=4,
+        encoder_input_dim=16,
+        encoder_hop_size=16,
+        decoder_output_dim=16,
+        enc_backbone=dict(backbone),
+        dec_backbone=dict(backbone),
+    )
+
+
+def test_build_audio_vae_constructs_encoder_and_decoder() -> None:
+    cfg = _tiny_audio_vae_config()
+    vae = build_audio_vae(cfg, dtype=torch.float32, device="cpu")
+    assert isinstance(vae, AudioVAE)
+    assert vae.sample_rate == 8000
+    assert vae.encoder.input_dim == 16
+    assert vae.decoder.hop_length == 16
+    # patch_size=-1 → no aggregator / upsampler.
+    assert not hasattr(vae.encoder, "aggregator") or vae.encoder.patch_size != -1
+    assert not hasattr(vae.decoder, "upsampling")
+
+
+def test_audio_vae_encode_latent_returns_correct_shape() -> None:
+    cfg = _tiny_audio_vae_config()
+    vae = build_audio_vae(cfg, dtype=torch.float32, device="cpu")
+    waveform = torch.randn(2, 64)
+    waveform_length = torch.tensor([64, 48])
+    with torch.no_grad():
+        latent, frame_num = vae.encode_latent(waveform, waveform_length)
+    # input_dim=16 → frame_num[0] = ceil(64/16) = 4, frame_num[1] = ceil(48/16) = 3.
+    assert frame_num.tolist() == [4, 3]
+    # Latent dimensions: (B, T_latents, latent_dim) after transpose.
+    # T_latents = encoder T_frames = ceil((64 + 15) / 16) = 4 (same for both
+    # since the waveform was padded to the max length before the encoder).
+    assert latent.shape[0] == 2
+    assert latent.shape[2] == 4   # latent_dim
+    assert torch.isfinite(latent).all()
+
+
+def test_audio_vae_decode_runs_end_to_end() -> None:
+    cfg = _tiny_audio_vae_config()
+    vae = build_audio_vae(cfg, dtype=torch.float32, device="cpu")
+    latent = torch.randn(1, 5, 4)
+    with torch.no_grad():
+        waveform, state, pkv = vae.decode(latent, use_cache=False)
+    assert waveform.dim() == 3
+    assert waveform.shape[0] == 1 and waveform.shape[1] == 1
+    assert torch.isfinite(waveform).all()
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated structure asserts (key parity only — no weight load)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(
+    _find_local_snapshot() is None,
+    reason="Need Ming-flash-omni-2.0 snapshot with talker/vae/.",
+)
+def test_audio_vae_module_keys_match_snapshot_state_dict() -> None:
+    """Built AudioVAE.state_dict() contains the keys present in the ckpt.
+
+    Smoke test for the eventual loader (step 6f): construct an AudioVAE
+    from the real config, list its state_dict, and verify the major
+    keys present in `talker/vae/model.safetensors` line up. We only
+    check structural buckets (encoder.encoder.layers.0.*,
+    decoder.decoder.*, fc1/fc2/fc3, head.out, head.istft.window,
+    aggregator.layers.0.*) — full parameter coverage is the loader's job.
+    """
+    from safetensors import safe_open
+
+    from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+
+    snap = _find_local_snapshot()
+    config = MingFlashOmniModelConfig.from_pretrained(snap)
+    assert config.talker is not None
+    vae = build_audio_vae(config.talker.vae, dtype=torch.float32, device="cpu")
+    module_keys = set(vae.state_dict().keys())
+
+    with safe_open(
+        f"{snap}/talker/vae/model.safetensors", framework="pt"
+    ) as f:
+        ckpt_keys = set(f.keys())
+
+    representative = {
+        "encoder.fc1.weight",
+        "encoder.fc2.weight",
+        "encoder.fc3.weight",
+        "encoder.norm.weight",
+        "encoder.cls_embed",
+        "encoder.encoder.embed_tokens.weight",
+        "encoder.aggregator.embed_tokens.weight",
+        "decoder.fc1.weight",
+        "decoder.head.out.weight",
+        "decoder.head.istft.window",
+        "decoder.decoder.embed_tokens.weight",
+    }
+    missing_in_module = representative - module_keys
+    assert not missing_in_module, f"Built VAE missing keys present in ckpt: {missing_in_module}"
+    missing_in_ckpt = representative - ckpt_keys
+    assert not missing_in_ckpt, f"Ckpt missing keys expected by VAE: {missing_in_ckpt}"
diff --git a/test/modular/test_ming_flash_omni_byt5_mapper.py b/test/modular/test_ming_flash_omni_byt5_mapper.py
new file mode 100644
index 00000000..b81ac5dc
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_byt5_mapper.py
@@ -0,0 +1,203 @@
+"""Tests for the ByT5 glyph mapper (step 9b).
+
+Two layers:
+
+  * Pure-Python structure/shape tests for ``T5EncoderBlockByT5Mapper`` using a
+    tiny HF ``T5Config`` — verify block stacking, position-bias reuse across
+    layers, the d_model→sdxl_channels projection, pad-mask handling, and that
+    Ming's unfused ``byt5_mapper.pt`` name layout loads with a plain
+    ``load_weights`` (no fused remap needed). Run on CPU, no snapshot.
+
+  * Snapshot-gated build of the full ``MingByT5Encoder`` from the real
+    checkpoint's ``byt5`` dir (skipped when absent).
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+from transformers import T5Config
+
+from mstar.model.ming_omni_flash.components.byte5_encoder import MingByT5Encoder
+from mstar.model.ming_omni_flash.components.t5_block_mapper import (
+    T5EncoderBlockByT5Mapper,
+)
+
+
+def _tiny_t5_config() -> T5Config:
+    return T5Config(
+        d_model=32,
+        d_kv=8,
+        d_ff=64,
+        num_layers=2,
+        num_heads=4,
+        vocab_size=384,
+        relative_attention_num_buckets=32,
+        relative_attention_max_distance=128,
+        layer_norm_epsilon=1e-6,
+        is_encoder_decoder=False,
+        is_decoder=False,
+        dropout_rate=0.0,
+        # Ming's byt5 is gated (wi_0/wi_1), matching the released checkpoint —
+        # the default "relu" would build a single fused wi and break the
+        # unfused-name load path we exercise below.
+        feed_forward_proj="gated-gelu",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Mapper structure / forward
+# ---------------------------------------------------------------------------
+
+
+def test_mapper_projects_to_sdxl_channels() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=2, sdxl_channels=48).eval()
+    x = torch.randn(3, 7, cfg.d_model)
+    mask = torch.ones(3, 7)
+    out = mapper(x, mask)
+    assert out.shape == (3, 7, 48)
+
+
+def test_mapper_no_projection_keeps_d_model() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=1, sdxl_channels=None).eval()
+    assert mapper.channel_mapper is None and mapper.final_layer_norm is None
+    out = mapper(torch.randn(2, 5, cfg.d_model), torch.ones(2, 5))
+    assert out.shape == (2, 5, cfg.d_model)
+
+
+def test_mapper_zero_layers_is_norm_plus_project() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=0, sdxl_channels=16).eval()
+    assert mapper.blocks is None
+    out = mapper(torch.randn(1, 4, cfg.d_model), torch.ones(1, 4))
+    assert out.shape == (1, 4, 16)
+
+
+def test_mapper_only_first_block_has_relative_bias() -> None:
+    """T5 weight-sharing convention: relative_attention_bias lives on block 0."""
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=3, sdxl_channels=None)
+    has_bias = [
+        any("relative_attention_bias" in n for n, _ in blk.named_parameters())
+        for blk in mapper.blocks
+    ]
+    assert has_bias == [True, False, False]
+
+
+def test_mapper_pad_mask_changes_output() -> None:
+    """Masking out the tail should change the kept positions' representation."""
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=2, sdxl_channels=None).eval()
+    x = torch.randn(1, 6, cfg.d_model)
+    full = torch.ones(1, 6)
+    half = torch.tensor([[1.0, 1.0, 1.0, 0.0, 0.0, 0.0]])
+    with torch.no_grad():
+        out_full = mapper(x, full)
+        out_half = mapper(x, half)
+    # The first (kept) token attends to fewer keys under the half mask.
+    assert not torch.allclose(out_full[:, 0], out_half[:, 0], atol=1e-5)
+
+
+def test_extended_attention_mask_additive_form() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=1, sdxl_channels=None)
+    mask = torch.tensor([[1.0, 0.0, 1.0]])
+    ext = mapper.get_extended_attention_mask(mask, dtype=torch.float32)
+    assert ext.shape == (1, 1, 1, 3)
+    assert ext[0, 0, 0, 0].item() == 0.0
+    assert ext[0, 0, 0, 1].item() == torch.finfo(torch.float32).min
+    assert ext[0, 0, 0, 2].item() == 0.0
+
+
+def test_extended_attention_mask_rejects_bad_rank() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=0, sdxl_channels=None)
+    with pytest.raises(ValueError, match="Unexpected attention_mask shape"):
+        mapper.get_extended_attention_mask(torch.ones(2, 3, 4, 5), dtype=torch.float32)
+
+
+# ---------------------------------------------------------------------------
+# load_weights: Ming's unfused byt5_mapper.pt name layout loads directly
+# ---------------------------------------------------------------------------
+
+
+def test_load_weights_roundtrips_unfused_layout() -> None:
+    cfg = _tiny_t5_config()
+    src = T5EncoderBlockByT5Mapper(cfg, num_layers=2, sdxl_channels=24)
+    # Randomize so a successful load is observable (not the init values).
+    with torch.no_grad():
+        for p in src.parameters():
+            p.normal_()
+    dst = T5EncoderBlockByT5Mapper(cfg, num_layers=2, sdxl_channels=24)
+    loaded = dst.load_weights(src.state_dict().items())
+    # Every dst parameter should have been covered by the source state dict.
+    assert loaded == set(dict(dst.named_parameters()).keys())
+    for name, p in dst.named_parameters():
+        assert torch.allclose(p, dict(src.named_parameters())[name])
+
+
+def test_load_weights_source_names_match_ming_unfused_format() -> None:
+    """Sanity: the param names we expose are exactly Ming's checkpoint keys
+    (unfused q/k/v/o + wi_0/wi_1/wo), so no stacked-param remap is needed."""
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=1, sdxl_channels=8)
+    names = set(dict(mapper.named_parameters()).keys())
+    assert "blocks.0.layer.0.SelfAttention.q.weight" in names
+    assert "blocks.0.layer.0.SelfAttention.k.weight" in names
+    assert "blocks.0.layer.0.SelfAttention.v.weight" in names
+    assert "blocks.0.layer.0.SelfAttention.o.weight" in names
+    assert "blocks.0.layer.1.DenseReluDense.wi_0.weight" in names
+    assert "blocks.0.layer.1.DenseReluDense.wi_1.weight" in names
+    assert "blocks.0.layer.1.DenseReluDense.wo.weight" in names
+    assert "channel_mapper.weight" in names and "channel_mapper.bias" in names
+
+
+def test_load_weights_shape_mismatch_raises() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=0, sdxl_channels=8)
+    bad = {"channel_mapper.weight": torch.zeros(8, 999)}
+    with pytest.raises(ValueError, match="Shape mismatch"):
+        mapper.load_weights(bad.items())
+
+
+def test_load_weights_ignores_unknown_keys() -> None:
+    cfg = _tiny_t5_config()
+    mapper = T5EncoderBlockByT5Mapper(cfg, num_layers=0, sdxl_channels=8)
+    loaded = mapper.load_weights({"not.a.real.param": torch.zeros(3)}.items())
+    assert loaded == set()
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated full encoder build
+# ---------------------------------------------------------------------------
+
+
+def _find_byt5_dir() -> str | None:
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    candidates = []
+    if override:
+        candidates.append(Path(override) / "byt5")
+    candidates.append(Path("/dev/shm/ming-hybrid") / "byt5")
+    for c in candidates:
+        # Require the actual weight dirs, not just the config jsons — some
+        # snapshots ship byt5.json + tokenizer stubs without the trained
+        # backbone, which would fail mid-load rather than skip cleanly.
+        if (c / "byt5.json").exists() and (c / "byt5_model" / "byt5_model.pt").exists():
+            return str(c)
+    return None
+
+
+@pytest.mark.skipif(_find_byt5_dir() is None, reason="Need Ming byt5 checkpoint dir.")
+def test_byt5_encoder_builds_and_runs_from_checkpoint() -> None:
+    byt5_dir = _find_byt5_dir()
+    enc = MingByT5Encoder.from_checkpoint(
+        Path(byt5_dir), device=torch.device("cpu"), dtype=torch.float32
+    )
+    feats = enc.forward(["hello world", "draw a red mug"])
+    assert feats.dim() == 3 and feats.shape[0] == 2
+    assert feats.shape[1] == enc.max_length
diff --git a/test/modular/test_ming_flash_omni_components.py b/test/modular/test_ming_flash_omni_components.py
new file mode 100644
index 00000000..9b053509
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_components.py
@@ -0,0 +1,501 @@
+"""Unit tests for Ling-2.0 architecture-novel components.
+
+CPU-only, small-dim, no model weights — these validate the math we ported
+in step 3a of ``mstar/model/ming_omni_flash/PORTING_NOTES.md``.
+
+One test (``test_ling_router_matches_vllm_omni``) cross-checks against
+vllm-omni's own ``BailingMoeV2Gate`` and skips when vllm-omni isn't
+importable — that's the strongest guard against subtle routing bugs
+(group_limited_topk has several easy off-by-one traps).
+"""
+
+from __future__ import annotations
+
+import importlib
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mstar.model.ming_omni_flash.components.attention import LingAttention
+from mstar.model.ming_omni_flash.components.rope import (
+    LingPartialMRotaryEmbedding,
+)
+from mstar.model.ming_omni_flash.components.router import LingMoeRouter
+
+torch.manual_seed(2026)
+
+
+class _MockCacheHandle:
+    """Stand-in for :class:`BatchedCacheManager` in unit tests.
+
+    Implements just ``set_layer_idx`` + ``run_attention`` — the two
+    methods :class:`LingAttention` and :class:`LingMoeModel` call. The
+    ``run_attention`` runs standard causal SDPA, matching what the
+    inline path did before the cache_handle refactor. No KV cache state
+    is preserved across calls (single-shot per layer is enough for unit
+    tests; the real engine handles paging).
+    """
+
+    def __init__(self) -> None:
+        self.layer_idx = 0
+
+    def set_layer_idx(self, layer_idx: int) -> None:
+        self.layer_idx = layer_idx
+
+    def run_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+    ) -> torch.Tensor:
+        """Plain causal SDPA. ``q``/``k``/``v``:
+        ``(num_tokens, num_heads_or_kv, head_dim)``. Returns
+        ``(num_tokens, num_heads, head_dim)``.
+        """
+        num_heads = q.shape[1]
+        num_kv = k.shape[1]
+        kv_groups = num_heads // num_kv
+        if kv_groups > 1:
+            k = k.repeat_interleave(kv_groups, dim=1)
+            v = v.repeat_interleave(kv_groups, dim=1)
+        # SDPA expects (B, num_heads, T, head_dim); we have
+        # (T, num_heads, head_dim). Unsqueeze a batch + transpose.
+        q4 = q.transpose(0, 1).unsqueeze(0)
+        k4 = k.transpose(0, 1).unsqueeze(0)
+        v4 = v.transpose(0, 1).unsqueeze(0)
+        scale = q.shape[-1] ** -0.5
+        out = F.scaled_dot_product_attention(q4, k4, v4, is_causal=True, scale=scale)
+        return out.squeeze(0).transpose(0, 1).contiguous()
+
+
+# ---------------------------------------------------------------------------
+# Router
+# ---------------------------------------------------------------------------
+
+
+def test_ling_router_shapes_and_scaling() -> None:
+    """Forward returns the (logits, weights, indices) 3-tuple with the
+    expected shapes; weights sum to ~routed_scaling_factor per row."""
+    router = LingMoeRouter(
+        hidden_size=64, num_experts=16,
+        num_experts_per_tok=4,
+        n_group=4, topk_group=2,
+        routed_scaling_factor=2.5,
+    )
+    x = torch.randn(8, 64)
+    logits, weights, indices = router(x)
+    assert logits.shape == (8, 16)
+    assert weights.shape == (8, 4)
+    assert indices.shape == (8, 4)
+    assert indices.dtype == torch.int64
+    # Renormalised weights sum to 1, then × routed_scaling_factor → 2.5.
+    row_sums = weights.float().sum(dim=-1)
+    assert torch.allclose(row_sums, torch.full((8,), 2.5), atol=1e-5), row_sums
+
+
+def test_ling_router_group_limited() -> None:
+    """If only group 0's experts score high (others -inf-ish), every
+    selected index must fall inside group 0's expert range."""
+    router = LingMoeRouter(
+        hidden_size=8, num_experts=12,
+        num_experts_per_tok=3,
+        n_group=3, topk_group=1,
+    )
+    with torch.no_grad():
+        router.gate.weight.zero_()
+        # Boost group 0 (experts 0..3): a single boosted input dim hits
+        # those experts strongly.
+        router.gate.weight[0:4, 0] = 10.0
+    x = torch.zeros(4, 8)
+    x[:, 0] = 1.0  # activate the input dim that lights up group 0
+    _, _, indices = router(x)
+    # All chosen experts must be in [0, 4) since topk_group=1 means only
+    # group 0 (experts 0..3) is eligible.
+    assert (indices >= 0).all() and (indices < 4).all(), indices
+
+
+def test_ling_router_expert_bias_shifts_routing() -> None:
+    """A large positive bias on expert E forces it to be picked even when
+    the gate logits favour another expert."""
+    router = LingMoeRouter(
+        hidden_size=4, num_experts=8,
+        num_experts_per_tok=2,
+        n_group=2, topk_group=2,
+    )
+    with torch.no_grad():
+        router.gate.weight.zero_()
+        router.gate.weight[1, 0] = 5.0  # gate prefers expert 1
+    x = torch.zeros(3, 4)
+    x[:, 0] = 1.0
+    _, _, baseline = router(x)
+    assert (baseline[:, 0] == 1).all()  # expert 1 picked first
+
+    with torch.no_grad():
+        router.expert_bias[6] = 5.0  # boost expert 6 via bias
+    _, _, after = router(x)
+    # Expert 6 should now appear in every row's top-2.
+    assert (after == 6).any(dim=-1).all(), after
+
+
+def test_ling_router_rejects_bad_group_split() -> None:
+    """num_experts must divide evenly by n_group; otherwise the
+    constructor must raise."""
+    with pytest.raises(ValueError, match="divisible"):
+        LingMoeRouter(
+            hidden_size=4, num_experts=10,
+            num_experts_per_tok=2,
+            n_group=3, topk_group=1,
+        )
+    with pytest.raises(ValueError, match="topk_group"):
+        LingMoeRouter(
+            hidden_size=4, num_experts=8,
+            num_experts_per_tok=2,
+            n_group=2, topk_group=3,
+        )
+
+
+def test_ling_router_matches_vllm_omni() -> None:
+    """Cross-check vs vllm-omni's ``BailingMoeV2Gate`` on the same inputs.
+
+    Same hidden_size / num_experts / etc., same gate weight, same
+    expert_bias — chosen indices must match exactly. (Returned weights
+    differ because the upstream Gate returns the gathered scores
+    pre-renormalisation; we compare the indices, which is what
+    matters for downstream dispatch.)
+    """
+    try:
+        importlib.import_module("vllm_omni")
+        from vllm_omni.model_executor.models.ming_flash_omni.modeling_bailing_moe_v2 import (
+            BailingMoeV2Gate,
+        )
+        from vllm_omni.transformers_utils.configs.ming_flash_omni import (
+            BailingMoeV2Config,
+        )
+    except Exception as e:  # noqa: BLE001 — broad on purpose; any import path failure ⇒ skip
+        pytest.skip(f"vllm-omni not importable: {e}")
+
+    # vllm-omni's Gate calls get_tensor_model_parallel_world_size() — we
+    # need to be in a TP-initialised state for that. Set up a single-rank
+    # group manually.
+    try:
+        from vllm.distributed import init_distributed_environment, initialize_model_parallel
+        if not torch.distributed.is_initialized():
+            init_distributed_environment(
+                world_size=1, rank=0, distributed_init_method="tcp://127.0.0.1:25555",
+                local_rank=0, backend="gloo",
+            )
+        initialize_model_parallel(tensor_model_parallel_size=1)
+    except Exception as e:  # noqa: BLE001
+        pytest.skip(f"vllm distributed init not available: {e}")
+
+    config = BailingMoeV2Config(
+        hidden_size=32, num_experts=16, num_experts_per_tok=4,
+        n_group=4, topk_group=2, routed_scaling_factor=2.5,
+    )
+    upstream = BailingMoeV2Gate(config)
+
+    ours = LingMoeRouter(
+        hidden_size=32, num_experts=16, num_experts_per_tok=4,
+        n_group=4, topk_group=2, routed_scaling_factor=2.5,
+    )
+    # Copy gate weights + bias for an apples-to-apples comparison.
+    with torch.no_grad():
+        ours.gate.weight.copy_(upstream.gate.weight.data)
+        ours.expert_bias.copy_(upstream.expert_bias.data)
+        # Give expert_bias something non-trivial so the bias path is exercised.
+        ours.expert_bias.normal_(std=0.01)
+        upstream.expert_bias.data.copy_(ours.expert_bias.data)
+
+    x = torch.randn(6, 32)
+    _, _, ours_indices = ours(x)
+    up_indices, up_weights, _ = upstream(x)
+
+    # Compare as sets per row — top-k order isn't guaranteed to match by
+    # construction (both use ``sorted=False`` in their final topk).
+    for r in range(x.shape[0]):
+        assert set(ours_indices[r].tolist()) == set(up_indices[r].tolist()), (
+            f"row {r}: ours={sorted(ours_indices[r].tolist())} vs "
+            f"upstream={sorted(up_indices[r].tolist())}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Partial MRoPE
+# ---------------------------------------------------------------------------
+
+
+def _make_rope(head_dim: int = 128) -> LingPartialMRotaryEmbedding:
+    return LingPartialMRotaryEmbedding(
+        head_dim=head_dim,
+        partial_rotary_factor=0.5,
+        mrope_section=[8, 12, 12],
+        rope_theta=2_400_000.0,
+        max_position_embeddings=32768,
+    )
+
+
+def test_partial_mrope_shapes_and_pass_through() -> None:
+    """Output shape unchanged; pass-through half is byte-identical.
+
+    head_dim=128, partial=0.5 → rotary_dim=64. Indices 64..128 are
+    untouched.
+    """
+    rope = _make_rope()  # head_dim=128, mrope_section=[8,12,12] sums to 32 = 64//2  ✓
+    T = 7
+    q = torch.randn(2, T, 128)  # (num_heads, T, head_dim)
+    k = torch.randn(2, T, 128)
+    positions = torch.arange(T)
+    q_out, k_out = rope(q, k, positions)
+    assert q_out.shape == q.shape == k_out.shape
+    # The second half of head_dim must be untouched (rotary_dim=64).
+    assert torch.equal(q_out[..., 64:], q[..., 64:])
+    assert torch.equal(k_out[..., 64:], k[..., 64:])
+
+
+def test_partial_mrope_1d_matches_standard_rotary() -> None:
+    """With 1D position_ids, rotation reduces to plain rotary on the
+    first 64 dims — invariant: identical inputs at identical positions
+    produce identical rotations regardless of axis layout."""
+    rope = _make_rope()
+    q = torch.randn(1, 1, 128)
+    k = torch.zeros(1, 1, 128)
+    pos = torch.tensor([5])
+    # Same q rotated at position 5 twice → identical.
+    out1, _ = rope(q.clone(), k.clone(), pos)
+    out2, _ = rope(q.clone(), k.clone(), pos)
+    assert torch.equal(out1, out2)
+
+
+def test_partial_mrope_video_rope_layout() -> None:
+    """``video_rope`` axis assignment: spatial half uses H/W alternating,
+    temporal tail uses T.
+
+    Test by zeroing two of the three position rows and checking the
+    rotation only touches the dims the surviving axis was assigned to.
+    """
+    rope = _make_rope()
+    T = 1
+    # Identity-friendly q: ones in the rotary half so rotation is observable.
+    q = torch.zeros(1, T, 128)
+    q[..., :64] = 1.0
+    k = q.clone()
+
+    # All time positions = 5, H = W = 0  → time should be the only
+    # axis with nonzero effect. video_rope places T at indices [hw_size:half]
+    # which is [24:32] in each of the two halves.
+    positions = torch.zeros(3, T, dtype=torch.long)
+    positions[0] = 5
+    q_t, _ = rope(q.clone(), k.clone(), positions)
+
+    # Pull the cos/sin we expect for time at indices [24:32] and [24+32:64]
+    # (the two halves of rotary_dim=64). For H=W=0, cos=1 sin=0 everywhere,
+    # so spatial dims should remain == 1.0 (no rotation).
+    rotary_first = q_t[..., :64]
+    # Spatial dims: 0..24 in each half — for H=W=0, freq=0, cos=1, sin=0
+    # → rotation leaves value at 1.0.
+    assert torch.allclose(rotary_first[..., :24], torch.ones_like(rotary_first[..., :24])), \
+        "spatial dims rotated under H=W=0 — wrong axis assignment"
+    assert torch.allclose(rotary_first[..., 32:32 + 24], torch.ones_like(rotary_first[..., 32:32 + 24])), \
+        "spatial dims (second half) rotated under H=W=0"
+    # Temporal dims [24:32] and [56:64]: position 5 with theta=2.4M and
+    # rotary_dim=64 produces a measurable but small rotation (we don't
+    # check exact value; just that it diverged from 1.0).
+    assert not torch.allclose(rotary_first[..., 24:32], torch.ones_like(rotary_first[..., 24:32])), \
+        "temporal dims unrotated when T=5 — time axis not applied"
+
+
+def test_partial_mrope_rejects_inconsistent_section() -> None:
+    """sum(mrope_section) must equal rotary_dim // 2."""
+    with pytest.raises(ValueError, match="rotary_dim"):
+        LingPartialMRotaryEmbedding(
+            head_dim=128, partial_rotary_factor=0.5,
+            mrope_section=[8, 16, 16],   # sums to 40, expected 32
+            rope_theta=10000.0, max_position_embeddings=1024,
+        )
+
+
+@pytest.mark.parametrize(
+    "mrope_section,head_dim,num_tokens",
+    [
+        # Released ckpt geometry (head_dim=128, rotary_dim=64, half=32).
+        ([8, 12, 12], 128, 1),
+        ([8, 12, 12], 128, 7),
+        ([8, 12, 12], 128, 64),
+        # hw_size == half (no temporal tail) — edge case for the
+        # ``offset+hw_size:offset+half`` slice.
+        ([0, 8, 8], 64, 5),
+        # hw_size < half by a wide margin.
+        ([14, 1, 1], 64, 5),
+        # Asymmetric Nh / Nw split.
+        ([2, 5, 1], 32, 11),
+    ],
+)
+def test_partial_mrope_video_rope_matches_vllm_omni(
+    mrope_section: list[int], head_dim: int, num_tokens: int,
+) -> None:
+    """Numeric parity vs vllm-omni's ``_remap_video_rope``.
+
+    The two implementations operate on differently-shaped inputs:
+
+    * mstar consumes the *full* ``(3, T, rotary_dim)`` neox-cat table and
+      writes both halves in a single ``for offset in (0, half)`` loop.
+    * vllm-omni consumes the *half* ``(3, T, rotary_dim/2)`` table — the
+      same one that ``cos_sin_cache.chunk(2)`` returns — and writes just
+      one half.
+
+    Since the neox cat duplicates each frequency into both halves, the
+    expected invariant is::
+
+        mstar_full[:, :half]  == vllm_half
+        mstar_full[:, half:]  == vllm_half  (identical, because both halves
+                                             carry the same freqs)
+
+    The ``offset+hw_size:offset+half`` slice in mstar is the bit most
+    likely to misalign for unusual ``mrope_section`` shapes — this
+    parametrisation exercises the edges.
+    """
+    try:
+        importlib.import_module("vllm_omni")
+        from vllm_omni.model_executor.models.ming_flash_omni.modeling_bailing_moe_v2 import (
+            MingVideoRopeMRotaryEmbedding,
+        )
+    except Exception as e:  # noqa: BLE001
+        pytest.skip(f"vllm-omni not importable: {e}")
+
+    # vllm's ``_remap_video_rope`` only reads ``self.mrope_section``; build
+    # the thinnest possible stand-in so we can call it as an unbound method
+    # without constructing the full MRotaryEmbedding (which pulls in
+    # vllm's CUDA cache machinery).
+    import types
+    stub = types.SimpleNamespace(mrope_section=list(mrope_section))
+
+    rotary_dim = head_dim // 2  # partial_rotary_factor=0.5
+    half = rotary_dim // 2
+
+    # Synthesise per-axis half-tables with values drawn from a wide range so
+    # any wrong-axis pick shows up loudly.
+    torch.manual_seed(20260609)
+    cos_half = torch.randn(3, num_tokens, half, dtype=torch.float64) * 3.0
+    sin_half = torch.randn(3, num_tokens, half, dtype=torch.float64) * 3.0
+
+    # Reference (vllm-omni) — operates on half-tables.
+    ref_cos_half, ref_sin_half = MingVideoRopeMRotaryEmbedding._remap_video_rope(
+        stub, cos_half, sin_half,
+    )
+    assert ref_cos_half.shape == (num_tokens, half)
+    assert ref_sin_half.shape == (num_tokens, half)
+
+    # Ours (mstar) — operates on full neox-cat tables.
+    rope = LingPartialMRotaryEmbedding(
+        head_dim=head_dim, partial_rotary_factor=0.5,
+        mrope_section=list(mrope_section),
+        rope_theta=10000.0,
+        max_position_embeddings=128,
+    )
+    cos_full = torch.cat((cos_half, cos_half), dim=-1)
+    sin_full = torch.cat((sin_half, sin_half), dim=-1)
+    full_cos, full_sin = rope._remap_video_rope(cos_full, sin_full)
+    assert full_cos.shape == (num_tokens, rotary_dim)
+    assert full_sin.shape == (num_tokens, rotary_dim)
+
+    # Both halves of the full output must equal vllm's half output exactly
+    # (we used float64 to dodge fp32 quantisation noise).
+    assert torch.equal(full_cos[:, :half], ref_cos_half), (
+        f"mrope_section={mrope_section}, head_dim={head_dim}, T={num_tokens}: "
+        f"first half of cos diverges from vllm reference"
+    )
+    assert torch.equal(full_cos[:, half:], ref_cos_half), (
+        f"mrope_section={mrope_section}, head_dim={head_dim}, T={num_tokens}: "
+        f"second half of cos diverges from vllm reference"
+    )
+    assert torch.equal(full_sin[:, :half], ref_sin_half), (
+        f"mrope_section={mrope_section}, head_dim={head_dim}, T={num_tokens}: "
+        f"first half of sin diverges from vllm reference"
+    )
+    assert torch.equal(full_sin[:, half:], ref_sin_half), (
+        f"mrope_section={mrope_section}, head_dim={head_dim}, T={num_tokens}: "
+        f"second half of sin diverges from vllm reference"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Attention (QK-norm + partial MRoPE composition)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="mstar RMSNorm uses flashinfer's CUDA-only rmsnorm")
+def test_ling_attention_forward_runs_with_qk_norm() -> None:
+    """End-to-end forward at small dim — main goal is that the QK-norm +
+    rope composition doesn't crash and produces finite output."""
+    head_dim = 32
+    # rotary_dim=16, rotary_dim//2=8 — section sum must be 8.
+    rope = LingPartialMRotaryEmbedding(
+        head_dim=head_dim,
+        partial_rotary_factor=0.5,
+        mrope_section=[2, 3, 3],
+        rope_theta=10000.0,
+        max_position_embeddings=128,
+    ).cuda()
+    attn = LingAttention(
+        hidden_size=64, num_heads=4, num_kv_heads=2,
+        head_dim=head_dim, rms_norm_eps=1e-6, rotary=rope,
+    ).cuda()
+    T = 5
+    x = torch.randn(T, 64, device="cuda")
+    pos = torch.arange(T, device="cuda")
+    out = attn(x, _MockCacheHandle(), pos)
+    assert out.shape == x.shape
+    assert torch.isfinite(out).all()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="mstar RMSNorm uses flashinfer's CUDA-only rmsnorm")
+def test_ling_attention_qk_norm_actually_normalises() -> None:
+    """Verify the q_norm / k_norm layers are RMSNorm-shaped — sanity guard
+    for the right module is plumbed in. Using ``head_norm_check`` helper."""
+    head_dim = 16
+    # rotary_dim=8, rotary_dim//2=4 — section sum must be 4.
+    rope = LingPartialMRotaryEmbedding(
+        head_dim=head_dim, partial_rotary_factor=0.5,
+        mrope_section=[1, 1, 2], rope_theta=10000.0,
+        max_position_embeddings=64,
+    ).cuda()
+    attn = LingAttention(
+        hidden_size=32, num_heads=2, num_kv_heads=2,
+        head_dim=head_dim, rms_norm_eps=1e-6, rotary=rope,
+    ).cuda()
+    # Feed a heavily-scaled input — RMSNorm should bring per-head RMS to 1.
+    q_big = torch.randn(3, 4, head_dim, device="cuda") * 100.0   # (T, H, head_dim)
+    out = attn.q_norm(q_big)
+    max_dev = LingAttention.head_norm_check(out)
+    # 5e-3 tolerance accommodates bf16 RMSNorm; the load-bearing claim is
+    # that q_norm reshapes per-head and applies normalisation, not that
+    # the RMS is precisely 1.0 to 4 decimals on fp16 hardware.
+    assert max_dev < 5e-3, f"q_norm did not produce unit-RMS output: dev={max_dev}"
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="mstar RMSNorm uses flashinfer's CUDA-only rmsnorm")
+def test_ling_attention_causal_mask() -> None:
+    """Sanity: appending a later token shouldn't change the output of
+    earlier positions (proves causal masking is on)."""
+    head_dim = 32
+    # rotary_dim=16, rotary_dim//2=8 — section sum must be 8.
+    rope = LingPartialMRotaryEmbedding(
+        head_dim=head_dim, partial_rotary_factor=0.5,
+        mrope_section=[2, 3, 3], rope_theta=10000.0,
+        max_position_embeddings=128,
+    ).cuda()
+    attn = LingAttention(
+        hidden_size=64, num_heads=4, num_kv_heads=4,
+        head_dim=head_dim, rms_norm_eps=1e-6, rotary=rope,
+    ).cuda().eval()
+    x = torch.randn(3, 64, device="cuda")
+    pos = torch.arange(3, device="cuda")
+    out_a = attn(x, _MockCacheHandle(), pos)
+
+    # Append a 4th token; first 3 outputs MUST equal out_a (causal).
+    x4 = torch.cat([x, torch.randn(1, 64, device="cuda")], dim=0)
+    pos4 = torch.arange(4, device="cuda")
+    out_b = attn(x4, _MockCacheHandle(), pos4)
+    assert torch.allclose(out_a, out_b[:3], atol=1e-4), \
+        "causal mask leaked — adding a later token changed earlier outputs"
diff --git a/test/modular/test_ming_flash_omni_config.py b/test/modular/test_ming_flash_omni_config.py
new file mode 100644
index 00000000..edaec30a
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_config.py
@@ -0,0 +1,561 @@
+"""Smoke tests for Ming-flash-omni-2.0 config loading.
+
+These tests run against the released checkpoint
+(``inclusionAI/Ming-flash-omni-2.0``). They skip cleanly when no local
+snapshot is available, so CI / dev machines without the 222 GB download
+still pass.
+
+Snapshot discovery order:
+  1. ``MING_FLASH_OMNI_DIR`` env var (explicit override)
+  2. The default HF Hub cache layout under ``~/.cache/huggingface/hub/``
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    ImageGenConfig,
+    MingFlashOmniModelConfig,
+    TalkerConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+
+
+def _find_local_snapshot() -> str | None:
+    """Locate a Ming-flash-omni-2.0 snapshot on disk, or None."""
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and (Path(override) / "config.json").exists():
+        return override
+
+    hub_root = Path.home() / ".cache" / "huggingface" / "hub"
+    repo_dir = hub_root / "models--inclusionAI--Ming-flash-omni-2.0" / "snapshots"
+    if not repo_dir.exists():
+        return None
+    # Pick the first snapshot dir that has a config.json (HF stores one per
+    # commit revision; usually there's only one).
+    for snap in sorted(repo_dir.iterdir()):
+        if (snap / "config.json").exists():
+            return str(snap)
+    return None
+
+
+@pytest.fixture(scope="module")
+def snapshot_dir() -> str:
+    snap = _find_local_snapshot()
+    if snap is None:
+        pytest.skip(
+            "Ming-flash-omni-2.0 snapshot not found. Set MING_FLASH_OMNI_DIR "
+            "or download with `huggingface-cli download "
+            "inclusionAI/Ming-flash-omni-2.0`."
+        )
+    return snap
+
+
+@pytest.fixture(scope="module")
+def config(snapshot_dir: str) -> MingFlashOmniModelConfig:
+    return MingFlashOmniModelConfig.from_pretrained(snapshot_dir)
+
+
+def test_from_pretrained_loads_thinker_dims(config: MingFlashOmniModelConfig) -> None:
+    """Released ckpt: Ling-2.0 32L, 4096-hidden, 256-expert MoE, head_dim=128."""
+    llm = config.thinker_llm
+    assert llm.vocab_size == 157184
+    assert llm.hidden_size == 4096
+    assert llm.intermediate_size == 9216
+    assert llm.num_hidden_layers == 32
+    assert llm.num_attention_heads == 32
+    assert llm.num_key_value_heads == 4
+    assert llm.head_dim == 128
+    assert llm.rope_theta == 2_400_000.0
+    assert llm.num_experts == 256
+    assert llm.num_experts_per_tok == 8
+    assert llm.moe_intermediate_size == 1024
+    assert llm.first_k_dense_replace == 1
+    assert llm.router_type == "MultiRouter"
+    assert llm.use_qk_norm is True
+
+    # Convenience accessors used by the rest of mstar
+    assert config.thinker_hidden_size == 4096
+    assert config.thinker_num_layers == 32
+    assert config.thinker_head_dim == 128
+    assert config.thinker_num_kv_heads == 4
+    assert config.vocab_size == 157184
+
+
+def test_from_pretrained_loads_vision_audio(config: MingFlashOmniModelConfig) -> None:
+    """Released ckpt: Qwen3-MoE ViT (27L, out_hidden=4096) + Whisper-style audio."""
+    assert config.vision.depth == 27
+    assert config.vision.hidden_size == 1152
+    assert config.vision.out_hidden_size == 4096
+    assert config.vision.deepstack_visual_indexes == (8, 16, 24)
+    assert config.vision.spatial_merge_size == 2
+    assert config.vision.patch_size == 16
+    assert config.vision.hidden_act == "gelu_pytorch_tanh"
+
+    audio = config.audio_encoder
+    assert audio.encoder_layers == 32
+    assert audio.d_model == 1280
+    assert audio.encoder_attention_heads == 20
+    assert audio.n_mels == 128
+    assert audio.ds_kernel_size == 3
+    assert audio.ds_stride == 2
+    assert audio.norm_query_embeds is True
+
+
+def test_mrope_section_sums_to_half_rotary_dims(config: MingFlashOmniModelConfig) -> None:
+    """Regression guard on the MRoPE arithmetic.
+
+    sum(mrope_section) must equal (head_dim * partial_rotary_factor) / 2 —
+    the rotary subset of each head is paired (cos, sin), so the section
+    partitions one half. For Ming-flash-omni-2.0: 128 * 0.5 / 2 = 32, and
+    the released ckpt sets mrope_section = [8, 12, 12].
+    """
+    llm = config.thinker_llm
+    assert llm.head_dim is not None
+    rotary_pair_dims = int(llm.head_dim * llm.partial_rotary_factor) // 2
+    assert sum(llm.mrope_section) == rotary_pair_dims, (
+        f"mrope_section {llm.mrope_section} sums to {sum(llm.mrope_section)}, "
+        f"expected {rotary_pair_dims}"
+    )
+
+
+def test_subdir_configs_load_when_present(config: MingFlashOmniModelConfig) -> None:
+    """talker/ and the imagegen subdir family populate when present."""
+    assert config.talker is not None, "talker/config.json should have populated"
+    assert config.talker.vae_sample_rate == 44100
+    assert config.talker.patch_size == 4
+    assert config.talker.history_patch_size == 32
+    # Step 6a: llm + vae are typed dataclasses (used to be raw dicts).
+    assert config.talker.llm.hidden_size == 896
+    assert config.talker.llm.num_hidden_layers == 24
+    assert config.talker.llm.num_key_value_heads == 2
+    assert config.talker.vae.sample_rate == 44100
+    assert config.talker.vae.latent_dim == 64
+    # flowmodel + aggregator share shape; only dropout differs (0 vs 0.1).
+    assert config.talker.flowmodel.depth == 8
+    assert config.talker.flowmodel.hidden_size == 1024
+    assert config.talker.aggregator.dropout == pytest.approx(0.1)
+
+    assert config.image_gen is not None, "imagegen subdirs should have populated"
+    assert config.image_gen.num_query_tokens == 256  # img_gen_scales=[16] => 16*16
+    assert config.image_gen.diffusion_c_input_dim == 2560
+    assert config.image_gen.text_encoder_norm is True
+    # Step 9a: typed sub-configs parsed from the imagegen subdir tree.
+    assert config.image_gen.dit.dim == 3840
+    assert config.image_gen.dit.n_layers == 30
+    assert config.image_gen.dit.in_channels == 16
+    assert config.image_gen.dit.axes_dims == (32, 48, 48)
+    assert config.image_gen.vae.latent_channels == 16
+    assert config.image_gen.vae.scaling_factor == pytest.approx(0.3611)
+    assert config.image_gen.scheduler.shift == pytest.approx(3.0)
+    assert config.image_gen.byt5.sdxl_channels == 2560
+    assert config.image_gen.byt5.byt5_name == "google/byt5-small"
+    # Connector is a Qwen2 LLM kept as a raw dict.
+    assert config.image_gen.connector is not None
+    assert config.image_gen.connector.get("model_type") == "qwen2"
+
+
+def test_subdir_configs_absent_returns_none() -> None:
+    """A snapshot dir with only a stripped-down config.json yields
+    talker=None and image_gen=None."""
+    minimal = {
+        "llm_config": {"hidden_size": 4096, "num_attention_heads": 32, "vocab_size": 157184},
+        "vision_config": {"depth": 27, "out_hidden_size": 4096},
+        "audio_config": {
+            "ds_kernel_size": 3, "ds_stride": 2, "norm_query_embeds": True,
+            "whisper_encoder_config": {
+                "n_ctx": 15000, "n_head": 20, "n_layer": 32, "n_mels": 128, "n_state": 1280,
+            },
+        },
+        "mlp_depth": 2,
+    }
+    with tempfile.TemporaryDirectory() as tmp:
+        (Path(tmp) / "config.json").write_text(json.dumps(minimal))
+        c = MingFlashOmniModelConfig.from_pretrained(tmp)
+    assert c.talker is None
+    assert c.image_gen is None
+
+
+def test_sub_config_from_dict_filters_unknown_keys() -> None:
+    """from_dict should silently drop keys the dataclass doesn't declare,
+    so checkpoints that add new fields don't break loading."""
+    # Released ThinkerLLMConfig doesn't carry e.g. ``some_future_field``; that
+    # key must be silently dropped, not raise.
+    cfg = ThinkerLLMConfig.from_dict({
+        "hidden_size": 4096,
+        "num_attention_heads": 32,
+        "some_future_field": "ignored",
+    })
+    assert cfg.hidden_size == 4096
+    assert not hasattr(cfg, "some_future_field")
+
+    vis = VisionEncoderConfig.from_dict({"depth": 27, "deepstack_visual_indexes": [1, 2, 3]})
+    assert vis.deepstack_visual_indexes == (1, 2, 3)
+
+    aud = AudioEncoderConfig.from_dict({"ds_stride": 4, "irrelevant": True})
+    assert aud.ds_stride == 4
+
+
+def test_invariant_check_rejects_out_of_vocab_multimodal_tokens() -> None:
+    """__post_init__ should refuse a config whose multimodal token IDs
+    are outside the vocabulary range — that pattern silently causes a
+    CUDA device-side assert at embedding-lookup time."""
+    bad = ThinkerLLMConfig(
+        vocab_size=1000,
+        image_patch_token=2000,  # > vocab_size
+    )
+    with pytest.raises(ValueError, match="image_patch_token"):
+        MingFlashOmniModelConfig(thinker_llm=bad)
+
+
+def test_invariant_check_covers_audio_and_end_tokens() -> None:
+    """The vocab-bounds check must cover every multimodal token field,
+    not just the four the ckpt ships. Regression for the audio + *_end
+    tokens added alongside the vision/audio encoder port."""
+    for field, bad_value in [
+        ("audio_patch_token", 200_000),
+        ("audio_start_token", 200_000),
+        ("audio_end_token", 200_000),
+        ("image_end_token", 200_000),
+        ("video_end_token", 200_000),
+    ]:
+        bad = ThinkerLLMConfig(vocab_size=160_000, **{field: bad_value})
+        with pytest.raises(ValueError, match=field):
+            MingFlashOmniModelConfig(thinker_llm=bad)
+
+
+def test_video_start_token_mislabel_auto_repaired(caplog: pytest.LogCaptureFixture) -> None:
+    """The inclusionAI ckpt's llm_config.video_start_token=157159 is
+    actually `</image>` per the tokenizer; the real `<video>` token is
+    157160. ThinkerLLMConfig.__post_init__ must repair the bogus value
+    AND emit a warning so the user sees what happened.
+    """
+    import logging
+    with caplog.at_level(logging.WARNING):
+        cfg = ThinkerLLMConfig.from_dict({
+            # Mimic the on-disk inclusionAI llm_config (minus head_dim noise).
+            "hidden_size": 4096, "num_attention_heads": 32, "vocab_size": 160_000,
+            "image_start_token": 157158,
+            "video_start_token": 157159,  # bogus per ckpt
+        })
+    # Repaired in place to the tokenizer-truth value.
+    assert cfg.video_start_token == 157160, (
+        f"video_start_token should auto-repair from 157159 to 157160; got {cfg.video_start_token}"
+    )
+    assert any("video_start_token=157159" in rec.message for rec in caplog.records), \
+        "expected a warning about the ckpt mislabel"
+
+    # If video_start_token is set to anything else (whether the corrected
+    # 157160 or a custom value), the repair must NOT fire and the value
+    # must pass through untouched.
+    cfg_ok = ThinkerLLMConfig(video_start_token=157160)
+    assert cfg_ok.video_start_token == 157160
+    cfg_custom = ThinkerLLMConfig(video_start_token=99_999, image_end_token=42)
+    assert cfg_custom.video_start_token == 99_999
+
+
+def test_invariant_check_rejects_bad_mrope_section() -> None:
+    """Wrong mrope_section partition is exactly the kind of silent miswire
+    we want loud failure on."""
+    bad_llm = ThinkerLLMConfig(
+        rope_scaling={"type": "video_rope", "mrope_section": [16, 16, 16]},  # sums to 48, expected 32
+    )
+    with pytest.raises(ValueError, match="MRoPE section"):
+        MingFlashOmniModelConfig(thinker_llm=bad_llm)
+
+
+def test_imagegen_skeleton_defaults() -> None:
+    """The image-gen skeleton should produce a usable instance even before
+    any subdir reads (downstream code may want to read default subfolder
+    names / sampling defaults without touching disk)."""
+    ig = ImageGenConfig()
+    assert ig.num_query_tokens == 256
+    assert ig.transformer_subfolder == "transformer"
+    assert ig.byt5_subfolder == "byt5"
+    assert ig.num_inference_steps == 30
+    assert ig.guidance_scale == 2.0
+
+
+def test_talker_from_subdir_returns_none_for_missing_dir() -> None:
+    """Missing talker/ subdir must return None, not raise."""
+    with tempfile.TemporaryDirectory() as tmp:
+        assert TalkerConfig.from_subdir(Path(tmp) / "talker") is None
+
+
+# ---------------------------------------------------------------------------
+# Step 6a: TalkerLLMConfig / DiTBlockConfig / AudioVAEConfig
+# ---------------------------------------------------------------------------
+
+
+def test_talker_llm_config_defaults_match_released_ckpt() -> None:
+    """Defaults track the released talker/llm/config.json values."""
+    from mstar.model.ming_omni_flash.config import TalkerLLMConfig
+    llm = TalkerLLMConfig()
+    assert llm.vocab_size == 151936
+    assert llm.hidden_size == 896
+    assert llm.intermediate_size == 4864
+    assert llm.num_hidden_layers == 24
+    assert llm.num_attention_heads == 14
+    assert llm.num_key_value_heads == 2
+    assert llm.head_dim == 64  # 896 / 14
+    assert llm.rope_theta == 1_000_000.0
+    assert llm.tie_word_embeddings is True
+
+
+def test_talker_llm_config_from_dict_filters_unknown_keys() -> None:
+    """Released config has fields we don't model (`transformers_version` etc.)
+    — `from_dict` must silently ignore them."""
+    from mstar.model.ming_omni_flash.config import TalkerLLMConfig
+    llm = TalkerLLMConfig.from_dict({
+        "hidden_size": 1024,
+        "transformers_version": "4.43.1",
+        "_attn_implementation": "flash_attention_2",
+    })
+    assert llm.hidden_size == 1024
+    assert llm.num_hidden_layers == 24  # default preserved
+
+
+def test_dit_block_config_intermediate_size_and_head_dim() -> None:
+    from mstar.model.ming_omni_flash.config import DiTBlockConfig
+    blk = DiTBlockConfig()
+    assert blk.intermediate_size == 1024 * 4
+    assert blk.head_dim == 1024 // 16
+
+
+def test_audio_vae_config_lifts_latent_and_output_dims_from_kwargs() -> None:
+    """`enc_kwargs.latent_dim` and `dec_kwargs.output_dim` get pulled out."""
+    from mstar.model.ming_omni_flash.config import AudioVAEConfig
+    cfg = AudioVAEConfig.from_dict({
+        "sample_rate": 44100,
+        "patch_size": 4,
+        "enc_kwargs": {
+            "latent_dim": 64,
+            "input_dim": 80,
+            "hop_size": 320,
+            "backbone": {"hidden_size": 896, "num_hidden_layers": 24},
+        },
+        "dec_kwargs": {
+            "latent_dim": 64,
+            "output_dim": 882,
+            "backbone": {"hidden_size": 896, "num_hidden_layers": 24},
+        },
+        "init_method": "kaiming",
+        "lambda_mel_loss": 1.0,
+    })
+    assert cfg.sample_rate == 44100
+    assert cfg.latent_dim == 64
+    assert cfg.encoder_input_dim == 80
+    assert cfg.encoder_hop_size == 320
+    assert cfg.decoder_output_dim == 882
+    assert cfg.enc_backbone["hidden_size"] == 896
+    assert cfg.dec_backbone["num_hidden_layers"] == 24
+    assert cfg.lambda_mel_loss == 1.0
+
+
+def test_audio_vae_config_falls_back_when_enc_kwargs_missing_latent() -> None:
+    """If enc_kwargs has no latent_dim, fall back to dec_kwargs.latent_dim."""
+    from mstar.model.ming_omni_flash.config import AudioVAEConfig
+    cfg = AudioVAEConfig.from_dict({
+        "enc_kwargs": {"input_dim": 80, "hop_size": 320},
+        "dec_kwargs": {"latent_dim": 128, "output_dim": 512},
+    })
+    assert cfg.latent_dim == 128
+
+
+def test_talker_config_from_subdir_typed_subfields() -> None:
+    """from_subdir produces typed TalkerLLMConfig / DiTBlockConfig / AudioVAEConfig."""
+    from mstar.model.ming_omni_flash.config import (
+        AudioVAEConfig,
+        DiTBlockConfig,
+        TalkerConfig,
+        TalkerLLMConfig,
+    )
+    with tempfile.TemporaryDirectory() as tmp:
+        talker_dir = Path(tmp) / "talker"
+        talker_dir.mkdir()
+        # Minimal valid config.json (top-level scalars + flowmodel + aggregator).
+        (talker_dir / "config.json").write_text(json.dumps({
+            "steps": 12,
+            "patch_size": 8,
+            "history_patch_size": 64,
+            "cfg_strength": 1.5,
+            "flowmodel": {"depth": 4, "hidden_size": 512, "num_heads": 8, "dropout": 0.0},
+            "aggregator": {"depth": 4, "hidden_size": 512, "num_heads": 8, "dropout": 0.1},
+        }))
+        (talker_dir / "llm").mkdir()
+        (talker_dir / "llm" / "config.json").write_text(json.dumps({
+            "hidden_size": 512, "num_hidden_layers": 12,
+        }))
+        (talker_dir / "vae").mkdir()
+        (talker_dir / "vae" / "config.json").write_text(json.dumps({
+            "sample_rate": 22050,
+            "patch_size": 2,
+            "enc_kwargs": {"latent_dim": 32, "input_dim": 80, "hop_size": 256},
+            "dec_kwargs": {"latent_dim": 32, "output_dim": 401},
+        }))
+
+        cfg = TalkerConfig.from_subdir(talker_dir)
+        assert cfg is not None
+        assert cfg.steps == 12
+        assert isinstance(cfg.llm, TalkerLLMConfig)
+        assert cfg.llm.hidden_size == 512
+        assert isinstance(cfg.flowmodel, DiTBlockConfig)
+        assert cfg.flowmodel.depth == 4
+        assert cfg.aggregator.dropout == pytest.approx(0.1)
+        assert isinstance(cfg.vae, AudioVAEConfig)
+        assert cfg.vae.sample_rate == 22050
+        assert cfg.vae.latent_dim == 32
+        # Convenience accessor still works.
+        assert cfg.vae_sample_rate == 22050
+
+
+def test_talker_config_default_factories_yield_real_dataclasses() -> None:
+    """``TalkerConfig()`` with no args still produces typed sub-configs."""
+    from mstar.model.ming_omni_flash.config import (
+        AudioVAEConfig,
+        DiTBlockConfig,
+        TalkerConfig,
+        TalkerLLMConfig,
+    )
+    t = TalkerConfig()
+    assert isinstance(t.llm, TalkerLLMConfig)
+    assert isinstance(t.flowmodel, DiTBlockConfig)
+    assert isinstance(t.aggregator, DiTBlockConfig)
+    assert isinstance(t.vae, AudioVAEConfig)
+    assert t.vae_sample_rate == 44100   # convenience property
+
+
+# ---------------------------------------------------------------------------
+# Step 9a: ImageGen typed sub-configs (pure-Python)
+# ---------------------------------------------------------------------------
+
+
+def test_zimage_dit_config_from_dict_coerces_tuples_and_filters() -> None:
+    from mstar.model.ming_omni_flash.config import ZImageDiTConfig
+    dit = ZImageDiTConfig.from_dict({
+        "dim": 3840, "n_layers": 30, "in_channels": 16,
+        "axes_dims": [32, 48, 48], "axes_lens": [1536, 512, 512],
+        "all_patch_size": [2], "_class_name": "ZImageTransformer2DModel",
+    })
+    assert dit.dim == 3840
+    assert dit.axes_dims == (32, 48, 48)
+    assert dit.axes_lens == (1536, 512, 512)
+    assert dit.all_patch_size == (2,)
+    assert not hasattr(dit, "_class_name")
+
+
+def test_image_vae_config_defaults_and_from_dict() -> None:
+    from mstar.model.ming_omni_flash.config import ImageVAEConfig
+    vae = ImageVAEConfig.from_dict({
+        "latent_channels": 16, "scaling_factor": 0.3611, "shift_factor": 0.1159,
+        "act_fn": "silu", "ignored": 1,
+    })
+    assert vae.latent_channels == 16
+    assert vae.scaling_factor == pytest.approx(0.3611)
+    assert vae.shift_factor == pytest.approx(0.1159)
+    assert not hasattr(vae, "ignored")
+
+
+def test_imagegen_scheduler_config_from_dict() -> None:
+    from mstar.model.ming_omni_flash.config import ImageGenSchedulerConfig
+    s = ImageGenSchedulerConfig.from_dict({
+        "num_train_timesteps": 1000, "shift": 3.0, "use_dynamic_shifting": False,
+        "_class_name": "FlowMatchEulerDiscreteScheduler",
+    })
+    assert s.num_train_timesteps == 1000
+    assert s.shift == pytest.approx(3.0)
+    assert s.use_dynamic_shifting is False
+
+
+def test_byt5_mapper_config_from_nested_json() -> None:
+    from mstar.model.ming_omni_flash.config import ByT5MapperConfig
+    b = ByT5MapperConfig.from_json({
+        "byt5_mapper_type": "T5EncoderBlockByT5Mapper",
+        "byt5_mapper_config": {"num_layers": 4, "sdxl_channels": 2560},
+        "byt5_config": {"byt5_name": "google/byt5-small", "multilingual": True},
+        "byt5_max_length": 256,
+    })
+    assert b.byt5_mapper_type == "T5EncoderBlockByT5Mapper"
+    assert b.mapper_num_layers == 4
+    assert b.sdxl_channels == 2560
+    assert b.byt5_name == "google/byt5-small"
+    assert b.byt5_max_length == 256
+    assert b.multilingual is True
+
+
+def test_imagegen_config_default_factories_yield_typed_subconfigs() -> None:
+    from mstar.model.ming_omni_flash.config import (
+        ByT5MapperConfig,
+        ImageGenConfig,
+        ImageGenSchedulerConfig,
+        ImageVAEConfig,
+        ZImageDiTConfig,
+    )
+    ig = ImageGenConfig()
+    assert isinstance(ig.dit, ZImageDiTConfig)
+    assert isinstance(ig.vae, ImageVAEConfig)
+    assert isinstance(ig.scheduler, ImageGenSchedulerConfig)
+    assert isinstance(ig.byt5, ByT5MapperConfig)
+    assert ig.connector is None  # only populated by from_subdirs
+    assert ig.use_identity_mlp is True
+    assert ig.dit_type == "zimage"
+
+
+def test_imagegen_from_subdirs_returns_none_without_transformer() -> None:
+    """No transformer/ subdir → None (thinker-only / talker-only ckpt)."""
+    from mstar.model.ming_omni_flash.config import ImageGenConfig
+    with tempfile.TemporaryDirectory() as tmp:
+        assert ImageGenConfig.from_subdirs(Path(tmp)) is None
+
+
+def test_imagegen_from_subdirs_parses_synthetic_tree() -> None:
+    """from_subdirs reads each subdir's config into the typed fields."""
+    from mstar.model.ming_omni_flash.config import ImageGenConfig
+    with tempfile.TemporaryDirectory() as tmp:
+        root = Path(tmp)
+        (root / "transformer").mkdir()
+        (root / "transformer" / "config.json").write_text(json.dumps({
+            "_class_name": "ZImageTransformer2DModel",
+            "dim": 1024, "n_layers": 4, "in_channels": 16,
+            "axes_dims": [8, 12, 12], "axes_lens": [128, 64, 64],
+        }))
+        (root / "vae").mkdir()
+        (root / "vae" / "config.json").write_text(json.dumps({
+            "latent_channels": 16, "scaling_factor": 0.5, "shift_factor": 0.1,
+        }))
+        (root / "scheduler").mkdir()
+        (root / "scheduler" / "scheduler_config.json").write_text(json.dumps({
+            "num_train_timesteps": 1000, "shift": 2.5,
+        }))
+        (root / "byt5").mkdir()
+        (root / "byt5" / "byt5.json").write_text(json.dumps({
+            "byt5_mapper_config": {"num_layers": 2, "sdxl_channels": 1024},
+            "byt5_config": {"byt5_name": "google/byt5-small"},
+        }))
+        (root / "connector").mkdir()
+        (root / "connector" / "config.json").write_text(json.dumps({
+            "model_type": "qwen2", "hidden_size": 1536,
+        }))
+        (root / "mlp").mkdir()
+        (root / "mlp" / "config.json").write_text(json.dumps({
+            "img_gen_scales": [16], "diffusion_c_input_dim": 2560,
+            "use_identity_mlp": True, "dit_type": "zimage",
+        }))
+
+        ig = ImageGenConfig.from_subdirs(root)
+        assert ig is not None
+        assert ig.dit.dim == 1024
+        assert ig.dit.axes_dims == (8, 12, 12)
+        assert ig.vae.scaling_factor == pytest.approx(0.5)
+        assert ig.scheduler.shift == pytest.approx(2.5)
+        assert ig.byt5.mapper_num_layers == 2
+        assert ig.connector["hidden_size"] == 1536
+        assert ig.num_query_tokens == 256
diff --git a/test/modular/test_ming_flash_omni_encoders.py b/test/modular/test_ming_flash_omni_encoders.py
new file mode 100644
index 00000000..49367908
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_encoders.py
@@ -0,0 +1,522 @@
+"""Smoke tests for Ming-flash-omni-2.0 vision/audio encoders + projectors.
+
+Two layers of coverage:
+
+  * Pure-Python tests on the projector wrappers — shape / layer-index
+    parity with the released checkpoint's ``linear_proj.*`` and
+    ``linear_proj_audio.*`` weight keys. Run on CPU, no snapshot needed.
+
+  * Snapshot-gated tests on the vision encoder factory — construct from
+    the real ``VisionEncoderConfig`` and run a tiny forward. Skip when
+    no Ming snapshot or Ming source repo is available.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.projectors import (
+    MingAudioProjector,
+    MingVisionProjector,
+)
+
+# ---------------------------------------------------------------------------
+# Snapshot / Ming source discovery (mirrors test_ming_flash_omni_config.py)
+# ---------------------------------------------------------------------------
+
+
+def _find_local_snapshot() -> str | None:
+    """Locate a Ming-flash-omni-2.0 snapshot dir with shards reachable.
+
+    We need the shards (``model-00001-of-00042.safetensors`` etc.) to
+    live next to the index — the HF-Hub snapshot dir only carries the
+    index json symlink, with shards pulled out separately on this box.
+    Check the env override first, then the HF cache, then ``/dev/shm/
+    ming-hybrid`` (the local merged layout this dev machine uses).
+    """
+    def _has_shards(path: Path) -> bool:
+        return (
+            (path / "config.json").exists()
+            and (path / "model.safetensors.index.json").exists()
+            and (path / "model-00001-of-00042.safetensors").exists()
+        )
+
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and _has_shards(Path(override)):
+        return override
+
+    # The dev box's merged layout: shards + index colocate in /dev/shm.
+    hybrid = Path("/dev/shm/ming-hybrid")
+    if _has_shards(hybrid):
+        return str(hybrid)
+
+    # Fall back to the HF cache hub layout — accept it only if the
+    # snapshot dir also has the shards (not just the index symlink).
+    hub_roots = [
+        Path.home() / ".cache" / "huggingface" / "hub",
+        Path("/dev/shm/hf-cache"),
+    ]
+    repo_dirs = [
+        "models--inclusionAI--Ming-flash-omni-2.0",
+        "models--Jonathan1909--Ming-flash-omni-2.0",
+    ]
+    for hub_root in hub_roots:
+        for repo in repo_dirs:
+            snap_root = hub_root / repo / "snapshots"
+            if not snap_root.exists():
+                continue
+            for snap in sorted(snap_root.iterdir()):
+                if _has_shards(snap):
+                    return str(snap)
+    return None
+
+
+def _find_ming_code_dir() -> str | None:
+    """Mirror MingFlashOmniModel._find_ming_code_dir's search order."""
+    env = os.environ.get("MING_CODE_DIR")
+    if env and (Path(env) / "qwen3_moe_vit.py").exists():
+        return env
+    for candidate in (Path("./Ming"), Path("/tmp/ming_repo")):
+        if (candidate / "qwen3_moe_vit.py").exists():
+            return str(candidate)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# MingVisionProjector — pure Python
+# ---------------------------------------------------------------------------
+
+
+def test_vision_projector_default_depth_2_layer_indices() -> None:
+    """``linear_proj.0`` -> first Linear; ``linear_proj.2`` -> second Linear.
+
+    The released ckpt has ``mlp_depth=2`` so the projector is
+    Linear → GELU → Linear, and the weight loader keys hit indices 0 and 2.
+    """
+    p = MingVisionProjector(vision_dim=4096, llm_dim=4096, mlp_depth=2)
+    assert isinstance(p.proj[0], torch.nn.Linear)
+    assert isinstance(p.proj[1], torch.nn.GELU)
+    assert isinstance(p.proj[2], torch.nn.Linear)
+    assert p.proj[0].weight.shape == (4096, 4096)
+    assert p.proj[2].weight.shape == (4096, 4096)
+
+
+def test_vision_projector_depth_1_single_linear() -> None:
+    p = MingVisionProjector(vision_dim=4096, llm_dim=2048, mlp_depth=1)
+    assert len(p.proj) == 1
+    assert isinstance(p.proj[0], torch.nn.Linear)
+    assert p.proj[0].weight.shape == (2048, 4096)
+
+
+def test_vision_projector_rejects_depth_zero() -> None:
+    with pytest.raises(ValueError, match="mlp_depth must be >= 1"):
+        MingVisionProjector(vision_dim=4096, llm_dim=4096, mlp_depth=0)
+
+
+def test_vision_projector_forward_shape() -> None:
+    p = MingVisionProjector(vision_dim=8, llm_dim=16, mlp_depth=2)
+    x = torch.randn(5, 8)
+    out = p(x)
+    assert out.shape == (5, 16)
+    assert torch.isfinite(out).all()
+
+
+def test_vision_projector_forward_shape_batched() -> None:
+    p = MingVisionProjector(vision_dim=8, llm_dim=16, mlp_depth=2)
+    x = torch.randn(2, 5, 8)
+    out = p(x)
+    assert out.shape == (2, 5, 16)
+
+
+def test_vision_projector_checkpoint_keys_loadable() -> None:
+    """``linear_proj.0.weight`` style keys load via load_state_dict.
+
+    The Ming checkpoint stores the projector weights as flat
+    ``linear_proj.<idx>.weight`` / ``.bias`` — we expose the same
+    structure under our own ``proj.<idx>.<param>`` namespace, so the
+    upstream key prefix needs trimming. Verify the trim is sufficient.
+    """
+    p = MingVisionProjector(vision_dim=8, llm_dim=16, mlp_depth=2)
+    # Simulate the checkpoint state-dict shape (already trimmed of
+    # the "linear_proj." outer prefix by the caller).
+    fake_state = {
+        "proj.0.weight": torch.randn(16, 8),
+        "proj.0.bias": torch.randn(16),
+        "proj.2.weight": torch.randn(16, 16),
+        "proj.2.bias": torch.randn(16),
+    }
+    missing, unexpected = p.load_state_dict(fake_state)
+    assert not missing
+    assert not unexpected
+
+
+# ---------------------------------------------------------------------------
+# MingAudioProjector — pure Python
+# ---------------------------------------------------------------------------
+
+
+def test_audio_projector_default_depth_2_layer_indices() -> None:
+    """``linear_proj_audio.0`` -> Conv1d; ``linear_proj_audio.3`` -> Linear.
+
+    Layer order on disk: Conv1d (0), Transpose (1, no params), GELU (2,
+    no params), Linear (3), Transpose (4, no params). Indices 0 and 3
+    are the only ones with params.
+    """
+    p = MingAudioProjector(audio_dim=1280, llm_dim=4096, ds_kernel_size=3, ds_stride=2, mlp_depth=2)
+    assert isinstance(p.proj[0], torch.nn.Conv1d)
+    assert isinstance(p.proj[3], torch.nn.Linear)
+    assert p.proj[0].weight.shape == (4096, 1280, 3)
+    assert p.proj[3].weight.shape == (4096, 4096)
+
+
+def test_audio_projector_depth_1_no_mlp() -> None:
+    """depth=1 yields Conv1d + 2 transposes; no MLP. Only one param tensor."""
+    p = MingAudioProjector(audio_dim=8, llm_dim=16, mlp_depth=1)
+    # Layers: Conv1d(0), Transpose(1), Transpose(2).
+    assert len(p.proj) == 3
+    assert isinstance(p.proj[0], torch.nn.Conv1d)
+
+
+def test_audio_projector_rejects_depth_zero() -> None:
+    with pytest.raises(ValueError, match="mlp_depth must be >= 1"):
+        MingAudioProjector(audio_dim=8, llm_dim=16, mlp_depth=0)
+
+
+def test_audio_projector_forward_shape() -> None:
+    """Output is (B, llm_dim, T') with T' from compute_output_length."""
+    p = MingAudioProjector(audio_dim=8, llm_dim=16, ds_kernel_size=3, ds_stride=2, mlp_depth=2)
+    # 11-frame input. After Whisper stem this would be (11-3+2)//2+1 = 6;
+    # then the projector conv applies again — but the projector eats the
+    # raw (B, T, audio_dim) so the Whisper stem isn't in the equation here.
+    # Just the projector conv: T' = (11 - 3 + 2)//2 + 1 = 6.
+    x = torch.randn(2, 11, 8)
+    out = p(x)
+    assert out.shape == (2, 16, 6)
+    assert torch.isfinite(out).all()
+
+
+def test_audio_projector_compute_output_length_matches_two_conv_chain() -> None:
+    """Length math composes the Whisper stem with the projector conv."""
+    p = MingAudioProjector(audio_dim=8, llm_dim=16, ds_kernel_size=3, ds_stride=2)
+    # Whisper stem: (23-3+2*1)//2+1 = 22//2+1 = 12.
+    # Projector conv: (12-3+2*1)//2+1 = 11//2+1 = 6.
+    assert p.compute_output_length(torch.tensor([23])).tolist() == [6]
+
+
+# ---------------------------------------------------------------------------
+# Vision encoder — snapshot-gated
+# ---------------------------------------------------------------------------
+
+
+def _try_load_snapshot_and_code() -> tuple[str, str] | None:
+    snap = _find_local_snapshot()
+    if snap is None:
+        return None
+    code_dir = _find_ming_code_dir()
+    if code_dir is None:
+        return None
+    return snap, code_dir
+
+
+@pytest.fixture(scope="module")
+def staged_snapshot() -> tuple[str, str]:
+    """Skip if no snapshot or no Ming source repo is available.
+
+    Side effect: stages the Ming source files into the snapshot dir
+    (the same thing MingFlashOmniModel.__init__ does), so the dynamic
+    import inside build_vision_encoder resolves.
+    """
+    pair = _try_load_snapshot_and_code()
+    if pair is None:
+        pytest.skip(
+            "Need both a Ming-flash-omni-2.0 snapshot and a Ming source repo. "
+            "Set MING_FLASH_OMNI_DIR + MING_CODE_DIR."
+        )
+    snap, code_dir = pair
+    from mstar.model.ming_omni_flash.ming_omni_flash_model import _prepare_tokenizer_dir
+    _prepare_tokenizer_dir(snap, code_dir)
+    return snap, code_dir
+
+
+def test_vision_encoder_builds_from_config(staged_snapshot: tuple[str, str]) -> None:
+    """``build_vision_encoder`` returns a module with the expected dims.
+
+    Tiny config (depth=2) to keep the test fast; otherwise the full
+    27-layer encoder takes a few seconds to instantiate.
+    """
+    from mstar.model.ming_omni_flash.components.vision_encoder import build_vision_encoder
+    from mstar.model.ming_omni_flash.config import VisionEncoderConfig
+
+    snap, _ = staged_snapshot
+    cfg = VisionEncoderConfig(depth=2)  # rest default to the released ckpt's values
+    enc = build_vision_encoder(
+        config=cfg,
+        dtype=torch.float32,  # avoid bf16-on-CPU complaints
+        device="cpu",
+        local_dir=snap,
+        attn_implementation="eager",  # don't require FA2 on CPU
+    )
+    # Spot-check structural attributes that downstream code reads.
+    assert enc.image_emb_dim == cfg.out_hidden_size
+    assert enc.spatial_merge_size == cfg.spatial_merge_size
+    assert len(enc.blocks) == cfg.depth
+    assert enc.patch_embed.in_channels == cfg.in_channels
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="needs CUDA + FA2")
+def test_vision_encoder_forward_runs_smoke(staged_snapshot: tuple[str, str]) -> None:
+    """Construct a tiny encoder, run a single grid_thw=(1,2,2) image.
+
+    Uses the eager attention path so it runs without flash-attn installed.
+    """
+    from mstar.model.ming_omni_flash.components.vision_encoder import build_vision_encoder
+    from mstar.model.ming_omni_flash.config import VisionEncoderConfig
+
+    snap, _ = staged_snapshot
+    cfg = VisionEncoderConfig(depth=2)
+    enc = build_vision_encoder(
+        config=cfg,
+        dtype=torch.float32,
+        device="cuda",
+        local_dir=snap,
+        attn_implementation="eager",
+    )
+    # 1 image, grid (1 temporal, 2x2 spatial), patch_size=16, temporal_patch=2.
+    # Per Qwen3VLMoeVisionPatchEmbed: in_dim = patch_size**2 * temporal_patch * in_channels.
+    patch_in = cfg.patch_size * cfg.patch_size * cfg.temporal_patch_size * cfg.in_channels
+    n_patches = 1 * 2 * 2  # T*H*W
+    pixels = torch.randn(n_patches, patch_in, device="cuda")
+    grid_thw = torch.tensor([[1, 2, 2]], device="cuda")
+    try:
+        with torch.no_grad():
+            out = enc(pixels, grid_thw=grid_thw)
+    except RuntimeError as e:
+        # The upstream encoder uses inductor-compiled reductions which need
+        # nvrtc + libnvrtc-builtins matching the installed CUDA toolkit. On
+        # boxes where the toolkit/torch versions are mismatched, the kernel
+        # build fails with "failed to open libnvrtc-builtins.so.*". Skip
+        # cleanly so the rest of this file keeps green on under-provisioned
+        # test boxes — the forward-correctness path will be re-verified by
+        # the snapshot smoke once step 5 wires it into the prefill walk.
+        if "nvrtc" in str(e) or "libnvrtc" in str(e):
+            pytest.skip(f"nvrtc / CUDA toolkit unavailable on this box: {e}")
+        raise
+    # After spatial_merge_size=2 merge: 4 / 2**2 = 1 token per image, out_hidden_size dim.
+    assert out.shape == (1, cfg.out_hidden_size)
+    assert torch.isfinite(out).all()
+
+
+# ---------------------------------------------------------------------------
+# MingAudioEncoder — pure Python (no snapshot needed; weights are random)
+# ---------------------------------------------------------------------------
+
+
+def test_audio_encoder_constructs_with_defaults() -> None:
+    """Default kwargs match the released ckpt's whisper_encoder_config."""
+    from mstar.model.ming_omni_flash.components.audio_encoder import MingAudioEncoder
+
+    enc = MingAudioEncoder()  # defaults: n_mels=128, n_ctx=15000, n_state=1280, n_head=20, n_layer=32
+    assert enc.audio_emb_dim == 1280
+    assert len(enc.blocks) == 32
+    assert enc.positional_embedding.shape == (15000, 1280)
+
+
+def test_audio_encoder_constructs_with_overrides() -> None:
+    from mstar.model.ming_omni_flash.components.audio_encoder import MingAudioEncoder
+
+    enc = MingAudioEncoder(n_mels=80, n_ctx=500, n_state=64, n_head=4, n_layer=2)
+    assert enc.audio_emb_dim == 64
+    assert len(enc.blocks) == 2
+    assert enc.positional_embedding.shape == (500, 64)
+
+
+def test_audio_encoder_weight_keys_match_whisper_layout() -> None:
+    """Param names follow OpenAI Whisper's convention (query/key/value/out, mlp.0/.2).
+
+    The released Ming ckpt stores audio weights under the ``audio.*``
+    top-level prefix; loader strips that prefix and load_state_dict
+    must find the rest. Spot-check a representative set of keys.
+    """
+    from mstar.model.ming_omni_flash.components.audio_encoder import MingAudioEncoder
+
+    enc = MingAudioEncoder(n_mels=8, n_ctx=64, n_state=16, n_head=2, n_layer=2)
+    keys = set(dict(enc.named_parameters()).keys())
+    expected = {
+        "conv1.weight", "conv1.bias",
+        "conv2.weight", "conv2.bias",
+        "blocks.0.attn.query.weight", "blocks.0.attn.query.bias",
+        "blocks.0.attn.key.weight",          # key has bias=False
+        "blocks.0.attn.value.weight", "blocks.0.attn.value.bias",
+        "blocks.0.attn.out.weight",   "blocks.0.attn.out.bias",
+        "blocks.0.attn_ln.weight",    "blocks.0.attn_ln.bias",
+        "blocks.0.mlp.0.weight",      "blocks.0.mlp.0.bias",
+        "blocks.0.mlp.2.weight",      "blocks.0.mlp.2.bias",
+        "blocks.0.mlp_ln.weight",     "blocks.0.mlp_ln.bias",
+        "ln_post.weight",             "ln_post.bias",
+    }
+    missing = expected - keys
+    assert not missing, f"Missing expected weight keys: {sorted(missing)}"
+    # `key.bias` should NOT exist (Whisper convention).
+    assert "blocks.0.attn.key.bias" not in keys
+
+
+def test_audio_encoder_forward_packed_shape_no_flash_attn() -> None:
+    """Run a tiny encoder on CPU without flash-attn.
+
+    Verifies the packed-attention fallback produces the right shapes:
+      input:  list of (n_mels, T_i) for i in {0..N-1}
+      output: (sum_i conv2(conv1(T_i)), n_state)
+    The conv1 stride=1 + conv2 stride=2 reduce each T_i to ``(T_i // 2) + 1``
+    when pad=1, kernel=3, stride=(1,2).
+    """
+    from mstar.model.ming_omni_flash.components.audio_encoder import MingAudioEncoder
+
+    torch.manual_seed(0)
+    enc = MingAudioEncoder(n_mels=8, n_ctx=64, n_state=16, n_head=2, n_layer=2, use_flash_attn=False)
+    enc = enc.float()  # default Whisper inits in fp32 on CPU
+    x_list = [torch.randn(8, 10), torch.randn(8, 16), torch.randn(8, 6)]
+    out, cu_seqlens = enc(x_list)
+    # Per-clip encoded length: conv1(stride=1, pad=1, kernel=3) preserves T,
+    # then conv2(stride=2, pad=1, kernel=3) halves T → T'_i = (T_i + 1) // 2.
+    expected_lens = [(t.shape[1] + 1) // 2 for t in x_list]
+    assert out.shape == (sum(expected_lens), 16)
+    assert cu_seqlens.tolist() == [0, *list(__import__("itertools").accumulate(expected_lens))]
+    assert torch.isfinite(out).all()
+
+
+def test_audio_encoder_build_from_config() -> None:
+    """``build_audio_encoder`` reads dims off AudioEncoderConfig.
+
+    Doesn't need the snapshot — AudioEncoderConfig() default factory
+    populates ``whisper_encoder_config`` with the released ckpt's values.
+    """
+    from mstar.model.ming_omni_flash.components.audio_encoder import build_audio_encoder
+    from mstar.model.ming_omni_flash.config import AudioEncoderConfig
+
+    cfg = AudioEncoderConfig()
+    enc = build_audio_encoder(cfg, dtype=torch.float32, device="cpu", use_flash_attn=False)
+    assert enc.audio_emb_dim == cfg.d_model
+    assert len(enc.blocks) == cfg.encoder_layers
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated weight loaders (step 4b)
+# ---------------------------------------------------------------------------
+#
+# These exercise the prefix-strip + state_dict path against the real
+# released checkpoint. They're skipped when no snapshot is available.
+
+
+def _require_snapshot() -> str:
+    snap = _find_local_snapshot()
+    if snap is None:
+        pytest.skip("Need a Ming-flash-omni-2.0 snapshot. Set MING_FLASH_OMNI_DIR.")
+    return snap
+
+
+def test_load_vision_projector_weights_from_snapshot() -> None:
+    """``linear_proj.*`` keys load cleanly into MingVisionProjector(mlp_depth=2)."""
+    from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+    from mstar.model.ming_omni_flash.loader import load_vision_projector_weights
+
+    snap = _require_snapshot()
+    cfg = MingFlashOmniModelConfig.from_pretrained(snap)
+    proj = MingVisionProjector(
+        vision_dim=cfg.vision.out_hidden_size,
+        llm_dim=cfg.thinker_llm.hidden_size,
+        mlp_depth=cfg.mlp_depth,
+    )
+    proj = proj.float()
+    loaded = load_vision_projector_weights(proj, snap, device="cpu", strict=True)
+    # Two Linear blocks × {weight, bias} = 4 keys total at mlp_depth=2.
+    assert loaded == {"proj.0.weight", "proj.0.bias", "proj.2.weight", "proj.2.bias"}
+    # Sanity-check that the loaded weight is non-zero (a fresh nn.Linear
+    # would be too, but we want to know the param actually got overwritten).
+    assert (proj.proj[0].weight.abs().sum() > 0).item()
+
+
+def test_load_audio_projector_weights_from_snapshot() -> None:
+    """``linear_proj_audio.*`` keys load cleanly into MingAudioProjector(mlp_depth=2)."""
+    from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+    from mstar.model.ming_omni_flash.loader import load_audio_projector_weights
+
+    snap = _require_snapshot()
+    cfg = MingFlashOmniModelConfig.from_pretrained(snap)
+    proj = MingAudioProjector(
+        audio_dim=cfg.audio_encoder.d_model,
+        llm_dim=cfg.thinker_llm.hidden_size,
+        ds_kernel_size=cfg.audio_encoder.ds_kernel_size,
+        ds_stride=cfg.audio_encoder.ds_stride,
+        mlp_depth=cfg.mlp_depth,
+    )
+    proj = proj.float()
+    loaded = load_audio_projector_weights(proj, snap, device="cpu", strict=True)
+    # Conv1d + Linear × {weight, bias} = 4 keys total at mlp_depth=2.
+    assert loaded == {"proj.0.weight", "proj.0.bias", "proj.3.weight", "proj.3.bias"}
+
+
+def test_load_audio_encoder_weights_from_snapshot() -> None:
+    """``audio.*`` keys load cleanly into MingAudioEncoder.
+
+    Snapshot is bf16; we build the encoder in fp32 here so load_state_dict
+    dtype-promotes the loaded tensors without a downcast assertion.
+    """
+    from mstar.model.ming_omni_flash.components.audio_encoder import build_audio_encoder
+    from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+    from mstar.model.ming_omni_flash.loader import load_audio_encoder_weights
+
+    snap = _require_snapshot()
+    cfg = MingFlashOmniModelConfig.from_pretrained(snap)
+    # Full 32-layer encoder is ~5 GB at fp32; bf16 keeps it under 3 GB
+    # and still loads cleanly because both ckpt + module agree on dtype.
+    enc = build_audio_encoder(
+        cfg.audio_encoder, dtype=torch.bfloat16, device="cpu", use_flash_attn=False,
+    )
+    loaded = load_audio_encoder_weights(enc, snap, device="cpu", strict=True)
+    # 32 layers × (4 attn linears: query/key/value/out, 1 with bias=False
+    # so 7 attn params; + 2 LN × 2 + 2 mlp Linear × 2) = lots; just spot-check
+    # representative keys made it in.
+    assert "blocks.0.attn.query.weight" in loaded
+    assert "blocks.0.attn.key.weight" in loaded
+    assert "blocks.31.mlp.2.bias" in loaded
+    assert "ln_post.weight" in loaded
+    # Released ckpt ships its own (trained) positional_embedding that
+    # overrides the sinusoidal init — confirm it's loaded as a buffer.
+    assert "positional_embedding" in loaded
+    assert enc.positional_embedding.shape == (15000, cfg.audio_encoder.d_model)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="needs CUDA + Ming source modules to instantiate vision encoder",
+)
+def test_load_vision_encoder_weights_from_snapshot(staged_snapshot: tuple[str, str]) -> None:
+    """``vision.*`` keys load cleanly into the Ming Qwen3MoeVisionTransformer.
+
+    Full vision encoder is 27 layers; instantiating it bf16 takes a couple
+    of seconds. CUDA-gated because Whisper's autograd-free Conv1d still
+    pulls in CUDA contexts in the upstream encoder module (constructor
+    calls .to()).
+    """
+    from mstar.model.ming_omni_flash.components.vision_encoder import build_vision_encoder
+    from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+    from mstar.model.ming_omni_flash.loader import load_vision_encoder_weights
+
+    snap, _ = staged_snapshot
+    cfg = MingFlashOmniModelConfig.from_pretrained(snap)
+    enc = build_vision_encoder(
+        config=cfg.vision,
+        dtype=torch.bfloat16,
+        device="cpu",
+        local_dir=snap,
+        attn_implementation="eager",
+    )
+    loaded = load_vision_encoder_weights(enc, snap, device="cpu", strict=True)
+    assert "blocks.0.attn.qkv.weight" in loaded
+    assert "blocks.0.mlp.linear_fc1.weight" in loaded
+    assert "merger.linear_fc1.weight" in loaded
+    assert f"blocks.{cfg.vision.depth - 1}.norm2.weight" in loaded
diff --git a/test/modular/test_ming_flash_omni_graph.py b/test/modular/test_ming_flash_omni_graph.py
new file mode 100644
index 00000000..6e044a08
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_graph.py
@@ -0,0 +1,532 @@
+"""Tests for the multimodal graph + scheduling wiring (step 5c).
+
+Covers ``get_graph_walk_graphs``, ``get_partitions``, the prefill-
+schedule helpers, ``get_initial_forward_pass_args`` and
+``get_partition_forward_pass_args`` — all routed by
+``input_modalities`` instead of the text-only `prefill`/`decode`
+walks from step 3f.
+
+These tests build a bare ``MingFlashOmniModel`` via ``__new__`` so
+they exercise the routing/scheduling code paths without loading the
+~238 GB ckpt. Snapshot-gated end-to-end serve verification is a
+separate task (the 4-GPU dev box can't fit the full TP=8 model).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from mstar.conductor.request_info import CurrentForwardConductorMetadata
+from mstar.graph.base import GraphNode, Loop, Sequential
+from mstar.graph.special_destinations import EMIT_TO_CLIENT
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    MingFlashOmniModelConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+# ---------------------------------------------------------------------------
+# Tiny model instance (no weights, no tokenizer)
+# ---------------------------------------------------------------------------
+
+
+def _bare_model() -> MingFlashOmniModel:
+    """Return a MingFlashOmniModel with just enough state for graph routing.
+
+    Bypasses __init__ (which downloads the snapshot + tokenizer); injects
+    a tiny config so the prefill scheduler / partition state machine can
+    run without loading the 100B-param ckpt.
+    """
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+    )
+    inst._submodule_cache = {}
+    return inst
+
+
+# Stub TensorPointerInfo: the scheduling code only ever reads its
+# presence (length checks + per-step dict construction), not any field,
+# so a plain object is enough for unit tests.
+class _StubTI:
+    def __init__(self, tag: str) -> None:
+        self.tag = tag
+
+    def __repr__(self) -> str:
+        return f"<TI {self.tag}>"
+
+
+# ---------------------------------------------------------------------------
+# get_graph_walk_graphs / get_partitions
+# ---------------------------------------------------------------------------
+
+
+def test_graph_walk_graphs_emits_five_walks() -> None:
+    model = _bare_model()
+    walks = model.get_graph_walk_graphs()
+    assert set(walks.keys()) == {
+        "prefill_text", "prefill_audio",
+        "prefill_vision", "prefill_video",
+        "thinker_decode",
+    }
+
+
+def test_prefill_text_walk_is_single_thinker_node() -> None:
+    """Text prefill is a bare Thinker node with one EMIT_TO_CLIENT edge."""
+    model = _bare_model()
+    walks = model.get_graph_walk_graphs()
+    node = walks["prefill_text"]
+    assert isinstance(node, GraphNode)
+    assert node.name == "Thinker"
+    assert set(node.input_names) == {"text_inputs"}
+    assert len(node.outputs) == 1
+    assert node.outputs[0].next_node == EMIT_TO_CLIENT
+    assert node.outputs[0].name == "new_token"
+    assert node.outputs[0].output_modality == "text"
+    assert node.outputs[0].persist is True
+
+
+def test_prefill_audio_walk_routes_encoder_then_thinker() -> None:
+    model = _bare_model()
+    walks = model.get_graph_walk_graphs()
+    seq = walks["prefill_audio"]
+    assert isinstance(seq, Sequential)
+    assert len(seq.sections) == 2
+    encoder, thinker = seq.sections
+    assert encoder.name == "audio_encoder"
+    assert set(encoder.input_names) == {"audio_features", "audio_seqlens"}
+    assert len(encoder.outputs) == 1
+    assert encoder.outputs[0].next_node == "Thinker"
+    assert encoder.outputs[0].name == "audio_embeds"
+    # Second node is the Thinker; its only input is the encoder's audio_embeds.
+    assert thinker.name == "Thinker"
+    assert set(thinker.input_names) == {"audio_embeds"}
+
+
+def test_prefill_vision_walk_threads_grid_to_thinker() -> None:
+    """vision_encoder is first; Thinker also reads image_grid_thw."""
+    model = _bare_model()
+    walks = model.get_graph_walk_graphs()
+    seq = walks["prefill_vision"]
+    assert isinstance(seq, Sequential)
+    encoder, thinker = seq.sections
+    assert encoder.name == "vision_encoder"
+    assert set(encoder.input_names) == {"pixel_values", "image_grid_thw"}
+    assert thinker.name == "Thinker"
+    # Thinker needs image_grid_thw for the 3D MRoPE math.
+    assert "vision_embeds" in thinker.input_names
+    assert "image_grid_thw" in thinker.input_names
+
+
+def test_prefill_video_walk_adds_video_second_per_grid() -> None:
+    model = _bare_model()
+    walks = model.get_graph_walk_graphs()
+    seq = walks["prefill_video"]
+    assert isinstance(seq, Sequential)
+    encoder, thinker = seq.sections
+    assert encoder.name == "vision_encoder"
+    assert "video_second_per_grid" in thinker.input_names
+
+
+def test_thinker_decode_is_loop() -> None:
+    model = _bare_model()
+    walks = model.get_graph_walk_graphs()
+    loop = walks["thinker_decode"]
+    assert isinstance(loop, Loop)
+    assert loop.section.name == "Thinker"
+    # The loop must produce a feedback edge so prior token reaches next iter.
+    feedback = [e for e in loop.section.outputs if e.next_node == "Thinker"]
+    assert len(feedback) == 1
+    assert feedback[0].name == "text_inputs"
+
+
+def test_get_partitions_lists_all_five_walks() -> None:
+    model = _bare_model()
+    parts = model.get_partitions()
+    assert len(parts) == 1
+    p = parts[0]
+    assert p.name == "Thinker"
+    assert p.initial_walk == "prefill_text"
+    assert p.graph_walks == {
+        "prefill_text", "prefill_audio",
+        "prefill_vision", "prefill_video",
+        "thinker_decode",
+    }
+
+
+# ---------------------------------------------------------------------------
+# _build_thinker_prefill_schedule
+# ---------------------------------------------------------------------------
+
+
+def test_build_schedule_text_only() -> None:
+    model = _bare_model()
+    text_ti = _StubTI("text")
+    sched = model._build_thinker_prefill_schedule(
+        input_modalities=["text"],
+        input_signals={"text_inputs": [text_ti]},
+    )
+    assert sched == [("prefill_text", {"text_inputs": text_ti})]
+
+
+def test_build_schedule_text_then_audio_then_image() -> None:
+    """Schedule honors input_modalities order."""
+    model = _bare_model()
+    sig = {
+        "text_inputs": [_StubTI("t0")],
+        "audio_features": [_StubTI("a0")],
+        "audio_seqlens": [_StubTI("aseq0")],
+        "pixel_values": [_StubTI("p0")],
+        "image_grid_thw": [_StubTI("g0")],
+    }
+    sched = model._build_thinker_prefill_schedule(
+        input_modalities=["text", "audio", "image"],
+        input_signals=sig,
+    )
+    assert [w for w, _ in sched] == [
+        "prefill_text", "prefill_audio", "prefill_vision",
+    ]
+    # Audio step carries the optional seqlens.
+    assert sched[1][1]["audio_seqlens"] is sig["audio_seqlens"][0]
+    # Image step carries the grid.
+    assert sched[2][1]["image_grid_thw"] is sig["image_grid_thw"][0]
+
+
+def test_build_schedule_video_carries_second_per_grid() -> None:
+    model = _bare_model()
+    sig = {
+        "pixel_values_videos": [_StubTI("v0")],
+        "video_grid_thw": [_StubTI("vg0")],
+        "video_second_per_grid": [_StubTI("vspg0")],
+    }
+    sched = model._build_thinker_prefill_schedule(
+        input_modalities=["video"], input_signals=sig,
+    )
+    assert sched[0][0] == "prefill_video"
+    entry = sched[0][1]
+    assert entry["pixel_values"] is sig["pixel_values_videos"][0]
+    assert entry["image_grid_thw"] is sig["video_grid_thw"][0]
+    assert entry["video_second_per_grid"] is sig["video_second_per_grid"][0]
+
+
+def test_build_schedule_skips_modalities_without_inputs() -> None:
+    """input_modalities=['audio'] but no audio_features → empty schedule."""
+    model = _bare_model()
+    sched = model._build_thinker_prefill_schedule(
+        input_modalities=["audio"], input_signals={},
+    )
+    assert sched == []
+
+
+def test_build_schedule_unknown_modality_silently_ignored() -> None:
+    """An unknown modality string doesn't crash — it just produces no step."""
+    model = _bare_model()
+    sched = model._build_thinker_prefill_schedule(
+        input_modalities=["holographic"], input_signals={},
+    )
+    assert sched == []
+
+
+# ---------------------------------------------------------------------------
+# _get_thinker_prefill_inputs
+# ---------------------------------------------------------------------------
+
+
+def _make_metadata(schedule: list[tuple[str, dict[str, Any]]], step: int = 0):
+    return CurrentForwardConductorMetadata(
+        input_modalities=[],
+        output_modalities=["text"],
+        graph_walk=schedule[step][0],
+        is_prefill=True,
+        kwargs={"prefill_schedule": schedule, "prefill_step": step},
+    )
+
+
+def test_prefill_inputs_text_routes_only_to_thinker() -> None:
+    model = _bare_model()
+    text_ti = _StubTI("text")
+    md = _make_metadata([("prefill_text", {"text_inputs": text_ti})])
+    edges = model._get_thinker_prefill_inputs(md, {"text_inputs": [text_ti]})
+    assert len(edges) == 1
+    assert edges[0].next_node == "Thinker"
+    assert edges[0].name == "text_inputs"
+    assert edges[0].tensor_info == [text_ti]
+
+
+def test_prefill_inputs_audio_routes_to_audio_encoder() -> None:
+    model = _bare_model()
+    af = _StubTI("af")
+    aseq = _StubTI("aseq")
+    md = _make_metadata([(
+        "prefill_audio",
+        {"audio_features": af, "audio_seqlens": aseq},
+    )])
+    edges = model._get_thinker_prefill_inputs(md, {})
+    target_names = sorted((e.next_node, e.name) for e in edges)
+    # Both audio inputs target the audio_encoder node.
+    assert ("audio_encoder", "audio_features") in target_names
+    assert ("audio_encoder", "audio_seqlens") in target_names
+
+
+def test_prefill_inputs_vision_dual_edges_for_grid() -> None:
+    """image_grid_thw goes to BOTH vision_encoder AND Thinker.
+
+    The encoder needs the grid to compute spatial positions on the
+    pixel patches; the Thinker also needs it for the 3D MRoPE math
+    (sentinel position layout around the vision span).
+    """
+    model = _bare_model()
+    pv = _StubTI("pv")
+    grid = _StubTI("grid")
+    md = _make_metadata([(
+        "prefill_vision",
+        {"pixel_values": pv, "image_grid_thw": grid},
+    )])
+    edges = model._get_thinker_prefill_inputs(md, {})
+    pairs = sorted((e.next_node, e.name) for e in edges)
+    assert ("vision_encoder", "pixel_values") in pairs
+    assert ("vision_encoder", "image_grid_thw") in pairs
+    assert ("Thinker", "image_grid_thw") in pairs
+
+
+def test_prefill_inputs_video_routes_second_per_grid_to_thinker() -> None:
+    model = _bare_model()
+    md = _make_metadata([(
+        "prefill_video",
+        {
+            "pixel_values": _StubTI("pv"),
+            "image_grid_thw": _StubTI("grid"),
+            "video_second_per_grid": _StubTI("spg"),
+        },
+    )])
+    edges = model._get_thinker_prefill_inputs(md, {})
+    pairs = sorted((e.next_node, e.name) for e in edges)
+    assert ("Thinker", "video_second_per_grid") in pairs
+
+
+# ---------------------------------------------------------------------------
+# get_initial_forward_pass_args
+# ---------------------------------------------------------------------------
+
+
+def test_initial_args_text_only_starts_in_prefill_text() -> None:
+    model = _bare_model()
+    text_ti = _StubTI("text")
+    args = model.get_initial_forward_pass_args(
+        partition_name="Thinker",
+        input_modalities=["text"],
+        output_modalities=["text"],
+        input_signals={"text_inputs": [text_ti]},
+    )
+    assert args.full_metadata.graph_walk == "prefill_text"
+    assert args.full_metadata.is_prefill is True
+    assert args.full_metadata.kwargs["prefill_step"] == 0
+    assert len(args.full_metadata.kwargs["prefill_schedule"]) == 1
+    # Single-modality request → is_last_prefill = True from the start.
+    assert args.step_metadata["is_last_prefill"] is True
+
+
+def test_initial_args_text_plus_image_orders_walks() -> None:
+    model = _bare_model()
+    args = model.get_initial_forward_pass_args(
+        partition_name="Thinker",
+        input_modalities=["text", "image"],
+        output_modalities=["text"],
+        input_signals={
+            "text_inputs": [_StubTI("text")],
+            "pixel_values": [_StubTI("pv")],
+            "image_grid_thw": [_StubTI("grid")],
+        },
+    )
+    assert args.full_metadata.graph_walk == "prefill_text"
+    schedule = args.full_metadata.kwargs["prefill_schedule"]
+    assert [w for w, _ in schedule] == ["prefill_text", "prefill_vision"]
+    # Two-step schedule → first step is NOT the last.
+    assert args.step_metadata["is_last_prefill"] is False
+
+
+def test_initial_args_no_modalities_returns_done() -> None:
+    """Empty schedule → request_done so the conductor doesn't hang."""
+    model = _bare_model()
+    args = model.get_initial_forward_pass_args(
+        partition_name="Thinker",
+        input_modalities=[],
+        output_modalities=["text"],
+        input_signals={},
+    )
+    assert args.request_done is True
+
+
+def test_initial_args_rejects_unknown_partition() -> None:
+    # Talker (step 6e-3) and ImageGen (step 9b) are both recognized partition
+    # names now; use a genuinely unknown name as the canonical 'unknown' here.
+    model = _bare_model()
+    with pytest.raises(ValueError, match="Unknown partition: 'NotARealPartition'"):
+        model.get_initial_forward_pass_args(
+            partition_name="NotARealPartition",
+            input_modalities=["text"],
+            output_modalities=["text"],
+            input_signals={"text_inputs": [_StubTI("text")]},
+        )
+
+
+# ---------------------------------------------------------------------------
+# get_partition_forward_pass_args state machine
+# ---------------------------------------------------------------------------
+
+
+def test_state_machine_advances_schedule_then_decodes_then_finishes() -> None:
+    """Drive Thinker state machine across a 2-step prefill + decode + finish."""
+    model = _bare_model()
+    init = model.get_initial_forward_pass_args(
+        partition_name="Thinker",
+        input_modalities=["text", "audio"],
+        output_modalities=["text"],
+        input_signals={
+            "text_inputs": [_StubTI("text")],
+            "audio_features": [_StubTI("af")],
+            "audio_seqlens": [_StubTI("aseq")],
+        },
+    )
+    metadata = init.full_metadata
+    assert metadata.graph_walk == "prefill_text"
+
+    # Step 2: advance to second prefill walk (prefill_audio).
+    args2 = model.get_partition_forward_pass_args(
+        partition_name="Thinker",
+        partition_metadata=metadata,
+        persist_signals={"new_token": [_StubTI("ntok")]},
+    )
+    assert args2.full_metadata.graph_walk == "prefill_audio"
+    assert args2.full_metadata.is_prefill is True
+    assert args2.step_metadata["is_last_prefill"] is True
+
+    # Step 3: schedule exhausted → transition to thinker_decode.
+    args3 = model.get_partition_forward_pass_args(
+        partition_name="Thinker",
+        partition_metadata=args2.full_metadata,
+        persist_signals={"new_token": [_StubTI("ntok")]},
+    )
+    assert args3.full_metadata.graph_walk == "thinker_decode"
+    assert args3.full_metadata.is_prefill is False
+    # Decode loop feedback edge is text_inputs <- new_token.
+    assert any(e.name == "text_inputs" for e in args3.inputs)
+
+    # Step 4: decode loop unwound → request_done.
+    args4 = model.get_partition_forward_pass_args(
+        partition_name="Thinker",
+        partition_metadata=args3.full_metadata,
+        persist_signals={},
+    )
+    assert args4.request_done is True
+
+
+# ---------------------------------------------------------------------------
+# get_worker_graphs partial-deploy skipping (regression for the live
+# bring-up KeyError: 'audio_encoder' — see model/base.py fix c06c99a)
+# ---------------------------------------------------------------------------
+
+
+def _talker_enabled_model() -> MingFlashOmniModel:
+    """Bare model whose config DOES declare a talker (so the talker walk exists)."""
+    from mstar.model.ming_omni_flash.config import (
+        AudioVAEConfig,
+        DiTBlockConfig,
+        TalkerConfig,
+        TalkerLLMConfig,
+    )
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        talker=TalkerConfig(
+            llm=TalkerLLMConfig(), flowmodel=DiTBlockConfig(),
+            aggregator=DiTBlockConfig(), vae=AudioVAEConfig(),
+        ),
+    )
+    inst._submodule_cache = {}
+    return inst
+
+
+def _write_yaml(tmp_path, node_groups: list[dict]) -> str:
+    import json
+    p = tmp_path / "cfg.yaml"
+    # node_groups is JSON-compatible; YAML is a JSON superset so json.dumps
+    # is a valid (if ugly) serialization the yaml loader accepts.
+    p.write_text("model: ming_flash_omni\nmax_seq_len: 4096\n"
+                 "node_groups: " + json.dumps(node_groups) + "\n")
+    return str(p)
+
+
+def _worker_graph_node_names(worker_graphs) -> set[str]:
+    """Collect the real GraphNode names across a list of WorkerGraphs.
+
+    A worker graph's ``section`` may be a Loop wrapper (e.g.
+    ``thinker_decode_loop``) rather than a GraphNode, so reach the actual
+    nodes via ``get_nodes()`` instead of reading ``section.name``.
+    """
+    names: set[str] = set()
+    for wg in worker_graphs:
+        names |= set(wg.section.get_nodes().keys())
+    return names
+
+
+def test_get_worker_graphs_thinker_only_skips_encoder_and_talker_walks(tmp_path) -> None:
+    """Regression: a thinker-only node_groups must NOT KeyError on the
+    encoder/talker walks — they're skipped because their nodes are absent.
+
+    This is exactly the live-bring-up crash that motivated the
+    model/base.py fix (KeyError: 'audio_encoder' during
+    _divide_into_worker_graphs of the prefill_audio walk).
+    """
+    model = _talker_enabled_model()  # all walks (incl. talker) are emitted
+    cfg = _write_yaml(tmp_path, [
+        {"node_names": ["Thinker"], "ranks": [0, 1, 2, 3], "tp_size": 4},
+    ])
+    # Must not raise.
+    wgs = model.get_worker_graphs(cfg)
+    node_names = _worker_graph_node_names(wgs)
+    # Only the Thinker is ever a worker-graph node; encoder/talker nodes
+    # never appear because their walks were skipped.
+    assert node_names == {"Thinker"}
+    assert "audio_encoder" not in node_names
+    assert "vision_encoder" not in node_names
+    assert "Talker" not in node_names
+
+
+def test_get_worker_graphs_full_omni_includes_all_nodes(tmp_path) -> None:
+    """With encoders + Talker declared, their walks divide cleanly."""
+    model = _talker_enabled_model()
+    cfg = _write_yaml(tmp_path, [
+        {"node_names": ["vision_encoder", "audio_encoder", "Talker"], "ranks": [0]},
+        {"node_names": ["Thinker"], "ranks": [0, 1, 2, 3], "tp_size": 4},
+    ])
+    wgs = model.get_worker_graphs(cfg)
+    node_names = _worker_graph_node_names(wgs)
+    # All node types now present across the divided worker graphs.
+    assert "Thinker" in node_names
+    assert "vision_encoder" in node_names
+    assert "audio_encoder" in node_names
+    assert "Talker" in node_names
+
+
+def test_get_worker_graphs_thinker_only_no_talker_config(tmp_path) -> None:
+    """A model whose config has no talker emits no talker walk at all, and a
+    thinker-only deploy still divides without error."""
+    model = _bare_model()  # talker=None → no talker walk emitted
+    cfg = _write_yaml(tmp_path, [
+        {"node_names": ["Thinker"], "ranks": [0, 1, 2, 3], "tp_size": 4},
+    ])
+    wgs = model.get_worker_graphs(cfg)
+    assert _worker_graph_node_names(wgs) == {"Thinker"}
diff --git a/test/modular/test_ming_flash_omni_imagegen_graph.py b/test/modular/test_ming_flash_omni_imagegen_graph.py
new file mode 100644
index 00000000..e35ac0ca
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_imagegen_graph.py
@@ -0,0 +1,211 @@
+"""Tests for the ImageGen graph walk + partition wiring (step 9b).
+
+Covers the imagegen-enabled graph topology: the ``imagegen`` walk, the ImageGen
+partition, the Thinker->ImageGen streaming connection, the STATELESS engine
+type, and the unchanged paths when no image_gen config is present.
+
+All tests build a bare MingFlashOmniModel via __new__ + injected config — no
+checkpoint load, no diffusers.
+"""
+
+from __future__ import annotations
+
+from mstar.conductor.request_info import CurrentForwardConductorMetadata
+from mstar.engine.base import EngineType
+from mstar.graph.base import GraphNode
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    ImageGenConfig,
+    MingFlashOmniModelConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+
+def _model(with_imagegen: bool) -> MingFlashOmniModel:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        image_gen=ImageGenConfig() if with_imagegen else None,
+    )
+    inst._submodule_cache = {}
+    return inst
+
+
+# ---------------------------------------------------------------------------
+# ImageGen absent — unchanged
+# ---------------------------------------------------------------------------
+
+
+def test_no_imagegen_walk_when_config_absent() -> None:
+    walks = _model(with_imagegen=False).get_graph_walk_graphs()
+    assert "imagegen" not in walks
+
+
+def test_no_imagegen_partition_when_config_absent() -> None:
+    parts = [p.name for p in _model(with_imagegen=False).get_partitions()]
+    assert "ImageGen" not in parts
+
+
+def test_no_imagegen_engine_type_when_config_absent() -> None:
+    types = _model(with_imagegen=False).get_node_engine_types()
+    assert "ImageGen" not in types
+
+
+# ---------------------------------------------------------------------------
+# ImageGen enabled — graph structure
+# ---------------------------------------------------------------------------
+
+
+def test_imagegen_walk_present_and_emits_image() -> None:
+    walks = _model(with_imagegen=True).get_graph_walk_graphs()
+    assert "imagegen" in walks
+    node = walks["imagegen"]
+    assert isinstance(node, GraphNode)
+    assert node.name == "ImageGen"
+    assert set(node.input_names) == {"thinker_hidden_states"}
+    assert len(node.outputs) == 1
+    assert node.outputs[0].name == "image"
+    assert node.outputs[0].output_modality == "image"
+
+
+def test_imagegen_partition_listed_with_producer() -> None:
+    parts = {p.name: p for p in _model(with_imagegen=True).get_partitions()}
+    assert "ImageGen" in parts
+    ig = parts["ImageGen"]
+    assert ig.graph_walks == {"imagegen"}
+    assert ig.initial_walk is None
+    assert ig.producer_partitions == ["Thinker"]
+
+
+def test_imagegen_topology_connects_thinker_to_imagegen() -> None:
+    topo = _model(with_imagegen=True).get_partition_topology()
+    assert set(topo.partitions) == {"Thinker", "ImageGen"}
+    assert len(topo.connections) == 1
+    conn = topo.connections[0]
+    assert conn.from_partition == "Thinker"
+    assert conn.to_partition == "ImageGen"
+    assert conn.edge_name == "thinker_hidden_states"
+    # The chunk policy must keep the consumer alive past producer-done.
+    policy = conn.chunk_policy_factory()
+    assert policy.continue_after_producer_done() is True
+
+
+def test_node_engine_types_registers_imagegen_stateless() -> None:
+    types = _model(with_imagegen=True).get_node_engine_types()
+    assert types["ImageGen"] == EngineType.STATELESS
+
+
+def test_imagegen_does_not_disturb_thinker_walks() -> None:
+    walks = _model(with_imagegen=True).get_graph_walk_graphs()
+    # All five thinker walks remain present.
+    for w in ("prefill_text", "prefill_audio", "prefill_vision", "prefill_video", "thinker_decode"):
+        assert w in walks
+
+
+def test_imagegen_and_talker_coexist_when_both_absent_is_thinker_only() -> None:
+    """Sanity: with neither talker nor imagegen, topology is single-partition."""
+    topo = _model(with_imagegen=False).get_partition_topology()
+    assert topo.partitions == ["Thinker"]
+    assert topo.connections == []
+
+
+# ---------------------------------------------------------------------------
+# get_submodule dispatch (no load — just the unknown-node error path)
+# ---------------------------------------------------------------------------
+
+
+def test_get_submodule_unknown_node_lists_imagegen() -> None:
+    import pytest
+
+    model = _model(with_imagegen=True)
+    with pytest.raises(ValueError, match="ImageGen"):
+        model.get_submodule("NotARealNode")
+
+
+# ---------------------------------------------------------------------------
+# ImageGen partition state machine (consumer side)
+# ---------------------------------------------------------------------------
+
+
+class _Conn:
+    """Stub StreamingConnectionState."""
+
+    def __init__(self, producer_done: bool) -> None:
+        self.producer_done = producer_done
+        self.token_count = 0
+        self.consumed_count = 0
+
+
+def test_imagegen_initial_args_image_output_keeps_partition_alive() -> None:
+    model = _model(with_imagegen=True)
+    args = model.get_initial_forward_pass_args(
+        partition_name="ImageGen",
+        input_modalities=["text"],
+        output_modalities=["image"],
+        input_signals={},
+    )
+    assert args.full_metadata.graph_walk == "imagegen"
+    assert args.request_done is False
+
+
+def test_imagegen_initial_args_no_image_output_done_immediately() -> None:
+    model = _model(with_imagegen=True)
+    args = model.get_initial_forward_pass_args(
+        partition_name="ImageGen",
+        input_modalities=["text"],
+        output_modalities=["text"],  # no image requested
+        input_signals={},
+    )
+    assert args.request_done is True
+
+
+def test_imagegen_forward_waits_for_producer_done() -> None:
+    model = _model(with_imagegen=True)
+    meta = CurrentForwardConductorMetadata(
+        input_modalities=["text"],
+        output_modalities=["image"],
+        graph_walk="imagegen",
+        is_prefill=False,
+    )
+    args = model.get_partition_forward_pass_args(
+        partition_name="ImageGen",
+        partition_metadata=meta,
+        persist_signals={},
+        incoming_connections=[_Conn(producer_done=False)],
+    )
+    assert args.request_done is False
+    assert args.inputs == []
+
+
+def test_imagegen_forward_fires_once_then_done() -> None:
+    model = _model(with_imagegen=True)
+    meta = CurrentForwardConductorMetadata(
+        input_modalities=["text"],
+        output_modalities=["image"],
+        graph_walk="imagegen",
+        is_prefill=False,
+    )
+    args1 = model.get_partition_forward_pass_args(
+        partition_name="ImageGen",
+        partition_metadata=meta,
+        persist_signals={},
+        incoming_connections=[_Conn(producer_done=True)],
+    )
+    assert args1.full_metadata.graph_walk == "imagegen"
+    assert len(args1.inputs) == 1
+    assert args1.inputs[0].name == "thinker_hidden_states"
+    assert args1.request_done is False
+    # Next invocation → already fired → done.
+    args2 = model.get_partition_forward_pass_args(
+        partition_name="ImageGen",
+        partition_metadata=args1.full_metadata,
+        persist_signals={},
+        incoming_connections=[_Conn(producer_done=True)],
+    )
+    assert args2.request_done is True
diff --git a/test/modular/test_ming_flash_omni_imagegen_pipeline.py b/test/modular/test_ming_flash_omni_imagegen_pipeline.py
new file mode 100644
index 00000000..ba37643f
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_imagegen_pipeline.py
@@ -0,0 +1,295 @@
+"""Tests for the image-gen condition encoder + diffusion pipeline (step 9b).
+
+All pure-Python on CPU with stub components — no diffusers, no checkpoint:
+
+  * ``MingConditionEncoder`` forward shape + L2-normalize×1000 behavior, driven
+    by a stub Qwen2-like connector;
+  * ``combine_cfg`` guidance math + renormalization;
+  * ``MingImageDenoiser`` loop wiring (CFG batch doubling, sign flip, scheduler
+    stepping) with a stub DiT + stub scheduler;
+  * ``MingImagePipeline`` end-to-end with stubs (condition → denoise → decode).
+"""
+
+from __future__ import annotations
+
+import torch
+
+from mstar.model.ming_omni_flash.components.condition_encoder import MingConditionEncoder
+from mstar.model.ming_omni_flash.components.imagegen_pipeline import (
+    MingImageDenoiser,
+    MingImageGenSamplingParams,
+    MingImagePipeline,
+    calculate_shift,
+    combine_cfg,
+)
+
+# ---------------------------------------------------------------------------
+# Stubs
+# ---------------------------------------------------------------------------
+
+
+class _StubConnectorOut:
+    def __init__(self, last_hidden: torch.Tensor) -> None:
+        self.hidden_states = [last_hidden]
+
+
+class _StubConnector(torch.nn.Module):
+    """Identity-ish Qwen2 stand-in: returns inputs_embeds as the last hidden."""
+
+    def __init__(self, hidden_size: int) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.last_attention_mask = None
+
+    def forward(self, inputs_embeds, attention_mask=None, **kwargs):
+        self.last_attention_mask = attention_mask
+        return _StubConnectorOut(inputs_embeds)
+
+
+class _ImageGenCfgStub:
+    connector_subfolder = "connector"
+    mlp_subfolder = "mlp"
+    diffusion_c_input_dim = 8
+    text_encoder_norm = True
+    use_identity_mlp = True
+
+
+def _cond_encoder(thinker_hidden=6, conn_hidden=5, c_out=8, norm=True) -> MingConditionEncoder:
+    cfg = _ImageGenCfgStub()
+    cfg.diffusion_c_input_dim = c_out
+    cfg.text_encoder_norm = norm
+    enc = MingConditionEncoder(cfg, thinker_hidden_size=thinker_hidden)
+    enc.connector = _StubConnector(conn_hidden)
+    enc.connector_hidden_size = conn_hidden
+    enc.proj_in = torch.nn.Linear(thinker_hidden, conn_hidden, bias=True)
+    enc.proj_out = torch.nn.Linear(conn_hidden, c_out, bias=True)
+    return enc.eval()
+
+
+class _StubDiT(torch.nn.Module):
+    """Returns a velocity per item shaped [C, F, H, W] from the latent input."""
+
+    in_channels = 4
+
+    def forward(self, latents_list, timestep, embeds):
+        # Echo a small transform of each latent so CFG pos/neg differ.
+        out = [lat * 0.1 for lat in latents_list]
+        return out, {}
+
+
+class _StubScheduler:
+    """Minimal flow-matching scheduler: x_{t-1} = x_t - dt * v."""
+
+    def __init__(self, n_steps: int) -> None:
+        self.timesteps = torch.linspace(1000, 0, n_steps)
+        self.config = {}
+        self.sigma_min = 0.0
+        self._dt = 1.0 / n_steps
+
+    def set_timesteps(self, num, device=None, mu=None):
+        self.timesteps = torch.linspace(1000, 0, num)
+
+    def step(self, model_output, t, sample, return_dict=False):
+        return (sample + self._dt * model_output,)
+
+
+class _StubVAEConfig:
+    block_out_channels = [128, 256]
+    scaling_factor = 0.5
+    shift_factor = 0.1
+
+
+class _StubVAE:
+    config = _StubVAEConfig()
+    dtype = torch.float32
+
+    def decode(self, latents, return_dict=False):
+        # [B, C, H, W] -> [B, 3, H*?, W*?]; just map channels to 3 for shape.
+        b, _c, h, w = latents.shape
+        return (torch.zeros(b, 3, h, w),)
+
+
+# ---------------------------------------------------------------------------
+# Condition encoder
+# ---------------------------------------------------------------------------
+
+
+def test_condition_encoder_output_shape() -> None:
+    enc = _cond_encoder(thinker_hidden=6, conn_hidden=5, c_out=8)
+    hidden = torch.randn(2, 4, 6)
+    out = enc(hidden)
+    assert out.shape == (2, 4, 8)
+
+
+def test_condition_encoder_l2_normalize_times_1000() -> None:
+    enc = _cond_encoder(c_out=8, norm=True)
+    out = enc(torch.randn(1, 3, 6))
+    # Each row L2-normalized then ×1000 -> norm ≈ 1000.
+    norms = out.norm(dim=-1)
+    assert torch.allclose(norms, torch.full_like(norms, 1000.0), atol=1e-2)
+
+
+def test_condition_encoder_no_norm_scaling() -> None:
+    enc = _cond_encoder(c_out=8, norm=False)
+    out = enc(torch.randn(1, 3, 6))
+    norms = out.norm(dim=-1)
+    # Without the ×1000, normalized rows have unit norm.
+    assert torch.allclose(norms, torch.ones_like(norms), atol=1e-4)
+
+
+def test_condition_encoder_builds_4d_mask() -> None:
+    enc = _cond_encoder()
+    enc(torch.randn(2, 4, 6))
+    mask = enc.connector.last_attention_mask
+    assert mask.shape == (2, 1, 4, 4)
+    assert torch.all(mask == 1)
+
+
+def test_condition_encoder_requires_load() -> None:
+    enc = MingConditionEncoder(_ImageGenCfgStub(), thinker_hidden_size=6)
+    import pytest
+
+    with pytest.raises(RuntimeError, match="load_from_checkpoint"):
+        enc(torch.randn(1, 3, 6))
+
+
+def test_condition_encoder_rejects_2d() -> None:
+    enc = _cond_encoder()
+    import pytest
+
+    with pytest.raises(ValueError, match=r"expected \[B, N, H\]"):
+        enc(torch.randn(4, 6))
+
+
+def test_zero_negative_is_zeros_like() -> None:
+    enc = _cond_encoder()
+    x = torch.randn(2, 3, 8)
+    z = enc.zero_negative(x)
+    assert z.shape == x.shape and torch.all(z == 0)
+
+
+# ---------------------------------------------------------------------------
+# CFG math
+# ---------------------------------------------------------------------------
+
+
+def test_combine_cfg_basic() -> None:
+    pos = torch.tensor([[2.0, 0.0]])
+    neg = torch.tensor([[0.0, 0.0]])
+    out = combine_cfg(pos, neg, guidance_scale=3.0)
+    # pos + 3*(pos-neg) = 4*pos
+    assert torch.allclose(out, torch.tensor([[8.0, 0.0]]))
+
+
+def test_combine_cfg_zero_scale_returns_pos() -> None:
+    pos = torch.randn(1, 4)
+    neg = torch.randn(1, 4)
+    assert torch.allclose(combine_cfg(pos, neg, 0.0), pos)
+
+
+def test_combine_cfg_renormalization_caps_norm() -> None:
+    pos = torch.tensor([[1.0, 0.0, 0.0]])
+    neg = torch.tensor([[-5.0, 0.0, 0.0]])
+    # Large guidance blows up the norm; renorm should cap to 1.0 × |pos|.
+    out = combine_cfg(pos, neg, guidance_scale=10.0, cfg_normalization=1.0)
+    assert torch.linalg.vector_norm(out).item() <= torch.linalg.vector_norm(pos).item() + 1e-4
+
+
+def test_calculate_shift_monotonic() -> None:
+    lo = calculate_shift(256)
+    hi = calculate_shift(4096)
+    assert hi > lo
+    assert abs(lo - 0.5) < 1e-6  # base_shift at base_seq_len
+
+
+# ---------------------------------------------------------------------------
+# Denoiser loop
+# ---------------------------------------------------------------------------
+
+
+def test_denoiser_runs_without_cfg() -> None:
+    dit = _StubDiT()
+    sched = _StubScheduler(4)
+    den = MingImageDenoiser(dit, sched, dtype=torch.float32)
+    latents = torch.randn(1, 4, 8, 8)
+    pe = [torch.randn(16, 8)]
+    out = den.denoise(latents, sched.timesteps, pe, None, guidance_scale=0.0)
+    assert out.shape == latents.shape
+
+
+def test_denoiser_runs_with_cfg() -> None:
+    dit = _StubDiT()
+    sched = _StubScheduler(3)
+    den = MingImageDenoiser(dit, sched, dtype=torch.float32)
+    latents = torch.randn(1, 4, 8, 8)
+    pe = [torch.randn(16, 8)]
+    ne = [torch.zeros(16, 8)]
+    out = den.denoise(latents, sched.timesteps, pe, ne, guidance_scale=3.0)
+    assert out.shape == latents.shape
+
+
+def test_denoiser_cfg_truncation_disables_guidance_late() -> None:
+    """With cfg_truncation=0, every step's t_norm>0 so CFG is always off — the
+    run must still complete and match the no-CFG path's shape."""
+    dit = _StubDiT()
+    sched = _StubScheduler(3)
+    den = MingImageDenoiser(dit, sched, dtype=torch.float32)
+    latents = torch.randn(1, 4, 8, 8)
+    pe = [torch.randn(16, 8)]
+    ne = [torch.zeros(16, 8)]
+    out = den.denoise(latents, sched.timesteps, pe, ne, guidance_scale=3.0, cfg_truncation=0.0)
+    assert out.shape == latents.shape
+
+
+# ---------------------------------------------------------------------------
+# Full pipeline (stubs)
+# ---------------------------------------------------------------------------
+
+
+def _stub_pipeline() -> MingImagePipeline:
+    cfg = _ImageGenCfgStub()
+    cfg.diffusion_c_input_dim = 8
+    enc = _cond_encoder(thinker_hidden=6, conn_hidden=5, c_out=8)
+    return MingImagePipeline(
+        transformer=_StubDiT(),
+        scheduler=_StubScheduler(3),
+        vae=_StubVAE(),
+        condition_encoder=enc,
+        image_gen_config=cfg,
+        byte5=None,
+        device="cpu",
+        dtype=torch.float32,
+    )
+
+
+def test_pipeline_prepare_latents_shape() -> None:
+    pipe = _stub_pipeline()
+    # vae_scale_factor = 2^(2-1)=2 -> vae_scale=4; 64/4=16.
+    lat = pipe.prepare_latents(1, 64, 64)
+    assert lat.shape == (1, 4, 16, 16)
+
+
+def test_pipeline_build_cap_feats_default_negative_is_zero() -> None:
+    pipe = _stub_pipeline()
+    hidden = torch.randn(4, 6)  # [N, H] single item
+    pos, neg = pipe.build_cap_feats(hidden)
+    assert len(pos) == 1 and len(neg) == 1
+    assert torch.all(neg[0] == 0)
+
+
+def test_pipeline_generate_end_to_end_shape() -> None:
+    pipe = _stub_pipeline()
+    hidden = torch.randn(4, 6)
+    params = MingImageGenSamplingParams(height=64, width=64, num_inference_steps=3, guidance_scale=2.0)
+    img = pipe.generate(hidden, params)
+    # decode maps to [B, 3, H/vae, W/vae] in the stub VAE.
+    assert img.shape[0] == 1 and img.shape[1] == 3
+
+
+def test_pipeline_generate_seed_is_deterministic() -> None:
+    pipe = _stub_pipeline()
+    hidden = torch.randn(4, 6)
+    params = MingImageGenSamplingParams(height=32, width=32, num_inference_steps=2, guidance_scale=0.0, seed=123)
+    a = pipe.generate(hidden, params)
+    b = pipe.generate(hidden, params)
+    assert torch.allclose(a, b)
diff --git a/test/modular/test_ming_flash_omni_imagegen_producer.py b/test/modular/test_ming_flash_omni_imagegen_producer.py
new file mode 100644
index 00000000..e708f4b4
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_imagegen_producer.py
@@ -0,0 +1,249 @@
+"""Tests for the image-gen producer side (step 9b).
+
+Two pieces of the Thinker->ImageGen handoff:
+
+  * ``LingMoeModel.forward(return_hidden_states=True)`` returns the post-norm
+    hidden states alongside logits (CUDA-gated — the mstar RMSNorm kernel is
+    CUDA-only).
+  * ``BailingMoeV2ThinkerSubmodule.extract_image_gen_hidden_states`` slices
+    those hidden states at the ``<imagePatch>`` query-token positions
+    (pure-tensor, CPU-testable).
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.submodules import BailingMoeV2ThinkerSubmodule
+
+_extract = BailingMoeV2ThinkerSubmodule.extract_image_gen_hidden_states
+PATCH = 157157
+
+
+# ---------------------------------------------------------------------------
+# extract_image_gen_hidden_states (CPU)
+# ---------------------------------------------------------------------------
+
+
+def test_extract_picks_patch_positions_in_order() -> None:
+    T, H = 8, 4
+    hidden = torch.arange(T * H, dtype=torch.float32).view(T, H)
+    # patch tokens at positions 3,4,5 (a 3-wide query block).
+    token_ids = torch.tensor([10, 11, 12, PATCH, PATCH, PATCH, 13, 14])
+    out = _extract(hidden, token_ids, PATCH)
+    assert out.shape == (3, H)
+    assert torch.equal(out, hidden[3:6])
+
+
+def test_extract_non_contiguous_positions() -> None:
+    hidden = torch.randn(6, 5)
+    token_ids = torch.tensor([PATCH, 1, PATCH, 2, 3, PATCH])
+    out = _extract(hidden, token_ids, PATCH)
+    assert out.shape == (3, 5)
+    assert torch.equal(out, hidden[[0, 2, 5]])
+
+
+def test_extract_flattens_2d_token_ids() -> None:
+    hidden = torch.randn(4, 3)
+    token_ids = torch.tensor([[PATCH, PATCH, 1, 2]])  # (1, T)
+    out = _extract(hidden, token_ids, PATCH)
+    assert out.shape == (2, 3)
+
+
+def test_extract_raises_when_no_patch_tokens() -> None:
+    hidden = torch.randn(4, 3)
+    token_ids = torch.tensor([1, 2, 3, 4])
+    with pytest.raises(ValueError, match="no <imagePatch> token"):
+        _extract(hidden, token_ids, PATCH)
+
+
+def test_extract_raises_on_length_mismatch() -> None:
+    hidden = torch.randn(4, 3)
+    token_ids = torch.tensor([PATCH, PATCH, PATCH])  # T=3 != 4
+    with pytest.raises(ValueError, match="!= hidden_states T"):
+        _extract(hidden, token_ids, PATCH)
+
+
+def test_extract_raises_on_bad_hidden_rank() -> None:
+    hidden = torch.randn(2, 4, 3)
+    token_ids = torch.tensor([PATCH, PATCH])
+    with pytest.raises(ValueError, match=r"expected \(T, H\)"):
+        _extract(hidden, token_ids, PATCH)
+
+
+# ---------------------------------------------------------------------------
+# LingMoeModel.forward(return_hidden_states=True) (CUDA-gated)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="mstar RMSNorm kernel is CUDA-only")
+def test_model_returns_hidden_states_tuple() -> None:
+    import torch.nn.functional as F
+
+    from mstar.model.ming_omni_flash.components.model import LingMoeModel
+
+    model = LingMoeModel(
+        vocab_size=64,
+        hidden_size=16,
+        intermediate_size=32,
+        moe_intermediate_size=16,
+        num_hidden_layers=1,
+        num_attention_heads=2,
+        num_kv_heads=2,
+        head_dim=8,
+        rms_norm_eps=1e-6,
+        rope_theta=10000.0,
+        max_position_embeddings=128,
+        partial_rotary_factor=1.0,
+        mrope_section=[1, 2, 1],
+        num_experts=4,
+        num_experts_per_tok=2,
+        num_shared_experts=1,
+        n_group=1,
+        topk_group=1,
+        routed_scaling_factor=1.0,
+        first_k_dense_replace=0,
+        tie_word_embeddings=False,
+        use_qkv_bias=False,
+        use_bias=False,
+    ).to("cuda").eval()
+
+    class _Cache:
+        def set_layer_idx(self, i):
+            pass
+
+        def run_attention(self, q, k, v):
+            q4 = q.transpose(0, 1).unsqueeze(0)
+            k4 = k.transpose(0, 1).unsqueeze(0)
+            v4 = v.transpose(0, 1).unsqueeze(0)
+            o = F.scaled_dot_product_attention(q4, k4, v4, is_causal=True, scale=q.shape[-1] ** -0.5)
+            return o.squeeze(0).transpose(0, 1).contiguous()
+
+    ids = torch.tensor([1, 2, 3, 4], device="cuda")
+    with torch.no_grad():
+        logits, hidden = model(_Cache(), input_ids=ids, return_hidden_states=True)
+    # Shape is the contract this test guards (the return_hidden_states plumbing).
+    assert logits.shape == (4, 64)
+    assert hidden.shape == (4, 16)
+    # Untrained random weights through the CUDA MoE/RMSNorm path can produce
+    # NaNs on some boxes (unrelated to the return_hidden_states change); the
+    # numeric relationship below is only meaningful on a finite forward.
+    if not (torch.isfinite(logits).all() and torch.isfinite(hidden).all()):
+        pytest.skip("untrained-weight CUDA forward produced non-finite values on this box")
+    # The returned hidden states are exactly what lm_head consumed: feeding
+    # them back through lm_head must reproduce the returned logits.
+    assert torch.allclose(model.lm_head(hidden), logits, atol=1e-3)
+
+
+# ---------------------------------------------------------------------------
+# Submodule forward emits thinker_hidden_states for image-gen prefill (CPU)
+# ---------------------------------------------------------------------------
+
+
+class _StubModel(torch.nn.Module):
+    """LingMoeModel stand-in: returns deterministic logits (+ hidden states).
+
+    Honors the (cache_handle, input_ids, position_ids, return_hidden_states)
+    signature the Thinker submodule calls, so the image-gen emit path can be
+    exercised without the CUDA-only RMSNorm forward.
+    """
+
+    def __init__(self, vocab_size: int, hidden_size: int) -> None:
+        super().__init__()
+        self.embed_tokens = torch.nn.Embedding(vocab_size, hidden_size)
+        self.lm_head = torch.nn.Linear(hidden_size, vocab_size, bias=False)
+        self._vocab = vocab_size
+        self._hidden = hidden_size
+
+    def forward(self, cache_handle, input_ids=None, position_ids=None, return_hidden_states=False, **kw):
+        T = input_ids.shape[0]
+        # Per-position hidden = row index broadcast, so the slice is verifiable.
+        hidden = torch.arange(T, dtype=torch.float32).unsqueeze(1).repeat(1, self._hidden)
+        logits = torch.zeros(T, self._vocab)
+        if return_hidden_states:
+            return logits, hidden
+        return logits
+
+
+class _StubCache:
+    def advance_seq_lens(self):
+        pass
+
+
+class _StubReqInfo:
+    position_info: dict = {}
+
+
+class _StubEngineInputs:
+    cache_manager = _StubCache()
+    single_request_info = _StubReqInfo()
+
+
+def _thinker_with_imagegen(hidden_size: int = 8, vocab_size: int = 157200):
+    from mstar.model.ming_omni_flash.config import (
+        AudioEncoderConfig,
+        ImageGenConfig,
+        MingFlashOmniModelConfig,
+        ThinkerLLMConfig,
+        VisionEncoderConfig,
+    )
+
+    cfg = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        # Keep the default attention dims (head_dim=128, mrope [8,12,12]) so
+        # the MingFlashOmniModelConfig MRoPE/head_dim invariants pass; the stub
+        # model ignores them — only embed_tokens/lm_head dims (hidden_size) and
+        # vocab_size matter here.
+        thinker_llm=ThinkerLLMConfig(vocab_size=vocab_size, hidden_size=hidden_size, head_dim=128),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        image_gen=ImageGenConfig(),
+    )
+    model = _StubModel(vocab_size, hidden_size)
+    return BailingMoeV2ThinkerSubmodule(model=model, config=cfg)
+
+
+def test_forward_emits_hidden_states_when_patch_tokens_present() -> None:
+    sub = _thinker_with_imagegen(hidden_size=8)
+    # prompt: 3 text tokens then a 2-wide imagePatch block.
+    ids = torch.tensor([10, 11, 12, PATCH, PATCH], dtype=torch.long)
+    out = sub.forward(graph_walk="prefill_text", engine_inputs=_StubEngineInputs(), text_inputs=ids)
+    assert "logits" in out
+    assert "thinker_hidden_states" in out
+    patch_hidden = out["thinker_hidden_states"][0]
+    # 2 patch positions (rows 3 and 4), hidden_size=8.
+    assert patch_hidden.shape == (2, 8)
+    assert torch.equal(patch_hidden[:, 0], torch.tensor([3.0, 4.0]))
+
+
+def test_forward_no_hidden_states_without_patch_tokens() -> None:
+    sub = _thinker_with_imagegen(hidden_size=8)
+    ids = torch.tensor([10, 11, 12, 13], dtype=torch.long)  # no patch tokens
+    out = sub.forward(graph_walk="prefill_text", engine_inputs=_StubEngineInputs(), text_inputs=ids)
+    assert "thinker_hidden_states" not in out
+    assert "logits" in out
+
+
+def test_forward_no_hidden_states_when_imagegen_config_absent() -> None:
+    from mstar.model.ming_omni_flash.config import (
+        AudioEncoderConfig,
+        MingFlashOmniModelConfig,
+        ThinkerLLMConfig,
+        VisionEncoderConfig,
+    )
+
+    cfg = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(vocab_size=157200, hidden_size=8, head_dim=128),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        image_gen=None,  # no imagegen deploy
+    )
+    sub = BailingMoeV2ThinkerSubmodule(model=_StubModel(157200, 8), config=cfg)
+    ids = torch.tensor([10, 11, PATCH, PATCH], dtype=torch.long)
+    out = sub.forward(graph_walk="prefill_text", engine_inputs=_StubEngineInputs(), text_inputs=ids)
+    # Even with patch tokens present, no imagegen config → no emit.
+    assert "thinker_hidden_states" not in out
diff --git a/test/modular/test_ming_flash_omni_imagegen_submodule.py b/test/modular/test_ming_flash_omni_imagegen_submodule.py
new file mode 100644
index 00000000..66a543b9
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_imagegen_submodule.py
@@ -0,0 +1,127 @@
+"""Tests for ImageGenSubmodule (step 9b).
+
+Pure-Python: a stub MingImagePipeline wrapped in ImageGenSubmodule — verifies
+input marshalling (prepare_inputs slicing the thinker hidden states), the
+stateless flavor, default sampling-param derivation from the ImageGenConfig,
+and that forward routes through pipeline.generate and emits an ``image`` edge.
+No diffusers, no checkpoint.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    ImageGenConfig,
+    MingFlashOmniModelConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.submodules import ImageGenSubmodule
+
+
+class _StubPipeline:
+    """Records the args generate() was called with; returns a fixed image."""
+
+    def __init__(self) -> None:
+        self.calls: list[dict] = []
+
+    def generate(self, thinker_hidden_states, params, *, negative_hidden=None, byte5_texts=None):
+        self.calls.append(
+            {
+                "hidden": thinker_hidden_states,
+                "params": params,
+                "negative": negative_hidden,
+            }
+        )
+        b = thinker_hidden_states.shape[0] if thinker_hidden_states.dim() == 3 else 1
+        return torch.zeros(b, 3, 64, 64)
+
+
+def _config(default_height=512, default_width=768, steps=7, cfg=3.5) -> MingFlashOmniModelConfig:
+    ig = ImageGenConfig()
+    ig.default_height = default_height
+    ig.default_width = default_width
+    ig.num_inference_steps = steps
+    ig.guidance_scale = cfg
+    return MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        image_gen=ig,
+    )
+
+
+def _submodule() -> tuple[ImageGenSubmodule, _StubPipeline]:
+    pipe = _StubPipeline()
+    sub = ImageGenSubmodule(pipeline=pipe, config=_config())
+    return sub, pipe
+
+
+def test_stateless_flavor_is_audio_codec() -> None:
+    sub, _ = _submodule()
+    assert sub.get_stateless_flavor() == "audio_codec"
+
+
+def test_default_params_from_image_gen_config() -> None:
+    sub, _ = _submodule()
+    p = sub.default_params
+    assert p.height == 512 and p.width == 768
+    assert p.num_inference_steps == 7
+    assert p.guidance_scale == 3.5
+
+
+def test_prepare_inputs_pulls_hidden_states() -> None:
+    sub, _ = _submodule()
+    hidden = torch.randn(1, 256, 4096)
+    out = sub.prepare_inputs(graph_walk="imagegen", fwd_info=None, inputs={"thinker_hidden_states": [hidden]})
+    assert torch.equal(out.tensor_inputs["thinker_hidden_states"], hidden)
+    assert out.tensor_inputs["negative_thinker_hidden_states"] is None
+
+
+def test_prepare_inputs_passes_negative_when_present() -> None:
+    sub, _ = _submodule()
+    hidden = torch.randn(1, 16, 4096)
+    neg = torch.randn(1, 16, 4096)
+    out = sub.prepare_inputs(
+        graph_walk="imagegen",
+        fwd_info=None,
+        inputs={"thinker_hidden_states": [hidden], "negative_thinker_hidden_states": [neg]},
+    )
+    assert torch.equal(out.tensor_inputs["negative_thinker_hidden_states"], neg)
+
+
+def test_prepare_inputs_raises_on_missing_hidden() -> None:
+    sub, _ = _submodule()
+    with pytest.raises(ValueError, match="missing 'thinker_hidden_states'"):
+        sub.prepare_inputs(graph_walk="imagegen", fwd_info=None, inputs={})
+
+
+def test_forward_emits_image_via_pipeline() -> None:
+    sub, pipe = _submodule()
+    hidden = torch.randn(1, 16, 4096)
+    out = sub.forward(graph_walk="imagegen", engine_inputs=None, thinker_hidden_states=hidden)
+    assert "image" in out
+    img = out["image"][0]
+    assert img.shape == (1, 3, 64, 64)
+    # The pipeline was driven with the default params and no negative.
+    assert len(pipe.calls) == 1
+    assert pipe.calls[0]["params"] is sub.default_params
+    assert pipe.calls[0]["negative"] is None
+
+
+def test_forward_forwards_negative_hidden() -> None:
+    sub, pipe = _submodule()
+    hidden = torch.randn(1, 16, 4096)
+    neg = torch.randn(1, 16, 4096)
+    sub.forward(
+        graph_walk="imagegen",
+        engine_inputs=None,
+        thinker_hidden_states=hidden,
+        negative_thinker_hidden_states=neg,
+    )
+    assert torch.equal(pipe.calls[0]["negative"], neg)
diff --git a/test/modular/test_ming_flash_omni_loader.py b/test/modular/test_ming_flash_omni_loader.py
new file mode 100644
index 00000000..e5d24cdb
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_loader.py
@@ -0,0 +1,303 @@
+"""Tests for the Ling-2.0 weight loader (TP-aware, step 3e).
+
+Three pure-Python tests verify the new name remapper + QKV split +
+per-expert StackedParamRules in isolation. Two CUDA/snapshot-gated
+tests load the real released checkpoint and verify forward + per-param
+shape — the strongest signal that the model code matches the upstream
+architecture byte-for-byte.
+
+Snapshot lookup mirrors the other ming tests: ``MING_FLASH_OMNI_DIR``
+env var, then the default HF Hub cache layout.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.model import LingMoeModel
+from mstar.model.ming_omni_flash.loader import (
+    _build_thinker_stacked_params,
+    _remap_thinker_keys,
+    _split_packed_qkv,
+    load_thinker_weights,
+)
+
+
+def _find_local_snapshot() -> str | None:
+    """Locate a Ming-flash-omni-2.0 snapshot on disk, or None."""
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and (Path(override) / "config.json").exists():
+        return override
+
+    hub_root = Path.home() / ".cache" / "huggingface" / "hub"
+    repo_dir = hub_root / "models--inclusionAI--Ming-flash-omni-2.0" / "snapshots"
+    if not repo_dir.exists():
+        return None
+    for snap in sorted(repo_dir.iterdir()):
+        if (snap / "config.json").exists():
+            return str(snap)
+    return None
+
+
+# Real-config values for the released ckpt (so weight shapes line up).
+def _real_thinker_dims(num_hidden_layers: int = 1) -> dict:
+    return dict(
+        vocab_size=157184,
+        hidden_size=4096,
+        intermediate_size=9216,
+        moe_intermediate_size=1024,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=32,
+        num_kv_heads=4,
+        head_dim=128,
+        rms_norm_eps=1e-6,
+        rope_theta=2_400_000.0,
+        max_position_embeddings=32768,
+        partial_rotary_factor=0.5,
+        mrope_section=[8, 12, 12],
+        num_experts=256,
+        num_experts_per_tok=8,
+        num_shared_experts=1,
+        n_group=8,
+        topk_group=4,
+        routed_scaling_factor=2.5,
+        first_k_dense_replace=1,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Pure-Python unit tests for the new loader helpers
+# ---------------------------------------------------------------------------
+
+
+def test_remap_thinker_keys_resolves_layer0_keys() -> None:
+    """Every layer-0 LLM ckpt key remaps to a parameter that exists in
+    a 1-layer dense-only LingMoeModel (after the synthetic q/k/v
+    expansion from the QKV split; we test that separately)."""
+    model = LingMoeModel(**_real_thinker_dims(num_hidden_layers=1))
+    target_keys = set(model.state_dict().keys())
+
+    # Direct-load keys (not QKV — that's split separately).
+    direct_keys = {
+        "model.lm_head.weight": "lm_head.weight",
+        "model.model.word_embeddings.weight": "embed_tokens.weight",
+        "model.model.norm.weight": "norm.weight",
+        "model.model.layers.0.input_layernorm.weight":
+            "layers.0.input_layernorm.weight",
+        "model.model.layers.0.post_attention_layernorm.weight":
+            "layers.0.post_attention_layernorm.weight",
+        "model.model.layers.0.attention.dense.weight":
+            "layers.0.self_attn.dense.weight",
+        "model.model.layers.0.attention.q_norm.weight":
+            "layers.0.self_attn.q_norm.weight",
+        "model.model.layers.0.attention.k_norm.weight":
+            "layers.0.self_attn.k_norm.weight",
+    }
+    for raw, expected in direct_keys.items():
+        renamed = _remap_thinker_keys(raw)
+        assert renamed == expected, f"{raw} → {renamed!r}, expected {expected!r}"
+        assert renamed in target_keys, f"{renamed!r} not in model.state_dict()"
+
+
+def test_remap_thinker_keys_handles_moe_layer() -> None:
+    """MoE-layer renames + per-expert rewrite."""
+    # Routers + shared expert.
+    assert (
+        _remap_thinker_keys("model.model.layers.5.mlp.gate.weight")
+        == "layers.5.mlp.gate.gate.weight"
+    )
+    assert (
+        _remap_thinker_keys("model.model.layers.5.mlp.image_gate.weight")
+        == "layers.5.mlp.image_gate.gate.weight"
+    )
+    assert (
+        _remap_thinker_keys("model.model.layers.5.mlp.audio_gate.expert_bias")
+        == "layers.5.mlp.audio_gate.expert_bias"
+    )
+    assert (
+        _remap_thinker_keys("model.model.layers.5.mlp.shared_experts.gate_proj.weight")
+        == "layers.5.mlp.shared_expert.gate_proj.weight"
+    )
+    # Per-expert: rewritten with __expertN__ marker so StackedParamRule
+    # suffix-match works downstream.
+    assert (
+        _remap_thinker_keys("model.model.layers.5.mlp.experts.42.gate_proj.weight")
+        == "layers.5.mlp.experts.gate_proj.__expert42__.weight"
+    )
+    assert (
+        _remap_thinker_keys("model.model.layers.5.mlp.experts.255.down_proj.weight")
+        == "layers.5.mlp.experts.down_proj.__expert255__.weight"
+    )
+
+
+def test_remap_thinker_keys_drops_non_thinker_prefixes() -> None:
+    """audio.* / vision.* keys aren't part of the thinker port; return None."""
+    assert _remap_thinker_keys("audio.encoder.layers.0.weight") is None
+    assert _remap_thinker_keys("vision.patch_embed.weight") is None
+
+
+def test_build_stacked_params_covers_every_expert() -> None:
+    """3 rules per expert × num_experts, plus dense MLP rules."""
+    rules = _build_thinker_stacked_params(num_experts=8)
+    # 3 × 8 expert rules + 2 dense-MLP rules = 26
+    assert len(rules) == 3 * 8 + 2
+    expert_shard_ids = {r.shard_id for r in rules if isinstance(r.shard_id, str) and ":" in r.shard_id}
+    expected = set()
+    for i in range(8):
+        for kind in ("gate", "up", "down"):
+            expected.add(f"{kind}:{i}")
+    assert expert_shard_ids == expected
+
+
+def test_split_packed_qkv_emits_three_synthetic_keys() -> None:
+    """A single ``attention.query_key_value.weight`` becomes three
+    synthetic keys with the expected row slicing."""
+    # GQA shape: num_heads=4, num_kv_heads=2, head_dim=8 →
+    # q_size=32, kv_size=16, total=64.
+    packed = torch.arange(64 * 16, dtype=torch.float32).view(64, 16)
+    stream = [(
+        "layers.0.attention.query_key_value.weight", packed,
+    ), (
+        "layers.0.input_layernorm.weight", torch.ones(16),
+    )]
+    out = list(_split_packed_qkv(
+        iter(stream),
+        num_attention_heads=4, num_kv_heads=2, head_dim=8,
+    ))
+    # 3 synthetic + 1 passthrough = 4
+    assert len(out) == 4
+    names = [k for k, _ in out]
+    assert names[:3] == [
+        "layers.0.attention.q_proj.weight",
+        "layers.0.attention.k_proj.weight",
+        "layers.0.attention.v_proj.weight",
+    ]
+    # Row slicing: q=[0:32], k=[32:48], v=[48:64].
+    assert torch.equal(out[0][1], packed[0:32, :])
+    assert torch.equal(out[1][1], packed[32:48, :])
+    assert torch.equal(out[2][1], packed[48:64, :])
+    # Non-QKV key passes through unchanged.
+    assert names[3] == "layers.0.input_layernorm.weight"
+
+
+def test_split_packed_qkv_rejects_bad_shape() -> None:
+    """Wrong first-dim raises a clear error."""
+    bad = torch.zeros(50, 16)  # expected 64 for the dims below
+    stream = [("layers.0.attention.query_key_value.weight", bad)]
+    with pytest.raises(ValueError, match="expected first dim 64"):
+        list(_split_packed_qkv(
+            iter(stream),
+            num_attention_heads=4, num_kv_heads=2, head_dim=8,
+        ))
+
+
+# ---------------------------------------------------------------------------
+# Real-checkpoint smoke (CUDA + snapshot required)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def snapshot_dir() -> str:
+    snap = _find_local_snapshot()
+    if snap is None:
+        pytest.skip(
+            "Ming-flash-omni-2.0 snapshot not found. Set MING_FLASH_OMNI_DIR "
+            "or download via `huggingface-cli download`."
+        )
+    return snap
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="real-ckpt smoke needs CUDA")
+def test_load_layer0_real_weights_runs_forward(snapshot_dir: str) -> None:
+    """Load embed + dense layer 0 + norm + lm_head from the real ckpt
+    into a 1-layer LingMoeModel (TP=1, comm_group=None default); run a
+    forward; verify shape + finite."""
+    dims = _real_thinker_dims(num_hidden_layers=1)
+    # Construct on meta + materialise on CUDA to avoid double allocation.
+    with torch.device("meta"):
+        model = LingMoeModel(**dims)
+    model.to_empty(device="cuda")
+    model.to(torch.bfloat16)
+
+    load_thinker_weights(model, snapshot_dir, device="cuda", strict=True)
+    model.eval()
+
+    # Minimal mock cache handle — passthrough SDPA, same as step 3d tests.
+    import torch.nn.functional as F
+
+    class _Cache:
+        def set_layer_idx(self, i):
+            pass
+
+        def run_attention(self, q, k, v):
+            num_heads = q.shape[1]
+            num_kv = k.shape[1]
+            if num_heads // num_kv > 1:
+                k = k.repeat_interleave(num_heads // num_kv, dim=1)
+                v = v.repeat_interleave(num_heads // num_kv, dim=1)
+            q4 = q.transpose(0, 1).unsqueeze(0)
+            k4 = k.transpose(0, 1).unsqueeze(0)
+            v4 = v.transpose(0, 1).unsqueeze(0)
+            out = F.scaled_dot_product_attention(
+                q4, k4, v4, is_causal=True, scale=q.shape[-1] ** -0.5,
+            )
+            return out.squeeze(0).transpose(0, 1).contiguous()
+
+    input_ids = torch.tensor([100, 200, 300, 400], device="cuda")
+    with torch.no_grad():
+        out = model(_Cache(), input_ids=input_ids)
+
+    assert out.shape == (4, dims["vocab_size"])
+    assert torch.isfinite(out).all(), \
+        f"Non-finite logits after 1-layer forward; max={out.abs().max().item()}"
+    assert out.dtype == torch.bfloat16
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="real-ckpt smoke needs CUDA")
+def test_layer0_attention_weights_match_expected_shapes(snapshot_dir: str) -> None:
+    """After load, every layer-0 attention param has the expected shape.
+
+    With TP=1 these match the full per-rank-equals-total dims; the same
+    test under TP>1 would expect num_heads / num_kv_heads divided by
+    tp_size.
+    """
+    dims = _real_thinker_dims(num_hidden_layers=1)
+    with torch.device("meta"):
+        model = LingMoeModel(**dims)
+    model.to_empty(device="cuda")
+    model.to(torch.bfloat16)
+    load_thinker_weights(model, snapshot_dir, device="cuda", strict=True)
+
+    head_dim = dims["head_dim"]
+    hidden = dims["hidden_size"]
+    n_heads = dims["num_attention_heads"]
+    n_kv = dims["num_kv_heads"]
+
+    expected = {
+        # QKVParallelLinear packs (q + 2*kv) * head_dim along dim 0.
+        "layers.0.self_attn.qkv_proj.weight":
+            ((n_heads + 2 * n_kv) * head_dim, hidden),
+        # RowParallelLinear holds (output, input_per_partition); TP=1 →
+        # input_per_partition = full.
+        "layers.0.self_attn.dense.weight": (hidden, n_heads * head_dim),
+        "layers.0.self_attn.q_norm.weight": (head_dim,),
+        "layers.0.self_attn.k_norm.weight": (head_dim,),
+        "layers.0.input_layernorm.weight": (hidden,),
+        "layers.0.post_attention_layernorm.weight": (hidden,),
+        "embed_tokens.weight": (dims["vocab_size"], hidden),
+        "lm_head.weight": (dims["vocab_size"], hidden),
+    }
+    state = dict(model.state_dict())
+    for name, shape in expected.items():
+        assert name in state, f"{name} missing from loaded state_dict"
+        assert tuple(state[name].shape) == shape, (
+            f"{name}: expected {shape}, got {tuple(state[name].shape)}"
+        )
+        assert torch.isfinite(state[name]).all(), \
+            f"{name} contains non-finite values after load"
diff --git a/test/modular/test_ming_flash_omni_model.py b/test/modular/test_ming_flash_omni_model.py
new file mode 100644
index 00000000..c4f51dfa
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_model.py
@@ -0,0 +1,333 @@
+"""Unit tests for Ling-2.0 MoE block + decoder layer + full thinker model.
+
+Tiny-config tests (vocab=64, hidden=32, layers=2, num_experts=8) that
+exercise the routing-mask paths, the dense-vs-MoE layer branch, and the
+end-to-end forward shape.
+
+Step-3b scope: no KV cache, no real weights, no batching. The model
+takes ``(T,)`` token ids or ``(T, hidden)`` embeds and returns
+``(T, vocab_size)`` logits.
+
+CUDA-only tests are gated on ``torch.cuda.is_available()`` because
+LingAttention's RMSNorm goes through flashinfer's CUDA kernel — same
+constraint as step 3a's attention tests.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mstar.model.ming_omni_flash.components.decoder_layer import (
+    LingDecoderLayer,
+)
+from mstar.model.ming_omni_flash.components.model import LingMoeModel
+from mstar.model.ming_omni_flash.components.moe import LingMoeBlock
+from mstar.model.ming_omni_flash.components.rope import (
+    LingPartialMRotaryEmbedding,
+)
+
+torch.manual_seed(2026)
+
+
+class _MockCacheHandle:
+    """Stand-in for BatchedCacheManager in unit tests; duplicated from
+    test_ming_flash_omni_components.py because test/ isn't a package."""
+
+    def __init__(self) -> None:
+        self.layer_idx = 0
+
+    def set_layer_idx(self, layer_idx: int) -> None:
+        self.layer_idx = layer_idx
+
+    def run_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+    ) -> torch.Tensor:
+        num_heads = q.shape[1]
+        num_kv = k.shape[1]
+        kv_groups = num_heads // num_kv
+        if kv_groups > 1:
+            k = k.repeat_interleave(kv_groups, dim=1)
+            v = v.repeat_interleave(kv_groups, dim=1)
+        q4 = q.transpose(0, 1).unsqueeze(0)
+        k4 = k.transpose(0, 1).unsqueeze(0)
+        v4 = v.transpose(0, 1).unsqueeze(0)
+        scale = q.shape[-1] ** -0.5
+        out = F.scaled_dot_product_attention(q4, k4, v4, is_causal=True, scale=scale)
+        return out.squeeze(0).transpose(0, 1).contiguous()
+
+
+# ---------------------------------------------------------------------------
+# LingMoeBlock
+# ---------------------------------------------------------------------------
+
+
+def _make_moe(hidden_size: int = 16) -> LingMoeBlock:
+    return LingMoeBlock(
+        hidden_size=hidden_size,
+        num_experts=8,
+        num_experts_per_tok=2,
+        moe_intermediate_size=16,
+        num_shared_experts=1,
+        n_group=2,
+        topk_group=1,
+        routed_scaling_factor=1.0,
+    )
+
+
+def test_ling_moe_block_text_only_forward_shape() -> None:
+    """Vanilla text routing: masks=None, output shape matches input.
+
+    Initialise fused expert + shared expert weights to small randoms so
+    the output isn't trivially zero.
+    """
+    moe = _make_moe()
+    with torch.no_grad():
+        moe.experts.gate_up_proj.normal_(std=0.05)
+        moe.experts.down_proj.normal_(std=0.05)
+        for p in moe.shared_expert.parameters():
+            p.normal_(std=0.05)
+    x = torch.randn(6, 16)
+    out = moe(x)
+    assert out.shape == x.shape
+    assert torch.isfinite(out).all()
+
+
+def test_ling_moe_block_image_mask_routes_through_image_gate() -> None:
+    """When ``image_mask`` is True for some positions, those positions
+    receive the chosen expert set from ``image_gate`` instead of ``gate``.
+
+    Force the image gate to deterministically pick a known expert by
+    spiking one input dim and one image_gate weight column; verify that
+    expert is in the per-row selection at masked positions and absent
+    at unmasked positions.
+    """
+    moe = _make_moe()
+    # Make the text gate strongly prefer expert 0 across all inputs;
+    # make the image gate strongly prefer expert 5.
+    with torch.no_grad():
+        moe.gate.gate.weight.zero_()
+        moe.gate.gate.weight[0, 0] = 10.0
+        moe.image_gate.gate.weight.zero_()
+        moe.image_gate.gate.weight[5, 0] = 10.0
+        moe.audio_gate.gate.weight.zero_()
+        moe.experts.gate_up_proj.normal_(std=0.05)
+        moe.experts.down_proj.normal_(std=0.05)
+        # ParallelGatedMLP shared expert uses torch.empty for init;
+        # initialise so forward doesn't produce NaN.
+        for p in moe.shared_expert.parameters():
+            p.normal_(std=0.05)
+
+    N = 6
+    x = torch.zeros(N, 16)
+    x[:, 0] = 1.0  # light up the boosted input dim
+    image_mask = torch.tensor([True, True, True, False, False, False])
+
+    # Run the routing path directly so we can check the chosen indices,
+    # since the forward returns post-dispatch tensors only.
+    _, _, text_idx = moe.gate(x)
+    _, _, image_idx = moe.image_gate(x)
+    image_mask_n = image_mask.reshape(N, 1).bool()
+    selected_idx = torch.where(image_mask_n, image_idx, text_idx)
+
+    # Masked rows: expert 5 (image gate's pick) appears.
+    assert (selected_idx[:3] == 5).any(dim=-1).all(), selected_idx[:3]
+    # Unmasked rows: expert 0 (text gate's pick) appears.
+    assert (selected_idx[3:] == 0).any(dim=-1).all(), selected_idx[3:]
+    # Masked rows do NOT contain expert 0 (text gate's only pick).
+    assert not (selected_idx[:3] == 0).any(), selected_idx[:3]
+
+    # And the forward itself runs through end-to-end with the mask:
+    out = moe(x, image_mask=image_mask)
+    assert out.shape == x.shape
+    assert torch.isfinite(out).all()
+
+
+def test_ling_moe_block_shared_expert_contributes() -> None:
+    """Output differs when the shared expert has non-zero weights vs
+    zeroed weights — proves the shared expert isn't dead code."""
+    moe = _make_moe()
+    with torch.no_grad():
+        moe.experts.gate_up_proj.normal_(std=0.05)
+        moe.experts.down_proj.normal_(std=0.05)
+        # Start with shared expert zeroed.
+        for p in moe.shared_expert.parameters():
+            p.zero_()
+    x = torch.randn(4, 16)
+    out_zero_shared = moe(x).clone()
+
+    with torch.no_grad():
+        for p in moe.shared_expert.parameters():
+            p.normal_(std=0.1)
+    out_with_shared = moe(x)
+    assert not torch.allclose(out_zero_shared, out_with_shared), (
+        "shared expert weights had no effect — possibly skipped in forward"
+    )
+
+
+def test_ling_moe_block_rejects_bad_mask_shape() -> None:
+    """A mask whose total elements don't match num_tokens raises.
+
+    The shape check happens before any heavy forward work, so init
+    isn't strictly necessary — but keeping it consistent with the other
+    tests means a future "rejects after partial forward" failure also
+    surfaces cleanly.
+    """
+    moe = _make_moe()
+    with torch.no_grad():
+        moe.experts.gate_up_proj.normal_(std=0.05)
+        moe.experts.down_proj.normal_(std=0.05)
+        for p in moe.shared_expert.parameters():
+            p.normal_(std=0.05)
+    x = torch.randn(5, 16)
+    bad = torch.zeros(3, dtype=torch.bool)   # wrong length
+    with pytest.raises(ValueError, match="image_mask"):
+        moe(x, image_mask=bad)
+
+
+# ---------------------------------------------------------------------------
+# LingMoeModel — input_ids / input_embeds / shape contracts
+# ---------------------------------------------------------------------------
+
+
+def _tiny_model_kwargs() -> dict:
+    """Tiny config (~K params, runs on CPU or CUDA in <1s).
+
+    head_dim=8, partial=0.5 → rotary_dim=4, rotary_dim//2=2 → mrope
+    section must sum to 2. [1, 1, 0] is the simplest valid split.
+    """
+    return dict(
+        vocab_size=64, hidden_size=32, intermediate_size=64,
+        moe_intermediate_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4, num_kv_heads=2, head_dim=8,
+        rms_norm_eps=1e-6,
+        rope_theta=10000.0, max_position_embeddings=128,
+        partial_rotary_factor=0.5, mrope_section=[1, 1, 0],
+        num_experts=8, num_experts_per_tok=2,
+        num_shared_experts=1,
+        n_group=2, topk_group=1,
+        routed_scaling_factor=1.0,
+        first_k_dense_replace=1,
+    )
+
+
+def _init_dispatch_weights(model: LingMoeModel) -> None:
+    """Initialise every param the constructor allocated with
+    ``torch.empty`` (the Parallel* modules + the fused MoE experts).
+    Real weight loading overwrites these in production; tests need
+    init so we don't get NaN logits."""
+    with torch.no_grad():
+        for name, p in model.named_parameters():
+            if "norm" in name or "embed" in name:
+                # Norm weights default to 1.0 (initialise so RMSNorm is identity).
+                # Embed defaults to normal — match nn.Embedding init.
+                if "norm" in name:
+                    p.fill_(1.0)
+                else:
+                    p.normal_(std=0.02)
+            else:
+                p.normal_(std=0.05)
+
+
+def test_ling_moe_model_input_ids_xor_embeds_required() -> None:
+    """Both or neither of input_ids / input_embeds raises."""
+    m = LingMoeModel(**_tiny_model_kwargs())
+    cache = _MockCacheHandle()
+    with pytest.raises(ValueError, match="Exactly one"):
+        m(cache, input_ids=None, input_embeds=None)
+    with pytest.raises(ValueError, match="Exactly one"):
+        m(cache, input_ids=torch.zeros(3, dtype=torch.long),
+          input_embeds=torch.zeros(3, 32))
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="LingAttention uses mstar RMSNorm (CUDA-only via flashinfer)")
+def test_ling_moe_model_forward_with_input_ids_shape() -> None:
+    """Forward with (T,) token ids returns (T, vocab_size) finite logits."""
+    # bf16 — required by mstar's fused MoE kernel (asserts dtype in
+    # {bf16, fp16}). The real model loads bf16 weights, so this matches.
+    m = LingMoeModel(**_tiny_model_kwargs()).cuda().to(torch.bfloat16)
+    _init_dispatch_weights(m)
+    T = 5
+    input_ids = torch.randint(0, 64, (T,), device="cuda")
+    out = m(_MockCacheHandle(), input_ids=input_ids)
+    assert out.shape == (T, 64)
+    assert torch.isfinite(out).all()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="LingAttention uses mstar RMSNorm (CUDA-only via flashinfer)")
+def test_ling_moe_model_forward_with_input_embeds_shape() -> None:
+    """Forward bypassing embed_tokens via (T, hidden) input_embeds."""
+    m = LingMoeModel(**_tiny_model_kwargs()).cuda().to(torch.bfloat16)
+    _init_dispatch_weights(m)
+    T = 4
+    embeds = torch.randn(T, 32, device="cuda", dtype=torch.bfloat16)
+    out = m(_MockCacheHandle(), input_embeds=embeds)
+    assert out.shape == (T, 64)
+    assert torch.isfinite(out).all()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="LingAttention uses mstar RMSNorm (CUDA-only via flashinfer)")
+def test_ling_decoder_layer_dense_vs_moe_paths_differ() -> None:
+    """Layer 0 (dense GatedMLP) and layer 1 (MoE) on the same input must
+    produce different outputs — verifies the layer-index branch is wired."""
+    rotary = LingPartialMRotaryEmbedding(
+        head_dim=8, partial_rotary_factor=0.5,
+        mrope_section=[1, 1, 0], rope_theta=10000.0,
+        max_position_embeddings=64,
+    ).cuda()
+    common = dict(
+        first_k_dense_replace=1,
+        hidden_size=32, intermediate_size=64, moe_intermediate_size=16,
+        num_attention_heads=4, num_kv_heads=2, head_dim=8,
+        rms_norm_eps=1e-6,
+        num_experts=8, num_experts_per_tok=2,
+        num_shared_experts=1, n_group=2, topk_group=1,
+        routed_scaling_factor=1.0,
+        rotary=rotary,
+    )
+    dense = LingDecoderLayer(layer_idx=0, **common).cuda().to(torch.bfloat16)
+    moe = LingDecoderLayer(layer_idx=1, **common).cuda().to(torch.bfloat16)
+    with torch.no_grad():
+        moe.mlp.experts.gate_up_proj.normal_(std=0.05)
+        moe.mlp.experts.down_proj.normal_(std=0.05)
+    # Copy attention + norms so any output diff comes from the FFN branch only.
+    moe.input_layernorm.load_state_dict(dense.input_layernorm.state_dict())
+    moe.post_attention_layernorm.load_state_dict(
+        dense.post_attention_layernorm.state_dict()
+    )
+    moe.self_attn.load_state_dict(dense.self_attn.state_dict())
+
+    assert dense.is_moe is False and moe.is_moe is True
+    x = torch.randn(3, 32, device="cuda", dtype=torch.bfloat16)
+    pos = torch.arange(3, device="cuda")
+    out_dense = dense(x, _MockCacheHandle(), pos)
+    out_moe = moe(x, _MockCacheHandle(), pos)
+    assert not torch.allclose(out_dense, out_moe), (
+        "dense and MoE layer paths produced identical output"
+    )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(),
+                    reason="LingAttention uses mstar RMSNorm (CUDA-only via flashinfer)")
+def test_ling_moe_model_causal() -> None:
+    """Appending a later token doesn't change earlier-position logits.
+
+    Strongest end-to-end guard that nothing in the MoE / mask / rope
+    plumbing accidentally lets future tokens influence past ones.
+    """
+    m = LingMoeModel(**_tiny_model_kwargs()).cuda().to(torch.bfloat16).eval()
+    _init_dispatch_weights(m)
+    input_ids = torch.randint(0, 64, (4,), device="cuda")
+    out_a = m(_MockCacheHandle(), input_ids=input_ids)
+
+    extended = torch.cat([input_ids, torch.randint(0, 64, (1,), device="cuda")])
+    out_b = m(_MockCacheHandle(), input_ids=extended)
+    # bf16 tolerance — 2 layers' worth of bf16 ops drift more than fp32.
+    assert torch.allclose(out_a, out_b[:4], atol=0.05), (
+        "causal mask leaked: appending a token changed earlier-position logits"
+    )
diff --git a/test/modular/test_ming_flash_omni_positions.py b/test/modular/test_ming_flash_omni_positions.py
new file mode 100644
index 00000000..7b833bc1
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_positions.py
@@ -0,0 +1,159 @@
+"""Tests for Ming's 3D MRoPE position-id helpers (step 5b).
+
+These mirror the math in
+``modeling_bailing_moe_v2.get_rope_index:625-647`` (vision span) and
+the pure-text branch (`658-675`). Audio is treated as text positions
+upstream, so the audio helper is just a thin alias.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.positions import (
+    get_rope_index_audio,
+    get_rope_index_text,
+    get_rope_index_vision,
+    vision_span_max_position,
+)
+
+# ---------------------------------------------------------------------------
+# get_rope_index_text
+# ---------------------------------------------------------------------------
+
+
+def test_text_positions_shape_and_offset() -> None:
+    """``(3, T)`` with identical sequential rows offset by start_pos."""
+    pos = get_rope_index_text(seq_len=5, start_pos=10)
+    assert pos.shape == (3, 5)
+    expected = torch.tensor([[10, 11, 12, 13, 14]] * 3)
+    torch.testing.assert_close(pos, expected)
+
+
+def test_text_positions_start_at_zero() -> None:
+    pos = get_rope_index_text(seq_len=3, start_pos=0)
+    assert pos.tolist() == [[0, 1, 2], [0, 1, 2], [0, 1, 2]]
+
+
+def test_text_positions_long_dtype_default() -> None:
+    pos = get_rope_index_text(seq_len=2, start_pos=0)
+    assert pos.dtype == torch.long
+
+
+# ---------------------------------------------------------------------------
+# get_rope_index_audio
+# ---------------------------------------------------------------------------
+
+
+def test_audio_positions_match_text_positions() -> None:
+    """Audio is text-positioned upstream — verify the helper aliases."""
+    a = get_rope_index_audio(num_audio_tokens=7, start_pos=4)
+    t = get_rope_index_text(seq_len=7, start_pos=4)
+    torch.testing.assert_close(a, t)
+
+
+# ---------------------------------------------------------------------------
+# get_rope_index_vision
+# ---------------------------------------------------------------------------
+
+
+def test_vision_positions_single_image_no_temporal_scale() -> None:
+    """grid_thw=(1, 4, 4), spatial_merge=2 → 1 * 2 * 2 = 4 tokens.
+
+    Temporal row: all 0 (single frame); H row cycles [0,0,1,1];
+    W row cycles [0,1,0,1]. All offset by start_pos=10 → [10..].
+    """
+    pos = get_rope_index_vision(
+        grid_thw=torch.tensor([1, 4, 4], dtype=torch.long),
+        start_pos=10,
+        spatial_merge_size=2,
+    )
+    assert pos.shape == (3, 4)
+    expected = torch.tensor([
+        [10, 10, 10, 10],  # T
+        [10, 10, 11, 11],  # H
+        [10, 11, 10, 11],  # W
+    ])
+    torch.testing.assert_close(pos, expected)
+
+
+def test_vision_positions_multi_frame_indexes_t_per_frame() -> None:
+    """grid_thw=(3, 2, 2), spatial_merge=2 → 3 frames × 1 × 1 = 3 tokens.
+
+    Temporal row increments per frame; H/W rows are zero (single
+    merged token per frame). No abs-time scaling here.
+    """
+    pos = get_rope_index_vision(
+        grid_thw=torch.tensor([3, 2, 2], dtype=torch.long),
+        start_pos=0,
+        spatial_merge_size=2,
+    )
+    assert pos.shape == (3, 3)
+    expected = torch.tensor([[0, 1, 2], [0, 0, 0], [0, 0, 0]])
+    torch.testing.assert_close(pos, expected)
+
+
+def test_vision_positions_absolute_time_scales_temporal() -> None:
+    """``second_per_grid_t * tokens_per_second`` multiplies temporal row.
+
+    Mirrors the video branch of get_rope_index where
+    ``time_tensor = expanded * second_per_grid_t * tokens_per_second``.
+    """
+    pos = get_rope_index_vision(
+        grid_thw=torch.tensor([4, 2, 2], dtype=torch.long),
+        start_pos=0,
+        spatial_merge_size=2,
+        second_per_grid_t=0.5,    # half a second per frame
+        tokens_per_second=2,
+    )
+    # T row: (frame_index * 0.5 * 2).long() → [0, 1, 2, 3] across frames,
+    # each repeated H*W=1 times.
+    assert pos[0].tolist() == [0, 1, 2, 3]
+    assert pos[1].tolist() == [0, 0, 0, 0]
+    assert pos[2].tolist() == [0, 0, 0, 0]
+
+
+def test_vision_positions_rejects_bad_grid_thw_shape() -> None:
+    with pytest.raises(ValueError, match="grid_thw must be a 1-D tensor of length 3"):
+        get_rope_index_vision(
+            grid_thw=torch.tensor([[1, 4, 4]], dtype=torch.long),
+            start_pos=0,
+            spatial_merge_size=2,
+        )
+
+
+def test_vision_positions_rejects_non_divisible_grid() -> None:
+    with pytest.raises(ValueError, match="not divisible by spatial_merge_size"):
+        get_rope_index_vision(
+            grid_thw=torch.tensor([1, 3, 4], dtype=torch.long),
+            start_pos=0,
+            spatial_merge_size=2,
+        )
+
+
+# ---------------------------------------------------------------------------
+# vision_span_max_position
+# ---------------------------------------------------------------------------
+
+
+def test_vision_span_max_position_no_time_scale() -> None:
+    """Largest pos in (1, 4, 4) span at start=10 is max(0, 1, 1) = 1; +1 = 12."""
+    nxt = vision_span_max_position(
+        grid_thw=torch.tensor([1, 4, 4]),
+        start_pos=10,
+        spatial_merge_size=2,
+    )
+    assert nxt == 10 + 1 + 1   # start + max(H,W,T) + 1
+
+
+def test_vision_span_max_position_with_time_scale() -> None:
+    """(4, 2, 2) with 0.5s/frame, 2 tps → T=[0,1,2,3]; max=3; +start+1=4."""
+    nxt = vision_span_max_position(
+        grid_thw=torch.tensor([4, 2, 2]),
+        start_pos=0,
+        spatial_merge_size=2,
+        second_per_grid_t=0.5,
+        tokens_per_second=2,
+    )
+    assert nxt == 4
diff --git a/test/modular/test_ming_flash_omni_postprocess.py b/test/modular/test_ming_flash_omni_postprocess.py
new file mode 100644
index 00000000..a3b67425
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_postprocess.py
@@ -0,0 +1,115 @@
+"""Tests for MingFlashOmniModel.postprocess multi-modality encoding.
+
+The model emits three output modalities across its graph walks — text
+(thinker decode), audio (talker), image (imagegen) — so postprocess must encode
+all three. Pure CPU: build a bare model via __new__ + a stub tokenizer.
+"""
+
+from __future__ import annotations
+
+import io
+
+import pytest
+import torch
+from PIL import Image
+
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+
+class _StubTokenizer:
+    def decode(self, ids, skip_special_tokens=True):
+        return "".join(chr(65 + (i % 26)) for i in ids)
+
+
+def _model() -> MingFlashOmniModel:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.tokenizer = _StubTokenizer()
+    return inst
+
+
+# ---------------------------------------------------------------------------
+# text
+# ---------------------------------------------------------------------------
+
+
+def test_postprocess_text_returns_utf8() -> None:
+    out = _model().postprocess(torch.tensor([0, 1, 2]), "text")
+    assert out == b"ABC"
+
+
+def test_postprocess_empty_returns_empty_bytes() -> None:
+    assert _model().postprocess(torch.tensor([], dtype=torch.long), "text") == b""
+
+
+# ---------------------------------------------------------------------------
+# audio
+# ---------------------------------------------------------------------------
+
+
+def test_postprocess_audio_returns_raw_float32_pcm() -> None:
+    wav = torch.tensor([0.0, 0.5, -0.5, 1.0], dtype=torch.float32)
+    out = _model().postprocess(wav, "audio")
+    # 4 float32 samples = 16 bytes; round-trips exactly.
+    assert len(out) == 16
+    import numpy as np
+
+    assert np.frombuffer(out, dtype=np.float32).tolist() == [0.0, 0.5, -0.5, 1.0]
+
+
+def test_postprocess_audio_empty() -> None:
+    assert _model().postprocess(torch.empty(0), "audio") == b""
+
+
+# ---------------------------------------------------------------------------
+# image
+# ---------------------------------------------------------------------------
+
+
+def test_postprocess_image_chw_returns_png() -> None:
+    img = torch.zeros(3, 8, 8)  # mid-gray after [-1,1]->[0,255] is 128
+    out = _model().postprocess(img, "image")
+    decoded = Image.open(io.BytesIO(out))
+    assert decoded.format == "PNG"
+    assert decoded.size == (8, 8)  # (W, H)
+    assert decoded.mode == "RGB"
+    # 0.0 in [-1,1] maps to (0+1)*127.5 = 127.5 -> round 128.
+    px = decoded.getpixel((0, 0))
+    assert px == (128, 128, 128)
+
+
+def test_postprocess_image_bchw_takes_first() -> None:
+    img = torch.ones(2, 3, 4, 4)  # 1.0 -> 255
+    out = _model().postprocess(img, "image")
+    decoded = Image.open(io.BytesIO(out))
+    assert decoded.size == (4, 4)
+    assert decoded.getpixel((0, 0)) == (255, 255, 255)
+
+
+def test_postprocess_image_clamps_out_of_range() -> None:
+    img = torch.full((3, 2, 2), 5.0)  # clamps to 1.0 -> 255
+    out = _model().postprocess(img, "image")
+    decoded = Image.open(io.BytesIO(out))
+    assert decoded.getpixel((0, 0)) == (255, 255, 255)
+
+
+def test_postprocess_image_single_channel_expands_to_rgb() -> None:
+    img = torch.zeros(1, 4, 4)  # 1-channel -> repeated to RGB
+    out = _model().postprocess(img, "image")
+    decoded = Image.open(io.BytesIO(out))
+    assert decoded.mode == "RGB"
+    assert decoded.getpixel((0, 0)) == (128, 128, 128)
+
+
+def test_postprocess_image_bad_shape_raises() -> None:
+    with pytest.raises(ValueError, match="expected"):
+        _model().postprocess(torch.zeros(5, 8, 8), "image")  # 5 channels
+
+
+# ---------------------------------------------------------------------------
+# unknown
+# ---------------------------------------------------------------------------
+
+
+def test_postprocess_unknown_modality_raises() -> None:
+    with pytest.raises(ValueError, match="Unsupported modality"):
+        _model().postprocess(torch.zeros(3), "video")
diff --git a/test/modular/test_ming_flash_omni_process_prompt.py b/test/modular/test_ming_flash_omni_process_prompt.py
new file mode 100644
index 00000000..89b8f3b2
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_process_prompt.py
@@ -0,0 +1,508 @@
+"""Tests for MingFlashOmniModel.process_prompt (step 7).
+
+Two layers:
+
+  * Pure-Python tests using stub tokenizer + processor — verify the
+    dispatch (image/audio/video routing), tensor conversion (CHW
+    float [0,1] → HWC uint8), and result-key shape. Run on CPU,
+    no snapshot.
+
+  * Snapshot-gated tests with the real BailingMM2Processor — confirm
+    the chat template path, image processor, and audio processor
+    produce the expected result keys + shapes when called against
+    the actual checkpoint.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    MingFlashOmniModelConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+# ---------------------------------------------------------------------------
+# Snapshot discovery (mirrors test_ming_flash_omni_encoders.py)
+# ---------------------------------------------------------------------------
+
+
+def _find_local_snapshot() -> str | None:
+    def _has_shards(path: Path) -> bool:
+        return (
+            (path / "config.json").exists()
+            and (path / "model.safetensors.index.json").exists()
+            and (path / "model-00001-of-00042.safetensors").exists()
+        )
+
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and _has_shards(Path(override)):
+        return override
+    hybrid = Path("/dev/shm/ming-hybrid")
+    if _has_shards(hybrid):
+        return str(hybrid)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Stub tokenizer + processor for pure-Python tests
+# ---------------------------------------------------------------------------
+
+
+class _StubTokenizer:
+    """Just enough tokenizer surface to drive process_prompt's text path."""
+
+    eos_token = "<eos>"
+    eos_token_id = 0
+
+    def __init__(self) -> None:
+        # Record the content passed to apply_chat_template so tests can
+        # assert on prompt expansion (image-gen query-token block).
+        self.last_content: str | None = None
+
+    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=True):
+        # Emit a deterministic synthetic string; tokenize=False means
+        # process_prompt will re-tokenize via __call__.
+        assert tokenize is False
+        self.last_content = messages[0]["content"]
+        return "<|USER|>" + messages[0]["content"] + "<|ASSISTANT|>"
+
+    def __call__(self, text, return_tensors="pt"):
+        # Toy: emit one int per character.
+        ids = torch.tensor([[ord(c) % 256 for c in text]], dtype=torch.long)
+        return type("Out", (), {"input_ids": ids})()
+
+
+class _StubImageProcessor:
+    """Produce predictable shapes from arbitrary HWC uint8 input."""
+
+    def __call__(self, images=None, videos=None, return_tensors="pt", **kwargs):
+        if images is not None:
+            # Each image collapses to a single "patch" of fixed size for testing.
+            n = len(images)
+            return {
+                "pixel_values": torch.zeros(n, 3, 16, 16),
+                "image_grid_thw": torch.tensor([[1, 4, 4]] * n, dtype=torch.long),
+            }
+        if videos is not None:
+            n = len(videos)
+            frames = videos[0].__len__() if hasattr(videos[0], "__len__") else 1
+            return {
+                "pixel_values_videos": torch.zeros(n * frames, 3, 16, 16),
+                "video_grid_thw": torch.tensor([[frames, 4, 4]] * n, dtype=torch.long),
+            }
+        return {}
+
+
+class _StubAudioProcessor:
+    """Mel-spectrogram stub: produces fixed (n_mels=8, T=20) for any clip."""
+
+    sampling_rate = 16000
+
+    def __call__(self, audios, **kwargs):
+        n = len(audios)
+        # (B, T, n_mels) following the upstream layout.
+        return {
+            "audio_feats": np.zeros((n, 20, 8), dtype=np.float32),
+            "audio_feats_lengths": np.array([20] * n, dtype=np.int64),
+            "encoder_feats_lengths": np.array([10] * n, dtype=np.int64),
+        }
+
+
+class _StubProcessor:
+    """Combine the modality stubs in the shape BailingMM2Processor exposes."""
+
+    def __init__(self) -> None:
+        self.image_processor = _StubImageProcessor()
+        self.audio_processor = _StubAudioProcessor()
+
+
+def _bare_model_with_stubs() -> MingFlashOmniModel:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+    )
+    inst.tokenizer = _StubTokenizer()
+    inst._processor = _StubProcessor()
+    inst._submodule_cache = {}
+    return inst
+
+
+def _model_with_imagegen(scales: list[int] | None = None) -> MingFlashOmniModel:
+    """Bare model whose config carries an ImageGenConfig (image output path)."""
+    from mstar.model.ming_omni_flash.config import ImageGenConfig
+
+    inst = _bare_model_with_stubs()
+    image_gen = ImageGenConfig()
+    if scales is not None:
+        image_gen.img_gen_scales = scales
+    inst.config.image_gen = image_gen
+    return inst
+
+
+# ---------------------------------------------------------------------------
+# Text-only path
+# ---------------------------------------------------------------------------
+
+
+def test_text_only_returns_text_inputs_and_empty_modality_lists() -> None:
+    m = _bare_model_with_stubs()
+    out = m.process_prompt(
+        prompt="hello",
+        input_modalities=["text"],
+        output_modalities=["text"],
+        tensors=None,
+    )
+    assert "text_inputs" in out and len(out["text_inputs"]) == 1
+    assert out["text_inputs"][0].dim() == 1
+    # All modality buckets exist but are empty (so the scheduler in
+    # step 5c sees a clean shape).
+    for key in [
+        "pixel_values", "image_grid_thw",
+        "pixel_values_videos", "video_grid_thw", "video_second_per_grid",
+        "audio_features", "audio_seqlens",
+    ]:
+        assert key in out and out[key] == []
+
+
+def test_no_prompt_returns_no_text_inputs() -> None:
+    """prompt=None → text_inputs empty (audio-only / image-only request)."""
+    m = _bare_model_with_stubs()
+    out = m.process_prompt(
+        prompt=None,
+        input_modalities=["audio"],
+        output_modalities=["text"],
+        tensors=None,
+    )
+    assert out["text_inputs"] == []
+
+
+def test_missing_tokenizer_raises() -> None:
+    m = _bare_model_with_stubs()
+    m.tokenizer = None
+    with pytest.raises(RuntimeError, match="tokenizer is not loaded"):
+        m.process_prompt(
+            prompt="hi", input_modalities=["text"],
+            output_modalities=["text"], tensors=None,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Image path
+# ---------------------------------------------------------------------------
+
+
+def test_image_path_routes_through_image_processor() -> None:
+    """CHW float [0,1] → image_processor → pixel_values + grid_thw."""
+    m = _bare_model_with_stubs()
+    img = torch.rand(3, 32, 32)
+    out = m.process_prompt(
+        prompt="describe", input_modalities=["text", "image"],
+        output_modalities=["text"],
+        tensors={"image_inputs": [img]},
+    )
+    assert len(out["pixel_values"]) == 1
+    assert out["pixel_values"][0].shape == (1, 3, 16, 16)
+    assert len(out["image_grid_thw"]) == 1
+    assert out["image_grid_thw"][0].tolist() == [1, 4, 4]
+
+
+def test_image_conversion_clamps_float_to_uint8_hwc() -> None:
+    """Ensure the CHW-float → HWC-uint8 conversion is bit-correct for the
+    happy path (qwen3_omni had a double-rescale bug that turned the input
+    near-zero; this test guards against the same regression).
+    """
+    chw = torch.tensor([
+        [[0.0, 1.0], [0.5, 0.25]],
+        [[0.1, 0.9], [0.4, 0.7]],
+        [[0.2, 0.8], [0.6, 0.3]],
+    ])  # (3, 2, 2) — values < 1.0
+    arr = MingFlashOmniModel._image_to_processor_input(chw)
+    # Output is HWC uint8 in [0, 255].
+    assert arr.shape == (2, 2, 3)
+    assert arr.dtype == np.uint8
+    # Top-left R channel was 0.0 → 0; top-right R was 1.0 → 255.
+    assert arr[0, 0, 0] == 0
+    assert arr[0, 1, 0] == 255
+
+
+def test_image_conversion_handles_grayscale_single_channel() -> None:
+    """(1, H, W) input gets broadcast to 3 channels (HF processors
+    don't accept single-channel patches)."""
+    gray = torch.full((1, 4, 4), 0.5)
+    arr = MingFlashOmniModel._image_to_processor_input(gray)
+    assert arr.shape == (4, 4, 3)
+    # All three channels share the same value.
+    assert (arr[..., 0] == arr[..., 1]).all() and (arr[..., 0] == arr[..., 2]).all()
+
+
+def test_image_inputs_require_processor() -> None:
+    m = _bare_model_with_stubs()
+    m._processor = None
+    img = torch.rand(3, 8, 8)
+    with pytest.raises(RuntimeError, match="processor is None"):
+        m.process_prompt(
+            prompt=None, input_modalities=["image"],
+            output_modalities=["text"], tensors={"image_inputs": [img]},
+        )
+
+
+def test_image_inputs_already_uint8_pass_through() -> None:
+    """uint8 CHW input doesn't get rescaled a second time."""
+    chw = torch.full((3, 4, 4), 128, dtype=torch.uint8)
+    arr = MingFlashOmniModel._image_to_processor_input(chw)
+    assert arr.dtype == np.uint8
+    assert (arr == 128).all()
+
+
+# ---------------------------------------------------------------------------
+# Audio path
+# ---------------------------------------------------------------------------
+
+
+def test_audio_path_returns_mel_n_mels_first_and_seqlens() -> None:
+    """The processor yields (B, T, n_mels); process_prompt transposes
+    to (n_mels, T) per clip — that's what the AudioEncoderSubmodule
+    expects in its single-clip prepare_inputs."""
+    m = _bare_model_with_stubs()
+    waveform = torch.randn(16000)  # 1 s at 16 kHz
+    out = m.process_prompt(
+        prompt=None, input_modalities=["audio"],
+        output_modalities=["text"], tensors={"audio_inputs": [waveform]},
+    )
+    assert len(out["audio_features"]) == 1
+    assert out["audio_features"][0].shape == (8, 20)  # (n_mels, T)
+    assert len(out["audio_seqlens"]) == 1
+    assert out["audio_seqlens"][0].tolist() == [20]
+
+
+def test_audio_path_accepts_waveform_sr_tuples() -> None:
+    """``(waveform, sample_rate)`` tuples are accepted as well as raw waveforms."""
+    m = _bare_model_with_stubs()
+    out = m.process_prompt(
+        prompt=None, input_modalities=["audio"],
+        output_modalities=["text"],
+        tensors={"audio_inputs": [(torch.randn(8000), 16000)]},
+    )
+    assert len(out["audio_features"]) == 1
+
+
+def test_audio_inputs_require_processor() -> None:
+    m = _bare_model_with_stubs()
+    m._processor = None
+    with pytest.raises(RuntimeError, match="processor is None"):
+        m.process_prompt(
+            prompt=None, input_modalities=["audio"],
+            output_modalities=["text"],
+            tensors={"audio_inputs": [torch.randn(8000)]},
+        )
+
+
+# ---------------------------------------------------------------------------
+# Video path
+# ---------------------------------------------------------------------------
+
+
+def test_video_path_returns_pixel_values_grid_and_second_per_grid_default() -> None:
+    m = _bare_model_with_stubs()
+    # (T, C, H, W) — 3 frames.
+    video = torch.rand(3, 3, 32, 32)
+    out = m.process_prompt(
+        prompt="watch", input_modalities=["text", "video"],
+        output_modalities=["text"],
+        tensors={"video_inputs": [video]},
+    )
+    assert len(out["pixel_values_videos"]) == 1
+    assert len(out["video_grid_thw"]) == 1
+    assert out["video_grid_thw"][0].tolist() == [3, 4, 4]
+    # Default second_per_grid is 1.0 when no metadata override.
+    assert len(out["video_second_per_grid"]) == 1
+    assert float(out["video_second_per_grid"][0].item()) == 1.0
+
+
+def test_video_path_respects_metadata_second_per_grid_override() -> None:
+    """``input_metadata['video'][i]['second_per_grid']`` overrides the default."""
+    m = _bare_model_with_stubs()
+    video = torch.rand(2, 3, 16, 16)
+    out = m.process_prompt(
+        prompt=None, input_modalities=["video"], output_modalities=["text"],
+        tensors={"video_inputs": [video]},
+        input_metadata={"video": [{"second_per_grid": 0.5}]},
+    )
+    assert float(out["video_second_per_grid"][0].item()) == 0.5
+
+
+# ---------------------------------------------------------------------------
+# Mixed-modality plumbing
+# ---------------------------------------------------------------------------
+
+
+def test_mixed_text_image_audio_all_buckets_populated() -> None:
+    """A request with all three modalities populates all three buckets."""
+    m = _bare_model_with_stubs()
+    out = m.process_prompt(
+        prompt="hello", input_modalities=["text", "image", "audio"],
+        output_modalities=["text"],
+        tensors={
+            "image_inputs": [torch.rand(3, 16, 16)],
+            "audio_inputs": [torch.randn(8000)],
+        },
+    )
+    assert len(out["text_inputs"]) == 1
+    assert len(out["pixel_values"]) == 1
+    assert len(out["audio_features"]) == 1
+    # No video for this request.
+    assert out["pixel_values_videos"] == []
+
+
+def test_multiple_images_emit_multiple_entries() -> None:
+    """Two images → two pixel_values + two image_grid_thw entries."""
+    m = _bare_model_with_stubs()
+    imgs = [torch.rand(3, 16, 16), torch.rand(3, 24, 24)]
+    out = m.process_prompt(
+        prompt="describe", input_modalities=["text", "image", "image"],
+        output_modalities=["text"],
+        tensors={"image_inputs": imgs},
+    )
+    assert len(out["pixel_values"]) == 2
+    assert len(out["image_grid_thw"]) == 2
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated end-to-end with the real processor
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(
+    _find_local_snapshot() is None,
+    reason="Need Ming-flash-omni-2.0 snapshot (set MING_FLASH_OMNI_DIR).",
+)
+def test_process_prompt_text_only_with_real_tokenizer() -> None:
+    """End-to-end: real tokenizer + chat template produces non-empty input_ids."""
+    snap = _find_local_snapshot()
+    code_dir = os.environ.get("MING_CODE_DIR", "/tmp/ming_repo")
+    model = MingFlashOmniModel(model_path_hf=snap, ming_code_dir=code_dir)
+    if model.tokenizer is None:
+        pytest.skip("Tokenizer didn't load on this box (env-only, not a code bug).")
+    out = model.process_prompt(
+        prompt="What is the capital of France?",
+        input_modalities=["text"], output_modalities=["text"], tensors=None,
+    )
+    assert "text_inputs" in out and len(out["text_inputs"]) == 1
+    input_ids = out["text_inputs"][0]
+    assert input_ids.dim() == 1
+    # Non-trivial prompt → at least a handful of tokens.
+    assert input_ids.numel() > 5
+
+
+@pytest.mark.skipif(
+    _find_local_snapshot() is None,
+    reason="Need Ming-flash-omni-2.0 snapshot.",
+)
+def test_process_prompt_image_path_with_real_image_processor() -> None:
+    """End-to-end: real image processor accepts a tiny synthetic image."""
+    snap = _find_local_snapshot()
+    code_dir = os.environ.get("MING_CODE_DIR", "/tmp/ming_repo")
+    model = MingFlashOmniModel(model_path_hf=snap, ming_code_dir=code_dir)
+    if model.tokenizer is None or model._processor is None:
+        pytest.skip("Tokenizer/processor didn't load on this box.")
+    # 64x64 RGB image — small but the real processor's spatial_merge=2
+    # + patch_size=16 needs a multiple-of-32 input on both sides.
+    img = torch.rand(3, 64, 64)
+    try:
+        out = model.process_prompt(
+            prompt="What is in this image?",
+            input_modalities=["text", "image"], output_modalities=["text"],
+            tensors={"image_inputs": [img]},
+        )
+    except Exception as e:
+        pytest.skip(f"Real image processor failed to run on this box: {e}")
+    assert len(out["pixel_values"]) == 1
+    assert len(out["image_grid_thw"]) == 1
+    # Grid should be (1, h, w) where h*16 >= image height (after resizing).
+    grid = out["image_grid_thw"][0]
+    assert grid.shape == (3,) and int(grid[0].item()) == 1
+
+
+# ---------------------------------------------------------------------------
+# Image-generation prompt path (step 9b-pre): when output_modalities asks for
+# an image AND the deploy ships an ImageGenConfig, process_prompt appends the
+# <image><imagePatch>*N</image> query-token block before tokenizing.
+# ---------------------------------------------------------------------------
+
+
+def test_image_output_appends_query_token_block() -> None:
+    m = _model_with_imagegen()  # default img_gen_scales=[16] -> 256 tokens
+    out = m.process_prompt(
+        prompt="draw a cat",
+        input_modalities=["text"],
+        output_modalities=["image"],
+        tensors=None,
+    )
+    content = m.tokenizer.last_content
+    assert content is not None
+    assert content.startswith("draw a cat")
+    assert "<image>" in content and "</image>" in content
+    assert content.count("<imagePatch>") == 256
+    assert len(out["text_inputs"]) == 1
+
+
+def test_image_output_respects_img_gen_scales() -> None:
+    m = _model_with_imagegen(scales=[8])  # 8*8 = 64 query tokens
+    m.process_prompt(
+        prompt="draw a dog",
+        input_modalities=["text"],
+        output_modalities=["image"],
+        tensors=None,
+    )
+    assert m.tokenizer.last_content.count("<imagePatch>") == 64
+
+
+def test_text_output_does_not_append_query_block_even_with_imagegen() -> None:
+    """ImageGenConfig present but caller wants text -> no expansion."""
+    m = _model_with_imagegen()
+    m.process_prompt(
+        prompt="describe a cat",
+        input_modalities=["text"],
+        output_modalities=["text"],
+        tensors=None,
+    )
+    assert "<imagePatch>" not in m.tokenizer.last_content
+
+
+def test_image_output_without_imagegen_config_is_noop() -> None:
+    """Thinker-only deploy (no ImageGenConfig) ignores image output_modalities."""
+    m = _bare_model_with_stubs()  # config.image_gen is None
+    m.process_prompt(
+        prompt="draw a cat",
+        input_modalities=["text"],
+        output_modalities=["image"],
+        tensors=None,
+    )
+    assert "<imagePatch>" not in m.tokenizer.last_content
+
+
+def test_image_output_no_double_expansion() -> None:
+    """A prompt that already carries a patch block is left unchanged."""
+    m = _model_with_imagegen()
+    pre = "draw a cat<image>" + ("<imagePatch>" * 256) + "</image>"
+    m.process_prompt(
+        prompt=pre,
+        input_modalities=["text"],
+        output_modalities=["image"],
+        tensors=None,
+    )
+    # maybe_expand_image_gen_prompt is a no-op when a block already exists.
+    assert m.tokenizer.last_content.count("<imagePatch>") == 256
diff --git a/test/modular/test_ming_flash_omni_prompt_utils.py b/test/modular/test_ming_flash_omni_prompt_utils.py
new file mode 100644
index 00000000..0f9d3521
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_prompt_utils.py
@@ -0,0 +1,92 @@
+"""Tests for Ming-flash-omni-2.0 prompt utilities (step 8)."""
+
+from __future__ import annotations
+
+import json
+
+from mstar.model.ming_omni_flash.components.prompt_utils import (
+    BASE_CAPTION_TEMPLATE,
+    DEFAULT_NUM_QUERY_TOKENS,
+    IMAGE_PATCH_TOKEN,
+    create_instruction,
+    maybe_expand_image_gen_prompt,
+)
+
+# ---------------------------------------------------------------------------
+# Image-gen query-token expansion
+# ---------------------------------------------------------------------------
+
+
+def test_expand_appends_default_256_patch_block() -> None:
+    out = maybe_expand_image_gen_prompt("draw a cat")
+    assert out.startswith("draw a cat<image>")
+    assert out.endswith("</image>")
+    assert out.count(IMAGE_PATCH_TOKEN) == DEFAULT_NUM_QUERY_TOKENS  # 256
+
+
+def test_expand_respects_custom_token_count() -> None:
+    out = maybe_expand_image_gen_prompt("x", num_query_tokens=4)
+    assert out == "x<image>" + IMAGE_PATCH_TOKEN * 4 + "</image>"
+
+
+def test_expand_is_noop_when_already_has_patch_block() -> None:
+    pre = "y<image>" + IMAGE_PATCH_TOKEN * 16 + "</image>"
+    assert maybe_expand_image_gen_prompt(pre) == pre
+
+
+def test_expand_is_noop_on_empty_or_non_string() -> None:
+    assert maybe_expand_image_gen_prompt("") == ""
+    assert maybe_expand_image_gen_prompt(None) is None  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# TTS caption builder
+# ---------------------------------------------------------------------------
+
+
+def test_create_instruction_returns_valid_json_with_defaults() -> None:
+    s = create_instruction({})
+    parsed = json.loads(s)
+    assert "audio_sequence" in parsed
+    item = parsed["audio_sequence"][0]
+    assert item["序号"] == 1
+    assert item["说话人"] == "speaker_1"
+    assert item["BGM"]["Genre"] is None
+
+
+def test_create_instruction_merges_known_keys() -> None:
+    s = create_instruction({"说话人": "speaker_2", "情感": "happy"})
+    item = json.loads(s)["audio_sequence"][0]
+    assert item["说话人"] == "speaker_2"
+    assert item["情感"] == "happy"
+
+
+def test_create_instruction_ignores_unknown_keys() -> None:
+    s = create_instruction({"unknown_field": "x", "情感": "sad"})
+    item = json.loads(s)["audio_sequence"][0]
+    assert "unknown_field" not in item
+    assert item["情感"] == "sad"
+
+
+def test_create_instruction_does_not_mutate_base_template() -> None:
+    """create_instruction must deep-copy — calls must not leak into each other."""
+    create_instruction({"说话人": "speaker_9"})
+    # The module-level template is untouched.
+    assert BASE_CAPTION_TEMPLATE["audio_sequence"][0]["说话人"] == "speaker_1"
+    # A fresh call still sees the default.
+    item = json.loads(create_instruction({}))["audio_sequence"][0]
+    assert item["说话人"] == "speaker_1"
+
+
+def test_create_instruction_emits_unescaped_unicode() -> None:
+    """ensure_ascii=False keeps the Chinese field names readable."""
+    s = create_instruction({})
+    assert "说话人" in s   # not \uXXXX-escaped
+
+
+def test_create_instruction_nested_bgm_key_not_merged_at_top_level() -> None:
+    """BGM is a nested dict on the template; a top-level 'BGM' string replaces it
+    only because the key exists — verify the merge is shallow (matches upstream)."""
+    s = create_instruction({"BGM": {"Genre": "jazz"}})
+    item = json.loads(s)["audio_sequence"][0]
+    assert item["BGM"] == {"Genre": "jazz"}
diff --git a/test/modular/test_ming_flash_omni_submodules.py b/test/modular/test_ming_flash_omni_submodules.py
new file mode 100644
index 00000000..eb557843
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_submodules.py
@@ -0,0 +1,611 @@
+"""Smoke tests for the Ming-flash-omni-2.0 encoder submodules (step 5a).
+
+VisionEncoderSubmodule + AudioEncoderSubmodule wrap the components
+ported in step 4. Tests cover three properties:
+
+  * ``prepare_inputs`` raises a clear error on missing inputs and
+    extracts tensors from the engine's NameToTensorList bundle.
+  * ``forward`` produces the expected output edge name + tensor shape
+    on tiny CPU instances (no snapshot needed; weights random).
+  * The L2-norm post-projector matches Ming's source
+    (``modeling_bailingmm2.extract_image_feature`` /
+    ``extract_audio_feature``).
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.audio_encoder import MingAudioEncoder
+from mstar.model.ming_omni_flash.components.projectors import (
+    MingAudioProjector,
+    MingVisionProjector,
+)
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    MingFlashOmniModelConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.submodules import (
+    AudioEncoderSubmodule,
+    VisionEncoderSubmodule,
+)
+
+
+def _tiny_config() -> MingFlashOmniModelConfig:
+    """Tiny config with the released ckpt's modal token IDs preserved."""
+    return MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+    )
+
+
+# ---------------------------------------------------------------------------
+# AudioEncoderSubmodule — pure Python (random weights, CPU)
+# ---------------------------------------------------------------------------
+
+
+def _build_audio_submodule(hidden_size: int = 16) -> AudioEncoderSubmodule:
+    cfg = _tiny_config()
+    # Override LLM hidden_size so the projector output dim is small.
+    cfg.thinker_llm = ThinkerLLMConfig(
+        hidden_size=hidden_size, num_attention_heads=4, num_key_value_heads=2,
+        head_dim=hidden_size // 4,
+    )
+    enc = MingAudioEncoder(n_mels=8, n_ctx=128, n_state=16, n_head=2, n_layer=2, use_flash_attn=False)
+    enc = enc.float()
+    proj = MingAudioProjector(audio_dim=16, llm_dim=hidden_size, mlp_depth=2)
+    proj = proj.float()
+    return AudioEncoderSubmodule(audio_encoder=enc, audio_projector=proj, config=cfg)
+
+
+def test_audio_submodule_prepare_inputs_raises_on_missing_features() -> None:
+    sub = _build_audio_submodule()
+    with pytest.raises(ValueError, match="missing 'audio_features'"):
+        sub.prepare_inputs(graph_walk="prefill_audio", fwd_info=None, inputs={})
+
+
+def test_audio_submodule_prepare_inputs_passes_optional_seqlens() -> None:
+    """``audio_seqlens`` is optional — None when caller didn't provide it."""
+    sub = _build_audio_submodule()
+    features = torch.randn(8, 10)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_audio", fwd_info=None,
+        inputs={"audio_features": [features]},
+    )
+    assert out.tensor_inputs["audio_features"] is features
+    assert out.tensor_inputs["audio_seqlens"] is None
+
+
+def test_audio_submodule_forward_single_clip_shape() -> None:
+    """One clip → ``audio_embeds`` shape (T', llm_dim), L2-normed."""
+    sub = _build_audio_submodule(hidden_size=16)
+    features = torch.randn(8, 10)  # (n_mels, T)
+    out = sub.forward(
+        graph_walk="prefill_audio", engine_inputs=None,
+        audio_features=features, audio_seqlens=None,
+    )
+    embeds = out["audio_embeds"][0]
+    # Two convs: T=10 → conv1 stride=1 → 10; conv2 stride=2 → 6.
+    # Projector conv kernel=3 stride=2 pad=1 → T'' = (6-3+2)//2+1 = 3.
+    assert embeds.shape == (3, 16)
+    # ``norm_query_embeds=True`` by default → each row has unit norm.
+    norms = embeds.norm(dim=-1)
+    assert torch.allclose(norms, torch.ones_like(norms), atol=1e-5)
+
+
+def test_audio_submodule_forward_batched_clips_concatenates_along_time() -> None:
+    """(B, n_mels, T) batched input concatenates per-clip output along time."""
+    sub = _build_audio_submodule(hidden_size=16)
+    features = torch.randn(2, 8, 10)  # 2 clips
+    out = sub.forward(
+        graph_walk="prefill_audio", engine_inputs=None,
+        audio_features=features, audio_seqlens=None,
+    )
+    embeds = out["audio_embeds"][0]
+    # Same per-clip T'' = 3, two clips → 6 rows.
+    assert embeds.shape == (6, 16)
+
+
+def test_audio_submodule_forward_respects_audio_seqlens() -> None:
+    """``audio_seqlens`` trims padded tail before encoding."""
+    sub = _build_audio_submodule(hidden_size=16)
+    # Pad clip[0]'s T from 6 to 10 (extra noise tail). audio_seqlens=[6]
+    # should make the encoder see only the first 6 frames.
+    features_padded = torch.randn(8, 10)
+    features_trimmed = features_padded[:, :6]
+    seqlens = torch.tensor([6])
+
+    out_padded = sub.forward(
+        graph_walk="prefill_audio", engine_inputs=None,
+        audio_features=features_padded, audio_seqlens=seqlens,
+    )
+    out_trimmed = sub.forward(
+        graph_walk="prefill_audio", engine_inputs=None,
+        audio_features=features_trimmed, audio_seqlens=None,
+    )
+    # Same output: padded version with seqlens=[6] equals raw 6-frame version.
+    torch.testing.assert_close(
+        out_padded["audio_embeds"][0], out_trimmed["audio_embeds"][0], rtol=1e-5, atol=1e-5,
+    )
+
+
+# ---------------------------------------------------------------------------
+# VisionEncoderSubmodule — pure Python (mock encoder, CPU)
+# ---------------------------------------------------------------------------
+
+
+class _MockVisionEncoder(torch.nn.Module):
+    """Stand-in for Qwen3MoeVisionTransformer that the submodule can drive.
+
+    The real encoder needs the staged Ming source + nvrtc kernels; for
+    a CPU unit test we mock the (pixel_values, grid_thw) → embeddings
+    contract so the rest of the wrapper is exercised end-to-end.
+    """
+
+    def __init__(self, out_dim: int):
+        super().__init__()
+        self.out_dim = out_dim
+        # Project pixel input into the encoder's "out_hidden_size" space.
+        # Use a small trainable projection so the param-detection in
+        # NodeSubmodule.get_device works (real encoder has params).
+        self.dummy = torch.nn.Linear(8, out_dim, bias=False)
+
+    def forward(self, pixel_values: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        # Pretend each grid_thw produces (T*H*W / spatial_merge**2) tokens
+        # of out_dim each. We just collapse pixel_values into out_dim.
+        n_tokens = int(grid_thw.prod(dim=-1).sum().item())
+        # Down/up-sample to n_tokens deterministically.
+        x = self.dummy(pixel_values)
+        if x.shape[0] >= n_tokens:
+            return x[:n_tokens]
+        # Tile if input is smaller than requested.
+        reps = (n_tokens + x.shape[0] - 1) // x.shape[0]
+        return x.repeat(reps, 1)[:n_tokens]
+
+
+def _build_vision_submodule(vision_dim: int = 32, llm_dim: int = 16) -> VisionEncoderSubmodule:
+    cfg = _tiny_config()
+    cfg.thinker_llm = ThinkerLLMConfig(
+        hidden_size=llm_dim, num_attention_heads=4, num_key_value_heads=2,
+        head_dim=llm_dim // 4,
+    )
+    cfg.vision = VisionEncoderConfig(out_hidden_size=vision_dim)
+    enc = _MockVisionEncoder(out_dim=vision_dim)
+    proj = MingVisionProjector(vision_dim=vision_dim, llm_dim=llm_dim, mlp_depth=2)
+    return VisionEncoderSubmodule(vision_encoder=enc, vision_projector=proj, config=cfg)
+
+
+def test_vision_submodule_prepare_inputs_raises_on_missing_pixel_values() -> None:
+    sub = _build_vision_submodule()
+    with pytest.raises(ValueError, match="missing 'pixel_values'"):
+        sub.prepare_inputs(graph_walk="prefill_vision", fwd_info=None, inputs={})
+
+
+def test_vision_submodule_prepare_inputs_raises_on_missing_grid_thw() -> None:
+    sub = _build_vision_submodule()
+    pixels = torch.randn(4, 8)
+    with pytest.raises(ValueError, match="image_grid_thw"):
+        sub.prepare_inputs(
+            graph_walk="prefill_vision", fwd_info=None,
+            inputs={"pixel_values": [pixels]},
+        )
+
+
+def test_vision_submodule_prepare_inputs_promotes_1d_grid_thw() -> None:
+    """1-D ``[T, H, W]`` grid_thw gets promoted to ``(1, 3)``."""
+    sub = _build_vision_submodule()
+    pixels = torch.randn(4, 8)
+    grid_1d = torch.tensor([1, 2, 2], dtype=torch.long)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_vision", fwd_info=None,
+        inputs={"pixel_values": [pixels], "image_grid_thw": [grid_1d]},
+    )
+    assert out.tensor_inputs["grid_thw"].shape == (1, 3)
+
+
+def test_vision_submodule_forward_produces_l2_normed_embeds() -> None:
+    """``vision_embeds`` shape matches the encoder's token count; rows unit-norm."""
+    sub = _build_vision_submodule(vision_dim=32, llm_dim=16)
+    pixels = torch.randn(16, 8)
+    grid_thw = torch.tensor([[1, 2, 2]], dtype=torch.long)  # T*H*W = 4 tokens
+    out = sub.forward(
+        graph_walk="prefill_vision", engine_inputs=None,
+        pixel_values=pixels, grid_thw=grid_thw,
+    )
+    embeds = out["vision_embeds"][0]
+    assert embeds.shape == (4, 16)
+    norms = embeds.norm(dim=-1)
+    assert torch.allclose(norms, torch.ones_like(norms), atol=1e-5)
+
+
+# ---------------------------------------------------------------------------
+# get_node_engine_types registration (step 5a)
+# ---------------------------------------------------------------------------
+
+
+def test_get_node_engine_types_registers_encoders() -> None:
+    """Step 5a registers vision_encoder + audio_encoder as STATELESS."""
+    from mstar.engine.base import EngineType
+    from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+    # Stand up just enough of the model to call get_node_engine_types
+    # without loading the snapshot — build a bare instance via
+    # __new__ and inject the config attribute.
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = _tiny_config()
+    types = inst.get_node_engine_types()
+    assert types["Thinker"] == EngineType.KV_CACHE
+    assert types["vision_encoder"] == EngineType.STATELESS
+    assert types["audio_encoder"] == EngineType.STATELESS
+
+
+def test_get_submodule_rejects_unknown_node() -> None:
+    """Friendly error message for unregistered nodes.
+
+    Talker (step 6e-2) and ImageGen (step 9b) are both registered now, so a
+    genuinely unknown node name is the canonical 'unknown' here.
+    """
+    from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = _tiny_config()
+    inst._submodule_cache = {}
+    with pytest.raises(ValueError, match="Unknown node: 'NotARealNode'"):
+        inst.get_submodule("NotARealNode", device="cpu")
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated: end-to-end submodule construction with real weights
+# ---------------------------------------------------------------------------
+
+
+def _find_local_snapshot() -> str | None:
+    """Mirror the helper in test_ming_flash_omni_encoders.py."""
+    def _has_shards(path: Path) -> bool:
+        return (
+            (path / "config.json").exists()
+            and (path / "model.safetensors.index.json").exists()
+            and (path / "model-00001-of-00042.safetensors").exists()
+        )
+
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and _has_shards(Path(override)):
+        return override
+    hybrid = Path("/dev/shm/ming-hybrid")
+    if _has_shards(hybrid):
+        return str(hybrid)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# BailingMoeV2ThinkerSubmodule.prepare_inputs dispatch (step 5b)
+# ---------------------------------------------------------------------------
+#
+# These build a fake LingMoeModel-like stub so we can exercise the
+# prepare_inputs dispatch (sentinel embed splice, position-id math)
+# without a multi-GB MoE forward pass. The model.forward is never
+# called in these tests; only prepare_inputs.
+
+
+class _StubEmbedTokens(torch.nn.Module):
+    """Identity-like embed for sentinel-id lookups in CPU unit tests.
+
+    Returns a deterministic vector per token id so tests can verify
+    the splice landed the right token at the right position.
+    """
+
+    def __init__(self, vocab_size: int, hidden_size: int) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        # Per-token unit vector: token_id one-hot expanded into hidden_size
+        # by tiling so we can read it back.
+        table = torch.zeros(vocab_size, hidden_size, dtype=torch.float32)
+        for i in range(vocab_size):
+            table[i, i % hidden_size] = float(i + 1)
+        self.weight = torch.nn.Parameter(table, requires_grad=False)
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        return self.weight[ids]
+
+
+class _StubLingMoeModel(torch.nn.Module):
+    """Minimal LingMoeModel surface used by the Thinker submodule init.
+
+    Only ``embed_tokens`` and ``lm_head`` are accessed by the submodule
+    constructor; forward isn't called in the prepare_inputs tests.
+    """
+
+    def __init__(self, vocab_size: int, hidden_size: int) -> None:
+        super().__init__()
+        self.embed_tokens = _StubEmbedTokens(vocab_size, hidden_size)
+        self.lm_head = torch.nn.Linear(hidden_size, vocab_size, bias=False)
+
+
+def _build_thinker_submodule(
+    hidden_size: int = 32,
+    vocab_size: int | None = None,
+):
+    """Build a Thinker submodule on top of a tiny stub model.
+
+    vocab_size defaults to one above the largest sentinel token id
+    in the released ckpt's config so the embed lookups stay in range.
+    """
+    from mstar.model.ming_omni_flash.submodules import (
+        BailingMoeV2ThinkerSubmodule,
+    )
+    cfg = _tiny_config()
+    if vocab_size is None:
+        # Largest modal sentinel id on the released ckpt is video_patch_token = 157175.
+        vocab_size = cfg.thinker_llm.video_patch_token + 100
+    cfg.thinker_llm.vocab_size = vocab_size
+    cfg.thinker_llm.hidden_size = hidden_size
+    cfg.thinker_llm.head_dim = max(hidden_size // 4, 1)
+    cfg.thinker_llm.num_attention_heads = 4
+    cfg.thinker_llm.num_key_value_heads = 2
+    model = _StubLingMoeModel(vocab_size=vocab_size, hidden_size=hidden_size)
+    return BailingMoeV2ThinkerSubmodule(model=model, config=cfg)
+
+
+def test_thinker_prepare_inputs_prefill_text_uses_input_ids() -> None:
+    """Text prefill returns input_ids path (no splice, no embeds)."""
+    sub = _build_thinker_submodule(hidden_size=32)
+    token_ids = torch.tensor([1, 2, 3, 4, 5], dtype=torch.long)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_text", fwd_info=None,
+        inputs={"text_inputs": [token_ids]},
+    )
+    assert out.input_seq_len == 5
+    assert out.input_embeds is None
+    assert out.custom_pos_ids is None
+    torch.testing.assert_close(out.input_ids, token_ids)
+
+
+def test_thinker_prepare_inputs_absorbs_engine_kwargs() -> None:
+    """prepare_inputs must accept engine-passed extras it doesn't use.
+
+    The KV-cache engine calls prepare_inputs with pos_info AND seen_token_mask
+    (sampler token mask). A signature without **kwargs would TypeError at
+    serve time — regression guard for the engine→submodule contract.
+    """
+    sub = _build_thinker_submodule(hidden_size=32)
+    token_ids = torch.tensor([1, 2, 3], dtype=torch.long)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_text", fwd_info=None,
+        inputs={"text_inputs": [token_ids]},
+        pos_info={},
+        seen_token_mask=None,  # extra kwarg the engine passes
+        some_future_kwarg="ignored",
+    )
+    assert out.input_seq_len == 3
+    torch.testing.assert_close(out.input_ids, token_ids)
+
+
+def test_thinker_check_stop_returns_graph_loop_name_on_eos() -> None:
+    """check_stop must return the EXACT Loop name declared in the graph.
+
+    A mismatch (e.g. returning "decode_loop" when the graph declares
+    "thinker_decode_loop") makes the worker's dynamic-loop registry raise
+    KeyError(NodeAndGraphWalk(node='decode_loop', ...)) on the EOS step and
+    crash the rank. This test pins check_stop's output to the actual graph
+    Loop name so the two can't drift.
+    """
+    from mstar.graph.base import Loop
+    from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+
+    sub = _build_thinker_submodule()
+
+    # The decode loop's name as declared by the model graph.
+    model = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    model.config = sub.config
+    walks = model.get_graph_walk_graphs()
+    decode = walks["thinker_decode"]
+    assert isinstance(decode, Loop)
+    graph_loop_name = decode.name
+
+    # EOS token -> check_stop must name that exact loop.
+    eos = sub.eos_token_id
+    stops = sub.check_stop("rid", None, {"new_token": [torch.tensor([eos])]})
+    assert stops == {graph_loop_name}, f"{stops} != {{{graph_loop_name!r}}}"
+
+    # Non-EOS token -> no stop.
+    assert sub.check_stop("rid", None, {"new_token": [torch.tensor([eos + 1])]}) == set()
+
+
+def test_thinker_prepare_inputs_legacy_prefill_walk_still_works() -> None:
+    """``prefill`` (the step 3f name) routes the same as prefill_text."""
+    sub = _build_thinker_submodule()
+    token_ids = torch.tensor([10, 20, 30], dtype=torch.long)
+    out = sub.prepare_inputs(
+        graph_walk="prefill", fwd_info=None,
+        inputs={"text_inputs": [token_ids]},
+    )
+    assert out.input_embeds is None
+    torch.testing.assert_close(out.input_ids, token_ids)
+
+
+def test_thinker_prepare_inputs_decode_path() -> None:
+    """thinker_decode returns input_ids path with seq_len=1."""
+    sub = _build_thinker_submodule()
+    out = sub.prepare_inputs(
+        graph_walk="thinker_decode", fwd_info=None,
+        inputs={"text_inputs": [torch.tensor([42], dtype=torch.long)]},
+    )
+    assert out.input_seq_len == 1
+    assert out.input_ids.tolist() == [42]
+
+
+def test_thinker_prepare_inputs_prefill_audio_splices_bos_eos() -> None:
+    """prefill_audio wraps audio_embeds with audio_start / audio_end sentinels."""
+    sub = _build_thinker_submodule(hidden_size=32)
+    audio_embeds = torch.randn(4, 32)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_audio", fwd_info=None,
+        inputs={"audio_embeds": [audio_embeds]},
+    )
+    # Seq len = 1 (bos) + 4 (audio) + 1 (eos) = 6.
+    assert out.input_seq_len == 6
+    assert out.input_embeds.shape == (6, 32)
+    # First row should match the audio_start_token embed; last row the
+    # audio_end_token embed.
+    cfg = sub.config.thinker_llm
+    expected_bos = sub.embed_tokens.weight[cfg.audio_start_token]
+    expected_eos = sub.embed_tokens.weight[cfg.audio_end_token]
+    torch.testing.assert_close(out.input_embeds[0].float(), expected_bos.float())
+    torch.testing.assert_close(out.input_embeds[-1].float(), expected_eos.float())
+    # Middle rows are the audio embeds as supplied.
+    torch.testing.assert_close(out.input_embeds[1:5], audio_embeds)
+    # 3D positions, text-like.
+    assert out.custom_pos_ids.shape == (3, 6)
+    assert out.custom_pos_ids[0].tolist() == [0, 1, 2, 3, 4, 5]
+
+
+def test_thinker_prepare_inputs_prefill_audio_advances_with_start_pos() -> None:
+    """Audio span at start_pos=10 produces positions [10..15]."""
+    from mstar.engine.kv_store import PositionInfo
+    sub = _build_thinker_submodule(hidden_size=32)
+    audio_embeds = torch.randn(2, 32)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_audio", fwd_info=None,
+        inputs={"audio_embeds": [audio_embeds]},
+        pos_info={"main": PositionInfo(position_id_start=10)},
+    )
+    assert out.input_seq_len == 4   # bos + 2 + eos
+    assert out.custom_pos_ids[0].tolist() == [10, 11, 12, 13]
+
+
+def test_thinker_prepare_inputs_prefill_audio_raises_on_missing_audio_embeds() -> None:
+    sub = _build_thinker_submodule()
+    with pytest.raises(ValueError, match="missing 'audio_embeds'"):
+        sub.prepare_inputs(
+            graph_walk="prefill_audio", fwd_info=None, inputs={},
+        )
+
+
+def test_thinker_prepare_inputs_prefill_vision_splices_bos_eos() -> None:
+    """prefill_vision wraps vision_embeds with image_start / image_end sentinels."""
+    sub = _build_thinker_submodule(hidden_size=32)
+    # grid (1, 4, 4), spatial_merge=2 → 4 tokens.
+    vision_embeds = torch.randn(4, 32)
+    out = sub.prepare_inputs(
+        graph_walk="prefill_vision", fwd_info=None,
+        inputs={
+            "vision_embeds": [vision_embeds],
+            "image_grid_thw": [torch.tensor([1, 4, 4], dtype=torch.long)],
+        },
+    )
+    # seq_len = 1 (image_start) + 4 (vision) + 1 (image_end) = 6
+    assert out.input_seq_len == 6
+    assert out.input_embeds.shape == (6, 32)
+    cfg = sub.config.thinker_llm
+    expected_bos = sub.embed_tokens.weight[cfg.image_start_token]
+    expected_eos = sub.embed_tokens.weight[cfg.image_end_token]
+    torch.testing.assert_close(out.input_embeds[0].float(), expected_bos.float())
+    torch.testing.assert_close(out.input_embeds[-1].float(), expected_eos.float())
+    # 3D positions, grid-aware.
+    assert out.custom_pos_ids.shape == (3, 6)
+    # Position 0 is the image_start sentinel at start_pos=0; vision span
+    # at start_pos+1=1, single-frame grid (1, 4, 4)/spatial_merge=2 →
+    # llm_grid = (1, 2, 2) = 4 tokens. T row constant at 1; H row
+    # cycles [1, 1, 2, 2]; W row cycles [1, 2, 1, 2]. Max position
+    # across all rows = 2; eos sentinel goes at 2 + 1 = 3 in every row
+    # (Ming uses ``llm_pos_ids_list[-1].max() + 1`` — global max, not
+    # per-row, see modeling_bailing_moe_v2.get_rope_index:632).
+    assert out.custom_pos_ids[0].tolist() == [0, 1, 1, 1, 1, 3]   # T row
+    assert out.custom_pos_ids[1].tolist() == [0, 1, 1, 2, 2, 3]   # H row
+    assert out.custom_pos_ids[2].tolist() == [0, 1, 2, 1, 2, 3]   # W row
+
+
+def test_thinker_prepare_inputs_prefill_video_uses_video_sentinels() -> None:
+    """prefill_video selects video_start / video_end sentinels."""
+    sub = _build_thinker_submodule(hidden_size=32)
+    vision_embeds = torch.randn(2, 32)   # grid (1, 2, 2) → 1 token; here just 2
+    # Use grid (2, 2, 2) which gives 2 tokens for spatial_merge=2.
+    out = sub.prepare_inputs(
+        graph_walk="prefill_video", fwd_info=None,
+        inputs={
+            "vision_embeds": [vision_embeds],
+            "image_grid_thw": [torch.tensor([2, 2, 2], dtype=torch.long)],
+            "video_second_per_grid": [torch.tensor(1.0)],
+        },
+    )
+    assert out.input_seq_len == 4   # bos + 2 + eos
+    cfg = sub.config.thinker_llm
+    expected_bos = sub.embed_tokens.weight[cfg.video_start_token]
+    expected_eos = sub.embed_tokens.weight[cfg.video_end_token]
+    torch.testing.assert_close(out.input_embeds[0].float(), expected_bos.float())
+    torch.testing.assert_close(out.input_embeds[-1].float(), expected_eos.float())
+
+
+def test_thinker_prepare_inputs_prefill_vision_raises_on_missing_grid_thw() -> None:
+    sub = _build_thinker_submodule()
+    with pytest.raises(ValueError, match="missing 'image_grid_thw'"):
+        sub.prepare_inputs(
+            graph_walk="prefill_vision", fwd_info=None,
+            inputs={"vision_embeds": [torch.randn(4, 32)]},
+        )
+
+
+def test_thinker_prepare_inputs_prefill_vision_rejects_multi_image() -> None:
+    sub = _build_thinker_submodule()
+    with pytest.raises(NotImplementedError, match="multi-image"):
+        sub.prepare_inputs(
+            graph_walk="prefill_vision", fwd_info=None,
+            inputs={
+                "vision_embeds": [torch.randn(4, 32)],
+                "image_grid_thw": [torch.tensor([[1, 4, 4], [1, 4, 4]], dtype=torch.long)],
+            },
+        )
+
+
+def test_thinker_prepare_inputs_unknown_walk_raises() -> None:
+    sub = _build_thinker_submodule()
+    with pytest.raises(ValueError, match="unknown graph_walk"):
+        sub.prepare_inputs(
+            graph_walk="prefill_unicorn", fwd_info=None, inputs={},
+        )
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated: end-to-end submodule construction with real weights
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(
+    _find_local_snapshot() is None,
+    reason="Need Ming-flash-omni-2.0 snapshot (set MING_FLASH_OMNI_DIR).",
+)
+def test_create_audio_encoder_submodule_loads_real_weights() -> None:
+    """``MingFlashOmniModel._create_audio_encoder_submodule`` end-to-end.
+
+    Builds the encoder + projector from the real config, loads the
+    real ckpt for both, then sanity-checks that the wrapper actually
+    holds the loaded modules. Skipped on boxes without the snapshot.
+
+    No CUDA needed — the audio encoder runs on CPU.
+    """
+    from mstar.model.ming_omni_flash.ming_omni_flash_model import (
+        MingFlashOmniModel,
+        _find_ming_code_dir,
+    )
+
+    snap = _find_local_snapshot()
+    code_dir = _find_ming_code_dir() or "/tmp/ming_repo"
+
+    model = MingFlashOmniModel(model_path_hf=snap, ming_code_dir=code_dir)
+    sub = model.get_submodule("audio_encoder", device="cpu")
+    assert isinstance(sub, AudioEncoderSubmodule)
+    # Confirm the encoder + projector have loaded params (not random
+    # init values). Conv1 weight RMS is well-defined post-load.
+    conv1_w = sub.audio_encoder.conv1.weight
+    assert conv1_w.abs().sum().item() > 0
+    proj0_w = sub.audio_projector.proj[0].weight
+    assert proj0_w.abs().sum().item() > 0
diff --git a/test/modular/test_ming_flash_omni_talker_dit.py b/test/modular/test_ming_flash_omni_talker_dit.py
new file mode 100644
index 00000000..d5ca519f
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_talker_dit.py
@@ -0,0 +1,599 @@
+"""Tests for the Ming-flash-omni-2.0 Talker DiT + CFM (step 6b).
+
+Covers the building blocks ported in ``components/talker_dit.py``:
+
+  * RotaryEmbedding's interleaved-pair layout (rotate_half) matches
+    x_transformers' convention so the released ckpt's weights load
+    against the same RoPE shape they were trained with.
+  * DiTTimestepEmbedding outputs the right shape and is dtype-stable.
+  * RMSNorm / FeedForward / Attention / DiTBlock / FinalLayer /
+    CondEmbedder shapes round-trip correctly.
+  * DiT.forward / forward_with_cfg returns the expected dims given a
+    flowmodel-shaped config.
+  * CFM.sample integrates over the EPSS schedule and returns the
+    initial-noise shape unchanged.
+  * build_talker_cfm constructs a DiT + CFM from a real TalkerConfig
+    without needing the checkpoint.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.talker_dit import (
+    CFM,
+    DiT,
+    DiTTimestepEmbedding,
+    RotaryEmbedding,
+    _apply_rotary_pos_emb,
+    _Attention,
+    _CondEmbedder,
+    _DiTBlock,
+    _FeedForward,
+    _FinalLayer,
+    _RMSNorm,
+    _rotate_half_interleaved,
+    _SinusPositionEmbedding,
+    build_talker_cfm,
+    get_epss_timesteps,
+)
+from mstar.model.ming_omni_flash.config import (
+    AudioVAEConfig,
+    DiTBlockConfig,
+    TalkerConfig,
+    TalkerLLMConfig,
+)
+
+# ---------------------------------------------------------------------------
+# RotaryEmbedding — match x_transformers' interleaved-pair layout
+# ---------------------------------------------------------------------------
+
+
+def test_rotate_half_interleaved_matches_pair_negation() -> None:
+    """``(x1, x2, x3, x4) -> (-x2, x1, -x4, x3)`` per the upstream rotate_half."""
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
+    out = _rotate_half_interleaved(x)
+    torch.testing.assert_close(out, torch.tensor([[-2.0, 1.0, -4.0, 3.0]]))
+
+
+def test_rotary_embedding_forward_from_seq_len_shape_and_pair_repeat() -> None:
+    """``(1, T, dim)`` with each adjacent pair sharing the same frequency."""
+    rope = RotaryEmbedding(dim=4)
+    freqs, xpos = rope.forward_from_seq_len(seq_len=3)
+    assert freqs.shape == (1, 3, 4)
+    assert xpos is None
+    # Adjacent pairs share the same value: freqs[:, t, 0] == freqs[:, t, 1].
+    torch.testing.assert_close(freqs[..., 0::2], freqs[..., 1::2])
+
+
+def test_apply_rotary_pos_emb_partial_rotation_preserves_passed_through() -> None:
+    """Trailing channels beyond ``rot_dim`` are untouched."""
+    rope = RotaryEmbedding(dim=4)
+    freqs, _ = rope.forward_from_seq_len(seq_len=2)
+    # 6-channel tensor; only first 4 should rotate.
+    t = torch.randn(1, 2, 2, 6)  # (B, H, T, head_dim)
+    out = _apply_rotary_pos_emb(t, freqs)
+    assert out.shape == t.shape
+    # Last 2 channels unchanged.
+    torch.testing.assert_close(out[..., 4:], t[..., 4:])
+
+
+# ---------------------------------------------------------------------------
+# DiTTimestepEmbedding
+# ---------------------------------------------------------------------------
+
+
+def test_sinus_position_embedding_concat_sin_cos_shape() -> None:
+    emb = _SinusPositionEmbedding(dim=8)
+    out = emb(torch.tensor([0.0, 1.0, 2.0]))
+    assert out.shape == (3, 8)
+    # Halves are sin / cos so sum-of-squares per row should be 4 (= half_dim).
+    sq = (out ** 2).sum(dim=-1)
+    torch.testing.assert_close(sq, torch.full((3,), 4.0))
+
+
+def test_sinus_position_embedding_rejects_odd_dim() -> None:
+    with pytest.raises(ValueError, match="must be even"):
+        _SinusPositionEmbedding(dim=7)
+
+
+def test_dit_timestep_embedding_shape_and_dtype() -> None:
+    """MLP output is (N, hidden_size); dtype follows the input."""
+    embed = DiTTimestepEmbedding(dim=16, freq_embed_dim=8)
+    t = torch.tensor([0.1, 0.5, 0.9], dtype=torch.float32)
+    out = embed(t)
+    assert out.shape == (3, 16)
+    assert out.dtype == torch.float32
+
+
+# ---------------------------------------------------------------------------
+# DiT building blocks
+# ---------------------------------------------------------------------------
+
+
+def test_rmsnorm_normalises_per_row_to_unit_var() -> None:
+    norm = _RMSNorm(dim=4, eps=1e-12)
+    x = torch.tensor([[1.0, 2.0, 3.0, 4.0]])
+    out = norm(x)
+    # rms = sqrt((1+4+9+16)/4) = sqrt(7.5).
+    expected = x / (7.5 ** 0.5)
+    torch.testing.assert_close(out, expected, atol=1e-6, rtol=1e-6)
+
+
+def test_feed_forward_layer_indices_match_checkpoint_keys() -> None:
+    """FF inner-Sequential indices must align with the released talker
+    ckpt's ``blocks.N.mlp.ff.0.0`` / ``ff.0.1`` / ``ff.2`` keys.
+
+    ff.0 → Sequential(Linear, GELU); ff.1 → Dropout; ff.2 → Linear.
+    """
+    ff = _FeedForward(dim=8, mult=2, dropout=0.0)
+    # ff is the outer Sequential; named children
+    seq = ff.ff
+    assert isinstance(seq[0], torch.nn.Sequential)
+    assert isinstance(seq[0][0], torch.nn.Linear)
+    assert isinstance(seq[0][1], torch.nn.GELU)
+    assert isinstance(seq[1], torch.nn.Dropout)
+    assert isinstance(seq[2], torch.nn.Linear)
+    # Round-trip shape.
+    x = torch.randn(2, 4, 8)
+    assert ff(x).shape == (2, 4, 8)
+
+
+def test_attention_to_q_to_k_to_v_to_out_param_names() -> None:
+    """Upstream weight keys: blocks.N.attn.{to_q,to_k,to_v}.weight + to_out.0.weight."""
+    attn = _Attention(dim=8, heads=2, dim_head=4)
+    keys = set(dict(attn.named_parameters()).keys())
+    for must_have in [
+        "to_q.weight", "to_q.bias",
+        "to_k.weight", "to_k.bias",
+        "to_v.weight", "to_v.bias",
+        "to_out.0.weight", "to_out.0.bias",
+    ]:
+        assert must_have in keys, f"missing param {must_have!r}"
+    # qk_norm=None → no q_norm / k_norm params.
+    assert not any(k.startswith("q_norm") or k.startswith("k_norm") for k in keys)
+
+
+def test_attention_forward_shape_no_rope() -> None:
+    attn = _Attention(dim=16, heads=2, dim_head=8)
+    x = torch.randn(2, 5, 16)
+    out = attn(x, rope=None)
+    assert out.shape == (2, 5, 16)
+    assert torch.isfinite(out).all()
+
+
+def test_attention_forward_shape_with_rope() -> None:
+    attn = _Attention(dim=16, heads=2, dim_head=8)
+    rope = RotaryEmbedding(dim=8)
+    rope_freqs = rope.forward_from_seq_len(5)
+    x = torch.randn(2, 5, 16)
+    out = attn(x, rope=rope_freqs)
+    assert out.shape == (2, 5, 16)
+
+
+def test_attention_qk_norm_rms_adds_q_norm_k_norm_params() -> None:
+    attn = _Attention(dim=16, heads=2, dim_head=8, qk_norm="rms_norm")
+    keys = set(dict(attn.named_parameters()).keys())
+    assert "q_norm.weight" in keys
+    assert "k_norm.weight" in keys
+
+
+def test_attention_rejects_unknown_qk_norm() -> None:
+    with pytest.raises(ValueError, match="Unimplemented qk_norm"):
+        _Attention(dim=16, heads=2, dim_head=8, qk_norm="layer_norm")
+
+
+def test_dit_block_forward_runs_with_rope() -> None:
+    blk = _DiTBlock(hidden_size=16, num_heads=2, mlp_ratio=2)
+    rope = RotaryEmbedding(dim=8).forward_from_seq_len(5)
+    x = torch.randn(2, 5, 16)
+    # 6c added a mask argument to DiTBlock.forward (Aggregator needs it);
+    # the CFM/DiT call path passes mask=None.
+    out = blk(x, None, rope)
+    assert out.shape == (2, 5, 16)
+    assert torch.isfinite(out).all()
+
+
+def test_final_layer_projects_to_out_channels() -> None:
+    f = _FinalLayer(hidden_size=16, out_channels=64)
+    x = torch.randn(2, 5, 16)
+    out = f(x)
+    assert out.shape == (2, 5, 64)
+
+
+def test_cond_embedder_projects_llm_to_dit_hidden() -> None:
+    c = _CondEmbedder(input_feature_size=896, hidden_size=1024)
+    x = torch.randn(2, 1, 896)
+    out = c(x)
+    assert out.shape == (2, 1, 1024)
+
+
+# ---------------------------------------------------------------------------
+# DiT — full assembly
+# ---------------------------------------------------------------------------
+
+
+def _tiny_dit(spk_dim: int | None = None) -> DiT:
+    return DiT(
+        in_channels=8,
+        hidden_size=16,
+        depth=2,
+        num_heads=2,
+        mlp_ratio=2,
+        llm_cond_dim=8,
+        dropout=0.0,
+        spk_dim=spk_dim,
+    )
+
+
+def test_dit_forward_output_shape_includes_prefix_tokens() -> None:
+    """DiT outputs ``(B, 1 + his + patch, out_channels)`` (no spk_embedder)."""
+    dit = _tiny_dit()
+    B, his, patch = 2, 4, 3
+    x = torch.randn(B, patch, 8)
+    t = torch.tensor([0.5, 0.5])
+    c = torch.randn(B, 1, 8)
+    lh = torch.randn(B, his, 8)
+    out = dit(x, t, c, lh)
+    # Sequence: y (1 token) + latent_history (his) + x (patch).
+    assert out.shape == (B, 1 + his + patch, 8)
+    assert torch.isfinite(out).all()
+
+
+def test_dit_forward_with_cfg_returns_only_x_rows() -> None:
+    """CFG forward doubles batch internally; returns the trailing x rows."""
+    dit = _tiny_dit()
+    x = torch.randn(2, 3, 8)
+    t = torch.tensor(0.3)
+    c = torch.randn(2, 1, 8)
+    lh = torch.randn(2, 4, 8)
+    out = dit.forward_with_cfg(x, t, c, lh)
+    # Doubled batch (B*2) keeps the original batch dim before chunk.
+    # forward_with_cfg slices the last x.shape[1] rows → (B*2, patch, out).
+    assert out.shape == (4, 3, 8)
+
+
+def test_dit_spk_embedder_absent_raises_when_emb_supplied() -> None:
+    """Explicit shape contract: providing spk_emb when spk_embedder=None is a bug."""
+    dit = _tiny_dit(spk_dim=None)
+    with pytest.raises(AssertionError, match="spk_embedder"):
+        dit(
+            x=torch.randn(2, 3, 8),
+            t=torch.tensor([0.5, 0.5]),
+            c=torch.randn(2, 1, 8),
+            latent_history=torch.randn(2, 4, 8),
+            spk_emb=torch.randn(2, 16),
+        )
+
+
+def test_dit_with_spk_embedder_concats_spk_token() -> None:
+    dit = _tiny_dit(spk_dim=16)
+    x = torch.randn(2, 3, 8)
+    out = dit(
+        x=x,
+        t=torch.tensor([0.5, 0.5]),
+        c=torch.randn(2, 1, 8),
+        latent_history=torch.randn(2, 4, 8),
+        # spk_emb is (B, 1, spk_dim) — same 3D shape as c. The
+        # spk_embedder projects (B, 1, spk_dim) → (B, 1, hidden_size)
+        # and gets concatenated alongside y on dim=1.
+        spk_emb=torch.randn(2, 1, 16),
+    )
+    # spk (1) + y (1) + his (4) + patch (3) = 9.
+    assert out.shape == (2, 9, 8)
+
+
+# ---------------------------------------------------------------------------
+# CFM + EPSS schedule
+# ---------------------------------------------------------------------------
+
+
+def test_get_epss_timesteps_predefined_n_10_matches_upstream_schedule() -> None:
+    """Released ckpt uses steps=10 — schedule must match upstream exactly."""
+    t = get_epss_timesteps(10, device="cpu", dtype=torch.float32)
+    expected = (1 / 32) * torch.tensor(
+        [0, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32], dtype=torch.float32,
+    )
+    torch.testing.assert_close(t, expected)
+    assert t.numel() == 11   # steps + 1
+
+
+def test_get_epss_timesteps_falls_back_to_linspace_for_unknown_n() -> None:
+    t = get_epss_timesteps(9, device="cpu", dtype=torch.float32)
+    expected = torch.linspace(0, 1, 10, dtype=torch.float32)
+    torch.testing.assert_close(t, expected)
+
+
+def test_cfm_sample_returns_same_shape_as_y0() -> None:
+    """Smoke: CFM.sample preserves the noise tensor shape after integration."""
+    dit = _tiny_dit()
+    cfm = CFM(model=dit, steps=4)
+    B, patch, latent = 2, 3, 8
+    llm_cond = torch.randn(B, 1, 8)
+    lat_cond = torch.randn(B, 5, latent)
+    y0 = torch.randn(B, patch, latent)
+    t = get_epss_timesteps(4, device="cpu", dtype=torch.float32)
+    sde_args = torch.tensor([2.0, 0.0, 0.0])   # cfg=2, no sde noise
+    sde_rnd = torch.zeros(4, B, patch, latent)
+    out = cfm.sample(llm_cond, lat_cond, y0, t, sde_args, sde_rnd)
+    assert out.shape == y0.shape
+    assert torch.isfinite(out).all()
+
+
+def test_cfm_sample_rejects_mismatched_t_length() -> None:
+    dit = _tiny_dit()
+    cfm = CFM(model=dit, steps=4)
+    bad_t = torch.zeros(3)
+    with pytest.raises(ValueError, match="length steps\\+1 = 5"):
+        cfm.sample(
+            llm_cond=torch.randn(1, 1, 8),
+            lat_cond=torch.randn(1, 5, 8),
+            y0=torch.randn(1, 3, 8),
+            t=bad_t,
+            sde_args=torch.tensor([2.0, 0.0, 0.0]),
+            sde_rnd=torch.zeros(4, 1, 3, 8),
+        )
+
+
+def test_cfm_sample_rejects_mismatched_sde_rnd_first_dim() -> None:
+    dit = _tiny_dit()
+    cfm = CFM(model=dit, steps=4)
+    t = get_epss_timesteps(4, device="cpu", dtype=torch.float32)
+    with pytest.raises(ValueError, match="sde_rnd\\[0\\] = 4"):
+        cfm.sample(
+            llm_cond=torch.randn(1, 1, 8),
+            lat_cond=torch.randn(1, 5, 8),
+            y0=torch.randn(1, 3, 8),
+            t=t,
+            sde_args=torch.tensor([2.0, 0.0, 0.0]),
+            sde_rnd=torch.zeros(3, 1, 3, 8),
+        )
+
+
+def test_cfm_no_sway_skips_remap() -> None:
+    """sway_sampling_coef=None must skip the t remap (sanity-check the branch)."""
+    dit = _tiny_dit()
+    cfm = CFM(model=dit, steps=4, sway_sampling_coef=None)
+    t = get_epss_timesteps(4, device="cpu", dtype=torch.float32)
+    out = cfm.sample(
+        llm_cond=torch.randn(1, 1, 8),
+        lat_cond=torch.randn(1, 5, 8),
+        y0=torch.randn(1, 3, 8),
+        t=t,
+        sde_args=torch.tensor([0.0, 0.0, 0.0]),
+        sde_rnd=torch.zeros(4, 1, 3, 8),
+    )
+    assert out.shape == (1, 3, 8)
+
+
+# ---------------------------------------------------------------------------
+# build_talker_cfm factory
+# ---------------------------------------------------------------------------
+
+
+def test_build_talker_cfm_from_real_config() -> None:
+    """Released ckpt's TalkerConfig (defaults) yields the expected DiT shape."""
+    cfg = TalkerConfig(
+        llm=TalkerLLMConfig(),
+        flowmodel=DiTBlockConfig(),
+        aggregator=DiTBlockConfig(),
+        vae=AudioVAEConfig(),
+    )
+    cfm = build_talker_cfm(cfg, dtype=torch.float32, device="cpu")
+    assert isinstance(cfm, CFM)
+    assert cfm.steps == cfg.steps   # 10
+    dit = cfm.model
+    assert isinstance(dit, DiT)
+    assert dit.hidden_size == cfg.flowmodel.hidden_size   # 1024
+    assert dit.in_channels == cfg.flowmodel.in_channels   # 64
+    assert dit.num_heads == cfg.flowmodel.num_heads       # 16
+    assert len(dit.blocks) == cfg.flowmodel.depth         # 8
+    # llm_cond_dim defaults to talker LLM hidden_size (896 on released ckpt).
+    assert dit.c_embedder.cond_embedder.in_features == cfg.llm.hidden_size
+
+
+def test_build_talker_cfm_accepts_llm_cond_dim_override() -> None:
+    cfg = TalkerConfig()
+    cfm = build_talker_cfm(cfg, llm_cond_dim=4096, dtype=torch.float32, device="cpu")
+    assert cfm.model.c_embedder.cond_embedder.in_features == 4096
+
+
+# ---------------------------------------------------------------------------
+# Step 6c — Attention mask handling
+# ---------------------------------------------------------------------------
+
+
+def test_attention_mask_zeros_output_at_padded_positions() -> None:
+    """``mask=False`` rows in input get zeroed in the output regardless of
+    `attn_mask_enabled` (mirrors upstream's unconditional masked_fill).
+    """
+    attn = _Attention(dim=8, heads=2, dim_head=4, attn_mask_enabled=False)
+    x = torch.randn(1, 4, 8)
+    mask = torch.tensor([[True, True, False, False]])
+    out = attn(x, mask=mask)
+    # First 2 rows should be the live attention output; last 2 zero.
+    assert (out[:, 2:].abs().sum() == 0).item()
+    assert (out[:, :2].abs().sum() > 0).item()
+
+
+def test_attention_mask_enabled_uses_sdpa_attn_mask() -> None:
+    """With attn_mask_enabled=True, padded keys shouldn't contribute to softmax.
+
+    Smoke check: forward runs without error and the live output rows
+    are still finite (we don't assert numerical equivalence to the
+    unmasked case since SDPA's mask changes attention weights).
+    """
+    attn = _Attention(dim=8, heads=2, dim_head=4, attn_mask_enabled=True)
+    x = torch.randn(1, 4, 8)
+    mask = torch.tensor([[True, True, False, False]])
+    out = attn(x, mask=mask)
+    assert torch.isfinite(out[:, :2]).all()
+
+
+def test_attention_no_mask_no_zeroing() -> None:
+    """mask=None must NOT zero anything (regression guard against the
+    upstream branch that only runs masked_fill when mask is not None).
+    """
+    attn = _Attention(dim=8, heads=2, dim_head=4, attn_mask_enabled=False)
+    x = torch.randn(1, 4, 8)
+    out = attn(x, mask=None)
+    assert (out.abs().sum() > 0).item()
+
+
+# ---------------------------------------------------------------------------
+# Step 6c — Aggregator
+# ---------------------------------------------------------------------------
+
+
+def _tiny_aggregator(llm_input_dim: int = 16):
+    from mstar.model.ming_omni_flash.components.talker_dit import Aggregator
+    return Aggregator(
+        in_channels=8,
+        hidden_size=16,
+        depth=2,
+        num_heads=2,
+        mlp_ratio=2,
+        llm_input_dim=llm_input_dim,
+        dropout=0.0,
+    )
+
+
+def test_aggregator_outputs_cls_row_only() -> None:
+    """Aggregator returns ``(B, 1, llm_input_dim)`` — the [CLS] row."""
+    agg = _tiny_aggregator(llm_input_dim=16)
+    x = torch.randn(2, 5, 8)
+    out = agg(x)
+    assert out.shape == (2, 1, 16)
+    assert torch.isfinite(out).all()
+
+
+def test_aggregator_word_embedder_has_single_row() -> None:
+    """``nn.Embedding(1, hidden_size)`` — exactly one [CLS] token."""
+    agg = _tiny_aggregator()
+    assert agg.word_embedder.num_embeddings == 1
+    assert agg.word_embedder.embedding_dim == 16
+
+
+def test_aggregator_respects_mask_in_dit_blocks() -> None:
+    """With a key-padding mask, the masked rows still don't contaminate the
+    [CLS] output (since the DiT blocks zero them out before the final
+    layer).  Verify the forward at least runs through with a mask.
+    """
+    agg = _tiny_aggregator(llm_input_dim=16)
+    x = torch.randn(1, 5, 8)
+    # mark last 2 positions invalid
+    mask = torch.tensor([[True, True, True, False, False]])
+    out = agg(x, mask=mask)
+    assert out.shape == (1, 1, 16)
+    assert torch.isfinite(out).all()
+
+
+def test_aggregator_forward_matches_shape_for_various_T() -> None:
+    agg = _tiny_aggregator(llm_input_dim=16)
+    for T in (1, 4, 8):
+        out = agg(torch.randn(2, T, 8))
+        assert out.shape == (2, 1, 16)
+
+
+def test_build_aggregator_from_real_config() -> None:
+    """build_aggregator picks dims off TalkerConfig.aggregator."""
+    from mstar.model.ming_omni_flash.components.talker_dit import (
+        Aggregator,
+        build_aggregator,
+    )
+    cfg = TalkerConfig(
+        llm=TalkerLLMConfig(),
+        flowmodel=DiTBlockConfig(),
+        aggregator=DiTBlockConfig(),
+        vae=AudioVAEConfig(),
+    )
+    agg = build_aggregator(cfg, dtype=torch.float32, device="cpu")
+    assert isinstance(agg, Aggregator)
+    assert agg.hidden_size == cfg.aggregator.hidden_size   # 1024
+    assert len(agg.blocks) == cfg.aggregator.depth         # 8
+    # final_layer projects to llm_input_dim = talker.llm.hidden_size = 896.
+    assert agg.final_layer.linear.out_features == cfg.llm.hidden_size
+
+
+def test_build_aggregator_llm_input_dim_override() -> None:
+    from mstar.model.ming_omni_flash.components.talker_dit import build_aggregator
+    cfg = TalkerConfig()
+    agg = build_aggregator(cfg, llm_input_dim=2048, dtype=torch.float32, device="cpu")
+    assert agg.final_layer.linear.out_features == 2048
+
+
+# ---------------------------------------------------------------------------
+# Step 6c — Qwen2 talker LLM backbone
+# ---------------------------------------------------------------------------
+
+
+def test_build_talker_llm_returns_qwen2_model_with_correct_dims() -> None:
+    """Stock transformers.Qwen2Model with our TalkerLLMConfig dims."""
+    from mstar.model.ming_omni_flash.components.talker_dit import build_talker_llm
+    llm_cfg = TalkerLLMConfig(
+        vocab_size=128,            # tiny vocab for speed
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        max_position_embeddings=128,
+        sliding_window=32,
+        max_window_layers=0,
+    )
+    model = build_talker_llm(llm_cfg, dtype=torch.float32, device="cpu")
+    # The HF class is Qwen2Model.
+    from transformers import Qwen2Model
+    assert isinstance(model, Qwen2Model)
+    # Dims match what we passed in.
+    assert model.config.hidden_size == 64
+    assert model.config.num_hidden_layers == 2
+    assert model.config.num_attention_heads == 4
+    assert model.config.num_key_value_heads == 2
+    assert model.config.vocab_size == 128
+
+
+def test_build_talker_llm_forward_runs_on_tiny_input() -> None:
+    """Forward pass through the tiny Qwen2 backbone returns hidden states."""
+    from mstar.model.ming_omni_flash.components.talker_dit import build_talker_llm
+    llm_cfg = TalkerLLMConfig(
+        vocab_size=64, hidden_size=32, intermediate_size=64,
+        num_hidden_layers=1, num_attention_heads=4, num_key_value_heads=2,
+        max_position_embeddings=64, sliding_window=32, max_window_layers=0,
+    )
+    model = build_talker_llm(llm_cfg, dtype=torch.float32, device="cpu")
+    input_ids = torch.tensor([[1, 2, 3, 4]], dtype=torch.long)
+    with torch.no_grad():
+        out = model(input_ids=input_ids)
+    # Qwen2Model.forward returns a BaseModelOutputWithPast.
+    assert out.last_hidden_state.shape == (1, 4, 32)
+    assert torch.isfinite(out.last_hidden_state).all()
+
+
+# ---------------------------------------------------------------------------
+# Step 6c — Talker heads
+# ---------------------------------------------------------------------------
+
+
+def test_build_talker_heads_emits_stop_head_and_spk_head() -> None:
+    """stop_head: hidden → 2 (binary), spk_head: 192 → hidden."""
+    from mstar.model.ming_omni_flash.components.talker_dit import build_talker_heads
+    cfg = TalkerConfig(llm=TalkerLLMConfig())  # hidden_size=896
+    heads = build_talker_heads(cfg, dtype=torch.float32, device="cpu")
+    assert "stop_head" in heads and "spk_head" in heads
+    sh = heads["stop_head"]
+    assert sh.in_features == cfg.llm.hidden_size   # 896
+    assert sh.out_features == 2
+    assert sh.bias is not None
+    spk = heads["spk_head"]
+    assert spk.in_features == 192
+    assert spk.out_features == cfg.llm.hidden_size  # 896
+    assert spk.bias is not None
+
+
+def test_build_talker_heads_spk_dim_override() -> None:
+    from mstar.model.ming_omni_flash.components.talker_dit import build_talker_heads
+    cfg = TalkerConfig(llm=TalkerLLMConfig())
+    heads = build_talker_heads(cfg, spk_embed_dim=512, dtype=torch.float32, device="cpu")
+    assert heads["spk_head"].in_features == 512
diff --git a/test/modular/test_ming_flash_omni_talker_generator.py b/test/modular/test_ming_flash_omni_talker_generator.py
new file mode 100644
index 00000000..4594cbb2
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_talker_generator.py
@@ -0,0 +1,380 @@
+"""Tests for the TalkerGenerator orchestration helper (step 6e-1).
+
+Covers the standalone helper class that composes Qwen2 + CFM +
+Aggregator + stop_head + AudioVAE into the .generate_latents() /
+.decode_to_waveform() API. Pure-Python tests with tiny stub-like
+configs on CPU; integration with mstar's graph system lands in 6e-2.
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.audio_vae import build_audio_vae
+from mstar.model.ming_omni_flash.components.talker_dit import (
+    build_aggregator,
+    build_talker_cfm,
+    build_talker_heads,
+    build_talker_llm,
+)
+from mstar.model.ming_omni_flash.components.talker_generator import (
+    TalkerGenerator,
+    silence_holder,
+    trim_trailing_silence,
+)
+from mstar.model.ming_omni_flash.config import (
+    AudioVAEConfig,
+    DiTBlockConfig,
+    TalkerConfig,
+    TalkerLLMConfig,
+)
+
+# ---------------------------------------------------------------------------
+# Tiny TalkerConfig + Qwen2 backbone for fast CPU tests
+# ---------------------------------------------------------------------------
+
+
+def _tiny_qwen2_backbone(hidden_size: int = 32, num_layers: int = 1) -> dict:
+    return {
+        "hidden_size": hidden_size,
+        "intermediate_size": hidden_size * 2,
+        "num_hidden_layers": num_layers,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "max_position_embeddings": 256,
+        "vocab_size": 1,
+        "use_sliding_window": True,
+        "sliding_window": 32,
+        "max_window_layers": 0,
+        "rope_theta": 1_000_000.0,
+        "rms_norm_eps": 1e-6,
+        "hidden_act": "silu",
+    }
+
+
+def _tiny_talker_config() -> TalkerConfig:
+    """Tiny end-to-end TalkerConfig: 32-hidden, 1-layer talker LLM,
+    8-channel CFM, 4-channel VAE — sized so all CPU forwards finish
+    in <1s."""
+    return TalkerConfig(
+        steps=2,                # CFM substeps per generation step (use_predefined N/A)
+        patch_size=2,           # CFM patch length
+        history_patch_size=2,   # match patch_size for the simple update path
+        cfg_strength=2.0,
+        llm=TalkerLLMConfig(
+            vocab_size=32,
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=1,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            max_position_embeddings=128,
+            sliding_window=64,
+            max_window_layers=0,
+            use_sliding_window=False,
+        ),
+        flowmodel=DiTBlockConfig(
+            depth=1, hidden_size=32, num_heads=2,
+            mlp_ratio=2, in_channels=4, dropout=0.0,
+            attn_mask_enabled=False,
+        ),
+        aggregator=DiTBlockConfig(
+            depth=1, hidden_size=32, num_heads=2,
+            mlp_ratio=2, in_channels=4, dropout=0.0,
+            attn_mask_enabled=False,
+        ),
+        vae=AudioVAEConfig(
+            sample_rate=8000,
+            patch_size=-1,             # no patching inside the VAE
+            latent_dim=4,
+            encoder_input_dim=16,
+            encoder_hop_size=16,
+            decoder_output_dim=16,
+            enc_backbone=_tiny_qwen2_backbone(),
+            dec_backbone=_tiny_qwen2_backbone(),
+        ),
+    )
+
+
+def _build_generator(with_vae: bool = True) -> TalkerGenerator:
+    cfg = _tiny_talker_config()
+    llm = build_talker_llm(cfg.llm, dtype=torch.float32, device="cpu")
+    cfm = build_talker_cfm(cfg, dtype=torch.float32, device="cpu")
+    agg = build_aggregator(cfg, dtype=torch.float32, device="cpu")
+    heads = build_talker_heads(cfg, dtype=torch.float32, device="cpu")
+    vae = (
+        build_audio_vae(cfg.vae, dtype=torch.float32, device="cpu", attn_implementation="sdpa")
+        if with_vae else None
+    )
+    return TalkerGenerator(
+        talker_config=cfg, llm=llm, cfm=cfm, aggregator=agg,
+        stop_head=heads["stop_head"], audio_vae=vae,
+    )
+
+
+# ---------------------------------------------------------------------------
+# trim_trailing_silence
+# ---------------------------------------------------------------------------
+
+
+def test_trim_trailing_silence_empty_waveform_passthrough() -> None:
+    assert trim_trailing_silence(torch.zeros(0), sample_rate=8000).numel() == 0
+
+
+def test_trim_trailing_silence_keeps_short_clip_intact() -> None:
+    """Clips shorter than one frame get truncated to tail_silence budget."""
+    sr = 8000
+    short = torch.randn(1, 1, sr // 20)  # ~50 ms — shorter than 100 ms frame
+    out = trim_trailing_silence(short, sample_rate=sr, tail_silence_s=0.3)
+    assert out.shape[-1] == short.shape[-1]
+
+
+def test_trim_trailing_silence_trims_silent_tail() -> None:
+    """Long silent tail past a brief noisy region gets trimmed."""
+    sr = 8000
+    noisy = torch.randn(1, 1, sr) * 0.5      # 1.0s of noise
+    silent = torch.zeros(1, 1, 2 * sr)        # 2.0s of silence
+    waveform = torch.cat([noisy, silent], dim=-1)
+    out = trim_trailing_silence(waveform, sample_rate=sr, tail_silence_s=0.3)
+    # Should keep ~1.0s of noise + 0.3s of trailing silence.
+    assert out.shape[-1] < waveform.shape[-1]
+    assert out.shape[-1] >= int(0.9 * sr)  # at least ~0.9s
+
+
+def test_trim_trailing_silence_passes_through_weird_shape() -> None:
+    """4-D tensors aren't supported — return unchanged."""
+    weird = torch.zeros(1, 2, 3, 4)
+    out = trim_trailing_silence(weird, sample_rate=8000)
+    assert out.shape == weird.shape
+
+
+# ---------------------------------------------------------------------------
+# silence_holder
+# ---------------------------------------------------------------------------
+
+
+def test_silence_holder_initial_cache_shape() -> None:
+    """Empty input + no cache → returns empty tensor + empty cache."""
+    out, cache = silence_holder(
+        torch.zeros(1, 0), sample_rate=8000, sil_cache=None, last_chunk=True,
+    )
+    assert out.numel() == 0
+    assert cache == {"holder": [], "buffer": []}
+
+
+def test_silence_holder_short_chunk_buffers_until_last() -> None:
+    """Sub-frame chunks accumulate in buffer until last_chunk=True."""
+    sr = 8000
+    cache = None
+    out1, cache = silence_holder(
+        torch.zeros(1, sr // 20), sample_rate=sr, sil_cache=cache, last_chunk=False,
+    )
+    assert out1.shape[-1] == 0   # buffered, nothing emitted
+    out2, cache = silence_holder(
+        torch.zeros(1, sr // 20), sample_rate=sr, sil_cache=cache, last_chunk=True,
+    )
+    # On last_chunk=True, the buffered + holder regions are flushed,
+    # truncated to last_sil=0.3s.
+    assert out2.shape[-1] <= int(0.3 * sr)
+
+
+# ---------------------------------------------------------------------------
+# TalkerGenerator — construction + state-machine sanity
+# ---------------------------------------------------------------------------
+
+
+def test_generator_constructs_with_all_components_bound() -> None:
+    gen = _build_generator(with_vae=True)
+    assert gen.patch_size == 2
+    assert gen.his_patch_size == 2
+    assert gen.latent_dim == 4
+    assert gen.cfg_strength == 2.0
+    assert gen.audio_vae is not None
+
+
+def test_generator_constructs_without_audio_vae() -> None:
+    gen = _build_generator(with_vae=False)
+    assert gen.audio_vae is None
+
+
+def test_init_his_lat_zeros_when_no_prompt() -> None:
+    gen = _build_generator(with_vae=False)
+    his = gen._init_his_lat(None, torch.device("cpu"), torch.float32)
+    assert his.shape == (1, gen.his_patch_size, gen.latent_dim)
+    assert (his == 0).all()
+
+
+def test_init_his_lat_right_aligns_prompt() -> None:
+    """Voice-prompt latents land at the right edge of the his window."""
+    gen = _build_generator(with_vae=False)
+    prompt = torch.randn(1, 1, gen.latent_dim)
+    his = gen._init_his_lat(prompt, torch.device("cpu"), torch.float32)
+    assert his.shape == (1, gen.his_patch_size, gen.latent_dim)
+    # Right-most row should equal the prompt's single frame.
+    torch.testing.assert_close(his[0, -1, :], prompt[0, 0, :])
+
+
+def test_update_his_lat_equal_sizes_returns_gen() -> None:
+    """When his_patch_size == patch_size, the new lat replaces the buffer."""
+    gen = _build_generator(with_vae=False)
+    his = torch.zeros(1, 2, 4)
+    new = torch.ones(1, 2, 4)
+    out = gen._update_his_lat(his, new)
+    torch.testing.assert_close(out, new)
+
+
+def test_update_his_lat_rejects_unsupported_shape() -> None:
+    """his_patch_size < patch_size is not yet implemented."""
+    gen = _build_generator(with_vae=False)
+    gen.his_patch_size = 1
+    gen.patch_size = 2
+    with pytest.raises(NotImplementedError, match="his_patch_size"):
+        gen._update_his_lat(torch.zeros(1, 1, 4), torch.zeros(1, 2, 4))
+
+
+# ---------------------------------------------------------------------------
+# Single-step plumbing (CFM step + LLM step)
+# ---------------------------------------------------------------------------
+
+
+def test_cfm_sample_step_returns_three_tensors_with_right_shapes() -> None:
+    """gen_lat (B, patch, latent_dim); next_emb (B, 1, llm_hidden); stop (B, 2)."""
+    gen = _build_generator(with_vae=False)
+    last_hs = torch.randn(1, 1, gen.config.llm.hidden_size)
+    his_lat = torch.zeros(1, gen.his_patch_size, gen.latent_dim)
+    with torch.no_grad():
+        gen_lat, next_emb, stop_out = gen.cfm_sample_step(
+            last_hs, his_lat, cfg=2.0, sigma=0.0, temperature=0.0,
+        )
+    assert gen_lat.shape == (1, gen.patch_size, gen.latent_dim)
+    assert next_emb.shape == (1, 1, gen.config.llm.hidden_size)
+    assert stop_out.shape == (1, 2)
+    # Softmax across dim=-1 sums to 1.
+    torch.testing.assert_close(stop_out.sum(-1), torch.ones(1), atol=1e-5, rtol=0)
+
+
+def test_llm_step_step0_no_cache_position() -> None:
+    """On step 0 the LLM is called without cache_position; just verify it returns."""
+    gen = _build_generator(with_vae=False)
+    inputs_embeds = torch.randn(1, 3, gen.config.llm.hidden_size)
+    with torch.no_grad():
+        out = gen.llm_step(
+            inputs_embeds, step=0, past_key_values=None, use_static_cache=False,
+        )
+    # Returns last hidden state row only.
+    assert out.shape == (1, 1, gen.config.llm.hidden_size)
+
+
+# ---------------------------------------------------------------------------
+# generate_latents — full AR loop on tiny config
+# ---------------------------------------------------------------------------
+
+
+def test_generate_latents_collects_per_step_patches() -> None:
+    """Loop emits one latent per step; min_new_token=0, max_steps=3."""
+    gen = _build_generator(with_vae=False)
+    inputs_embeds = torch.randn(1, 4, gen.config.llm.hidden_size)
+    lats = gen.generate_latents(
+        inputs_embeds,
+        min_new_token=0,
+        max_steps=3,
+        use_static_cache=False,   # avoid StaticCache complexity in tests
+    )
+    # min_new_token=0 means we may stop early on any step with stop_prob > 0.5.
+    # On random init, stop_prob is roughly 0.5; just verify we got *some* output.
+    assert 1 <= len(lats) <= 3
+    for lat in lats:
+        assert lat.shape == (1, gen.patch_size, gen.latent_dim)
+
+
+def test_generate_latents_respects_max_steps_cap() -> None:
+    """When stop signal never fires (force it via min_new_token), max_steps caps the loop."""
+    gen = _build_generator(with_vae=False)
+    inputs_embeds = torch.randn(1, 2, gen.config.llm.hidden_size)
+    lats = gen.generate_latents(
+        inputs_embeds,
+        min_new_token=1000,        # never satisfies stop check
+        max_steps=4,
+        use_static_cache=False,
+    )
+    assert len(lats) == 4
+
+
+# ---------------------------------------------------------------------------
+# duration_capped_steps
+# ---------------------------------------------------------------------------
+
+
+def test_duration_capped_steps_no_audio_vae_pass_through() -> None:
+    gen = _build_generator(with_vae=False)
+    assert gen.duration_capped_steps(text_len=100, requested_max_steps=1000) == 1000
+
+
+def test_duration_capped_steps_uses_text_len_heuristic() -> None:
+    """Long text → high cap; short text → low cap (capped at 2.0s minimum)."""
+    gen = _build_generator(with_vae=True)
+    short = gen.duration_capped_steps(text_len=1, requested_max_steps=10_000)
+    long_ = gen.duration_capped_steps(text_len=100, requested_max_steps=10_000)
+    assert short <= long_
+
+
+# ---------------------------------------------------------------------------
+# decode_to_waveform
+# ---------------------------------------------------------------------------
+
+
+def test_decode_to_waveform_empty_returns_zero_length() -> None:
+    """Empty latent list → (1, 1, 0) zero-length waveform."""
+    gen = _build_generator(with_vae=True)
+    wf = gen.decode_to_waveform([], stream_decode=False)
+    assert wf.shape == (1, 1, 0)
+
+
+def test_decode_to_waveform_oneshot_runs_end_to_end() -> None:
+    """Non-streaming path concatenates latents and runs one VAE decode."""
+    gen = _build_generator(with_vae=True)
+    latents = [torch.randn(1, gen.patch_size, gen.latent_dim) for _ in range(3)]
+    with torch.no_grad():
+        wf = gen.decode_to_waveform(latents, stream_decode=False)
+    assert wf.dim() == 3
+    assert wf.shape[0] == 1 and wf.shape[1] == 1
+    assert wf.shape[-1] > 0
+    assert torch.isfinite(wf).all()
+
+
+def test_decode_to_waveform_streaming_runs_end_to_end() -> None:
+    """Streaming path threads silence_holder + decode_pad through chunks."""
+    gen = _build_generator(with_vae=True)
+    latents = [torch.randn(1, gen.patch_size, gen.latent_dim) for _ in range(3)]
+    with torch.no_grad():
+        wf = gen.decode_to_waveform(latents, stream_decode=True)
+    assert wf.dim() == 3
+    assert torch.isfinite(wf).all()
+
+
+def test_decode_to_waveform_raises_without_audio_vae() -> None:
+    gen = _build_generator(with_vae=False)
+    with pytest.raises(RuntimeError, match="audio_vae is None"):
+        gen.decode_to_waveform([torch.zeros(1, 2, 4)])
+
+
+# ---------------------------------------------------------------------------
+# trim_trailing_silence (instance method)
+# ---------------------------------------------------------------------------
+
+
+def test_generator_trim_trailing_silence_uses_vae_sample_rate() -> None:
+    gen = _build_generator(with_vae=True)
+    sr = gen.audio_vae.config.sample_rate
+    # Pure silence → trimmed to last_silence (0.3s default).
+    silent = torch.zeros(1, 1, 4 * sr)
+    out = gen.trim_trailing_silence(silent)
+    assert out.shape[-1] <= int(0.3 * sr) + 1
+
+
+def test_generator_trim_trailing_silence_without_vae_is_passthrough() -> None:
+    gen = _build_generator(with_vae=False)
+    x = torch.randn(1, 1, 1000)
+    out = gen.trim_trailing_silence(x)
+    torch.testing.assert_close(out, x)
diff --git a/test/modular/test_ming_flash_omni_talker_graph.py b/test/modular/test_ming_flash_omni_talker_graph.py
new file mode 100644
index 00000000..0d0b5322
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_talker_graph.py
@@ -0,0 +1,280 @@
+"""Tests for the talker graph walk + Thinker->Talker bridge (step 6e-3).
+
+Covers the talker-enabled graph topology: the `talker` walk, the
+Talker partition, the Thinker->Talker streaming connection, the
+audio sample rate, and the Talker partition state machine. Plus the
+thinker-only path stays unchanged (talker config absent → no Talker
+partition / walk).
+
+All tests build a bare MingFlashOmniModel via __new__ + injected
+config — no checkpoint load. The detokenize/re-tokenize text bridge
+is exercised with stub tokenizers.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from mstar.conductor.request_info import CurrentForwardConductorMetadata
+from mstar.engine.base import EngineType
+from mstar.graph.base import GraphNode, Loop
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    AudioVAEConfig,
+    DiTBlockConfig,
+    MingFlashOmniModelConfig,
+    TalkerConfig,
+    TalkerLLMConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+from mstar.streaming.topology import StreamingGraphEdge
+
+
+def _talker_config() -> TalkerConfig:
+    return TalkerConfig(
+        steps=2, patch_size=2, history_patch_size=2, cfg_strength=2.0,
+        llm=TalkerLLMConfig(hidden_size=32, num_hidden_layers=1),
+        flowmodel=DiTBlockConfig(depth=1, hidden_size=32, num_heads=2, in_channels=4),
+        aggregator=DiTBlockConfig(depth=1, hidden_size=32, num_heads=2, in_channels=4),
+        vae=AudioVAEConfig(sample_rate=44100, patch_size=-1, latent_dim=4),
+    )
+
+
+def _model(with_talker: bool) -> MingFlashOmniModel:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        talker=_talker_config() if with_talker else None,
+    )
+    inst._submodule_cache = {}
+    return inst
+
+
+# ---------------------------------------------------------------------------
+# Thinker-only (talker absent) — unchanged from step 5c
+# ---------------------------------------------------------------------------
+
+
+def test_thinker_only_no_talker_walk() -> None:
+    walks = _model(with_talker=False).get_graph_walk_graphs()
+    assert "talker" not in walks
+    assert set(walks) == {
+        "prefill_text", "prefill_audio", "prefill_vision",
+        "prefill_video", "thinker_decode",
+    }
+
+
+def test_thinker_only_decode_has_no_streaming_edge() -> None:
+    """Without a talker, the decode loop emits only text edges (no Talker stream)."""
+    walks = _model(with_talker=False).get_graph_walk_graphs()
+    loop = walks["thinker_decode"]
+    assert isinstance(loop, Loop)
+    assert not any(
+        isinstance(e, StreamingGraphEdge) for e in loop.section.outputs
+    )
+
+
+def test_thinker_only_single_partition() -> None:
+    parts = _model(with_talker=False).get_partitions()
+    assert [p.name for p in parts] == ["Thinker"]
+
+
+def test_thinker_only_topology_no_connections() -> None:
+    topo = _model(with_talker=False).get_partition_topology()
+    assert topo.partitions == ["Thinker"]
+    assert topo.connections == []
+
+
+# ---------------------------------------------------------------------------
+# Talker enabled — graph structure
+# ---------------------------------------------------------------------------
+
+
+def test_talker_walk_present_and_emits_audio() -> None:
+    walks = _model(with_talker=True).get_graph_walk_graphs()
+    assert "talker" in walks
+    node = walks["talker"]
+    assert isinstance(node, GraphNode)
+    assert node.name == "Talker"
+    assert set(node.input_names) == {"thinker_tokens"}
+    assert len(node.outputs) == 1
+    assert node.outputs[0].name == "audio_chunk"
+    assert node.outputs[0].output_modality == "audio"
+
+
+def test_decode_loop_streams_thinker_tokens_to_talker() -> None:
+    walks = _model(with_talker=True).get_graph_walk_graphs()
+    loop = walks["thinker_decode"]
+    stream_edges = [
+        e for e in loop.section.outputs if isinstance(e, StreamingGraphEdge)
+    ]
+    assert len(stream_edges) == 1
+    assert stream_edges[0].name == "thinker_tokens"
+    assert stream_edges[0].target_partition == "Talker"
+    # Text edges still present (client text + decode loopback).
+    text_edges = [e.name for e in loop.section.outputs if not isinstance(e, StreamingGraphEdge)]
+    assert "new_token" in text_edges
+    assert "text_inputs" in text_edges
+
+
+def test_talker_partition_listed_with_producer() -> None:
+    parts = {p.name: p for p in _model(with_talker=True).get_partitions()}
+    assert set(parts) == {"Thinker", "Talker"}
+    talker = parts["Talker"]
+    assert talker.graph_walks == {"talker"}
+    assert talker.initial_walk is None
+    assert talker.producer_partitions == ["Thinker"]
+
+
+def test_talker_topology_connects_thinker_to_talker() -> None:
+    topo = _model(with_talker=True).get_partition_topology()
+    assert set(topo.partitions) == {"Thinker", "Talker"}
+    assert len(topo.connections) == 1
+    conn = topo.connections[0]
+    assert conn.from_partition == "Thinker"
+    assert conn.to_partition == "Talker"
+    assert conn.edge_name == "thinker_tokens"
+    # The chunk policy must keep the consumer alive past producer-done.
+    policy = conn.chunk_policy_factory()
+    assert policy.continue_after_producer_done() is True
+
+
+def test_node_engine_types_registers_talker_stateless() -> None:
+    types = _model(with_talker=True).get_node_engine_types()
+    assert types["Talker"] == EngineType.STATELESS
+
+
+# ---------------------------------------------------------------------------
+# Output sample rate
+# ---------------------------------------------------------------------------
+
+
+def test_output_sample_rate_uses_talker_vae() -> None:
+    assert _model(with_talker=True).get_output_sample_rate("audio") == 44100
+
+
+def test_output_sample_rate_falls_back_without_talker() -> None:
+    # Base class default (no talker) — just assert it doesn't raise and
+    # returns a positive int.
+    sr = _model(with_talker=False).get_output_sample_rate("audio")
+    assert isinstance(sr, int) and sr > 0
+
+
+# ---------------------------------------------------------------------------
+# Talker partition state machine
+# ---------------------------------------------------------------------------
+
+
+class _Conn:
+    """Stub StreamingConnectionState."""
+    def __init__(self, producer_done: bool) -> None:
+        self.producer_done = producer_done
+        self.token_count = 0
+        self.consumed_count = 0
+
+
+def test_talker_initial_args_audio_output_keeps_partition_alive() -> None:
+    model = _model(with_talker=True)
+    args = model.get_initial_forward_pass_args(
+        partition_name="Talker",
+        input_modalities=["text"],
+        output_modalities=["audio"],
+        input_signals={},
+    )
+    assert args.full_metadata.graph_walk == "talker"
+    assert args.request_done is False
+
+
+def test_talker_initial_args_no_audio_output_done_immediately() -> None:
+    model = _model(with_talker=True)
+    args = model.get_initial_forward_pass_args(
+        partition_name="Talker",
+        input_modalities=["text"],
+        output_modalities=["text"],   # no audio requested
+        input_signals={},
+    )
+    assert args.request_done is True
+
+
+def test_talker_forward_waits_for_producer_done() -> None:
+    model = _model(with_talker=True)
+    meta = CurrentForwardConductorMetadata(
+        input_modalities=["text"], output_modalities=["audio"],
+        graph_walk="talker", is_prefill=False,
+    )
+    # Producer still running → no-op step (no fire, not done).
+    args = model.get_partition_forward_pass_args(
+        partition_name="Talker", partition_metadata=meta,
+        persist_signals={},
+        incoming_connections=[_Conn(producer_done=False)],
+    )
+    assert args.request_done is False
+    assert args.inputs == []
+
+
+def test_talker_forward_fires_once_then_done() -> None:
+    model = _model(with_talker=True)
+    meta = CurrentForwardConductorMetadata(
+        input_modalities=["text"], output_modalities=["audio"],
+        graph_walk="talker", is_prefill=False,
+    )
+    # Producer done → fire the talker walk.
+    args1 = model.get_partition_forward_pass_args(
+        partition_name="Talker", partition_metadata=meta,
+        persist_signals={},
+        incoming_connections=[_Conn(producer_done=True)],
+    )
+    assert args1.full_metadata.graph_walk == "talker"
+    assert len(args1.inputs) == 1
+    assert args1.inputs[0].name == "thinker_tokens"
+    assert args1.request_done is False
+    # Next invocation → already fired → done.
+    args2 = model.get_partition_forward_pass_args(
+        partition_name="Talker", partition_metadata=args1.full_metadata,
+        persist_signals={},
+        incoming_connections=[_Conn(producer_done=True)],
+    )
+    assert args2.request_done is True
+
+
+# ---------------------------------------------------------------------------
+# Thinker->Talker text bridge
+# ---------------------------------------------------------------------------
+
+
+class _StubThinkerTok:
+    def decode(self, ids, skip_special_tokens=True):
+        # Toy: join ids as chars.
+        return "".join(chr(65 + (i % 26)) for i in ids)
+
+
+class _StubTalkerTok:
+    def __call__(self, text, return_tensors="pt"):
+        import torch
+        ids = torch.tensor([[ord(c) for c in text]], dtype=torch.long)
+        return type("O", (), {"input_ids": ids})()
+
+
+def test_text_bridge_decodes_then_reencodes() -> None:
+    import torch
+    model = _model(with_talker=True)
+    model.tokenizer = _StubThinkerTok()
+    model._talker_tokenizer = _StubTalkerTok()
+    thinker_ids = torch.tensor([0, 1, 2])   # -> "ABC"
+    out = model.thinker_text_to_talker_inputs(thinker_ids)
+    assert out.tolist() == [ord("A"), ord("B"), ord("C")]
+
+
+def test_text_bridge_raises_without_thinker_tokenizer() -> None:
+    import torch
+    model = _model(with_talker=True)
+    model.tokenizer = None
+    model._talker_tokenizer = _StubTalkerTok()
+    with pytest.raises(RuntimeError, match="thinker tokenizer not loaded"):
+        model.thinker_text_to_talker_inputs(torch.tensor([1, 2]))
diff --git a/test/modular/test_ming_flash_omni_talker_loader.py b/test/modular/test_ming_flash_omni_talker_loader.py
new file mode 100644
index 00000000..b69fd904
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_talker_loader.py
@@ -0,0 +1,201 @@
+"""Snapshot-gated tests for the Talker + AudioVAE weight loaders (step 6f).
+
+The talker checkpoint lives in two safetensors files:
+
+  talker/model.safetensors        — model.* / cfm.* / aggregator.*
+                                    / stop_head.* / spk_head.*
+  talker/vae/model.safetensors    — encoder.* + decoder.* (AudioVAE)
+
+Each loader is non-TP and just does prefix-strip + load_state_dict
+via the shared `_load_prefixed_state_dict` helper. These tests skip
+cleanly when no snapshot is available, so CI machines without the
+~5GB talker download still pass.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+
+from mstar.model.ming_omni_flash.components.audio_vae import build_audio_vae
+from mstar.model.ming_omni_flash.components.talker_dit import (
+    build_aggregator,
+    build_talker_cfm,
+    build_talker_heads,
+    build_talker_llm,
+)
+from mstar.model.ming_omni_flash.config import MingFlashOmniModelConfig
+from mstar.model.ming_omni_flash.loader import (
+    load_talker_aggregator_weights,
+    load_talker_audio_vae_weights,
+    load_talker_cfm_weights,
+    load_talker_heads_weights,
+    load_talker_llm_weights,
+)
+
+# ---------------------------------------------------------------------------
+# Snapshot discovery
+# ---------------------------------------------------------------------------
+
+
+def _find_local_snapshot() -> str | None:
+    def _has(p: Path) -> bool:
+        return (
+            (p / "talker" / "config.json").exists()
+            and (p / "talker" / "model.safetensors").exists()
+        )
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and _has(Path(override)):
+        return override
+    hybrid = Path("/dev/shm/ming-hybrid")
+    if _has(hybrid):
+        return str(hybrid)
+    return None
+
+
+pytestmark = pytest.mark.skipif(
+    _find_local_snapshot() is None,
+    reason="Need Ming-flash-omni-2.0 snapshot with talker/.",
+)
+
+
+@pytest.fixture(scope="module")
+def snap_and_config() -> tuple[str, MingFlashOmniModelConfig]:
+    snap = _find_local_snapshot()
+    cfg = MingFlashOmniModelConfig.from_pretrained(snap)
+    if cfg.talker is None:
+        pytest.skip("Snapshot has no talker/ subdir.")
+    return snap, cfg
+
+
+# ---------------------------------------------------------------------------
+# Talker LLM (Qwen2)
+# ---------------------------------------------------------------------------
+
+
+def test_load_talker_llm_weights_strict(snap_and_config) -> None:
+    """``model.*`` from talker/model.safetensors loads cleanly into Qwen2Model."""
+    snap, cfg = snap_and_config
+    llm = build_talker_llm(cfg.talker.llm, dtype=torch.float32, device="cpu")
+    loaded = load_talker_llm_weights(llm, snap, device="cpu", strict=True)
+    # 24 layers × ~12 params each + embed + final norm = many keys; just
+    # spot-check representative entries.
+    assert "embed_tokens.weight" in loaded
+    assert "layers.0.self_attn.q_proj.weight" in loaded
+    assert "layers.0.mlp.gate_proj.weight" in loaded
+    assert f"layers.{cfg.talker.llm.num_hidden_layers - 1}.input_layernorm.weight" in loaded
+    assert "norm.weight" in loaded
+    # Sanity-check that the embed table actually got overwritten.
+    assert (llm.embed_tokens.weight.abs().sum() > 0).item()
+
+
+# ---------------------------------------------------------------------------
+# CFM
+# ---------------------------------------------------------------------------
+
+
+def test_load_talker_cfm_weights_strict(snap_and_config) -> None:
+    """``cfm.*`` loads into `CFM(DiT)` by state-dict equality."""
+    snap, cfg = snap_and_config
+    cfm = build_talker_cfm(cfg.talker, dtype=torch.float32, device="cpu")
+    loaded = load_talker_cfm_weights(cfm, snap, device="cpu", strict=True)
+    # CFM module wraps a DiT under `.model`, so the loaded keys are
+    # ``model.<...>`` after stripping the ``cfm.`` prefix.
+    assert "model.x_embedder.weight" in loaded
+    assert "model.c_embedder.cond_embedder.weight" in loaded
+    assert "model.t_embedder.time_embed.dim" not in loaded   # buffer-free
+    assert "model.blocks.0.attn.to_q.weight" in loaded
+    assert "model.blocks.0.mlp.ff.0.0.weight" in loaded
+    assert "model.final_layer.linear.weight" in loaded
+
+
+# ---------------------------------------------------------------------------
+# Aggregator
+# ---------------------------------------------------------------------------
+
+
+def test_load_talker_aggregator_weights_strict(snap_and_config) -> None:
+    snap, cfg = snap_and_config
+    agg = build_aggregator(cfg.talker, dtype=torch.float32, device="cpu")
+    loaded = load_talker_aggregator_weights(agg, snap, device="cpu", strict=True)
+    assert "x_embedder.weight" in loaded
+    assert "word_embedder.weight" in loaded
+    assert "blocks.0.attn.to_q.weight" in loaded
+    assert "final_layer.linear.weight" in loaded
+
+
+# ---------------------------------------------------------------------------
+# Heads
+# ---------------------------------------------------------------------------
+
+
+def test_load_talker_heads_weights_strict(snap_and_config) -> None:
+    """stop_head and spk_head both load by leaf prefix."""
+    snap, cfg = snap_and_config
+    heads = build_talker_heads(cfg.talker, dtype=torch.float32, device="cpu")
+    loaded = load_talker_heads_weights(heads, snap, device="cpu", strict=True)
+    assert loaded["stop_head"] == {"weight", "bias"}
+    assert loaded["spk_head"] == {"weight", "bias"}
+    # Sanity: head weights are not the init values.
+    assert (heads["stop_head"].weight.abs().sum() > 0).item()
+    assert (heads["spk_head"].weight.abs().sum() > 0).item()
+
+
+def test_load_talker_heads_weights_rejects_missing_key() -> None:
+    """KeyError if the heads dict is missing one of the required entries.
+
+    Use a dict missing ``stop_head`` (the first entry the loader checks)
+    so the missing-key guard fires before we attempt any disk I/O.
+    """
+    with pytest.raises(KeyError, match="missing required key 'stop_head'"):
+        load_talker_heads_weights({"spk_head": torch.nn.Linear(1, 1)}, "/tmp/x")
+
+
+# ---------------------------------------------------------------------------
+# AudioVAE
+# ---------------------------------------------------------------------------
+
+
+def test_load_talker_audio_vae_weights_strict(snap_and_config) -> None:
+    """Full AudioVAE state_dict round-trips from talker/vae/model.safetensors."""
+    snap, cfg = snap_and_config
+    vae = build_audio_vae(
+        cfg.talker.vae, dtype=torch.float32, device="cpu",
+        attn_implementation="sdpa",
+    )
+    loaded = load_talker_audio_vae_weights(vae, snap, device="cpu", strict=True)
+    # Encoder + decoder subtrees both present.
+    assert "encoder.fc1.weight" in loaded
+    assert "encoder.encoder.embed_tokens.weight" in loaded
+    assert "encoder.aggregator.embed_tokens.weight" in loaded
+    assert "encoder.cls_embed" in loaded
+    assert "decoder.fc1.weight" in loaded
+    assert "decoder.head.out.weight" in loaded
+    assert "decoder.head.istft.window" in loaded
+    assert "decoder.decoder.embed_tokens.weight" in loaded
+
+
+def test_audio_vae_decode_runs_with_loaded_weights(snap_and_config) -> None:
+    """End-to-end CPU smoke after a real-weights load.
+
+    Constructs a small latent and decodes; checks the output is finite.
+    Catches catastrophic dtype / weight-layout misloads that wouldn't
+    surface from key-name parity alone.
+    """
+    snap, cfg = snap_and_config
+    vae = build_audio_vae(
+        cfg.talker.vae, dtype=torch.float32, device="cpu",
+        attn_implementation="sdpa",
+    )
+    load_talker_audio_vae_weights(vae, snap, device="cpu", strict=True)
+
+    # One latent frame at latent_dim=64.
+    latent = torch.randn(1, 1, cfg.talker.vae.latent_dim) * 0.1
+    with torch.no_grad():
+        waveform, state, pkv = vae.decode(latent, use_cache=False)
+    assert waveform.dim() == 3
+    assert waveform.shape[0] == 1 and waveform.shape[1] == 1
+    assert torch.isfinite(waveform).all()
diff --git a/test/modular/test_ming_flash_omni_talker_submodule.py b/test/modular/test_ming_flash_omni_talker_submodule.py
new file mode 100644
index 00000000..c82397fd
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_talker_submodule.py
@@ -0,0 +1,279 @@
+"""Tests for TalkerSubmodule + node registration + construction (step 6e-2).
+
+Two layers:
+
+  * Pure-Python: a tiny TalkerGenerator wrapped in TalkerSubmodule —
+    prepare_inputs embeds talker text ids, forward runs the full
+    AR-decode + VAE-decode and returns an audio_chunk. Plus the
+    model's get_node_engine_types / get_submodule wiring.
+
+  * Snapshot-gated: MingFlashOmniModel._create_talker_submodule builds
+    the full talker stack and loads real weights, then runs a tiny
+    end-to-end generation. Heavy (~5 GB CPU); skipped without a snapshot.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+
+from mstar.engine.base import EngineType
+from mstar.model.ming_omni_flash.components.audio_vae import build_audio_vae
+from mstar.model.ming_omni_flash.components.talker_dit import (
+    build_aggregator,
+    build_talker_cfm,
+    build_talker_heads,
+    build_talker_llm,
+)
+from mstar.model.ming_omni_flash.components.talker_generator import TalkerGenerator
+from mstar.model.ming_omni_flash.config import (
+    AudioEncoderConfig,
+    AudioVAEConfig,
+    DiTBlockConfig,
+    MingFlashOmniModelConfig,
+    TalkerConfig,
+    TalkerLLMConfig,
+    ThinkerLLMConfig,
+    VisionEncoderConfig,
+)
+from mstar.model.ming_omni_flash.ming_omni_flash_model import MingFlashOmniModel
+from mstar.model.ming_omni_flash.submodules import TalkerSubmodule
+
+# ---------------------------------------------------------------------------
+# Snapshot discovery
+# ---------------------------------------------------------------------------
+
+
+def _find_local_snapshot() -> str | None:
+    def _has(p: Path) -> bool:
+        return (
+            (p / "talker" / "config.json").exists()
+            and (p / "talker" / "model.safetensors").exists()
+            and (p / "talker" / "vae" / "model.safetensors").exists()
+        )
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and _has(Path(override)):
+        return override
+    hybrid = Path("/dev/shm/ming-hybrid")
+    if _has(hybrid):
+        return str(hybrid)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Tiny config + generator (CPU, fast)
+# ---------------------------------------------------------------------------
+
+
+def _tiny_qwen2_backbone(hidden_size: int = 32, num_layers: int = 1) -> dict:
+    return {
+        "hidden_size": hidden_size,
+        "intermediate_size": hidden_size * 2,
+        "num_hidden_layers": num_layers,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "max_position_embeddings": 256,
+        "vocab_size": 1,
+        "use_sliding_window": True,
+        "sliding_window": 32,
+        "max_window_layers": 0,
+        "rope_theta": 1_000_000.0,
+        "rms_norm_eps": 1e-6,
+        "hidden_act": "silu",
+    }
+
+
+def _tiny_talker_config() -> TalkerConfig:
+    return TalkerConfig(
+        steps=2,
+        patch_size=2,
+        history_patch_size=2,
+        cfg_strength=2.0,
+        llm=TalkerLLMConfig(
+            vocab_size=32, hidden_size=32, intermediate_size=64,
+            num_hidden_layers=1, num_attention_heads=4, num_key_value_heads=2,
+            max_position_embeddings=128, sliding_window=64, max_window_layers=0,
+            use_sliding_window=False,
+        ),
+        flowmodel=DiTBlockConfig(
+            depth=1, hidden_size=32, num_heads=2, mlp_ratio=2,
+            in_channels=4, dropout=0.0, attn_mask_enabled=False,
+        ),
+        aggregator=DiTBlockConfig(
+            depth=1, hidden_size=32, num_heads=2, mlp_ratio=2,
+            in_channels=4, dropout=0.0, attn_mask_enabled=False,
+        ),
+        vae=AudioVAEConfig(
+            sample_rate=8000, patch_size=-1, latent_dim=4,
+            encoder_input_dim=16, encoder_hop_size=16, decoder_output_dim=16,
+            enc_backbone=_tiny_qwen2_backbone(), dec_backbone=_tiny_qwen2_backbone(),
+        ),
+    )
+
+
+def _tiny_model_config() -> MingFlashOmniModelConfig:
+    return MingFlashOmniModelConfig(
+        local_dir="",
+        mlp_depth=2,
+        thinker_llm=ThinkerLLMConfig(),
+        vision=VisionEncoderConfig(),
+        audio_encoder=AudioEncoderConfig(),
+        talker=_tiny_talker_config(),
+    )
+
+
+def _build_tiny_submodule() -> TalkerSubmodule:
+    cfg = _tiny_talker_config()
+    llm = build_talker_llm(cfg.llm, dtype=torch.float32, device="cpu")
+    cfm = build_talker_cfm(cfg, dtype=torch.float32, device="cpu")
+    agg = build_aggregator(cfg, dtype=torch.float32, device="cpu")
+    heads = build_talker_heads(cfg, dtype=torch.float32, device="cpu")
+    vae = build_audio_vae(cfg.vae, dtype=torch.float32, device="cpu", attn_implementation="sdpa")
+    gen = TalkerGenerator(
+        talker_config=cfg, llm=llm, cfm=cfm, aggregator=agg,
+        stop_head=heads["stop_head"], audio_vae=vae,
+    )
+    model_cfg = _tiny_model_config()
+    return TalkerSubmodule(generator=gen, config=model_cfg, max_steps=3, min_new_token=1000)
+
+
+# ---------------------------------------------------------------------------
+# TalkerSubmodule — prepare_inputs / forward
+# ---------------------------------------------------------------------------
+
+
+def test_talker_submodule_stateless_flavor_is_audio_codec() -> None:
+    sub = _build_tiny_submodule()
+    assert sub.get_stateless_flavor() == "audio_codec"
+
+
+def test_talker_submodule_prepare_inputs_embeds_text_ids() -> None:
+    sub = _build_tiny_submodule()
+    token_ids = torch.tensor([1, 2, 3, 4], dtype=torch.long)
+    out = sub.prepare_inputs(
+        graph_walk="talker", fwd_info=None,
+        inputs={"talker_text_inputs": [token_ids]},
+    )
+    embeds = out.tensor_inputs["inputs_embeds"]
+    # (1, T, hidden) after embedding.
+    assert embeds.shape == (1, 4, sub.config.talker.llm.hidden_size)
+    assert out.tensor_inputs["prompt_wav_lat"] is None
+
+
+def test_talker_submodule_prepare_inputs_raises_on_missing_text() -> None:
+    sub = _build_tiny_submodule()
+    with pytest.raises(ValueError, match="missing 'talker_text_inputs'"):
+        sub.prepare_inputs(graph_walk="talker", fwd_info=None, inputs={})
+
+
+def test_talker_submodule_forward_returns_audio_chunk() -> None:
+    """End-to-end tiny generation: text ids -> waveform."""
+    sub = _build_tiny_submodule()
+    token_ids = torch.tensor([1, 2, 3], dtype=torch.long)
+    prep = sub.prepare_inputs(
+        graph_walk="talker", fwd_info=None,
+        inputs={"talker_text_inputs": [token_ids]},
+    )
+    out = sub.forward(
+        graph_walk="talker", engine_inputs=None,
+        inputs_embeds=prep.tensor_inputs["inputs_embeds"],
+        prompt_wav_lat=prep.tensor_inputs["prompt_wav_lat"],
+    )
+    assert "audio_chunk" in out
+    wf = out["audio_chunk"][0]
+    assert wf.dim() == 3
+    assert wf.shape[0] == 1 and wf.shape[1] == 1
+    assert torch.isfinite(wf).all()
+
+
+def test_talker_submodule_prepare_inputs_accepts_2d_token_ids() -> None:
+    """Already-batched (1, T) token ids work too (no double-unsqueeze)."""
+    sub = _build_tiny_submodule()
+    token_ids = torch.tensor([[5, 6, 7]], dtype=torch.long)
+    out = sub.prepare_inputs(
+        graph_walk="talker", fwd_info=None,
+        inputs={"talker_text_inputs": [token_ids]},
+    )
+    assert out.tensor_inputs["inputs_embeds"].shape == (1, 3, sub.config.talker.llm.hidden_size)
+
+
+# ---------------------------------------------------------------------------
+# Model node registration
+# ---------------------------------------------------------------------------
+
+
+def test_get_node_engine_types_registers_talker_when_config_present() -> None:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    inst.config = _tiny_model_config()
+    types = inst.get_node_engine_types()
+    assert types["Talker"] == EngineType.STATELESS
+    assert types["Thinker"] == EngineType.KV_CACHE
+
+
+def test_get_node_engine_types_omits_talker_for_thinker_only() -> None:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    cfg = _tiny_model_config()
+    cfg.talker = None
+    inst.config = cfg
+    types = inst.get_node_engine_types()
+    assert "Talker" not in types
+
+
+def test_get_submodule_talker_raises_without_talker_config() -> None:
+    inst = MingFlashOmniModel.__new__(MingFlashOmniModel)
+    cfg = _tiny_model_config()
+    cfg.talker = None
+    inst.config = cfg
+    inst._submodule_cache = {}
+    with pytest.raises(RuntimeError, match="no talker/ subdir"):
+        inst._create_talker_submodule(device="cpu")
+
+
+# ---------------------------------------------------------------------------
+# Snapshot-gated end-to-end construction + generation
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(
+    _find_local_snapshot() is None,
+    reason="Need Ming-flash-omni-2.0 snapshot with talker/.",
+)
+def test_create_talker_submodule_loads_real_weights_and_generates() -> None:
+    """Full talker construction (real weights) + tiny TTS generation.
+
+    Heavy (~5 GB on CPU). Builds the LLM + CFM + Aggregator + heads +
+    AudioVAE, loads every subtree via the step-6f loaders, and runs a
+    short generation to confirm the wiring produces a finite waveform.
+    """
+    snap = _find_local_snapshot()
+    code_dir = os.environ.get("MING_CODE_DIR", "/tmp/ming_repo")
+    model = MingFlashOmniModel(model_path_hf=snap, ming_code_dir=code_dir)
+
+    # bf16 on CPU is slow for matmuls; override autocast dtype to fp32
+    # for the test by monkeypatching get_autocast_dtype.
+    model.get_autocast_dtype = lambda: torch.float32  # type: ignore
+
+    sub = model.get_submodule("Talker", device="cpu")
+    assert isinstance(sub, TalkerSubmodule)
+
+    # Cap generation hard so the test is fast.
+    sub.max_steps = 2
+    sub.min_new_token = 1000   # force max_steps cap (no early stop)
+
+    # A short token sequence in the talker LLM's vocab.
+    token_ids = torch.tensor([1, 2, 3, 4, 5], dtype=torch.long)
+    prep = sub.prepare_inputs(
+        graph_walk="talker", fwd_info=None,
+        inputs={"talker_text_inputs": [token_ids]},
+    )
+    out = sub.forward(
+        graph_walk="talker", engine_inputs=None,
+        inputs_embeds=prep.tensor_inputs["inputs_embeds"],
+        prompt_wav_lat=None,
+    )
+    wf = out["audio_chunk"][0]
+    assert wf.dim() == 3 and wf.shape[1] == 1
+    assert torch.isfinite(wf).all()
diff --git a/test/modular/test_ming_flash_omni_tokenizer.py b/test/modular/test_ming_flash_omni_tokenizer.py
new file mode 100644
index 00000000..9fc2ebb9
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_tokenizer.py
@@ -0,0 +1,312 @@
+"""Tokenizer + processor wiring tests for Ming-flash-omni-2.0.
+
+These tests require BOTH:
+  1. The released HF snapshot under ``~/.cache/huggingface/hub/`` (or
+     ``MING_FLASH_OMNI_DIR`` env override)
+  2. A clone of https://github.com/inclusionAI/Ming locatable via the
+     ``MING_CODE_DIR`` env var (or under ``./Ming`` / ``/tmp/ming_repo``)
+  3. Python deps from Ming's requirements (``opencv-python-headless``,
+     ``openai-whisper``)
+
+Tests skip cleanly when any of these is missing, so CI / dev environments
+without the full Ming setup still pass.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+from mstar.model.ming_omni_flash.ming_omni_flash_model import (
+    _find_ming_code_dir,
+    _prepare_tokenizer_dir,
+    _resolve_local_hf_snapshot,
+)
+
+
+def _find_local_snapshot() -> str | None:
+    """Locate the Ming-flash-omni-2.0 snapshot on disk, or None."""
+    override = os.environ.get("MING_FLASH_OMNI_DIR")
+    if override and (Path(override) / "config.json").exists():
+        return override
+
+    hub_root = Path.home() / ".cache" / "huggingface" / "hub"
+    repo_dir = hub_root / "models--inclusionAI--Ming-flash-omni-2.0" / "snapshots"
+    if not repo_dir.exists():
+        return None
+    for snap in sorted(repo_dir.iterdir()):
+        if (snap / "config.json").exists():
+            return str(snap)
+    return None
+
+
+@pytest.fixture(scope="module")
+def snapshot_dir() -> str:
+    snap = _find_local_snapshot()
+    if snap is None:
+        pytest.skip(
+            "Ming-flash-omni-2.0 snapshot not found. Set MING_FLASH_OMNI_DIR "
+            "or download with `huggingface-cli download "
+            "inclusionAI/Ming-flash-omni-2.0`."
+        )
+    return snap
+
+
+@pytest.fixture(scope="module")
+def ming_code_dir() -> str:
+    code = _find_ming_code_dir()
+    if code is None:
+        pytest.skip(
+            "Ming source repo not found. Set MING_CODE_DIR=<path/to/Ming> or "
+            "git clone https://github.com/inclusionAI/Ming to ./Ming or "
+            "/tmp/ming_repo. The HF checkpoint ships only weights — the "
+            "tokenizer/processor Python modules live in the source repo."
+        )
+    return code
+
+
+@pytest.fixture(scope="module")
+def staged_snapshot(snapshot_dir: str, ming_code_dir: str) -> str:
+    """Stage Ming source files alongside the snapshot, add snapshot to sys.path."""
+    _prepare_tokenizer_dir(snapshot_dir, ming_code_dir)
+    if snapshot_dir not in sys.path:
+        sys.path.insert(0, snapshot_dir)
+    return snapshot_dir
+
+
+@pytest.fixture(scope="module")
+def tokenizer(staged_snapshot: str):
+    try:
+        from transformers import AutoTokenizer
+    except ImportError as e:
+        pytest.skip(f"transformers not importable: {e}")
+    try:
+        return AutoTokenizer.from_pretrained(staged_snapshot, trust_remote_code=True)
+    except ImportError as e:
+        pytest.skip(
+            f"Ming tokenizer requires extra Python deps that are missing: {e}. "
+            f"Run `pip install opencv-python-headless openai-whisper`."
+        )
+
+
+@pytest.fixture(scope="module")
+def processor(staged_snapshot: str):
+    try:
+        from transformers import AutoProcessor
+    except ImportError as e:
+        pytest.skip(f"transformers not importable: {e}")
+    try:
+        return AutoProcessor.from_pretrained(staged_snapshot, trust_remote_code=True)
+    except ImportError as e:
+        pytest.skip(
+            f"Ming processor requires extra Python deps that are missing: {e}. "
+            f"Run `pip install opencv-python-headless openai-whisper`."
+        )
+
+
+# ---------------------------------------------------------------------------
+# Tokenizer
+# ---------------------------------------------------------------------------
+
+
+def test_tokenizer_loads_with_expected_class_and_vocab(tokenizer) -> None:
+    """BailingTokenizer loads with vocab_size matching the released ckpt
+    (157179, slightly below config.llm_config.vocab_size=157184; the 5-token
+    gap is multimodal sentinels added at model-init time)."""
+    assert type(tokenizer).__name__ == "BailingTokenizer"
+    assert tokenizer.vocab_size == 157179
+    # EOS = pad = <|role_end|> on this ckpt; the chat template uses it as
+    # the role-block terminator.
+    assert tokenizer.eos_token_id == 156895
+    assert tokenizer.pad_token_id == 156895
+
+
+def test_multimodal_special_tokens_decode_to_expected_strings(tokenizer) -> None:
+    """The multimodal token IDs we hard-code in ThinkerLLMConfig must decode
+    to the expected sentinel strings — regression guard against vocab drift
+    or wrong ID assumptions in the prefill processor (step 5)."""
+    expected = {
+        157157: "<imagePatch>",
+        157158: "<image>",
+        157159: "</image>",
+        157175: "<framePatch>",
+    }
+    for tid, expected_str in expected.items():
+        decoded = tokenizer.decode([tid])
+        assert decoded == expected_str, (
+            f"token {tid}: expected {expected_str!r}, got {decoded!r}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Processor + chat template
+# ---------------------------------------------------------------------------
+
+
+def test_processor_loads_with_chat_template_and_gen_terminator(processor) -> None:
+    """BailingMM2Processor exposes the methods step-7 (process_prompt) needs."""
+    assert type(processor).__name__ == "BailingMM2Processor"
+    assert hasattr(processor, "apply_chat_template")
+    assert hasattr(processor, "process_vision_info")
+    # gen_terminator drives generate()'s stop condition; must equal the
+    # tokenizer's eos_token_id.
+    assert processor.gen_terminator == [156895]
+
+
+def test_chat_template_emits_role_blocks(processor) -> None:
+    """The Ming chat template renders explicit ``<role>...</role>`` blocks
+    terminated by ``<|role_end|>``. Required for the benchmark and the
+    eventual process_prompt port to construct prompts the model recognises.
+    """
+    text = processor.apply_chat_template(
+        [{"role": "HUMAN", "content": [{"type": "text", "text": "Hello."}]}],
+        sys_prompt_exp=None,
+        use_cot_system_prompt=False,
+    )
+    # Default sys prompt is auto-inserted when sys_prompt_exp is None.
+    assert "<role>SYSTEM</role>" in text
+    assert "<role>HUMAN</role>Hello." in text
+    # Trailing ASSISTANT block primes the model to generate.
+    assert text.endswith("<role>ASSISTANT</role>")
+    assert "<|role_end|>" in text
+
+
+def test_processor_apply_chat_template_rejects_openai_lowercase_roles(processor) -> None:
+    """Ming's Python-side ``BailingMM2Processor.apply_chat_template``
+    asserts ``role in [HUMAN, ASSISTANT]``. The native mstar
+    ``process_prompt`` (step 7) goes through this path for full multimodal
+    preprocessing and must remap roles explicitly. (The benchmark side
+    goes through ``tokenizer.apply_chat_template`` instead — see the
+    next test — which DOES accept OpenAI roles via jinja.)
+    """
+    with pytest.raises((AssertionError, ValueError, KeyError)):
+        processor.apply_chat_template(
+            [{"role": "user", "content": "Hi"}],
+            sys_prompt_exp=None,
+            use_cot_system_prompt=False,
+        )
+
+
+def test_tokenizer_apply_chat_template_accepts_openai_roles(tokenizer) -> None:
+    """The jinja chat_template in ``tokenizer_config.json`` DOES handle
+    OpenAI standard ``user`` / ``assistant`` / ``system`` roles, remapping
+    them to ``HUMAN`` / ``ASSISTANT`` / ``SYSTEM`` inside the template.
+    vllm-omni's serving path renders prompts via
+    ``tokenizer.apply_chat_template``, so the benchmark adapter can send
+    standard OpenAI message shapes unchanged. Regression guard against the
+    chat_template field being stripped or replaced upstream.
+    """
+    text = tokenizer.apply_chat_template(
+        [{"role": "system", "content": "Be brief."},
+         {"role": "user", "content": "Hi"}],
+        tokenize=False, add_generation_prompt=True,
+    )
+    # Even though the input role was lowercase, the rendered prompt uses
+    # Ming's uppercase role blocks.
+    assert "<role>SYSTEM</role>" in text
+    assert "Be brief." in text
+    assert "<role>HUMAN</role>Hi" in text
+    assert text.endswith("<role>ASSISTANT</role>")
+
+
+def test_chat_template_cot_system_prompt_differs(processor) -> None:
+    """``use_cot_system_prompt=True`` swaps the default system block from
+    ``detailed thinking off`` to ``detailed thinking on`` — used by the
+    talker for chain-of-thought prompts and (later) by the reasoning path."""
+    off = processor.apply_chat_template(
+        [{"role": "HUMAN", "content": [{"type": "text", "text": "Hi"}]}],
+        sys_prompt_exp=None,
+        use_cot_system_prompt=False,
+    )
+    on = processor.apply_chat_template(
+        [{"role": "HUMAN", "content": [{"type": "text", "text": "Hi"}]}],
+        sys_prompt_exp=None,
+        use_cot_system_prompt=True,
+    )
+    assert "detailed thinking off" in off
+    assert "detailed thinking on" in on
+    assert off != on
+
+
+# ---------------------------------------------------------------------------
+# Staging helpers
+# ---------------------------------------------------------------------------
+
+
+def test_find_ming_code_dir_picks_up_env_override(monkeypatch, tmp_path) -> None:
+    """MING_CODE_DIR env override beats any other discovery path, as long
+    as it points at a directory containing configuration_bailingmm2.py."""
+    fake = tmp_path / "ming_fake"
+    fake.mkdir()
+    (fake / "configuration_bailingmm2.py").write_text("# fake\n")
+    monkeypatch.setenv("MING_CODE_DIR", str(fake))
+    found = _find_ming_code_dir()
+    assert found == str(fake.resolve())
+
+
+def test_find_ming_code_dir_returns_none_when_nothing_set(monkeypatch, tmp_path) -> None:
+    """No env override + no Ming/ in cwd + no /tmp/ming_repo + no sys.path
+    candidates → None. (We chdir to an empty tmp dir to neutralise ./Ming
+    discovery, and clear PYTHONPATH-flavored sys.path entries.)"""
+    monkeypatch.delenv("MING_CODE_DIR", raising=False)
+    monkeypatch.chdir(tmp_path)
+    # Snapshot a clean sys.path without any Ming-bearing entries.
+    monkeypatch.setattr(
+        sys, "path",
+        [p for p in sys.path
+         if not (p and (Path(p) / "configuration_bailingmm2.py").exists())],
+    )
+    # /tmp/ming_repo is a real path on this dev box; mask it via monkeypatch
+    # of Path.exists isn't trivial. Instead, accept the result when it's the
+    # cached /tmp/ming_repo (env-dependent) and assert None otherwise.
+    found = _find_ming_code_dir()
+    if found is not None:
+        # Confirm it came from one of the fixed fallback dirs we explicitly
+        # checked, not from a polluted sys.path entry — that's the property
+        # we actually care about.
+        assert found in {
+            str(Path("./Ming").resolve()),
+            str(Path("/tmp/ming_repo").resolve()),
+        }
+
+
+def test_resolve_local_hf_snapshot_returns_string() -> None:
+    """The snapshot resolver should produce a string path; if the HF download
+    fails it falls back to the repo id verbatim, which is still a str."""
+    out = _resolve_local_hf_snapshot("inclusionAI/Ming-flash-omni-2.0")
+    assert isinstance(out, str)
+    assert len(out) > 0
+
+
+# ---------------------------------------------------------------------------
+# Documents the discovered constraints — failure here means the upstream
+# released ckpt changed shape and the rest of the port needs revisiting.
+# ---------------------------------------------------------------------------
+
+
+def test_snapshot_has_no_top_level_tokenizer_files(snapshot_dir: str) -> None:
+    """Sanity-snapshot the discovery that motivates the
+    ``_prepare_tokenizer_dir`` helper: the released checkpoint ships NO
+    top-level tokenizer/processor Python or json files. If this ever stops
+    being true (HF releases a self-contained variant), simplify the loader.
+    """
+    snap = Path(snapshot_dir)
+    # If any of these are real (non-symlinked) files, the snapshot has
+    # changed and we can stop bothering with the symlink dance.
+    for name in (
+        "tokenizer.json", "tokenizer_config.json",
+        "processor_config.json", "tokenization_bailing.py",
+        "configuration_bailingmm2.py",
+    ):
+        p = snap / name
+        # Symlinks are OK (means a previous test staged), but a real file
+        # would indicate a new release shape.
+        if p.is_file() and not p.is_symlink():
+            pytest.fail(
+                f"Snapshot now contains real (non-symlinked) {name}; "
+                f"_MING_CODE_FILES staging may be redundant — re-validate "
+                f"the loader."
+            )
diff --git a/test/modular/test_ming_flash_omni_zimage_dit.py b/test/modular/test_ming_flash_omni_zimage_dit.py
new file mode 100644
index 00000000..92dd4a24
--- /dev/null
+++ b/test/modular/test_ming_flash_omni_zimage_dit.py
@@ -0,0 +1,249 @@
+"""Tests for the ZImage DiT transformer (step 9b).
+
+Pure-Python on CPU with a tiny config (the released model is dim=3840/30L —
+far too large to instantiate in CI). Covers:
+
+  * the interleaved (is_neox_style=False) RoPE matches vllm-omni's reference
+    ``apply_rotary_emb_torch`` exactly (the one numeric correctness anchor that
+    must agree with the validated serving path);
+  * RMSNorm / timestep_embedding match the upstream formulas;
+  * patchify→unpatchify roundtrips shape;
+  * a full tiny forward runs and returns one latent per batch item with the
+    input image shape;
+  * the unfused checkpoint layout (to_q/to_k/to_v, w1/w3) loads via a direct
+    copy with no stacked-param remap;
+  * Ming's reference-latent subclass concatenates the ref frame and drops it
+    from the output.
+"""
+
+from __future__ import annotations
+
+import torch
+
+from mstar.model.ming_omni_flash.components.zimage_transformer import (
+    SEQ_MULTI_OF,
+    MingZImageTransformer2DModel,
+    RMSNorm,
+    RopeEmbedder,
+    ZImageTransformer2DModel,
+    apply_rotary_emb_interleaved,
+    timestep_embedding,
+)
+
+
+def _tiny_model(cls=ZImageTransformer2DModel) -> ZImageTransformer2DModel:
+    # dim=16, head_dim=8, axes_dims sum to 8 (=head_dim) so RoPE covers the head.
+    return cls(
+        all_patch_size=(2,),
+        all_f_patch_size=(1,),
+        in_channels=4,
+        dim=16,
+        n_layers=2,
+        n_refiner_layers=1,
+        n_heads=2,
+        n_kv_heads=2,
+        norm_eps=1e-5,
+        cap_feat_dim=12,
+        rope_theta=256.0,
+        axes_dims=(2, 4, 2),
+        axes_lens=(256, 256, 256),
+    ).eval()
+
+
+# ---------------------------------------------------------------------------
+# RoPE numeric parity with vllm-omni's reference
+# ---------------------------------------------------------------------------
+
+
+def test_interleaved_rope_matches_vllm_reference() -> None:
+    from einops import rearrange, repeat
+
+    def rotate_half(x):
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+
+    def ref(x, cos, sin):
+        ro_dim = cos.shape[-1] * 2
+        cos_r = repeat(cos, "... d -> ... 1 (d 2)")
+        sin_r = repeat(sin, "... d -> ... 1 (d 2)")
+        return torch.cat(
+            [x[..., :ro_dim] * cos_r + rotate_half(x[..., :ro_dim]) * sin_r, x[..., ro_dim:]],
+            dim=-1,
+        )
+
+    torch.manual_seed(0)
+    x = torch.randn(2, 5, 3, 8)
+    cos = torch.randn(2, 5, 4)
+    sin = torch.randn(2, 5, 4)
+    assert torch.allclose(ref(x, cos, sin), apply_rotary_emb_interleaved(x, cos, sin), atol=1e-6)
+
+
+def test_rope_partial_dim_leaves_tail_untouched() -> None:
+    x = torch.randn(1, 3, 2, 8)
+    cos = torch.randn(1, 3, 2)  # ro_dim = 4 < head_dim 8
+    sin = torch.randn(1, 3, 2)
+    out = apply_rotary_emb_interleaved(x, cos, sin)
+    assert torch.allclose(out[..., 4:], x[..., 4:])
+
+
+def test_rope_embedder_axes_concatenate_to_half_head() -> None:
+    emb = RopeEmbedder(theta=256.0, axes_dims=(2, 4, 2), axes_lens=(8, 8, 8))
+    ids = torch.tensor([[0, 0, 0], [1, 2, 3]])
+    cos, sin = emb(ids)
+    # half-frequencies: sum(d/2) = 1 + 2 + 1 = 4
+    assert cos.shape == (2, 4) and sin.shape == (2, 4)
+    # position 0 across all axes -> cos=1, sin=0
+    assert torch.allclose(cos[0], torch.ones(4))
+    assert torch.allclose(sin[0], torch.zeros(4))
+
+
+# ---------------------------------------------------------------------------
+# Primitive parity
+# ---------------------------------------------------------------------------
+
+
+def test_rmsnorm_matches_manual_fp32() -> None:
+    norm = RMSNorm(8, eps=1e-6)
+    with torch.no_grad():
+        norm.weight.normal_()
+    x = torch.randn(2, 3, 8)
+    ref = x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + 1e-6)
+    ref = (norm.weight.float() * ref).to(x.dtype)
+    assert torch.allclose(norm(x), ref, atol=1e-6)
+
+
+def test_timestep_embedding_shape_and_formula() -> None:
+    t = torch.tensor([0.0, 1.0, 5.0])
+    emb = timestep_embedding(t, dim=16)
+    assert emb.shape == (3, 16)
+    # t=0 -> cos block all ones, sin block all zeros.
+    assert torch.allclose(emb[0, :8], torch.ones(8))
+    assert torch.allclose(emb[0, 8:], torch.zeros(8))
+
+
+def test_timestep_embedding_odd_dim_zero_pad() -> None:
+    emb = timestep_embedding(torch.tensor([1.0]), dim=7)
+    assert emb.shape == (1, 7)
+    assert emb[0, -1].item() == 0.0
+
+
+# ---------------------------------------------------------------------------
+# Patchify / unpatchify roundtrip
+# ---------------------------------------------------------------------------
+
+
+def test_patchify_unpatchify_roundtrip_shape() -> None:
+    model = _tiny_model()
+    # C=4, F=1, H=W=4 -> patch_size=2 -> 4 image tokens.
+    img = torch.randn(4, 1, 4, 4)
+    cap = torch.randn(SEQ_MULTI_OF, 12)
+    (image_out, _, sizes, *_rest) = model.patchify_and_embed([img], [cap], patch_size=2, f_patch_size=1)
+    assert sizes == [(1, 4, 4)]
+    # image tokens padded up to a multiple of SEQ_MULTI_OF
+    assert image_out[0].shape[0] % SEQ_MULTI_OF == 0
+    assert image_out[0].shape[1] == 2 * 2 * 1 * 4  # pf*ph*pw*C
+
+
+# ---------------------------------------------------------------------------
+# Full tiny forward
+# ---------------------------------------------------------------------------
+
+
+def test_forward_returns_latent_per_item() -> None:
+    model = _tiny_model()
+    with torch.no_grad():
+        for p in model.parameters():
+            p.normal_(std=0.02)
+    img = torch.randn(4, 1, 4, 4)
+    cap = torch.randn(20, 12)
+    t = torch.tensor([0.5])
+    with torch.no_grad():
+        out, _aux = model([img], t, [cap], patch_size=2, f_patch_size=1)
+    assert isinstance(out, list) and len(out) == 1
+    # out latent has the model's out_channels and the input image's F,H,W
+    assert out[0].shape == (model.out_channels, 1, 4, 4)
+
+
+def test_forward_batch_of_two() -> None:
+    model = _tiny_model()
+    with torch.no_grad():
+        for p in model.parameters():
+            p.normal_(std=0.02)
+    imgs = [torch.randn(4, 1, 4, 4), torch.randn(4, 1, 4, 4)]
+    caps = [torch.randn(18, 12), torch.randn(40, 12)]
+    t = torch.tensor([0.3, 0.7])
+    with torch.no_grad():
+        out, _ = model(imgs, t, caps, patch_size=2, f_patch_size=1)
+    assert len(out) == 2
+    assert out[0].shape == (model.out_channels, 1, 4, 4)
+
+
+# ---------------------------------------------------------------------------
+# Weight load: unfused layout copies directly
+# ---------------------------------------------------------------------------
+
+
+def test_load_weights_unfused_roundtrip() -> None:
+    src = _tiny_model()
+    with torch.no_grad():
+        for p in src.parameters():
+            p.normal_()
+    dst = _tiny_model()
+    loaded = dst.load_weights(src.state_dict().items())
+    assert loaded == set(dict(dst.named_parameters()).keys())
+    for name, p in dst.named_parameters():
+        assert torch.allclose(p, dict(src.named_parameters())[name])
+
+
+def test_param_names_are_unfused() -> None:
+    model = _tiny_model()
+    names = set(dict(model.named_parameters()).keys())
+    assert "layers.0.attention.to_q.weight" in names
+    assert "layers.0.attention.to_k.weight" in names
+    assert "layers.0.attention.to_v.weight" in names
+    assert "layers.0.attention.to_out.0.weight" in names
+    assert "layers.0.feed_forward.w1.weight" in names
+    assert "layers.0.feed_forward.w3.weight" in names
+    assert "layers.0.feed_forward.w2.weight" in names
+    # no fused names leaked in
+    assert not any("to_qkv" in n or "w13" in n for n in names)
+
+
+def test_load_weights_shape_mismatch_raises() -> None:
+    model = _tiny_model()
+    import pytest
+
+    with pytest.raises(ValueError, match="Shape mismatch"):
+        model.load_weights({"x_pad_token": torch.zeros(1, 999)}.items())
+
+
+# ---------------------------------------------------------------------------
+# Ming reference-latent subclass
+# ---------------------------------------------------------------------------
+
+
+def test_ming_ref_latent_concats_and_drops_frame() -> None:
+    model = _tiny_model(cls=MingZImageTransformer2DModel)
+    with torch.no_grad():
+        for p in model.parameters():
+            p.normal_(std=0.02)
+    img = torch.randn(4, 1, 4, 4)
+    ref = torch.randn(4, 4, 4)  # [C, H, W] -> becomes one extra frame
+    cap = torch.randn(20, 12)
+    t = torch.tensor([0.5])
+    with torch.no_grad():
+        out, _ = model([img], t, [cap], patch_size=2, f_patch_size=1, ref_latent=[ref])
+    # Output keeps only the first (non-reference) frame.
+    assert out[0].shape == (model.out_channels, 1, 4, 4)
+
+
+def test_ming_without_ref_latent_is_plain_t2i() -> None:
+    model = _tiny_model(cls=MingZImageTransformer2DModel)
+    with torch.no_grad():
+        for p in model.parameters():
+            p.normal_(std=0.02)
+    img = torch.randn(4, 1, 4, 4)
+    cap = torch.randn(20, 12)
+    with torch.no_grad():
+        out, _ = model([img], torch.tensor([0.5]), [cap], patch_size=2, f_patch_size=1)
+    assert out[0].shape == (model.out_channels, 1, 4, 4)