mstar-project · zhudianGG · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/benchmark/base.py b/benchmark/base.py
@@ -138,7 +138,7 @@
        return "canopylabs/orpheus-3b-0.1-ft"

    def get_supported_modalities(self):
        return {RequestType.T2S}


 class Qwen3Omni(Model):
@@ -214,6 +214,78 @@
         }
 
 
+class MingFlashOmni(Model):
+    """Ming-flash-omni-2.0 (inclusionAI), the Ling-2.0 sparse-MoE omni model
+    (100B total / 6B active params) released 2026-02-11.
+
+    Reachable today via the vllm-omni server using
+    ``vllm_omni/deploy/ming_flash_omni.yaml`` (thinker+talker) or
+    ``ming_flash_omni_thinker_only.yaml`` (text-only). The native ``ours`` /
+    ``ours_openai`` backends will work once the mminf-side port under
+    ``mminf/model/ming_omni_flash/`` is finished — until then, point the
+    benchmark at a vllm-omni instance with ``--inference-system vllm_omni``.
+
+    Wire shape mirrors :class:`Qwen3Omni`: standard OpenAI
+    ``/v1/chat/completions`` with multimodal content parts. The role remap
+    from OpenAI's ``user``/``assistant``/``system`` to Ming's internal
+    ``HUMAN``/``ASSISTANT``/``SYSTEM`` happens inside the jinja chat_template
+    shipped in ``tokenizer_config.json`` — vllm-omni renders prompts via
+    ``tokenizer.apply_chat_template`` which uses that jinja, so the benchmark
+    sends the standard OpenAI shape unchanged.
+
+    Caveat: Ming ALSO ships a Python-side ``BailingMM2Processor.apply_chat_template``
+    (in the Ming source repo) that is strict about uppercase roles and would
+    AssertionError on ``user``/``assistant``. mminf's native port uses that
+    processor for full multimodal preprocessing (vision/audio feature
+    extraction) and remaps roles in ``process_prompt`` accordingly — see
+    ``mminf/model/ming_omni_flash/`` and its tokenizer tests.
+    """
+
+    def get_hf_url(self):
+        return "inclusionAI/Ming-flash-omni-2.0"
+
+    def get_openai_system_message(self) -> Optional[dict]:
+        # Ming-flash-omni-2.0's cookbook uses ``sys_prompt_exp=None`` and
+        # ``use_cot_system_prompt=False`` by default — there's no required
+        # "You are Ming…"-style preamble equivalent to Qwen3-Omni's. The HF
+        # processor's chat_template fills in any internal system text on its
+        # own, and vllm-omni's serving layer goes through that template via
+        # ``trust_remote_code``. Sending an explicit system message here only
+        # risks overriding the model's own defaults, so default to None.
+        return None
+
+    def get_model_kwargs(self, request_type: RequestType):
+        # Cap thinker output at 256 tokens for cross-system fairness — same
+        # rationale as Qwen3Omni: comparable runs need a fixed decode budget.
+        # vllm-omni's released stage default is ``max_tokens: 2048`` (see
+        # ``vllm_omni/deploy/ming_flash_omni.yaml`` stage 0); we lower it for
+        # benchmark parity. Send both ``max_tokens`` (OpenAI convention) and
+        # ``max_output_tokens`` (mminf's native kwarg) so the cap survives
+        # whichever ``--inference-system`` is in use.
+        #
+        # Force greedy on the thinker (``temperature=0.0`` at payload top-level
+        # in VLLMOmni.send_request) for deterministic text. The talker's
+        # sampling defaults live server-side in the deploy yaml
+        # (``stage_id: 1`` → ``temperature: 0.0`` per the released config) —
+        # we don't override them here.
+        return {
+            "max_tokens": 256,
+            "max_output_tokens": 256,
+        }
+
+    def get_supported_modalities(self):
+        return {
+            RequestType.T2T,
+            RequestType.T2S,
+            RequestType.I2T,
+            RequestType.I2S,
+            RequestType.A2T,
+            RequestType.A2S,
+            RequestType.V2T,
+            RequestType.V2S,
+        }
+
+
 class Pi05(Model):
     """Physical Intelligence Pi0.5 VLA model.
 
@@ -268,6 +340,7 @@
     BAGEL = "bagel"
     ORPHEUS = "orpheus"
     QWEN3OMNI = "qwen3omni"
+    MING_FLASH_OMNI = "ming_flash_omni"
     PI05 = "pi05"
     VJEPA2AC = "vjepa2ac"
 
@@ -278,6 +351,8 @@
             return Orpheus(**kwargs)
         if self == ModelType.QWEN3OMNI:
             return Qwen3Omni(**kwargs)
+        if self == ModelType.MING_FLASH_OMNI:
+            return MingFlashOmni(**kwargs)
         if self == ModelType.PI05:
             return Pi05(**kwargs)
         if self == ModelType.VJEPA2AC:

diff --git a/benchmark/vllm_omni_instructions.md b/benchmark/vllm_omni_instructions.md
@@ -21,4 +21,93 @@ CUDA_VISIBLE_DEVICES=3 vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8000
 ### for qwen3-omni:
 ```
 vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
-```
+```
+
+### for ming-flash-omni-2.0:
+
+The released `inclusionAI/Ming-flash-omni-2.0` ckpt (~238 GB / 42 shards)
+does NOT load cleanly into vllm-omni's `MingFlashOmniForConditionalGeneration`
+class as-is. Two patches are needed (one-time setup):
+
+1. **Replace metadata files.** vllm-omni's model class uses
+   `Qwen2VLImageProcessor` + `MingWhisperFeatureExtractor` (its own
+   registered classes), while the inclusionAI snapshot declares the
+   `BailingMM2*` processor variants via `auto_map` and `trust_remote_code`.
+   Use `Jonathan1909/Ming-flash-omni-2.0`'s `preprocessor_config.json`,
+   `config.json` (auto_map stripped), and `tokenizer*.json` instead.
+
+2. **Replace the talker weights.** vllm-omni's `MingFlashOmniTalker` expects
+   weights under `audio_vae.*` but the inclusionAI talker safetensors uses
+   `audio.*` prefix. Jonathan1909 reshipped the talker with renamed weights
+   (~1.5 GB).
+
+Building a hybrid snapshot avoids re-downloading the 200+ GB thinker weights:
+
+```bash
+# 1. Make sure the inclusionAI thinker shards are cached
+huggingface-cli download inclusionAI/Ming-flash-omni-2.0 \
+    --include="model-*.safetensors" --include="model.safetensors.index.json"
+
+# 2. Pull only Jonathan1909's metadata + talker (no thinker weights)
+huggingface-cli download Jonathan1909/Ming-flash-omni-2.0 \
+    --include="*.json" --include="*.py" --include="*.txt" --include="*.mvn" \
+    --include="talker/**" \
+    --cache-dir /dev/shm/hf-cache    # or any path with ~3 GB free
+
+# 3. Stitch the two together
+INCL=$(huggingface-cli scan-cache | grep inclusionAI/Ming-flash-omni-2.0 \
+       | awk '{print $NF}')/snapshots/$(ls ~/.cache/huggingface/hub/models--inclusionAI--Ming-flash-omni-2.0/snapshots | head -1)
+JONA=/dev/shm/hf-cache/models--Jonathan1909--Ming-flash-omni-2.0/snapshots/*
+HYBRID=/dev/shm/ming-hybrid
+mkdir -p $HYBRID
+for f in $INCL/model-*.safetensors; do ln -s "$f" "$HYBRID/$(basename $f)"; done
+for f in $JONA/*; do
+    base=$(basename "$f")
+    [ -L "$HYBRID/$base" ] && rm "$HYBRID/$base"
+    ln -s "$f" "$HYBRID/$base"
+done
+```
+
+Then serve and benchmark:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve /dev/shm/ming-hybrid \
+  --omni --port 8091 --host 0.0.0.0 --trust-remote-code \
+  --stage-configs-path /tmp/vllm-omni/vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml
+
+# Wait for "Application startup complete" then:
+MODEL=ming_flash_omni INF_SYS=vllm_omni TASK=text_to_text \
+  URL=http://0.0.0.0:8091 ./benchmark/run_benchmark.sh
+```
+
+NOTE: vllm-omni's `/v1/chat/completions` rejects unknown model ids, so the
+client must send `"model": "/dev/shm/ming-hybrid"` (the served path), not
+`"inclusionAI/Ming-flash-omni-2.0"`. Easiest is to monkey-patch
+`MingFlashOmni.get_hf_url` before calling the benchmark runner:
+
+```python
+from benchmark.base import MingFlashOmni
+MingFlashOmni.get_hf_url = lambda self: "/dev/shm/ming-hybrid"
+```
+
+Or pass `--served-model-name inclusionAI/Ming-flash-omni-2.0` to `vllm serve`
+(untested; would also work in principle).
+
+#### Modalities exercised on a local 4×H100 run (2026-06-06)
+
+| Task | Status | Notes |
+|---|---|---|
+| T2T (text → text) | ✅ | offline B=1: 110 tok/s, closed-loop C=32: **1060 tok/s** (full scaling sweep in [`results/ming_t2t_sweep/SUMMARY.md`](../results/ming_t2t_sweep/SUMMARY.md)) |
+| I2T (image → text) | ✅ | TTFT 87 ms, ~100 tok/s on Food101 |
+| A2T (audio → text) | ✅ | English transcription + Chinese audio QA both work |
+| T2S (text → speech) | ✅ | RTF 0.14, 24 kHz mono PCM via harness; 44.1 kHz via direct OpenAI path |
+| V2T (video → text) | ✅ | Local Ming demo mp4s; coherent descriptions (`yoga.mp4` → yoga pose narration, `cup_change.mp4` → "shell game") |
+| V2S (video → speech) | ✅ | Local Ming demo mp4s; 2-3 MB WAV/clip @ 44.1 kHz |
+| I2S (image → speech) | ✅ | Food101 in, ~7 s/req for ~48 s of audio |
+| A2S (audio → speech) | ✅ | Ming sample wavs; 0.5-3 MB WAV/clip @ 44.1 kHz |
+| T2I / I2I (image gen) | not wired | requires `ming_flash_omni_image.yaml` + a benchmark wrapper similar to BAGEL's `/v1/images/generations` path |
+
+The V2T/V2S/A2S runs sidestep the bench harness's `UCF101Dataset` and
+`LibriSpeechDataset` (both want fresh HF-Hub downloads) by hitting
+`/v1/chat/completions` directly with base64-inlined media from local files
+(Ming repo's `figures/cases/*.mp4` and `data/wavs/*.wav`).
diff --git a/configs/ming_flash_omni.yaml b/configs/ming_flash_omni.yaml
@@ -0,0 +1,31 @@
+# Ming-flash-omni-2.0 — thinker + talker + audio VAE.
+#
+# WIP: the native mminf model port at mminf/model/ming_omni_flash/ is a
+# scaffold (every abstractmethod raises NotImplementedError), so
+# `mminf-serve --config configs/ming_flash_omni.yaml` will fail at startup
+# until that port lands. Until then, benchmark Ming-flash-omni-2.0 via the
+# vllm-omni server (see benchmark/vllm_omni_instructions.md).
+#
+# Target topology mirrors vllm-omni/deploy/ming_flash_omni.yaml:
+#   * Thinker (Ling-2.0 sparse MoE LLM, the multimodal understanding core)
+#     wants TP=4 across GPUs 0-3.
+#   * Talker (CFM-based audio generator) colocates on GPU 3.
+#   * Audio VAE (codec -> waveform) and stateless encoders (vision / audio)
+#     can ride on rank 0.
+#
+# Node names below are the placeholders the scaffold will reference; rename
+# in lockstep with mminf/model/ming_omni_flash/ming_omni_flash_model.py once
+# the graph walks are implemented.
+
+model: "ming_flash_omni"
+max_seq_len: 32768
+node_groups:
+  - node_names: [audio_encoder, vision_encoder, AudioVAE]
+    ranks: [0]
+
+  - node_names: [Thinker]
+    ranks: [0, 1, 2, 3]
+    tp_size: 4
+
+  - node_names: [Talker]
+    ranks: [3]
diff --git a/configs/ming_flash_omni_thinker_only.yaml b/configs/ming_flash_omni_thinker_only.yaml
@@ -0,0 +1,21 @@
+# Ming-flash-omni-2.0 — thinker-only deploy (text out, no talker).
+#
+# TP=8 across 8 H100s. Per-rank shard_inter = 1024/8 = 128;
+# experts.gate_up_proj is (256, 2*128, 4096) per rank, ~33 GB across
+# 31 MoE layers. With embed + lm_head + attention + dense layer 0 +
+# KV cache, ~40 GB per rank fits the 80 GB H100s comfortably.
+#
+# TP=4 OOMs at ~78.5 / 80 GB per rank even with
+# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True (re-verified
+# 2026-06-08; loader streaming overhead pushes past the 80 GB limit).
+# TP=8 halves the model footprint with plenty of headroom.
+#
+# Audio / vision / talker / image-gen are step 4+; this config is for
+# text-only T2T benchmarking and the first mminf-served Ming forward.
+
+model: "ming_flash_omni"
+max_seq_len: 32768
+node_groups:
+  - node_names: [Thinker]
+    ranks: [0, 1, 2, 3, 4, 5, 6, 7]
+    tp_size: 8
diff --git a/mminf/model/base.py b/mminf/model/base.py
@@ -253,19 +253,29 @@ def get_worker_graphs(self, config_path: str) -> list[WorkerGraph]:
         if node_groups is None:
             raise KeyError("Config must define `node_groups`.")
 
+        # Nodes this deploy actually provides. A graph walk referencing a
+        # node absent from node_groups (e.g. the encoder / talker walks in
+        # a thinker-only deploy) is skipped rather than KeyError'ing during
+        # worker-graph division — that deploy simply can't serve the walk.
+        available_nodes: set[str] = set()
+        for group in node_groups:
+            available_nodes.update(group["node_names"])
+
         # TODO: merge identical worker graphs from different graph walks
-        return sum(
-            [
+        worker_graphs: list[WorkerGraph] = []
+        for graph_walk, graph in self.get_graph_walk_graphs().items():
+            required = set(graph.get_nodes().keys())
+            if not required <= available_nodes:
+                continue
+            worker_graphs.extend(
                 self._get_worker_graphs_for_graph_walk(graph_walk, graph, node_groups)
-                for graph_walk, graph in self.get_graph_walk_graphs().items()
-            ],
-            start=[],
-        )
-
+            )
+        return worker_graphs
+
     def get_sharding_config(self, config_path: str) -> ShardingConfig:
         with open(config_path, "r") as f:
             config = yaml.safe_load(f)
-        
+
         sharding_config = self.get_default_sharding_config()
 
         # Derive sharding groups from node_groups with tp_size > 1. The