Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cc2c305
ming_flash_omni: add benchmark wiring + native mminf scaffold (WIP)
zhudianGG Jun 6, 2026
3c04ac9
ming_flash_omni: port config from HF checkpoint layout
zhudianGG Jun 6, 2026
90d6042
ming_flash_omni: wire tokenizer + processor (step 2)
zhudianGG Jun 6, 2026
a9a0ed8
benchmark/ming: document hybrid-snapshot recipe + measured results
zhudianGG Jun 6, 2026
c90762f
benchmark/ming: T2T scaling sweep + full modality coverage
zhudianGG Jun 6, 2026
eff2b58
benchmark/ming: task-accuracy spot checks (MMLU + VideoMME)
zhudianGG Jun 6, 2026
45b8f9e
ming_flash_omni: Ling-2.0 architecture-novel components (step 3a)
zhudianGG Jun 8, 2026
971fe05
ming_flash_omni: Ling-2.0 MoE block + decoder layer + model (step 3b)
zhudianGG Jun 8, 2026
942486c
ming_flash_omni: weight loader + real-ckpt smoke test (step 3c)
zhudianGG Jun 8, 2026
bf62f5d
ming_flash_omni: cache wiring + ThinkerSubmodule + engine integration…
zhudianGG Jun 8, 2026
b941c0d
ming_flash_omni: TP-aware variants + TP=8 mminf-serve load (step 3e)
zhudianGG Jun 8, 2026
4559b32
ming_flash_omni: video_rope parity test + audio/vision token IDs
zhudianGG Jun 9, 2026
3f8f7cd
ming_flash_omni: text-only /generate end-to-end (step 3f)
zhudianGG Jun 9, 2026
e8fdce7
ming_flash_omni: vision + audio encoders + projectors (step 4a)
zhudianGG Jun 9, 2026
d1628f0
ming_flash_omni: vision/audio/projector weight loaders (step 4b)
zhudianGG Jun 9, 2026
6a5cee3
ming_flash_omni: vision + audio encoder submodules + node registratio…
zhudianGG Jun 9, 2026
a2c470f
ming_flash_omni: Thinker prefill_audio/prefill_vision dispatch + 3D p…
zhudianGG Jun 9, 2026
db6ad28
ming_flash_omni: multimodal prefill graph walks + scheduler (step 5c)
zhudianGG Jun 9, 2026
3b13615
ming_flash_omni: multimodal process_prompt for text + image + audio +…
zhudianGG Jun 9, 2026
780f643
model/base: skip graph walks whose nodes aren't in the deploy's node_…
zhudianGG Jun 10, 2026
b693988
ming_flash_omni: ruff lint fixes (PR1 — import sort, unused imports, …
zhudianGG Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions benchmark/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@
return "canopylabs/orpheus-3b-0.1-ft"

def get_supported_modalities(self):
return {RequestType.T2S}

Check failure on line 141 in benchmark/base.py

View workflow job for this annotation

GitHub Actions / build

ruff (W293)

benchmark/base.py:141:1: W293 Blank line contains whitespace help: Remove whitespace from blank line


class Qwen3Omni(Model):
Expand Down Expand Up @@ -214,6 +214,78 @@
}


class MingFlashOmni(Model):
"""Ming-flash-omni-2.0 (inclusionAI), the Ling-2.0 sparse-MoE omni model
(100B total / 6B active params) released 2026-02-11.

Reachable today via the vllm-omni server using
``vllm_omni/deploy/ming_flash_omni.yaml`` (thinker+talker) or
``ming_flash_omni_thinker_only.yaml`` (text-only). The native ``ours`` /
``ours_openai`` backends will work once the mminf-side port under
``mminf/model/ming_omni_flash/`` is finished — until then, point the
benchmark at a vllm-omni instance with ``--inference-system vllm_omni``.

Wire shape mirrors :class:`Qwen3Omni`: standard OpenAI
``/v1/chat/completions`` with multimodal content parts. The role remap
from OpenAI's ``user``/``assistant``/``system`` to Ming's internal
``HUMAN``/``ASSISTANT``/``SYSTEM`` happens inside the jinja chat_template
shipped in ``tokenizer_config.json`` — vllm-omni renders prompts via
``tokenizer.apply_chat_template`` which uses that jinja, so the benchmark
sends the standard OpenAI shape unchanged.

Caveat: Ming ALSO ships a Python-side ``BailingMM2Processor.apply_chat_template``
(in the Ming source repo) that is strict about uppercase roles and would
AssertionError on ``user``/``assistant``. mminf's native port uses that
processor for full multimodal preprocessing (vision/audio feature
extraction) and remaps roles in ``process_prompt`` accordingly — see
``mminf/model/ming_omni_flash/`` and its tokenizer tests.
"""

def get_hf_url(self):
return "inclusionAI/Ming-flash-omni-2.0"

def get_openai_system_message(self) -> Optional[dict]:
# Ming-flash-omni-2.0's cookbook uses ``sys_prompt_exp=None`` and
# ``use_cot_system_prompt=False`` by default — there's no required
# "You are Ming…"-style preamble equivalent to Qwen3-Omni's. The HF
# processor's chat_template fills in any internal system text on its
# own, and vllm-omni's serving layer goes through that template via
# ``trust_remote_code``. Sending an explicit system message here only
# risks overriding the model's own defaults, so default to None.
return None

def get_model_kwargs(self, request_type: RequestType):
# Cap thinker output at 256 tokens for cross-system fairness — same
# rationale as Qwen3Omni: comparable runs need a fixed decode budget.
# vllm-omni's released stage default is ``max_tokens: 2048`` (see
# ``vllm_omni/deploy/ming_flash_omni.yaml`` stage 0); we lower it for
# benchmark parity. Send both ``max_tokens`` (OpenAI convention) and
# ``max_output_tokens`` (mminf's native kwarg) so the cap survives
# whichever ``--inference-system`` is in use.
#
# Force greedy on the thinker (``temperature=0.0`` at payload top-level
# in VLLMOmni.send_request) for deterministic text. The talker's
# sampling defaults live server-side in the deploy yaml
# (``stage_id: 1`` → ``temperature: 0.0`` per the released config) —
# we don't override them here.
return {
"max_tokens": 256,
"max_output_tokens": 256,
}

def get_supported_modalities(self):
return {
RequestType.T2T,
RequestType.T2S,
RequestType.I2T,
RequestType.I2S,
RequestType.A2T,
RequestType.A2S,
RequestType.V2T,
RequestType.V2S,
}


class Pi05(Model):
"""Physical Intelligence Pi0.5 VLA model.

Expand Down Expand Up @@ -268,6 +340,7 @@
BAGEL = "bagel"
ORPHEUS = "orpheus"
QWEN3OMNI = "qwen3omni"
MING_FLASH_OMNI = "ming_flash_omni"
PI05 = "pi05"
VJEPA2AC = "vjepa2ac"

Expand All @@ -278,6 +351,8 @@
return Orpheus(**kwargs)
if self == ModelType.QWEN3OMNI:
return Qwen3Omni(**kwargs)
if self == ModelType.MING_FLASH_OMNI:
return MingFlashOmni(**kwargs)
if self == ModelType.PI05:
return Pi05(**kwargs)
if self == ModelType.VJEPA2AC:
Expand Down
91 changes: 90 additions & 1 deletion benchmark/vllm_omni_instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,93 @@ CUDA_VISIBLE_DEVICES=3 vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8000
### for qwen3-omni:
```
vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
```
```

### for ming-flash-omni-2.0:

The released `inclusionAI/Ming-flash-omni-2.0` ckpt (~238 GB / 42 shards)
does NOT load cleanly into vllm-omni's `MingFlashOmniForConditionalGeneration`
class as-is. Two patches are needed (one-time setup):

1. **Replace metadata files.** vllm-omni's model class uses
`Qwen2VLImageProcessor` + `MingWhisperFeatureExtractor` (its own
registered classes), while the inclusionAI snapshot declares the
`BailingMM2*` processor variants via `auto_map` and `trust_remote_code`.
Use `Jonathan1909/Ming-flash-omni-2.0`'s `preprocessor_config.json`,
`config.json` (auto_map stripped), and `tokenizer*.json` instead.

2. **Replace the talker weights.** vllm-omni's `MingFlashOmniTalker` expects
weights under `audio_vae.*` but the inclusionAI talker safetensors uses
`audio.*` prefix. Jonathan1909 reshipped the talker with renamed weights
(~1.5 GB).

Building a hybrid snapshot avoids re-downloading the 200+ GB thinker weights:

```bash
# 1. Make sure the inclusionAI thinker shards are cached
huggingface-cli download inclusionAI/Ming-flash-omni-2.0 \
--include="model-*.safetensors" --include="model.safetensors.index.json"

# 2. Pull only Jonathan1909's metadata + talker (no thinker weights)
huggingface-cli download Jonathan1909/Ming-flash-omni-2.0 \
--include="*.json" --include="*.py" --include="*.txt" --include="*.mvn" \
--include="talker/**" \
--cache-dir /dev/shm/hf-cache # or any path with ~3 GB free

# 3. Stitch the two together
INCL=$(huggingface-cli scan-cache | grep inclusionAI/Ming-flash-omni-2.0 \
| awk '{print $NF}')/snapshots/$(ls ~/.cache/huggingface/hub/models--inclusionAI--Ming-flash-omni-2.0/snapshots | head -1)
JONA=/dev/shm/hf-cache/models--Jonathan1909--Ming-flash-omni-2.0/snapshots/*
HYBRID=/dev/shm/ming-hybrid
mkdir -p $HYBRID
for f in $INCL/model-*.safetensors; do ln -s "$f" "$HYBRID/$(basename $f)"; done
for f in $JONA/*; do
base=$(basename "$f")
[ -L "$HYBRID/$base" ] && rm "$HYBRID/$base"
ln -s "$f" "$HYBRID/$base"
done
```

Then serve and benchmark:

```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve /dev/shm/ming-hybrid \
--omni --port 8091 --host 0.0.0.0 --trust-remote-code \
--stage-configs-path /tmp/vllm-omni/vllm_omni/model_executor/stage_configs/ming_flash_omni.yaml

# Wait for "Application startup complete" then:
MODEL=ming_flash_omni INF_SYS=vllm_omni TASK=text_to_text \
URL=http://0.0.0.0:8091 ./benchmark/run_benchmark.sh
```

NOTE: vllm-omni's `/v1/chat/completions` rejects unknown model ids, so the
client must send `"model": "/dev/shm/ming-hybrid"` (the served path), not
`"inclusionAI/Ming-flash-omni-2.0"`. Easiest is to monkey-patch
`MingFlashOmni.get_hf_url` before calling the benchmark runner:

```python
from benchmark.base import MingFlashOmni
MingFlashOmni.get_hf_url = lambda self: "/dev/shm/ming-hybrid"
```

Or pass `--served-model-name inclusionAI/Ming-flash-omni-2.0` to `vllm serve`
(untested; would also work in principle).

#### Modalities exercised on a local 4×H100 run (2026-06-06)

| Task | Status | Notes |
|---|---|---|
| T2T (text → text) | ✅ | offline B=1: 110 tok/s, closed-loop C=32: **1060 tok/s** (full scaling sweep in [`results/ming_t2t_sweep/SUMMARY.md`](../results/ming_t2t_sweep/SUMMARY.md)) |
| I2T (image → text) | ✅ | TTFT 87 ms, ~100 tok/s on Food101 |
| A2T (audio → text) | ✅ | English transcription + Chinese audio QA both work |
| T2S (text → speech) | ✅ | RTF 0.14, 24 kHz mono PCM via harness; 44.1 kHz via direct OpenAI path |
| V2T (video → text) | ✅ | Local Ming demo mp4s; coherent descriptions (`yoga.mp4` → yoga pose narration, `cup_change.mp4` → "shell game") |
| V2S (video → speech) | ✅ | Local Ming demo mp4s; 2-3 MB WAV/clip @ 44.1 kHz |
| I2S (image → speech) | ✅ | Food101 in, ~7 s/req for ~48 s of audio |
| A2S (audio → speech) | ✅ | Ming sample wavs; 0.5-3 MB WAV/clip @ 44.1 kHz |
| T2I / I2I (image gen) | not wired | requires `ming_flash_omni_image.yaml` + a benchmark wrapper similar to BAGEL's `/v1/images/generations` path |

The V2T/V2S/A2S runs sidestep the bench harness's `UCF101Dataset` and
`LibriSpeechDataset` (both want fresh HF-Hub downloads) by hitting
`/v1/chat/completions` directly with base64-inlined media from local files
(Ming repo's `figures/cases/*.mp4` and `data/wavs/*.wav`).
31 changes: 31 additions & 0 deletions configs/ming_flash_omni.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Ming-flash-omni-2.0 — thinker + talker + audio VAE.
#
# WIP: the native mminf model port at mminf/model/ming_omni_flash/ is a
# scaffold (every abstractmethod raises NotImplementedError), so
# `mminf-serve --config configs/ming_flash_omni.yaml` will fail at startup
# until that port lands. Until then, benchmark Ming-flash-omni-2.0 via the
# vllm-omni server (see benchmark/vllm_omni_instructions.md).
#
# Target topology mirrors vllm-omni/deploy/ming_flash_omni.yaml:
# * Thinker (Ling-2.0 sparse MoE LLM, the multimodal understanding core)
# wants TP=4 across GPUs 0-3.
# * Talker (CFM-based audio generator) colocates on GPU 3.
# * Audio VAE (codec -> waveform) and stateless encoders (vision / audio)
# can ride on rank 0.
#
# Node names below are the placeholders the scaffold will reference; rename
# in lockstep with mminf/model/ming_omni_flash/ming_omni_flash_model.py once
# the graph walks are implemented.

model: "ming_flash_omni"
max_seq_len: 32768
node_groups:
- node_names: [audio_encoder, vision_encoder, AudioVAE]
ranks: [0]

- node_names: [Thinker]
ranks: [0, 1, 2, 3]
tp_size: 4

- node_names: [Talker]
ranks: [3]
21 changes: 21 additions & 0 deletions configs/ming_flash_omni_thinker_only.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Ming-flash-omni-2.0 — thinker-only deploy (text out, no talker).
#
# TP=8 across 8 H100s. Per-rank shard_inter = 1024/8 = 128;
# experts.gate_up_proj is (256, 2*128, 4096) per rank, ~33 GB across
# 31 MoE layers. With embed + lm_head + attention + dense layer 0 +
# KV cache, ~40 GB per rank fits the 80 GB H100s comfortably.
#
# TP=4 OOMs at ~78.5 / 80 GB per rank even with
# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True (re-verified
# 2026-06-08; loader streaming overhead pushes past the 80 GB limit).
# TP=8 halves the model footprint with plenty of headroom.
#
# Audio / vision / talker / image-gen are step 4+; this config is for
# text-only T2T benchmarking and the first mminf-served Ming forward.

model: "ming_flash_omni"
max_seq_len: 32768
node_groups:
- node_names: [Thinker]
ranks: [0, 1, 2, 3, 4, 5, 6, 7]
tp_size: 8
26 changes: 18 additions & 8 deletions mminf/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,19 +253,29 @@ def get_worker_graphs(self, config_path: str) -> list[WorkerGraph]:
if node_groups is None:
raise KeyError("Config must define `node_groups`.")

# Nodes this deploy actually provides. A graph walk referencing a
# node absent from node_groups (e.g. the encoder / talker walks in
# a thinker-only deploy) is skipped rather than KeyError'ing during
# worker-graph division — that deploy simply can't serve the walk.
available_nodes: set[str] = set()
for group in node_groups:
available_nodes.update(group["node_names"])

# TODO: merge identical worker graphs from different graph walks
return sum(
[
worker_graphs: list[WorkerGraph] = []
for graph_walk, graph in self.get_graph_walk_graphs().items():
required = set(graph.get_nodes().keys())
if not required <= available_nodes:
continue
worker_graphs.extend(
self._get_worker_graphs_for_graph_walk(graph_walk, graph, node_groups)
for graph_walk, graph in self.get_graph_walk_graphs().items()
],
start=[],
)

)
return worker_graphs

def get_sharding_config(self, config_path: str) -> ShardingConfig:
with open(config_path, "r") as f:
config = yaml.safe_load(f)

sharding_config = self.get_default_sharding_config()

# Derive sharding groups from node_groups with tp_size > 1. The
Expand Down
Loading
Loading