mstar-project · merceod · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/configs/cosmos3_nano.yaml b/configs/cosmos3_nano.yaml
@@ -0,0 +1,17 @@
+model: "cosmos3"
+# Sequence-length hint for the scheduler. The conductor only asserts its
+# presence; the real per-request capacity is the KV pool below.
+max_seq_len: 8192
+# KV pool sizing. The default (max_num_pages 2048 x page_size 128) pre-allocates
+# ~38 GB of paged K/V for the 36-layer DiT regardless of the workload, which
+# OOMs larger video on an 80 GB card. A bs=1 720p x 189-frame request needs only
+# ~692 pages across both CFG branches (images take a few dozen), so 1024 pages
+# (~19 GB) cover single-request video at every tier plus image batching and free
+# ~19 GB for activations.
+kv_cache:
+  max_num_pages: 1024
+node_groups:
+  - node_names: ["dit"]
+    ranks: [0]
+  - node_names: ["vae_decoder"]
+    ranks: [0]
diff --git a/configs/cosmos3_nano_tp2.yaml b/configs/cosmos3_nano_tp2.yaml
@@ -0,0 +1,18 @@
+model: "cosmos3"
+# Sequence-length hint for the scheduler (see cosmos3_nano.yaml).
+max_seq_len: 8192
+# Per-rank KV pool. Under tensor parallelism the KV heads shard across ranks, so
+# each rank's pages hold half the heads — 1024 pages leave ample headroom.
+kv_cache:
+  max_num_pages: 1024
+# The DiT runs tensor-parallel across two ranks (attention heads + MLP
+# intermediate shard; the residual stream stays full and the out/down
+# projections all-reduce). The VAE decoder is small and runs un-sharded on
+# rank 0; the DiT's final latents are replicated, so the decoder reads them
+# directly.
+node_groups:
+  - node_names: ["dit"]
+    ranks: [0, 1]
+    tp_size: 2
+  - node_names: ["vae_decoder"]
+    ranks: [0]
diff --git a/configs/cosmos3_super_tp4.yaml b/configs/cosmos3_super_tp4.yaml
@@ -0,0 +1,17 @@
+model: "cosmos3_super"
+# Sequence-length hint for the scheduler (see cosmos3_nano.yaml).
+max_seq_len: 8192
+# Per-rank KV pool. Super is 64 layers (vs Nano's 36) but the KV heads (8) shard
+# across the 4 TP ranks, so per-rank KV stays modest; 1024 pages is ample on the
+# 143 GB H200s.
+kv_cache:
+  max_num_pages: 1024
+# Super (64B) is unviable on one GPU (~128 GB in bf16), so the DiT runs
+# tensor-parallel across 4 ranks. The VAE decoder is small and runs un-sharded
+# on rank 0 (the DiT's final latents are replicated, so it reads them directly).
+node_groups:
+  - node_names: ["dit"]
+    ranks: [0, 1, 2, 3]
+    tp_size: 4
+  - node_names: ["vae_decoder"]
+    ranks: [0]
diff --git a/mstar/api_server/data_worker.py b/mstar/api_server/data_worker.py
@@ -113,6 +113,17 @@ def get_result_chunks(self)-> list[ResultChunk]:
         results = []
         while not self.output_queue.empty():
             result: ResultChunk = self.output_queue.get()
+            # A request can be cleaned up (its result already returned) while a
+            # late chunk is still in the queue -- common when several requests
+            # complete in the same step. Mirror new_result_tensors' guard and
+            # drop the straggler rather than KeyError, which would otherwise
+            # abort the whole drain and lose the other requests' chunks.
+            if result.request_id not in self.per_request_reading_tensors:
+                logger.debug(
+                    "Late result chunk for cleaned-up request %s, ignoring",
+                    result.request_id,
+                )
+                continue
             self.per_request_reading_tensors[result.request_id] -= 1
             logger.debug(
                 "Data worker reading queue for request %s decreased to length %d",

diff --git a/mstar/api_server/openai/adapters.py b/mstar/api_server/openai/adapters.py
@@ -35,6 +35,7 @@
         ChatCompletionRequest,
         ImageGenerationRequest,
         SpeechRequest,
+        VideoGenerationRequest,
     )
 
 
@@ -164,6 +165,7 @@ class OpenAIAdapter:
     supports_chat: bool = False     # POST /v1/chat/completions
     supports_speech: bool = False   # POST /v1/audio/speech
     supports_images: bool = False   # POST /v1/images/generations and /v1/images/edits
+    supports_videos: bool = False   # POST /v1/videos/generations
 
     def chat_to_request(self, req: ChatCompletionRequest, upload_dir: Path) -> SubmitArgs:  # noqa: ARG002
         # Output modalities vary by model: e.g. Qwen3-Omni speech output also
@@ -176,6 +178,9 @@ def speech_to_request(self, req: SpeechRequest, upload_dir: Path) -> SubmitArgs:
     def image_to_request(self, req: ImageGenerationRequest, upload_dir: Path) -> SubmitArgs:  # noqa: ARG002
         raise NotImplementedError("image generation is not supported by this model")
 
+    def video_to_request(self, req: VideoGenerationRequest, upload_dir: Path) -> SubmitArgs:  # noqa: ARG002
+        raise NotImplementedError("video generation is not supported by this model")
+
     def image_edit_to_request(self, prompt: str, image_path: str, extra_kwargs: dict) -> SubmitArgs:  # noqa: ARG002
         raise NotImplementedError("image editing is not supported by this model")
 
@@ -297,12 +302,69 @@ def speech_to_request(self, req: SpeechRequest, upload_dir: Path) -> SubmitArgs:
         )
 
 
+class Cosmos3Adapter(OpenAIAdapter):
+    """NVIDIA Cosmos3: text-to-image and text/image-to-video generation.
+
+    ``size`` ("WxH") maps to the generation resolution; ``seed`` and any
+    extra knobs (``guidance_scale``, ``num_inference_steps``, ``negative_prompt``,
+    and for video ``num_frames`` / ``fps``) pass through via ``extra_body``.
+    """
+
+    supports_images = True
+    supports_videos = True
+
+    def image_to_request(self, req: ImageGenerationRequest, upload_dir: Path) -> SubmitArgs:  # noqa: ARG002
+        mk = _passthrough(req)
+        if getattr(req, "size", None):
+            mk.setdefault("size", req.size)
+        if getattr(req, "seed", None) is not None:
+            mk.setdefault("seed", req.seed)
+        return SubmitArgs(
+            text=req.prompt,
+            input_modalities=["text"],
+            output_modalities=["image"],
+            model_kwargs=mk,
+        )
+
+    def video_to_request(self, req: VideoGenerationRequest, upload_dir: Path) -> SubmitArgs:
+        mk = _passthrough(req)
+        if getattr(req, "size", None):
+            mk.setdefault("size", req.size)
+        if getattr(req, "seed", None) is not None:
+            mk.setdefault("seed", req.seed)
+        # num_frames / fps are first-class video fields (not in extra_body).
+        if getattr(req, "num_frames", None) is not None:
+            mk.setdefault("num_frames", req.num_frames)
+        if getattr(req, "fps", None) is not None:
+            mk.setdefault("fps", req.fps)
+        # Image-to-video: the conditioning frame (URL / data URI) is persisted and
+        # loaded by the worker, which VAE-encodes it into the clean frame-0 anchor.
+        image = getattr(req, "image", None)
+        if image:
+            _, path = media_io.resolve_media_ref(image, upload_dir)
+            return SubmitArgs(
+                text=req.prompt,
+                file_paths={"image": [path]},
+                input_modalities=["image", "text"],
+                output_modalities=["video"],
+                model_kwargs=mk,
+            )
+        return SubmitArgs(
+            text=req.prompt,
+            input_modalities=["text"],
+            output_modalities=["video"],
+            model_kwargs=mk,
+        )
+
+
 # Only models with an OpenAI-standard surface are registered. Action/world-model
 # models (pi05, vjepa2) are deliberately absent → /v1/* 404s; use /generate.
 ADAPTER_REGISTRY: dict[str, OpenAIAdapter] = {
     "bagel": BagelAdapter(),
     "qwen3_omni": Qwen3OmniAdapter(),
     "orpheus": OrpheusAdapter(),
+    "cosmos3": Cosmos3Adapter(),
+    "cosmos3_super": Cosmos3Adapter(),
 }
 
 

diff --git a/mstar/api_server/openai/protocol.py b/mstar/api_server/openai/protocol.py
@@ -71,6 +71,28 @@ class ImageGenerationRequest(BaseModel):
     seed: int | None = None
 
 
+class VideoGenerationRequest(BaseModel):
+    """``/v1/videos/generations`` (text-to-video / image-to-video).
+
+    Not an OpenAI-standard surface; modeled on the image endpoint. ``image`` (a
+    URL or data URI) conditions image-to-video. Extra knobs
+    (``guidance_scale``, ``num_inference_steps``, ``negative_prompt`` …) flow
+    through via ``extra_body``.
+    """
+
+    model_config = _CFG
+
+    prompt: str
+    model: str | None = None
+    n: int | None = 1
+    size: str | None = None
+    response_format: str = "b64_json"
+    seed: int | None = None
+    num_frames: int | None = None
+    fps: float | None = None
+    image: str | None = None  # URL or data URI for image-to-video conditioning
+
+
 class ModelCard(BaseModel):
     id: str
     object: str = "model"

diff --git a/mstar/api_server/openai/router.py b/mstar/api_server/openai/router.py
@@ -12,7 +12,12 @@
 from fastapi import APIRouter, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 
-from mstar.api_server.openai import serving_chat, serving_images, serving_speech
+from mstar.api_server.openai import (
+    serving_chat,
+    serving_images,
+    serving_speech,
+    serving_videos,
+)
 from mstar.api_server.openai._util import now
 from mstar.api_server.openai.adapters import get_adapter
 from mstar.api_server.openai.protocol import (
@@ -21,6 +26,7 @@
     ModelCard,
     ModelList,
     SpeechRequest,
+    VideoGenerationRequest,
 )
 
 router = APIRouter()
@@ -113,6 +119,18 @@ async def images_generations(request: ImageGenerationRequest):
     return JSONResponse(result)
 
 
+@router.post("/v1/videos/generations")
+async def videos_generations(request: VideoGenerationRequest):
+    api, model_name, adapter, err = _resolve("supports_videos")
+    if err is not None:
+        return err
+    try:
+        result = await serving_videos.create_videos(api, model_name, adapter, request)
+    except Exception as e:  # noqa: BLE001
+        return _error(getattr(e, "status_code", 500), str(getattr(e, "detail", e)), "server_error")
+    return JSONResponse(result)
+
+
 @router.post("/v1/images/edits")
 async def images_edits(request: Request):
     # Multipart (image file + prompt + passthrough fields), parsed manually so

diff --git a/mstar/api_server/openai/serving_videos.py b/mstar/api_server/openai/serving_videos.py
@@ -0,0 +1,34 @@
+"""/v1/videos/generations (text-to-video and image-to-video) handler."""
+
+from __future__ import annotations
+
+import base64
+
+from starlette.concurrency import run_in_threadpool
+
+from mstar.api_server.openai._util import now, rid
+
+
+async def create_videos(api, model_name, adapter, req):  # noqa: ARG001
+    args = adapter.video_to_request(req, api.upload_dir)
+    request_id = rid("vid")
+
+    api.submit_request(
+        text=args.text,
+        file_paths=args.file_paths,
+        input_modalities=args.input_modalities,
+        output_modalities=["video"],
+        model_kwargs=args.model_kwargs,
+        streaming=False,
+        request_id=request_id,
+    )
+
+    chunks = await run_in_threadpool(api.collect_results, request_id)
+    # Each video chunk is an mp4 (H.264); return it base64-encoded, mirroring the
+    # image endpoint's b64_json shape.
+    data = [
+        {"b64_json": base64.b64encode(c.data).decode("ascii"), "url": None}
+        for c in chunks
+        if c.modality == "video"
+    ]
+    return {"created": now(), "data": data}
diff --git a/mstar/benchmark/cosmos3/bench_t2i_oai.py b/mstar/benchmark/cosmos3/bench_t2i_oai.py
@@ -0,0 +1,69 @@
+"""Apples-to-apples t2i latency client — hits the OpenAI /v1/images/generations
+endpoint that BOTH our mstar server and vLLM-Omni (`vllm serve --omni`) expose, with
+an identical payload, and reports client-side wall latency (warmup + median of N).
+
+Same scope on both engines (client-side end-to-end incl. HTTP + b64 PNG), same config
+(tiers, steps, guidance, seed, prompt). Run once per server (different --port/--model).
+
+  python bench_t2i_oai.py --port 8000 --model nvidia/Cosmos3-Nano --tag vllm
+  python bench_t2i_oai.py --port 8100 --model cosmos3_nano --tag ours
+"""
+import argparse
+import base64
+import json
+import statistics
+import time
+import urllib.request
+
+ap = argparse.ArgumentParser()
+ap.add_argument("--port", type=int, required=True)
+ap.add_argument("--model", default="nvidia/Cosmos3-Nano")
+ap.add_argument("--sizes", default="320x192,832x480,1280x720")  # 256p/480p/720p tiers
+ap.add_argument("--steps", type=int, default=50)
+ap.add_argument("--gs", type=float, default=6.0)
+ap.add_argument("--seed", type=int, default=0)
+ap.add_argument("--rounds", type=int, default=5)
+ap.add_argument("--warmup", type=int, default=2)
+ap.add_argument("--tag", default="run")
+ap.add_argument("--save", default="")  # optional PNG path prefix
+args = ap.parse_args()
+
+PROMPT = "A red cube resting on a polished wooden table, soft daylight."
+NEG = "blurry, distorted, low quality"
+URL = f"http://localhost:{args.port}/v1/images/generations"
+
+
+def one(size):
+    body = json.dumps({
+        "model": args.model, "prompt": PROMPT, "negative_prompt": NEG,
+        "size": size, "n": 1, "response_format": "b64_json",
+        "num_inference_steps": args.steps, "guidance_scale": args.gs, "seed": args.seed,
+    }).encode()
+    req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"})
+    t0 = time.perf_counter()
+    with urllib.request.urlopen(req, timeout=1200) as r:
+        payload = json.load(r)
+    dt = time.perf_counter() - t0
+    b64 = payload["data"][0]["b64_json"]
+    return dt, b64
+
+
+print(f"=== {args.tag}  port={args.port} model={args.model}  steps={args.steps} gs={args.gs} seed={args.seed} ===", flush=True)
+for size in args.sizes.split(","):
+    try:
+        for _ in range(args.warmup):
+            one(size)
+        ts = []
+        last_b64 = None
+        for _ in range(args.rounds):
+            dt, last_b64 = one(size)
+            ts.append(dt)
+        ts.sort()
+        med = statistics.median(ts)
+        print(f"  {size:9s}  median {med:.3f}s  min {ts[0]:.3f}  max {ts[-1]:.3f}  (n={args.rounds})", flush=True)
+        if args.save and last_b64:
+            with open(f"{args.save}_{size}.png", "wb") as f:
+                f.write(base64.b64decode(last_b64))
+    except Exception as e:  # noqa: BLE001
+        print(f"  {size:9s}  ERROR {type(e).__name__}: {str(e)[:120]}", flush=True)
+print("DONE", flush=True)