-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.conf
More file actions
36 lines (31 loc) · 3.52 KB
/
models.conf
File metadata and controls
36 lines (31 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# sparkcli model registry
# Format: model_id | max_model_len | extra_flags [# notes]
#
# Flag notes:
# --reasoning-parser deepseek_r1 NOT qwen3. Qwen3-VL-Thinking injects <think> into
# the prompt template so it never appears in output;
# qwen3 parser requires opening <think> in output.
# deepseek_r1 only needs closing </think>.
# (vLLM upstream bugs #26239, #27118)
# --tool-call-parser hermes Works for all Qwen3 variants in NGC 26.02
# --kv-cache-dtype fp8 Reduces KV cache memory; safe for NVIDIA FP8 checkpoints
#
# To add a model: open a PR or GitHub issue using the "New Model" template.
# ── Qwen3 ─────────────────────────────────────────────────────────────────────
Qwen/Qwen3-VL-32B-Thinking-FP8 | 65536 | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3-32B-FP8 | 65536 | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3-30B-A3B | 65536 | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3.5-27B-FP8 | 65536 | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1 # requires cu130-compatible vLLM image
Qwen/QwQ-32B | 32768 | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3-Coder-Next-FP8 | 131072 | --enable-auto-tool-choice --tool-call-parser qwen3_coder # MoE — requires docker_env.conf entry
# ── Llama ─────────────────────────────────────────────────────────────────────
nvidia/Llama-3.3-70B-Instruct-FP8 | 65536 | --enable-auto-tool-choice --tool-call-parser llama3_json --kv-cache-dtype fp8
nvidia/Llama-3.1-8B-Instruct-FP8 | 131072 | --enable-auto-tool-choice --tool-call-parser llama3_json --kv-cache-dtype fp8
# ── Phi ───────────────────────────────────────────────────────────────────────
nvidia/Phi-4-reasoning-plus-FP8 | 32768 | --reasoning-parser deepseek_r1
# ── DeepSeek ──────────────────────────────────────────────────────────────────
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B | 32768 | --reasoning-parser deepseek_r1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B | 16384 | --reasoning-parser deepseek_r1 # tight on 128GB, reduce context if OOM
# ── OpenAI (MoE — requires docker_env.conf entry) ────────────────────────────
openai/gpt-oss-20b | 131072 | --enable-auto-tool-choice --tool-call-parser openai --reasoning-parser openai_gptoss --load-format fastsafetensors # requires custom vLLM wheel (vllm==0.10.1+gptoss), won't work with default image
openai/gpt-oss-120b | 32768 | --enable-auto-tool-choice --tool-call-parser openai --reasoning-parser openai_gptoss --load-format fastsafetensors # requires custom vLLM wheel (vllm==0.10.1+gptoss); borderline single-node, reduce context if OOM