sparkcli/models.conf at master · demigodmode/sparkcli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# sparkcli model registry
# Format: model_id | max_model_len | extra_flags [# notes]
#
# Flag notes:
#   --reasoning-parser deepseek_r1   NOT qwen3. Qwen3-VL-Thinking injects <think> into
#                                    the prompt template so it never appears in output;
#                                    qwen3 parser requires opening <think> in output.
#                                    deepseek_r1 only needs closing </think>.
#                                    (vLLM upstream bugs #26239, #27118)
#   --tool-call-parser hermes        Works for all Qwen3 variants in NGC 26.02
#   --kv-cache-dtype fp8             Reduces KV cache memory; safe for NVIDIA FP8 checkpoints
#
# To add a model: open a PR or GitHub issue using the "New Model" template.

# ── Qwen3 ─────────────────────────────────────────────────────────────────────
Qwen/Qwen3-VL-32B-Thinking-FP8 | 65536  | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3-32B-FP8              | 65536  | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3-30B-A3B              | 65536  | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3.5-27B-FP8            | 65536  | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1 # requires cu130-compatible vLLM image
Qwen/QwQ-32B                    | 32768  | --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
Qwen/Qwen3-Coder-Next-FP8       | 131072 | --enable-auto-tool-choice --tool-call-parser qwen3_coder # MoE — requires docker_env.conf entry

# ── Llama ─────────────────────────────────────────────────────────────────────
nvidia/Llama-3.3-70B-Instruct-FP8 | 65536 | --enable-auto-tool-choice --tool-call-parser llama3_json --kv-cache-dtype fp8
nvidia/Llama-3.1-8B-Instruct-FP8  | 131072 | --enable-auto-tool-choice --tool-call-parser llama3_json --kv-cache-dtype fp8

# ── Phi ───────────────────────────────────────────────────────────────────────
nvidia/Phi-4-reasoning-plus-FP8 | 32768 | --reasoning-parser deepseek_r1

# ── DeepSeek ──────────────────────────────────────────────────────────────────
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | 32768 | --reasoning-parser deepseek_r1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B | 16384 | --reasoning-parser deepseek_r1 # tight on 128GB, reduce context if OOM

# ── OpenAI (MoE — requires docker_env.conf entry) ────────────────────────────
openai/gpt-oss-20b  | 131072 | --enable-auto-tool-choice --tool-call-parser openai --reasoning-parser openai_gptoss --load-format fastsafetensors # requires custom vLLM wheel (vllm==0.10.1+gptoss), won't work with default image
openai/gpt-oss-120b | 32768  | --enable-auto-tool-choice --tool-call-parser openai --reasoning-parser openai_gptoss --load-format fastsafetensors # requires custom vLLM wheel (vllm==0.10.1+gptoss); borderline single-node, reduce context if OOM