-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathimp.conf.example
More file actions
153 lines (141 loc) · 5.69 KB
/
imp.conf.example
File metadata and controls
153 lines (141 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# imp.conf — central runtime configuration for the imp inference engine.
#
# Loading precedence (first non-empty wins):
# 1. --config <path> CLI flag
# 2. $IMP_CONFIG environment variable
# 3. ./imp.conf working-dir relative
# 4. ~/.config/imp/imp.conf user config directory
# 5. embedded defaults (no file, all values below)
#
# Per-run overrides on top via:
# imp --set kv_cache.dtype=fp8 --set runtime.cuda_graphs=never ...
#
# Lines starting with '#' are comments; section headers in [brackets].
[runtime]
# Force deterministic GEMM (cuBLASLt no_reduce_split). Slower but reproducible.
deterministic_gemm = false
# CUDA Graph capture: "auto" picks per-model based on architecture support;
# "always" forces capture (fails for MoE with D2H routing); "never" disables.
cuda_graphs = "auto"
# Run a warmup forward pass at engine init to prime cuBLAS handles + L2 cache.
# Off by default — opt in for prod rollouts where first-request TTFT
# matters. In dev / CI / one-shot runs the warmup is pure overhead and can
# also mask first-request calibration bugs.
warmup = false
# Override the model's max_seq_len (0 = use the model default).
max_seq_len = 0
# Disable Programmatic Dependent Launch (PDL). Diagnostic; small perf impact.
no_pdl = false
# Naked FP16 path with FP8/NVFP4/graphs/warmup all forced off.
debug_raw = false
# Disable SigLIP vision encoder CUDA-graph capture (debug).
no_vision_graph = false
[kv_cache]
# KV-cache element type: fp16 | fp8 | int8 | int4 | nvfp4
dtype = "fp16"
# Allow cuBLASLt non-deterministic algorithms with FP8 KV. Slightly faster
# but reproducibility is lost.
allow_nondeterministic_fp8 = false
# Restore the legacy auto-upgrade-to-FP8 behavior (off-by-default since 2026-04).
fp8_auto_legacy = false
[attention]
# Each "auto" key picks the kernel automatically based on weight type +
# context length. Override only for benchmarking or compatibility tests.
fp8_prefill = "auto"
fp8_fmha = "auto"
fmha_sm120 = "auto"
mxfp4 = "auto"
mxfp4_fp16_fallback = false
fmha_blockscale = "auto"
naive = false
no_cublas = false
force_cublas_decode = false
no_qknorm_fused = false
no_naive_swa = false
splitk_pipe = true
gate_concat = false
[moe]
# Expected per-expert overhead (% of expert size) before deciding to upload
# all experts vs. host-fall-back. 10 = aggressive auto-pick, 30 = conservative.
expert_overhead_pct = 10
# Force the LAST N MoE layers off-GPU regardless of budget (debug path).
# 0 = disabled (auto-pick).
force_host_experts = 0
skip = false
force_fp16_sync = false
no_expert_cache = false
# Zero MoE workspace buffers each layer (memory-safety bisect).
zero_workspace = false
# Skip Gemma-4/Qwen3-Next/3.6 always-active shared MLP branch.
no_shared_mlp = false
# Skip Qwen3-Next/3.6 sigmoid gate on shared expert output.
no_shexp_gate = false
# Force-disable CUTLASS 3.x grouped MoE GEMM path (legacy fallback).
no_cutlass3x = false
[gdn]
# FP32 scan/output for Gated-DeltaNet (Qwen3.5/3.6). Slightly slower but
# eliminates FP16 precision drift at long context.
fp32_scan = false
fp32_out = false
# 0 = use the model's rms_norm_eps; otherwise override (diagnostic).
norm_eps_override = 0.0
ref_kernel = false
vhead_reorder = false
[gemm]
# dp4a = INT8 4-element packed multiply-accumulate. mmvq = matrix*vec
# quantized GEMM. Disabling forces dequant->cuBLAS path (diagnostic).
no_dp4a = false
no_dp4a_gemv = false
no_dp4a_lm = false
no_mmvq = false
no_mmvq_q8_0 = false
[gemma4]
fp32_gemm_out = false
no_graphs = false
force_mmvq = false
# FP32 expert-down GEMM (Gemma-4 numerical-parity diagnostic).
fp32_expert_down = false
# Disable the decode fast-path (debug bisect).
no_decode_fast = false
# Disable post-FFW norm 1 in the parallel-branch path (debug).
no_post_ffw_1 = false
# Force ggml-style prefill matmul for Gemma-4 (debug A/B path).
ggml_prefill = false
[generation]
no_logit_softcap = false
lm_dequant_fp16 = false
# Max tokens spent in <think>...</think> blocks (0 = unlimited).
think_budget = 0
# Force-prepend BOS token even when the tokenizer says no.
force_bos = false
[server]
# OpenAI-compatible HTTP server: prefix caching breaks per-request
# determinism (cached blocks get different physical KV addresses); off
# by default.
prefix_cache = false
[bench]
# imp-cli --bench: also run Engine::generate() loop for accurate timing.
generate = false
[paths]
# Path to a vision projector .gguf file (Gemma-3, Mistral3 multimodal).
mmproj = ""
[diagnostics]
# Verbose per-layer forward dumps. Heavy I/O — use only for bisecting bugs.
debug_forward = false
debug_gemm_dispatch = false
# Print intermediate Jinja2 chat-template rendering steps.
debug_template = false
# If non-empty, dump per-layer hidden states as .npy files to this directory.
# Special values: "1" or "all" → /tmp.
dump_hidden_dir = ""
# If non-empty, dump MoE gate logits (pre-topk) and routing weights to file.
# Special value "all" dumps every layer (default: only last + selected layers).
dump_logits_dir = ""
dump_routing_dir = ""
# Print prefill token IDs to stderr (truncated to 20).
dump_tokens = false
# Stop forward pass after layer N (-1 = run full forward).
exit_layer = -1
profile = false
graph_diag = false
graph_dump_dir = ""