diff --git a/scripts/copy-models.cjs b/scripts/copy-models.cjs
index 7c92c5a..9dadd43 100644
--- a/scripts/copy-models.cjs
+++ b/scripts/copy-models.cjs
@@ -25,7 +25,7 @@ const MODEL_DIRS = [
 	// mode by default; opt into multi-head decision rule via
 	// `tier2Config.multihead`. Calibrated T = 2.41, highRiskThreshold = 0.64
 	// (encoded in classifier_config.json:calibration).
-	"minilm-multihead-v5",
+	"minilm-multihead-v6",
 ];
 
 let copied = 0;
diff --git a/src/classifiers/models/minilm-multihead-v5/classifier_config.json b/src/classifiers/models/minilm-multihead-v5/classifier_config.json
deleted file mode 100644
index e9624d4..0000000
--- a/src/classifiers/models/minilm-multihead-v5/classifier_config.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-  "model_id": "sentence-transformers/all-MiniLM-L6-v2",
-  "model_type": "minilm",
-  "use_seq_clf": false,
-  "hidden_size": 384,
-  "freeze_layers": 4,
-  "pooling": "mean",
-  "optimal_threshold": 0.4,
-  "datasets": [
-    "qualifire",
-    "jayavibhav",
-    "agentdojo",
-    "jasperls",
-    "jailbreakbench",
-    "toxic-chat",
-    "chatgpt-jailbreaks",
-    "email-hardneg",
-    "email-hardneg-gen",
-    "multilingual-hardneg",
-    "jailbreakbench-neg",
-    "toxic-chat-neg",
-    "fujitsu-injecagent",
-    "fujitsu-rag",
-    "enron-ham",
-    "connector-hardneg-v2",
-    "dev-tooling-hardneg-curated",
-    "dev-tooling-attacks",
-    "agentshield-shape-attacks",
-    "system-prompt-extraction-attacks",
-    "emoji-ci-benign",
-    "benign-user-queries",
-    "code-docs-benign"
-  ],
-  "token_level": false,
-  "token_pool": "max",
-  "token_topk": 5,
-  "three_class": false,
-  "multi_head": true,
-  "aux_loss_alpha": 0.5,
-  "calibration": {
-    "temperatureT": 2.41,
-    "highRiskThreshold": 0.64,
-    "ece": 0.09,
-    "fitted_on": "labeled plugin events 2026-05-13",
-    "notes": "Raw highRiskThreshold 0.8 is math-equivalent to calibrated 0.64 at T=2.41. tier2Score is reported as calibrated probability (post-sigmoid(logit/T))."
-  }
-}
diff --git a/src/classifiers/models/minilm-multihead-v5/tokenizer_config.json b/src/classifiers/models/minilm-multihead-v5/tokenizer_config.json
deleted file mode 100644
index f0eb41f..0000000
--- a/src/classifiers/models/minilm-multihead-v5/tokenizer_config.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "backend": "tokenizers",
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": true,
-  "is_local": false,
-  "mask_token": "[MASK]",
-  "max_length": 128,
-  "model_max_length": 512,
-  "never_split": null,
-  "pad_to_multiple_of": null,
-  "pad_token": "[PAD]",
-  "pad_token_type_id": 0,
-  "padding_side": "right",
-  "sep_token": "[SEP]",
-  "stride": 0,
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "truncation_side": "right",
-  "truncation_strategy": "longest_first",
-  "unk_token": "[UNK]"
-}
diff --git a/src/classifiers/models/minilm-multihead-v6/classifier_config.json b/src/classifiers/models/minilm-multihead-v6/classifier_config.json
new file mode 100644
index 0000000..89b9e79
--- /dev/null
+++ b/src/classifiers/models/minilm-multihead-v6/classifier_config.json
@@ -0,0 +1,33 @@
+{
+  "model_id": "sentence-transformers/all-MiniLM-L6-v2",
+  "model_type": "minilm",
+  "use_seq_clf": false,
+  "hidden_size": 384,
+  "freeze_layers": 4,
+  "pooling": "mean",
+  "optimal_threshold": 0.35,
+  "datasets": [
+    "qualifire",
+    "jayavibhav",
+    "agentdojo",
+    "jasperls",
+    "jailbreakbench",
+    "toxic-chat",
+    "chatgpt-jailbreaks",
+    "email-hardneg",
+    "email-hardneg-gen",
+    "multilingual-hardneg",
+    "jailbreakbench-neg",
+    "toxic-chat-neg",
+    "fujitsu-injecagent",
+    "fujitsu-rag",
+    "enron-ham",
+    "connector-hardneg-v2"
+  ],
+  "token_level": false,
+  "token_pool": "max",
+  "token_topk": 5,
+  "three_class": false,
+  "multi_head": false,
+  "aux_loss_alpha": 0.5
+}
\ No newline at end of file
diff --git a/src/classifiers/models/minilm-multihead-v5/config.json b/src/classifiers/models/minilm-multihead-v6/config.json
similarity index 74%
rename from src/classifiers/models/minilm-multihead-v5/config.json
rename to src/classifiers/models/minilm-multihead-v6/config.json
index aa9b4e9..dbd6d23 100644
--- a/src/classifiers/models/minilm-multihead-v5/config.json
+++ b/src/classifiers/models/minilm-multihead-v6/config.json
@@ -1,20 +1,17 @@
 {
-  "add_cross_attention": false,
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
   "architectures": [
     "BertModel"
   ],
   "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": null,
   "classifier_dropout": null,
-  "dtype": "float32",
-  "eos_token_id": null,
   "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 384,
   "initializer_range": 0.02,
   "intermediate_size": 1536,
-  "is_decoder": false,
   "layer_norm_eps": 1e-12,
   "max_position_embeddings": 512,
   "model_type": "bert",
@@ -22,8 +19,8 @@
   "num_hidden_layers": 6,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
-  "tie_word_embeddings": true,
-  "transformers_version": "5.5.4",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 30522
diff --git a/src/classifiers/models/minilm-multihead-v5/model_quantized.onnx b/src/classifiers/models/minilm-multihead-v6/model_quantized.onnx
similarity index 89%
rename from src/classifiers/models/minilm-multihead-v5/model_quantized.onnx
rename to src/classifiers/models/minilm-multihead-v6/model_quantized.onnx
index 3209705..c86b251 100644
Binary files a/src/classifiers/models/minilm-multihead-v5/model_quantized.onnx and b/src/classifiers/models/minilm-multihead-v6/model_quantized.onnx differ
diff --git a/src/classifiers/models/minilm-multihead-v5/tokenizer.json b/src/classifiers/models/minilm-multihead-v6/tokenizer.json
similarity index 100%
rename from src/classifiers/models/minilm-multihead-v5/tokenizer.json
rename to src/classifiers/models/minilm-multihead-v6/tokenizer.json
diff --git a/src/classifiers/models/minilm-multihead-v6/tokenizer_config.json b/src/classifiers/models/minilm-multihead-v6/tokenizer_config.json
new file mode 100644
index 0000000..6c45fb5
--- /dev/null
+++ b/src/classifiers/models/minilm-multihead-v6/tokenizer_config.json
@@ -0,0 +1,65 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}
diff --git a/src/classifiers/onnx-classifier.ts b/src/classifiers/onnx-classifier.ts
index f74bbc8..e69de29 100644
--- a/src/classifiers/onnx-classifier.ts
+++ b/src/classifiers/onnx-classifier.ts
@@ -1,501 +0,0 @@
-/**
- * ONNX classifier for fine-tuned MiniLM prompt injection detection.
- *
- * Pipeline: text -> AutoTokenizer -> ONNX Runtime (fine-tuned MiniLM + head) -> logit -> sigmoid -> score
- *
- * Uses @huggingface/transformers AutoTokenizer for tokenization and
- * onnxruntime-node for ONNX model inference. This avoids the pipeline()
- * API which assumes standard HuggingFace output format (our model outputs
- * a single logit, not class probabilities).
- */
-
-import { dirname, resolve } from "node:path";
-import { fileURLToPath } from "node:url";
-
-/**
- * Default path to the bundled ONNX model directory (relative to dist/).
- * Exported so `Tier2Classifier` can read model-specific calibration defaults
- * from the model's `classifier_config.json` at construction time.
- */
-export function getDefaultModelPath(): string {
-	// Works for both CJS (__dirname) and ESM (import.meta.url)
-	let baseDir: string;
-	try {
-		// ESM
-		baseDir = dirname(fileURLToPath(import.meta.url));
-	} catch {
-		// CJS fallback
-		baseDir = __dirname;
-	}
-	return resolve(baseDir, "models", "minilm-multihead-v5");
-}
-
-/**
- * Sigmoid activation function
- */
-function sigmoid(x: number): number {
-	return 1 / (1 + Math.exp(-x));
-}
-
-/**
- * Minimal tokenizer interface (subset of @huggingface/transformers PreTrainedTokenizer)
- */
-type Tokenizer = (
-	text: string | string[],
-	options?: {
-		padding?: boolean | string;
-		truncation?: boolean;
-		max_length?: number;
-		return_tensor?: boolean;
-	},
-) => {
-	input_ids: bigint[][] | { tolist: () => bigint[][] };
-	attention_mask: bigint[][] | { tolist: () => bigint[][] };
-};
-
-/**
- * Minimal ONNX Runtime InferenceSession interface
- */
-interface OrtInferenceSession {
-	run(
-		feeds: Record<string, unknown>,
-		options?: unknown,
-	): Promise<Record<string, { data: Float32Array | number[]; dims: number[] }>>;
-}
-
-/**
- * Minimal ONNX Runtime Tensor constructor interface
- */
-interface OrtTensorConstructor {
-	new (type: string, data: BigInt64Array, dims: number[]): unknown;
-}
-
-/**
- * Module-level session cache — shared across all OnnxClassifier instances in this process.
- *
- * Keyed by model path. Populated on first successful _loadModel() call and reused by every
- * subsequent instance. Sharing InferenceSession across concurrent run() calls is safe —
- * ONNX Runtime guarantees thread safety for concurrent Run() from v1.7.0. Sharing the
- * tokenizer is safe — tokenize() is synchronous and never mutates the tokenizer object.
- */
-const _sessionCache = new Map<
-	string,
-	{
-		session: OrtInferenceSession;
-		OrtTensor: OrtTensorConstructor;
-		tokenizer: Tokenizer;
-	}
->();
-
-/**
- * Module-level in-flight load promises — prevents duplicate concurrent loads when multiple
- * OnnxClassifier instances target the same modelPath simultaneously (e.g. warmup + first request).
- * Entries are removed after the load resolves or rejects.
- */
-const _loadingPromises = new Map<string, Promise<void>>();
-
-/**
- * ONNX Classifier for fine-tuned MiniLM models
- *
- * Usage:
- * ```typescript
- * const classifier = new OnnxClassifier();
- * await classifier.loadModel();  // loads from bundled path
- * await classifier.warmup();
- *
- * const score = await classifier.classify("Ignore previous instructions");
- * console.log(score); // 0.95 (high = likely injection)
- * ```
- */
-export class OnnxClassifier {
-	private session: OrtInferenceSession | null = null;
-	private tokenizer: Tokenizer | null = null;
-	private OrtTensor: OrtTensorConstructor | null = null;
-	private modelPath: string;
-	private loadingPromise: Promise<void> | null = null;
-	private maxLength = 256;
-	/**
-	 * Detected on first inference from the logits tensor `dims`:
-	 *  - `single` → `[batch]` or `[batch, 1]` — sigmoid path, one score per text
-	 *  - `multi`  → `[batch, 2]` — main+aux dual-head; `data` is row-major
-	 *               `[main_0, aux_0, main_1, aux_1, ...]`
-	 *  - `null`   → not yet known (no inference run)
-	 */
-	private outputMode: "single" | "multi" | null = null;
-	/**
-	 * Temperature for post-hoc calibration via temperature scaling. The raw
-	 * logit is divided by T before sigmoid: `sigmoid(logit / T)`. T > 1
-	 * softens overconfident output. T = 1 is a no-op (raw sigmoid).
-	 *
-	 * Fit T offline on a held-out labeled set by minimizing NLL. See
-	 * https://arxiv.org/abs/1706.04599 for the standard recipe.
-	 */
-	private temperatureT = 1.0;
-
-	constructor(modelPath?: string, temperatureT?: number) {
-		this.modelPath = modelPath ?? getDefaultModelPath();
-		if (temperatureT !== undefined) {
-			// T must be a positive finite number — calibration with T <= 0 is
-			// undefined behaviour (divide-by-zero or sign flip on logits) and
-			// almost certainly a programming error rather than a config the
-			// caller wants gracefully ignored.
-			if (!Number.isFinite(temperatureT) || temperatureT <= 0) {
-				throw new Error(`OnnxClassifier: temperatureT must be a positive finite number, got ${temperatureT}`);
-			}
-			this.temperatureT = temperatureT;
-		}
-	}
-
-	/** Current temperature scaling factor (1.0 = no calibration). */
-	getTemperature(): number {
-		return this.temperatureT;
-	}
-
-	/**
-	 * Output mode of the loaded model. `null` until the first inference runs.
-	 * `"multi"` indicates the model emits `[batch, 2]` (main + aux) logits.
-	 */
-	getOutputMode(): "single" | "multi" | null {
-		return this.outputMode;
-	}
-
-	/**
-	 * Load the ONNX model and tokenizer.
-	 *
-	 * @param modelPath - Optional override for the model directory path
-	 */
-	async loadModel(modelPath?: string): Promise<void> {
-		if (modelPath) {
-			this.modelPath = modelPath;
-		}
-
-		if (this.session && this.tokenizer) {
-			return;
-		}
-
-		if (this.loadingPromise) {
-			return this.loadingPromise;
-		}
-
-		this.loadingPromise = this._loadModel();
-		try {
-			await this.loadingPromise;
-		} catch (error) {
-			this.loadingPromise = null;
-			console.warn(
-				"[defender] ONNX model failed to load:",
-				error instanceof Error ? error.message : String(error),
-			);
-			throw error;
-		}
-	}
-
-	private async _loadModel(): Promise<void> {
-		const cached = _sessionCache.get(this.modelPath);
-		if (cached) {
-			this.session = cached.session;
-			this.OrtTensor = cached.OrtTensor;
-			this.tokenizer = cached.tokenizer;
-			return;
-		}
-
-		// Share a single in-flight load across concurrent instances targeting the same path
-		let inFlight = _loadingPromises.get(this.modelPath);
-		if (!inFlight) {
-			const modelPath = this.modelPath;
-			inFlight = (async () => {
-				// Dynamic imports — these are optional peer dependencies
-				// eslint-disable-next-line @typescript-eslint/no-require-imports
-				const transformers = (await import("@huggingface/transformers")) as unknown as {
-					AutoTokenizer: {
-						from_pretrained: (path: string, options?: { local_files_only: boolean }) => Promise<Tokenizer>;
-					};
-				};
-				const tokenizer = await transformers.AutoTokenizer.from_pretrained(modelPath, {
-					local_files_only: true,
-				});
-
-				// eslint-disable-next-line @typescript-eslint/no-require-imports
-				const ort = (await import("onnxruntime-node")) as unknown as {
-					InferenceSession: {
-						create: (path: string) => Promise<OrtInferenceSession>;
-					};
-					Tensor: OrtTensorConstructor;
-				};
-				const OrtTensor = ort.Tensor;
-				const onnxPath = resolve(modelPath, "model_quantized.onnx");
-				const session = await ort.InferenceSession.create(onnxPath);
-
-				_sessionCache.set(modelPath, { session, OrtTensor, tokenizer });
-			})();
-			_loadingPromises.set(this.modelPath, inFlight);
-			// Swallow .finally() rejection — the actual error propagates via `await inFlight` below.
-			// Without this, a rejected inFlight produces an unhandled rejection from the .finally() chain.
-			inFlight.finally(() => _loadingPromises.delete(this.modelPath)).catch(() => {});
-		}
-
-		await inFlight;
-
-		const loaded = _sessionCache.get(this.modelPath);
-		if (loaded) {
-			this.session = loaded.session;
-			this.OrtTensor = loaded.OrtTensor;
-			this.tokenizer = loaded.tokenizer;
-		}
-	}
-
-	/**
-	 * Classify a single text, returning the main-head sigmoid score in [0, 1].
-	 * Higher values indicate higher probability of prompt injection.
-	 *
-	 * For multi-head models, only the main score is returned. Callers that
-	 * need the aux score should use `classifyPair()`.
-	 *
-	 * @param text - Text to classify
-	 * @returns Sigmoid score in [0, 1]
-	 */
-	async classify(text: string): Promise<number> {
-		const { main } = await this.classifyPair(text);
-		return main;
-	}
-
-	/**
-	 * Classify a single text, returning both main and aux head scores.
-	 *
-	 * For single-head models, `aux` is `null`.
-	 * For multi-head `[batch, 2]` models, both scores are sigmoid-activated.
-	 *
-	 * @param text - Text to classify
-	 * @returns `{ main, aux }` — main in [0,1]; aux in [0,1] or null
-	 */
-	async classifyPair(text: string): Promise<{ main: number; aux: number | null }> {
-		await this.ensureLoaded();
-
-		const { inputIds, attentionMask } = this.tokenize(text);
-
-		if (!this.OrtTensor) {
-			throw new Error("OrtTensor not loaded");
-		}
-
-		const inputIdsTensor = new this.OrtTensor("int64", inputIds, [1, inputIds.length]);
-		const attentionMaskTensor = new this.OrtTensor("int64", attentionMask, [1, attentionMask.length]);
-
-		const results = await this.session?.run({
-			input_ids: inputIdsTensor,
-			attention_mask: attentionMaskTensor,
-		});
-
-		const logits = results?.logits;
-		if (!logits || logits.data[0] === undefined || logits.data[0] === null) {
-			throw new Error("ONNX model returned no logits");
-		}
-
-		this.detectOutputMode(logits.dims);
-
-		const T = this.temperatureT;
-		if (this.outputMode === "multi") {
-			const main = sigmoid(Number(logits.data[0]) / T);
-			const aux = sigmoid(Number(logits.data[1]) / T);
-			return { main, aux };
-		}
-		return { main: sigmoid(Number(logits.data[0]) / T), aux: null };
-	}
-
-	/**
-	 * Update `outputMode` from a logits-tensor shape on the first inference.
-	 * Idempotent — subsequent calls with the same shape are no-ops.
-	 */
-	private detectOutputMode(dims: number[] | undefined): void {
-		if (this.outputMode !== null) return;
-		// `dims` may be undefined on hand-rolled mocks; fall back to single-head.
-		if (!dims || dims.length < 2) {
-			this.outputMode = "single";
-			return;
-		}
-		this.outputMode = dims[1] === 2 ? "multi" : "single";
-	}
-
-	/**
-	 * Maximum number of texts per ONNX inference call.
-	 * Caps native memory from attention matrices: O(chunkSize × seqLen²).
-	 * For MiniLM (maxLength=256), chunk=32 keeps memory under ~50MB per call.
-	 */
-	private static readonly MAX_BATCH_CHUNK = 32;
-
-	/**
-	 * Classify multiple texts in batch, processing in chunks to bound memory.
-	 * Returns main-head scores only (back-compat). Use `classifyBatchPair()`
-	 * when aux scores are needed.
-	 *
-	 * @param texts - Array of texts to classify
-	 * @returns Array of main-head sigmoid scores in [0, 1]
-	 */
-	async classifyBatch(texts: string[]): Promise<number[]> {
-		const pairs = await this.classifyBatchPair(texts);
-		return pairs.map((p) => p.main);
-	}
-
-	/**
-	 * Classify multiple texts in batch, returning main+aux scores.
-	 * Aux is `null` per-row for single-head models.
-	 *
-	 * @param texts - Array of texts to classify
-	 * @returns Array of `{ main, aux }`
-	 */
-	async classifyBatchPair(texts: string[]): Promise<Array<{ main: number; aux: number | null }>> {
-		if (texts.length === 0) return [];
-
-		await this.ensureLoaded();
-
-		const allPairs: Array<{ main: number; aux: number | null }> = [];
-
-		for (let offset = 0; offset < texts.length; offset += OnnxClassifier.MAX_BATCH_CHUNK) {
-			const chunk = texts.slice(offset, offset + OnnxClassifier.MAX_BATCH_CHUNK);
-			const chunkPairs = await this.classifyBatchChunkPair(chunk);
-			allPairs.push(...chunkPairs);
-		}
-
-		return allPairs;
-	}
-
-	/**
-	 * Classify a single chunk of texts in one ONNX session.run() call.
-	 * Handles both single-head `[batch]`/`[batch, 1]` and multi-head `[batch, 2]`
-	 * outputs; the latter returns paired (main, aux) sigmoid scores.
-	 */
-	private async classifyBatchChunkPair(texts: string[]): Promise<Array<{ main: number; aux: number | null }>> {
-		const tokenized = texts.map((t) => this.tokenize(t));
-		const maxLen = Math.max(...tokenized.map((t) => t.inputIds.length));
-
-		const batchSize = texts.length;
-		const batchInputIds = new BigInt64Array(batchSize * maxLen);
-		const batchAttentionMask = new BigInt64Array(batchSize * maxLen);
-
-		for (let i = 0; i < batchSize; i++) {
-			const t = tokenized[i];
-			if (!t) continue;
-			for (let j = 0; j < t.inputIds.length; j++) {
-				batchInputIds[i * maxLen + j] = t.inputIds[j] ?? 0n;
-				batchAttentionMask[i * maxLen + j] = t.attentionMask[j] ?? 0n;
-			}
-		}
-
-		if (!this.OrtTensor) {
-			throw new Error("OrtTensor not loaded");
-		}
-
-		const inputIdsTensor = new this.OrtTensor("int64", batchInputIds, [batchSize, maxLen]);
-		const attentionMaskTensor = new this.OrtTensor("int64", batchAttentionMask, [batchSize, maxLen]);
-
-		const results = await this.session?.run({
-			input_ids: inputIdsTensor,
-			attention_mask: attentionMaskTensor,
-		});
-
-		const logits = results?.logits;
-		if (!logits) {
-			throw new Error("ONNX model returned no logits");
-		}
-
-		this.detectOutputMode(logits.dims);
-
-		const T = this.temperatureT;
-		const pairs: Array<{ main: number; aux: number | null }> = [];
-		if (this.outputMode === "multi") {
-			// Row-major [batch, 2]: [main_0, aux_0, main_1, aux_1, ...]
-			for (let i = 0; i < batchSize; i++) {
-				const main = sigmoid(Number(logits.data[i * 2]) / T);
-				const aux = sigmoid(Number(logits.data[i * 2 + 1]) / T);
-				pairs.push({ main, aux });
-			}
-		} else {
-			for (let i = 0; i < batchSize; i++) {
-				pairs.push({ main: sigmoid(Number(logits.data[i]) / T), aux: null });
-			}
-		}
-		return pairs;
-	}
-
-	/**
-	 * Pre-load the model. Call at startup to avoid first-call latency.
-	 */
-	async warmup(): Promise<void> {
-		await this.loadModel();
-	}
-
-	/**
-	 * Check if the model is loaded and ready for inference.
-	 */
-	isLoaded(): boolean {
-		return this.session !== null && this.tokenizer !== null;
-	}
-
-	/**
-	 * Count tokens in a text WITHOUT truncation, including special tokens
-	 * ([CLS] and [SEP] for BERT-family). Used by Tier 2 packing to decide
-	 * whether a string fits within the model's max_length and to size
-	 * sentence chunks.
-	 */
-	countTokens(text: string): number {
-		if (!this.tokenizer) {
-			throw new Error("Tokenizer not loaded. Call loadModel() first.");
-		}
-		const encoded = this.tokenizer(text, {
-			padding: false,
-			truncation: false,
-			return_tensor: false,
-		});
-		const rawIds: bigint[] = Array.isArray(encoded.input_ids)
-			? (encoded.input_ids as bigint[][]).flat()
-			: (encoded.input_ids as { tolist: () => bigint[][] }).tolist().flat();
-		return rawIds.length;
-	}
-
-	/** Model's maximum input length (in tokens), including special tokens. */
-	getMaxLength(): number {
-		return this.maxLength;
-	}
-
-	/**
-	 * Tokenize a single text into BigInt64Arrays for ONNX Runtime.
-	 */
-	private tokenize(text: string): {
-		inputIds: BigInt64Array;
-		attentionMask: BigInt64Array;
-	} {
-		if (!this.tokenizer) {
-			throw new Error("Tokenizer not loaded. Call loadModel() first.");
-		}
-
-		const encoded = this.tokenizer(text, {
-			padding: false,
-			truncation: true,
-			max_length: this.maxLength,
-			return_tensor: false,
-		});
-
-		// AutoTokenizer may return Tensor-like objects or plain arrays
-		const rawIds: bigint[] = Array.isArray(encoded.input_ids)
-			? (encoded.input_ids as bigint[][]).flat()
-			: (encoded.input_ids as { tolist: () => bigint[][] }).tolist().flat();
-
-		const rawMask: bigint[] = Array.isArray(encoded.attention_mask)
-			? (encoded.attention_mask as bigint[][]).flat()
-			: (encoded.attention_mask as { tolist: () => bigint[][] }).tolist().flat();
-
-		// Convert to BigInt64Array (onnxruntime-node expects int64)
-		const inputIds = new BigInt64Array(rawIds.map((v) => BigInt(v)));
-		const attentionMask = new BigInt64Array(rawMask.map((v) => BigInt(v)));
-
-		return { inputIds, attentionMask };
-	}
-
-	/**
-	 * Ensure the model is loaded, loading it if necessary.
-	 */
-	private async ensureLoaded(): Promise<void> {
-		if (!this.session || !this.tokenizer) {
-			await this.loadModel();
-		}
-	}
-}