diff --git a/scripts/copy-models.cjs b/scripts/copy-models.cjs index 7c92c5a..9dadd43 100644 --- a/scripts/copy-models.cjs +++ b/scripts/copy-models.cjs @@ -25,7 +25,7 @@ const MODEL_DIRS = [ // mode by default; opt into multi-head decision rule via // `tier2Config.multihead`. Calibrated T = 2.41, highRiskThreshold = 0.64 // (encoded in classifier_config.json:calibration). - "minilm-multihead-v5", + "minilm-multihead-v6", ]; let copied = 0; diff --git a/src/classifiers/models/minilm-multihead-v5/classifier_config.json b/src/classifiers/models/minilm-multihead-v5/classifier_config.json deleted file mode 100644 index e9624d4..0000000 --- a/src/classifiers/models/minilm-multihead-v5/classifier_config.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "model_id": "sentence-transformers/all-MiniLM-L6-v2", - "model_type": "minilm", - "use_seq_clf": false, - "hidden_size": 384, - "freeze_layers": 4, - "pooling": "mean", - "optimal_threshold": 0.4, - "datasets": [ - "qualifire", - "jayavibhav", - "agentdojo", - "jasperls", - "jailbreakbench", - "toxic-chat", - "chatgpt-jailbreaks", - "email-hardneg", - "email-hardneg-gen", - "multilingual-hardneg", - "jailbreakbench-neg", - "toxic-chat-neg", - "fujitsu-injecagent", - "fujitsu-rag", - "enron-ham", - "connector-hardneg-v2", - "dev-tooling-hardneg-curated", - "dev-tooling-attacks", - "agentshield-shape-attacks", - "system-prompt-extraction-attacks", - "emoji-ci-benign", - "benign-user-queries", - "code-docs-benign" - ], - "token_level": false, - "token_pool": "max", - "token_topk": 5, - "three_class": false, - "multi_head": true, - "aux_loss_alpha": 0.5, - "calibration": { - "temperatureT": 2.41, - "highRiskThreshold": 0.64, - "ece": 0.09, - "fitted_on": "labeled plugin events 2026-05-13", - "notes": "Raw highRiskThreshold 0.8 is math-equivalent to calibrated 0.64 at T=2.41. tier2Score is reported as calibrated probability (post-sigmoid(logit/T))." - } -} diff --git a/src/classifiers/models/minilm-multihead-v5/tokenizer_config.json b/src/classifiers/models/minilm-multihead-v5/tokenizer_config.json deleted file mode 100644 index f0eb41f..0000000 --- a/src/classifiers/models/minilm-multihead-v5/tokenizer_config.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "backend": "tokenizers", - "cls_token": "[CLS]", - "do_basic_tokenize": true, - "do_lower_case": true, - "is_local": false, - "mask_token": "[MASK]", - "max_length": 128, - "model_max_length": 512, - "never_split": null, - "pad_to_multiple_of": null, - "pad_token": "[PAD]", - "pad_token_type_id": 0, - "padding_side": "right", - "sep_token": "[SEP]", - "stride": 0, - "strip_accents": null, - "tokenize_chinese_chars": true, - "tokenizer_class": "BertTokenizer", - "truncation_side": "right", - "truncation_strategy": "longest_first", - "unk_token": "[UNK]" -} diff --git a/src/classifiers/models/minilm-multihead-v6/classifier_config.json b/src/classifiers/models/minilm-multihead-v6/classifier_config.json new file mode 100644 index 0000000..89b9e79 --- /dev/null +++ b/src/classifiers/models/minilm-multihead-v6/classifier_config.json @@ -0,0 +1,33 @@ +{ + "model_id": "sentence-transformers/all-MiniLM-L6-v2", + "model_type": "minilm", + "use_seq_clf": false, + "hidden_size": 384, + "freeze_layers": 4, + "pooling": "mean", + "optimal_threshold": 0.35, + "datasets": [ + "qualifire", + "jayavibhav", + "agentdojo", + "jasperls", + "jailbreakbench", + "toxic-chat", + "chatgpt-jailbreaks", + "email-hardneg", + "email-hardneg-gen", + "multilingual-hardneg", + "jailbreakbench-neg", + "toxic-chat-neg", + "fujitsu-injecagent", + "fujitsu-rag", + "enron-ham", + "connector-hardneg-v2" + ], + "token_level": false, + "token_pool": "max", + "token_topk": 5, + "three_class": false, + "multi_head": false, + "aux_loss_alpha": 0.5 +} \ No newline at end of file diff --git a/src/classifiers/models/minilm-multihead-v5/config.json b/src/classifiers/models/minilm-multihead-v6/config.json similarity index 74% rename from src/classifiers/models/minilm-multihead-v5/config.json rename to src/classifiers/models/minilm-multihead-v6/config.json index aa9b4e9..dbd6d23 100644 --- a/src/classifiers/models/minilm-multihead-v5/config.json +++ b/src/classifiers/models/minilm-multihead-v6/config.json @@ -1,20 +1,17 @@ { - "add_cross_attention": false, + "_attn_implementation_autoset": true, + "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2", "architectures": [ "BertModel" ], "attention_probs_dropout_prob": 0.1, - "bos_token_id": null, "classifier_dropout": null, - "dtype": "float32", - "eos_token_id": null, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 384, "initializer_range": 0.02, "intermediate_size": 1536, - "is_decoder": false, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", @@ -22,8 +19,8 @@ "num_hidden_layers": 6, "pad_token_id": 0, "position_embedding_type": "absolute", - "tie_word_embeddings": true, - "transformers_version": "5.5.4", + "torch_dtype": "float32", + "transformers_version": "4.49.0", "type_vocab_size": 2, "use_cache": true, "vocab_size": 30522 diff --git a/src/classifiers/models/minilm-multihead-v5/model_quantized.onnx b/src/classifiers/models/minilm-multihead-v6/model_quantized.onnx similarity index 89% rename from src/classifiers/models/minilm-multihead-v5/model_quantized.onnx rename to src/classifiers/models/minilm-multihead-v6/model_quantized.onnx index 3209705..c86b251 100644 Binary files a/src/classifiers/models/minilm-multihead-v5/model_quantized.onnx and b/src/classifiers/models/minilm-multihead-v6/model_quantized.onnx differ diff --git a/src/classifiers/models/minilm-multihead-v5/tokenizer.json b/src/classifiers/models/minilm-multihead-v6/tokenizer.json similarity index 100% rename from src/classifiers/models/minilm-multihead-v5/tokenizer.json rename to src/classifiers/models/minilm-multihead-v6/tokenizer.json diff --git a/src/classifiers/models/minilm-multihead-v6/tokenizer_config.json b/src/classifiers/models/minilm-multihead-v6/tokenizer_config.json new file mode 100644 index 0000000..6c45fb5 --- /dev/null +++ b/src/classifiers/models/minilm-multihead-v6/tokenizer_config.json @@ -0,0 +1,65 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "100": { + "content": "[UNK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "101": { + "content": "[CLS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "102": { + "content": "[SEP]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "103": { + "content": "[MASK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "clean_up_tokenization_spaces": false, + "cls_token": "[CLS]", + "do_basic_tokenize": true, + "do_lower_case": true, + "extra_special_tokens": {}, + "mask_token": "[MASK]", + "max_length": 128, + "model_max_length": 512, + "never_split": null, + "pad_to_multiple_of": null, + "pad_token": "[PAD]", + "pad_token_type_id": 0, + "padding_side": "right", + "sep_token": "[SEP]", + "stride": 0, + "strip_accents": null, + "tokenize_chinese_chars": true, + "tokenizer_class": "BertTokenizer", + "truncation_side": "right", + "truncation_strategy": "longest_first", + "unk_token": "[UNK]" +} diff --git a/src/classifiers/onnx-classifier.ts b/src/classifiers/onnx-classifier.ts index f74bbc8..e69de29 100644 --- a/src/classifiers/onnx-classifier.ts +++ b/src/classifiers/onnx-classifier.ts @@ -1,501 +0,0 @@ -/** - * ONNX classifier for fine-tuned MiniLM prompt injection detection. - * - * Pipeline: text -> AutoTokenizer -> ONNX Runtime (fine-tuned MiniLM + head) -> logit -> sigmoid -> score - * - * Uses @huggingface/transformers AutoTokenizer for tokenization and - * onnxruntime-node for ONNX model inference. This avoids the pipeline() - * API which assumes standard HuggingFace output format (our model outputs - * a single logit, not class probabilities). - */ - -import { dirname, resolve } from "node:path"; -import { fileURLToPath } from "node:url"; - -/** - * Default path to the bundled ONNX model directory (relative to dist/). - * Exported so `Tier2Classifier` can read model-specific calibration defaults - * from the model's `classifier_config.json` at construction time. - */ -export function getDefaultModelPath(): string { - // Works for both CJS (__dirname) and ESM (import.meta.url) - let baseDir: string; - try { - // ESM - baseDir = dirname(fileURLToPath(import.meta.url)); - } catch { - // CJS fallback - baseDir = __dirname; - } - return resolve(baseDir, "models", "minilm-multihead-v5"); -} - -/** - * Sigmoid activation function - */ -function sigmoid(x: number): number { - return 1 / (1 + Math.exp(-x)); -} - -/** - * Minimal tokenizer interface (subset of @huggingface/transformers PreTrainedTokenizer) - */ -type Tokenizer = ( - text: string | string[], - options?: { - padding?: boolean | string; - truncation?: boolean; - max_length?: number; - return_tensor?: boolean; - }, -) => { - input_ids: bigint[][] | { tolist: () => bigint[][] }; - attention_mask: bigint[][] | { tolist: () => bigint[][] }; -}; - -/** - * Minimal ONNX Runtime InferenceSession interface - */ -interface OrtInferenceSession { - run( - feeds: Record, - options?: unknown, - ): Promise>; -} - -/** - * Minimal ONNX Runtime Tensor constructor interface - */ -interface OrtTensorConstructor { - new (type: string, data: BigInt64Array, dims: number[]): unknown; -} - -/** - * Module-level session cache — shared across all OnnxClassifier instances in this process. - * - * Keyed by model path. Populated on first successful _loadModel() call and reused by every - * subsequent instance. Sharing InferenceSession across concurrent run() calls is safe — - * ONNX Runtime guarantees thread safety for concurrent Run() from v1.7.0. Sharing the - * tokenizer is safe — tokenize() is synchronous and never mutates the tokenizer object. - */ -const _sessionCache = new Map< - string, - { - session: OrtInferenceSession; - OrtTensor: OrtTensorConstructor; - tokenizer: Tokenizer; - } ->(); - -/** - * Module-level in-flight load promises — prevents duplicate concurrent loads when multiple - * OnnxClassifier instances target the same modelPath simultaneously (e.g. warmup + first request). - * Entries are removed after the load resolves or rejects. - */ -const _loadingPromises = new Map>(); - -/** - * ONNX Classifier for fine-tuned MiniLM models - * - * Usage: - * ```typescript - * const classifier = new OnnxClassifier(); - * await classifier.loadModel(); // loads from bundled path - * await classifier.warmup(); - * - * const score = await classifier.classify("Ignore previous instructions"); - * console.log(score); // 0.95 (high = likely injection) - * ``` - */ -export class OnnxClassifier { - private session: OrtInferenceSession | null = null; - private tokenizer: Tokenizer | null = null; - private OrtTensor: OrtTensorConstructor | null = null; - private modelPath: string; - private loadingPromise: Promise | null = null; - private maxLength = 256; - /** - * Detected on first inference from the logits tensor `dims`: - * - `single` → `[batch]` or `[batch, 1]` — sigmoid path, one score per text - * - `multi` → `[batch, 2]` — main+aux dual-head; `data` is row-major - * `[main_0, aux_0, main_1, aux_1, ...]` - * - `null` → not yet known (no inference run) - */ - private outputMode: "single" | "multi" | null = null; - /** - * Temperature for post-hoc calibration via temperature scaling. The raw - * logit is divided by T before sigmoid: `sigmoid(logit / T)`. T > 1 - * softens overconfident output. T = 1 is a no-op (raw sigmoid). - * - * Fit T offline on a held-out labeled set by minimizing NLL. See - * https://arxiv.org/abs/1706.04599 for the standard recipe. - */ - private temperatureT = 1.0; - - constructor(modelPath?: string, temperatureT?: number) { - this.modelPath = modelPath ?? getDefaultModelPath(); - if (temperatureT !== undefined) { - // T must be a positive finite number — calibration with T <= 0 is - // undefined behaviour (divide-by-zero or sign flip on logits) and - // almost certainly a programming error rather than a config the - // caller wants gracefully ignored. - if (!Number.isFinite(temperatureT) || temperatureT <= 0) { - throw new Error(`OnnxClassifier: temperatureT must be a positive finite number, got ${temperatureT}`); - } - this.temperatureT = temperatureT; - } - } - - /** Current temperature scaling factor (1.0 = no calibration). */ - getTemperature(): number { - return this.temperatureT; - } - - /** - * Output mode of the loaded model. `null` until the first inference runs. - * `"multi"` indicates the model emits `[batch, 2]` (main + aux) logits. - */ - getOutputMode(): "single" | "multi" | null { - return this.outputMode; - } - - /** - * Load the ONNX model and tokenizer. - * - * @param modelPath - Optional override for the model directory path - */ - async loadModel(modelPath?: string): Promise { - if (modelPath) { - this.modelPath = modelPath; - } - - if (this.session && this.tokenizer) { - return; - } - - if (this.loadingPromise) { - return this.loadingPromise; - } - - this.loadingPromise = this._loadModel(); - try { - await this.loadingPromise; - } catch (error) { - this.loadingPromise = null; - console.warn( - "[defender] ONNX model failed to load:", - error instanceof Error ? error.message : String(error), - ); - throw error; - } - } - - private async _loadModel(): Promise { - const cached = _sessionCache.get(this.modelPath); - if (cached) { - this.session = cached.session; - this.OrtTensor = cached.OrtTensor; - this.tokenizer = cached.tokenizer; - return; - } - - // Share a single in-flight load across concurrent instances targeting the same path - let inFlight = _loadingPromises.get(this.modelPath); - if (!inFlight) { - const modelPath = this.modelPath; - inFlight = (async () => { - // Dynamic imports — these are optional peer dependencies - // eslint-disable-next-line @typescript-eslint/no-require-imports - const transformers = (await import("@huggingface/transformers")) as unknown as { - AutoTokenizer: { - from_pretrained: (path: string, options?: { local_files_only: boolean }) => Promise; - }; - }; - const tokenizer = await transformers.AutoTokenizer.from_pretrained(modelPath, { - local_files_only: true, - }); - - // eslint-disable-next-line @typescript-eslint/no-require-imports - const ort = (await import("onnxruntime-node")) as unknown as { - InferenceSession: { - create: (path: string) => Promise; - }; - Tensor: OrtTensorConstructor; - }; - const OrtTensor = ort.Tensor; - const onnxPath = resolve(modelPath, "model_quantized.onnx"); - const session = await ort.InferenceSession.create(onnxPath); - - _sessionCache.set(modelPath, { session, OrtTensor, tokenizer }); - })(); - _loadingPromises.set(this.modelPath, inFlight); - // Swallow .finally() rejection — the actual error propagates via `await inFlight` below. - // Without this, a rejected inFlight produces an unhandled rejection from the .finally() chain. - inFlight.finally(() => _loadingPromises.delete(this.modelPath)).catch(() => {}); - } - - await inFlight; - - const loaded = _sessionCache.get(this.modelPath); - if (loaded) { - this.session = loaded.session; - this.OrtTensor = loaded.OrtTensor; - this.tokenizer = loaded.tokenizer; - } - } - - /** - * Classify a single text, returning the main-head sigmoid score in [0, 1]. - * Higher values indicate higher probability of prompt injection. - * - * For multi-head models, only the main score is returned. Callers that - * need the aux score should use `classifyPair()`. - * - * @param text - Text to classify - * @returns Sigmoid score in [0, 1] - */ - async classify(text: string): Promise { - const { main } = await this.classifyPair(text); - return main; - } - - /** - * Classify a single text, returning both main and aux head scores. - * - * For single-head models, `aux` is `null`. - * For multi-head `[batch, 2]` models, both scores are sigmoid-activated. - * - * @param text - Text to classify - * @returns `{ main, aux }` — main in [0,1]; aux in [0,1] or null - */ - async classifyPair(text: string): Promise<{ main: number; aux: number | null }> { - await this.ensureLoaded(); - - const { inputIds, attentionMask } = this.tokenize(text); - - if (!this.OrtTensor) { - throw new Error("OrtTensor not loaded"); - } - - const inputIdsTensor = new this.OrtTensor("int64", inputIds, [1, inputIds.length]); - const attentionMaskTensor = new this.OrtTensor("int64", attentionMask, [1, attentionMask.length]); - - const results = await this.session?.run({ - input_ids: inputIdsTensor, - attention_mask: attentionMaskTensor, - }); - - const logits = results?.logits; - if (!logits || logits.data[0] === undefined || logits.data[0] === null) { - throw new Error("ONNX model returned no logits"); - } - - this.detectOutputMode(logits.dims); - - const T = this.temperatureT; - if (this.outputMode === "multi") { - const main = sigmoid(Number(logits.data[0]) / T); - const aux = sigmoid(Number(logits.data[1]) / T); - return { main, aux }; - } - return { main: sigmoid(Number(logits.data[0]) / T), aux: null }; - } - - /** - * Update `outputMode` from a logits-tensor shape on the first inference. - * Idempotent — subsequent calls with the same shape are no-ops. - */ - private detectOutputMode(dims: number[] | undefined): void { - if (this.outputMode !== null) return; - // `dims` may be undefined on hand-rolled mocks; fall back to single-head. - if (!dims || dims.length < 2) { - this.outputMode = "single"; - return; - } - this.outputMode = dims[1] === 2 ? "multi" : "single"; - } - - /** - * Maximum number of texts per ONNX inference call. - * Caps native memory from attention matrices: O(chunkSize × seqLen²). - * For MiniLM (maxLength=256), chunk=32 keeps memory under ~50MB per call. - */ - private static readonly MAX_BATCH_CHUNK = 32; - - /** - * Classify multiple texts in batch, processing in chunks to bound memory. - * Returns main-head scores only (back-compat). Use `classifyBatchPair()` - * when aux scores are needed. - * - * @param texts - Array of texts to classify - * @returns Array of main-head sigmoid scores in [0, 1] - */ - async classifyBatch(texts: string[]): Promise { - const pairs = await this.classifyBatchPair(texts); - return pairs.map((p) => p.main); - } - - /** - * Classify multiple texts in batch, returning main+aux scores. - * Aux is `null` per-row for single-head models. - * - * @param texts - Array of texts to classify - * @returns Array of `{ main, aux }` - */ - async classifyBatchPair(texts: string[]): Promise> { - if (texts.length === 0) return []; - - await this.ensureLoaded(); - - const allPairs: Array<{ main: number; aux: number | null }> = []; - - for (let offset = 0; offset < texts.length; offset += OnnxClassifier.MAX_BATCH_CHUNK) { - const chunk = texts.slice(offset, offset + OnnxClassifier.MAX_BATCH_CHUNK); - const chunkPairs = await this.classifyBatchChunkPair(chunk); - allPairs.push(...chunkPairs); - } - - return allPairs; - } - - /** - * Classify a single chunk of texts in one ONNX session.run() call. - * Handles both single-head `[batch]`/`[batch, 1]` and multi-head `[batch, 2]` - * outputs; the latter returns paired (main, aux) sigmoid scores. - */ - private async classifyBatchChunkPair(texts: string[]): Promise> { - const tokenized = texts.map((t) => this.tokenize(t)); - const maxLen = Math.max(...tokenized.map((t) => t.inputIds.length)); - - const batchSize = texts.length; - const batchInputIds = new BigInt64Array(batchSize * maxLen); - const batchAttentionMask = new BigInt64Array(batchSize * maxLen); - - for (let i = 0; i < batchSize; i++) { - const t = tokenized[i]; - if (!t) continue; - for (let j = 0; j < t.inputIds.length; j++) { - batchInputIds[i * maxLen + j] = t.inputIds[j] ?? 0n; - batchAttentionMask[i * maxLen + j] = t.attentionMask[j] ?? 0n; - } - } - - if (!this.OrtTensor) { - throw new Error("OrtTensor not loaded"); - } - - const inputIdsTensor = new this.OrtTensor("int64", batchInputIds, [batchSize, maxLen]); - const attentionMaskTensor = new this.OrtTensor("int64", batchAttentionMask, [batchSize, maxLen]); - - const results = await this.session?.run({ - input_ids: inputIdsTensor, - attention_mask: attentionMaskTensor, - }); - - const logits = results?.logits; - if (!logits) { - throw new Error("ONNX model returned no logits"); - } - - this.detectOutputMode(logits.dims); - - const T = this.temperatureT; - const pairs: Array<{ main: number; aux: number | null }> = []; - if (this.outputMode === "multi") { - // Row-major [batch, 2]: [main_0, aux_0, main_1, aux_1, ...] - for (let i = 0; i < batchSize; i++) { - const main = sigmoid(Number(logits.data[i * 2]) / T); - const aux = sigmoid(Number(logits.data[i * 2 + 1]) / T); - pairs.push({ main, aux }); - } - } else { - for (let i = 0; i < batchSize; i++) { - pairs.push({ main: sigmoid(Number(logits.data[i]) / T), aux: null }); - } - } - return pairs; - } - - /** - * Pre-load the model. Call at startup to avoid first-call latency. - */ - async warmup(): Promise { - await this.loadModel(); - } - - /** - * Check if the model is loaded and ready for inference. - */ - isLoaded(): boolean { - return this.session !== null && this.tokenizer !== null; - } - - /** - * Count tokens in a text WITHOUT truncation, including special tokens - * ([CLS] and [SEP] for BERT-family). Used by Tier 2 packing to decide - * whether a string fits within the model's max_length and to size - * sentence chunks. - */ - countTokens(text: string): number { - if (!this.tokenizer) { - throw new Error("Tokenizer not loaded. Call loadModel() first."); - } - const encoded = this.tokenizer(text, { - padding: false, - truncation: false, - return_tensor: false, - }); - const rawIds: bigint[] = Array.isArray(encoded.input_ids) - ? (encoded.input_ids as bigint[][]).flat() - : (encoded.input_ids as { tolist: () => bigint[][] }).tolist().flat(); - return rawIds.length; - } - - /** Model's maximum input length (in tokens), including special tokens. */ - getMaxLength(): number { - return this.maxLength; - } - - /** - * Tokenize a single text into BigInt64Arrays for ONNX Runtime. - */ - private tokenize(text: string): { - inputIds: BigInt64Array; - attentionMask: BigInt64Array; - } { - if (!this.tokenizer) { - throw new Error("Tokenizer not loaded. Call loadModel() first."); - } - - const encoded = this.tokenizer(text, { - padding: false, - truncation: true, - max_length: this.maxLength, - return_tensor: false, - }); - - // AutoTokenizer may return Tensor-like objects or plain arrays - const rawIds: bigint[] = Array.isArray(encoded.input_ids) - ? (encoded.input_ids as bigint[][]).flat() - : (encoded.input_ids as { tolist: () => bigint[][] }).tolist().flat(); - - const rawMask: bigint[] = Array.isArray(encoded.attention_mask) - ? (encoded.attention_mask as bigint[][]).flat() - : (encoded.attention_mask as { tolist: () => bigint[][] }).tolist().flat(); - - // Convert to BigInt64Array (onnxruntime-node expects int64) - const inputIds = new BigInt64Array(rawIds.map((v) => BigInt(v))); - const attentionMask = new BigInt64Array(rawMask.map((v) => BigInt(v))); - - return { inputIds, attentionMask }; - } - - /** - * Ensure the model is loaded, loading it if necessary. - */ - private async ensureLoaded(): Promise { - if (!this.session || !this.tokenizer) { - await this.loadModel(); - } - } -}