microsoft · apsonawane · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/Qwen-Qwen3.5-2B/baseline/Qwen-Qwen3.5-2B_baseline_mmlu.json b/Qwen-Qwen3.5-2B/baseline/Qwen-Qwen3.5-2B_baseline_mmlu.json
@@ -0,0 +1,32 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "load_kwargs": {
+            "torch_dtype": "float16"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": ["CUDAExecutionProvider"]
+                }
+            ]
+        }
+    },
+    "evaluators": {
+        "mmlu": {
+            "type": "LMEvaluator",
+            "tasks": ["mmlu"],
+            "model_class": "hf",
+            "batch_size": 8
+        }
+    },
+    "evaluator": "mmlu",
+    "target": "local_system",
+    "log_severity_level": 0,
+    "evaluate_input_model": true
+}
diff --git a/Qwen-Qwen3.5-2B/baseline/requirements.txt b/Qwen-Qwen3.5-2B/baseline/requirements.txt
@@ -0,0 +1,5 @@
+accelerate
+datasets
+lm-eval
+torch
+transformers==4.52.4
diff --git a/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4.json b/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4.json
@@ -0,0 +1,42 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "load_kwargs": {
+            "torch_dtype": "float16"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "cpu",
+                    "execution_providers": ["CPUExecutionProvider"]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "extra_options": {
+                "exclude_embeds": false
+            }
+        },
+        "q": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {"surgeon": "QuantizeEmbeddingInt8"},
+                {"surgeon": "ShareEmbeddingLmHead"}
+            ],
+            "save_as_external_data": true
+        }
+    },
+    "target": "local_system",
+    "log_severity_level": 0,
+    "output_dir": "model",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4_with_eval.json b/Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4_with_eval.json
@@ -0,0 +1,52 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "load_kwargs": {
+            "torch_dtype": "float16"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "cpu",
+                    "execution_providers": ["CPUExecutionProvider"]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "extra_options": {
+                "exclude_embeds": false
+            }
+        },
+        "q": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {"surgeon": "QuantizeEmbeddingInt8"},
+                {"surgeon": "ShareEmbeddingLmHead"}
+            ],
+            "save_as_external_data": true,
+            "all_tensors_to_one_file": true,
+            "external_data_name": "model.onnx.data"
+        }
+    },
+    "evaluators": {
+        "mmlu": {
+            "type": "LMEvaluator",
+            "tasks": ["mmlu"],
+            "batch_size": 8
+        }
+    },
+    "evaluator": "mmlu",
+    "target": "local_system",
+    "log_severity_level": 0,
+    "output_dir": "model",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/Qwen-Qwen3.5-2B/cpu/README.md b/Qwen-Qwen3.5-2B/cpu/README.md
@@ -0,0 +1,28 @@
+# Qwen-Qwen3.5-2B — CPU optimization
+
+This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the CPU EP.
+
+## What this folder is for
+
+- Execution Provider: CPU EP
+- Typical precision: INT4 precision by default
+- Example recipe filename: Qwen-Qwen3.5-2B_cpu_int4.json
+
+## Setup
+
+1) Install the main branch of Olive:
+   - pip install git+https://github.com/microsoft/olive.git
+2) Install the appropriate runtime package for this backend:
+   - onnxruntime-genai (CPU build)
+3) Run Olive to build/optimize the model
+   - olive run --config Qwen-Qwen3.5-2B_cpu_int4.json
+
+Additional notes:
+- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head)
+- Model size: ~1.4 GB (down from 4.3 GB FP16)
+- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline.
+- Runs purely on CPU; no GPU required.
+
+---
+
+This README was auto-generated for the CPU EP of Qwen-Qwen3.5-2B.
diff --git a/Qwen-Qwen3.5-2B/cpu/info.yaml b/Qwen-Qwen3.5-2B/cpu/info.yaml
@@ -0,0 +1,6 @@
+arch: qwen3_5_text
+recipes:
+  - name: Qwen-Qwen3.5-2B_cpu_int4
+    file: Qwen-Qwen3.5-2B_cpu_int4.json
+    devices: cpu
+    eps: CPUExecutionProvider
diff --git a/Qwen-Qwen3.5-2B/cpu/requirements.txt b/Qwen-Qwen3.5-2B/cpu/requirements.txt
@@ -0,0 +1,4 @@
+accelerate
+datasets
+onnxruntime-genai
+transformers==4.52.4
diff --git a/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4.json b/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4.json
@@ -0,0 +1,42 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "load_kwargs": {
+            "torch_dtype": "float16"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": ["CUDAExecutionProvider"]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "extra_options": {
+                "exclude_embeds": false,
+                "enable_cuda_graph": true
+            }
+        },
+        "q": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {"surgeon": "QuantizeEmbeddingInt8"},
+                {"surgeon": "ShareEmbeddingLmHead"}
+            ]
+        }
+    },
+    "target": "local_system",
+    "log_severity_level": 0,
+    "output_dir": "model",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4_with_eval.json b/Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4_with_eval.json
@@ -0,0 +1,51 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "load_kwargs": {
+            "torch_dtype": "float16"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": ["CUDAExecutionProvider"]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "extra_options": {
+                "exclude_embeds": false,
+                "enable_cuda_graph": true
+            }
+        },
+        "q": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {"surgeon": "QuantizeEmbeddingInt8"},
+                {"surgeon": "ShareEmbeddingLmHead"}
+            ],
+            "save_as_external_data": true
+        }
+    },
+    "evaluators": {
+        "mmlu": {
+            "type": "LMEvaluator",
+            "tasks": ["mmlu"],
+            "batch_size": 8
+        }
+    },
+    "evaluator": "mmlu",
+    "target": "local_system",
+    "log_severity_level": 0,
+    "output_dir": "model",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}
diff --git a/Qwen-Qwen3.5-2B/cuda/README.md b/Qwen-Qwen3.5-2B/cuda/README.md
@@ -0,0 +1,31 @@
+# Qwen-Qwen3.5-2B — CUDA optimization
+
+This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the CUDA EP.
+
+## What this folder is for
+
+- Execution Provider: CUDA EP
+- Typical precision: INT4 precision by default
+- Example recipe filename: Qwen-Qwen3.5-2B_cuda_int4.json
+
+## Setup
+
+1) Install the main branch of Olive:
+   - pip install git+https://github.com/microsoft/olive.git
+2) Install the appropriate runtime package for this backend:
+   - onnxruntime-genai-cuda (CUDA build)
+3) Run Olive to build/optimize the model
+   - olive run --config Qwen-Qwen3.5-2B_cuda_int4.json
+
+Additional notes:
+- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head)
+- Model size: ~1.4 GB (down from 4.3 GB FP16)
+- MMLU accuracy: 57.11% (vs 59.27% FP16 baseline)
+- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline.
+- CUDA graph enabled for optimized decode throughput.
+- Requires NVIDIA GPU with CUDA support.
+- Ensure CUDA toolkit and cuDNN are properly installed.
+
+---
+
+This README was auto-generated for the CUDA EP of Qwen-Qwen3.5-2B.
diff --git a/Qwen-Qwen3.5-2B/cuda/info.yaml b/Qwen-Qwen3.5-2B/cuda/info.yaml
@@ -0,0 +1,6 @@
+arch: qwen3_5_text
+recipes:
+  - name: Qwen-Qwen3.5-2B_cuda_int4
+    file: Qwen-Qwen3.5-2B_cuda_int4.json
+    devices: gpu
+    eps: CUDAExecutionProvider
diff --git a/Qwen-Qwen3.5-2B/cuda/requirements.txt b/Qwen-Qwen3.5-2B/cuda/requirements.txt
@@ -0,0 +1,4 @@
+accelerate
+datasets
+onnxruntime-genai
+transformers==4.52.4
diff --git a/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4.json b/Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4.json
@@ -0,0 +1,43 @@
+{
+    "input_model": {
+        "type": "HfModel",
+        "model_path": "Qwen/Qwen3.5-2B",
+        "load_kwargs": {
+            "torch_dtype": "float16"
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "accelerators": [
+                {
+                    "device": "gpu",
+                    "execution_providers": ["WebGpuExecutionProvider"]
+                }
+            ]
+        }
+    },
+    "passes": {
+        "m": {
+            "type": "ModelBuilder",
+            "precision": "int4",
+            "int4_block_size": 32,
+            "extra_options": {
+                "exclude_embeds": false
+            }
+        },
+        "q": {
+            "type": "GraphSurgeries",
+            "surgeries": [
+                {"surgeon": "QuantizeEmbeddingInt8"},
+                {"surgeon": "ShareEmbeddingLmHead"}
+            ],
+            "save_as_external_data": true
+        }
+    },
+    "target": "local_system",
+    "log_severity_level": 0,
+    "output_dir": "model",
+    "cache_dir": "cache",
+    "no_artifacts": true
+}