Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions Qwen-Qwen3.5-2B/baseline/Qwen-Qwen3.5-2B_baseline_mmlu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B",
"load_kwargs": {
"torch_dtype": "float16"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [
{
"device": "gpu",
"execution_providers": ["CUDAExecutionProvider"]
}
]
}
},
"evaluators": {
"mmlu": {
"type": "LMEvaluator",
"tasks": ["mmlu"],
"model_class": "hf",
"batch_size": 8
}
},
"evaluator": "mmlu",
"target": "local_system",
"log_severity_level": 0,
"evaluate_input_model": true
}
5 changes: 5 additions & 0 deletions Qwen-Qwen3.5-2B/baseline/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
accelerate
datasets
lm-eval
torch
transformers==4.52.4
42 changes: 42 additions & 0 deletions Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B",
"load_kwargs": {
"torch_dtype": "float16"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [
{
"device": "cpu",
"execution_providers": ["CPUExecutionProvider"]
}
]
}
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"extra_options": {
"exclude_embeds": false
}
},
"q": {
"type": "GraphSurgeries",
"surgeries": [
{"surgeon": "QuantizeEmbeddingInt8"},
{"surgeon": "ShareEmbeddingLmHead"}
],
"save_as_external_data": true
}
},
"target": "local_system",
"log_severity_level": 0,
"output_dir": "model",
"cache_dir": "cache",
"no_artifacts": true
}
52 changes: 52 additions & 0 deletions Qwen-Qwen3.5-2B/cpu/Qwen-Qwen3.5-2B_cpu_int4_with_eval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B",
"load_kwargs": {
"torch_dtype": "float16"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [
{
"device": "cpu",
"execution_providers": ["CPUExecutionProvider"]
}
]
}
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"extra_options": {
"exclude_embeds": false
}
},
"q": {
"type": "GraphSurgeries",
"surgeries": [
{"surgeon": "QuantizeEmbeddingInt8"},
{"surgeon": "ShareEmbeddingLmHead"}
],
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"external_data_name": "model.onnx.data"
}
},
"evaluators": {
"mmlu": {
"type": "LMEvaluator",
"tasks": ["mmlu"],
"batch_size": 8
}
},
"evaluator": "mmlu",
"target": "local_system",
"log_severity_level": 0,
"output_dir": "model",
"cache_dir": "cache",
"no_artifacts": true
}
28 changes: 28 additions & 0 deletions Qwen-Qwen3.5-2B/cpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Qwen-Qwen3.5-2B — CPU optimization

This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the CPU EP.

## What this folder is for

- Execution Provider: CPU EP
- Typical precision: INT4 precision by default
- Example recipe filename: Qwen-Qwen3.5-2B_cpu_int4.json

## Setup

1) Install the main branch of Olive:
- pip install git+https://github.com/microsoft/olive.git
2) Install the appropriate runtime package for this backend:
- onnxruntime-genai (CPU build)
3) Run Olive to build/optimize the model
- olive run --config Qwen-Qwen3.5-2B_cpu_int4.json

Additional notes:
- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head)
- Model size: ~1.4 GB (down from 4.3 GB FP16)
- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline.
- Runs purely on CPU; no GPU required.

---

This README was auto-generated for the CPU EP of Qwen-Qwen3.5-2B.
6 changes: 6 additions & 0 deletions Qwen-Qwen3.5-2B/cpu/info.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
arch: qwen3_5_text
recipes:
- name: Qwen-Qwen3.5-2B_cpu_int4
file: Qwen-Qwen3.5-2B_cpu_int4.json
devices: cpu
eps: CPUExecutionProvider
4 changes: 4 additions & 0 deletions Qwen-Qwen3.5-2B/cpu/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
accelerate
datasets
onnxruntime-genai
transformers==4.52.4
42 changes: 42 additions & 0 deletions Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B",
"load_kwargs": {
"torch_dtype": "float16"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [
{
"device": "gpu",
"execution_providers": ["CUDAExecutionProvider"]
}
]
}
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"extra_options": {
"exclude_embeds": false,
"enable_cuda_graph": true
}
},
"q": {
"type": "GraphSurgeries",
"surgeries": [
{"surgeon": "QuantizeEmbeddingInt8"},
{"surgeon": "ShareEmbeddingLmHead"}
]
}
},
"target": "local_system",
"log_severity_level": 0,
"output_dir": "model",
"cache_dir": "cache",
"no_artifacts": true
}
51 changes: 51 additions & 0 deletions Qwen-Qwen3.5-2B/cuda/Qwen-Qwen3.5-2B_cuda_int4_with_eval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B",
"load_kwargs": {
"torch_dtype": "float16"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [
{
"device": "gpu",
"execution_providers": ["CUDAExecutionProvider"]
}
]
}
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"extra_options": {
"exclude_embeds": false,
"enable_cuda_graph": true
}
},
"q": {
"type": "GraphSurgeries",
"surgeries": [
{"surgeon": "QuantizeEmbeddingInt8"},
{"surgeon": "ShareEmbeddingLmHead"}
],
"save_as_external_data": true
}
},
"evaluators": {
"mmlu": {
"type": "LMEvaluator",
"tasks": ["mmlu"],
"batch_size": 8
}
},
"evaluator": "mmlu",
"target": "local_system",
"log_severity_level": 0,
"output_dir": "model",
"cache_dir": "cache",
"no_artifacts": true
}
31 changes: 31 additions & 0 deletions Qwen-Qwen3.5-2B/cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Qwen-Qwen3.5-2B — CUDA optimization

This folder contains Olive recipes for optimizing Qwen-Qwen3.5-2B targeting the CUDA EP.

## What this folder is for

- Execution Provider: CUDA EP
- Typical precision: INT4 precision by default
- Example recipe filename: Qwen-Qwen3.5-2B_cuda_int4.json

## Setup

1) Install the main branch of Olive:
- pip install git+https://github.com/microsoft/olive.git
2) Install the appropriate runtime package for this backend:
- onnxruntime-genai-cuda (CUDA build)
3) Run Olive to build/optimize the model
- olive run --config Qwen-Qwen3.5-2B_cuda_int4.json

Additional notes:
- Pipeline: `ModelBuilder` (INT4 via Neural Compressor) → `QuantizeEmbeddingInt8` (post-hoc INT8 embedding) → `ShareEmbeddingLmHead` (share INT8 weight between embedding and lm_head)
- Model size: ~1.4 GB (down from 4.3 GB FP16)
- MMLU accuracy: 57.11% (vs 59.27% FP16 baseline)
- Uses text-only mode (exclude_embeds=false) for standalone LLM inference without multimodal pipeline.
- CUDA graph enabled for optimized decode throughput.
- Requires NVIDIA GPU with CUDA support.
- Ensure CUDA toolkit and cuDNN are properly installed.

---

This README was auto-generated for the CUDA EP of Qwen-Qwen3.5-2B.
6 changes: 6 additions & 0 deletions Qwen-Qwen3.5-2B/cuda/info.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
arch: qwen3_5_text
recipes:
- name: Qwen-Qwen3.5-2B_cuda_int4
file: Qwen-Qwen3.5-2B_cuda_int4.json
devices: gpu
eps: CUDAExecutionProvider
4 changes: 4 additions & 0 deletions Qwen-Qwen3.5-2B/cuda/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
accelerate
datasets
onnxruntime-genai
transformers==4.52.4
43 changes: 43 additions & 0 deletions Qwen-Qwen3.5-2B/webgpu/Qwen-Qwen3.5-2B_webgpu_int4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"input_model": {
"type": "HfModel",
"model_path": "Qwen/Qwen3.5-2B",
"load_kwargs": {
"torch_dtype": "float16"
}
},
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [
{
"device": "gpu",
"execution_providers": ["WebGpuExecutionProvider"]
}
]
}
},
"passes": {
"m": {
"type": "ModelBuilder",
"precision": "int4",
"int4_block_size": 32,
"extra_options": {
"exclude_embeds": false
}
},
"q": {
"type": "GraphSurgeries",
"surgeries": [
{"surgeon": "QuantizeEmbeddingInt8"},
{"surgeon": "ShareEmbeddingLmHead"}
],
"save_as_external_data": true
}
},
"target": "local_system",
"log_severity_level": 0,
"output_dir": "model",
"cache_dir": "cache",
"no_artifacts": true
}
Loading
Loading