From 2a2510827fe2a5956b220774329df227a428bd06 Mon Sep 17 00:00:00 2001 From: hualxie Date: Wed, 6 May 2026 16:35:35 +0800 Subject: [PATCH 01/17] 1st add --- .aitk/configs/checks.json | 8 +- .aitk/configs/model_list.json | 3 +- .aitk/docs/guide/ModelList.md | 2 +- .../requirements/WebGPU/WebGPU_py3.12.13.txt | 0 .../aitk/deepseek_webgpu.json | 73 +++++++++++++++++++ .../aitk/deepseek_webgpu.json.config | 53 ++++++++++++++ .../aitk/info.yml | 3 + .../aitk/model_project.config | 4 + 8 files changed, 140 insertions(+), 6 deletions(-) create mode 100644 .aitk/requirements/WebGPU/WebGPU_py3.12.13.txt create mode 100644 deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json create mode 100644 deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 22f96301f..ee3a84873 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,5 +1,5 @@ { - "configCheck": 167, + "configCheck": 168, "copyCheck": 182, "extensionCheck": 2, "gitignoreCheck": 44, @@ -8,9 +8,9 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 167, - "pathCheck": 1423, + "oliveJsonCheck": 168, + "pathCheck": 1426, "requirementsCheck": 37, "templateCheck": 3, - "venvRequirementsCheck": 17 + "venvRequirementsCheck": 18 } diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 11abf6887..afa6e2615 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -64,7 +64,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index 8cc8e7801..034fc5429 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -5,7 +5,7 @@ | Model Name | Supported Runtimes | |------------|--------------------| | [Deepseek R1 Distill Llama 8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Llama-8B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Llama-8B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Llama-8B/aitk/deepseek_ov_npu_config.json) | -| [Deepseek R1 Distill Qwen 1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | [Qualcomm NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json), [Qualcomm GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_gpu_config.json), [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_config.json), [DirectML](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json) | +| [Deepseek R1 Distill Qwen 1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | [Qualcomm NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json), [Qualcomm GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_gpu_config.json), [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_config.json), [DirectML](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json), [WebGPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json) | | [Deepseek R1 Distill Qwen 14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_trtrtx.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_npu_config.json) | | [Deepseek R1 Distill Qwen 7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_trtrtx.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_npu_config.json) | | [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json), [AMD NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_config.json), [DirectML](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_dml_config.json) | diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt new file mode 100644 index 000000000..e69de29bb diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json new file mode 100644 index 000000000..a54f42c81 --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json @@ -0,0 +1,73 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "kld_gradient", + "bits": 4, + "high_bits": 8, + "ratio": 0.65, + "sym": false, + "group_size": 32 + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/deepseek", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config new file mode 100644 index 000000000..3810be3cd --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config @@ -0,0 +1,53 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "isGPUSuggested": true, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml index dc61befc4..d211dd33c 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml @@ -39,6 +39,9 @@ recipes: isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider + - file: "deepseek_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config index c997fb66b..495bf4127 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "deepseek_qnn_gpu_config.json", "templateName": "deepseek_qnn_gpu_config" + }, + { + "file": "deepseek_webgpu.json", + "templateName": "deepseek_webgpu" } ], "modelInfo": { From 8f509a6869bc2412d548a145a5ace3d1356572cb Mon Sep 17 00:00:00 2001 From: hualxie Date: Wed, 6 May 2026 16:56:27 +0800 Subject: [PATCH 02/17] init --- .aitk/requirements/WebGPU/WebGPU_py3.12.13.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index e69de29bb..aa1a81919 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -0,0 +1,5 @@ +olive-ai==0.12.1 +accelerate +datasets +onnxruntime-genai +transformers==4.52.4 From c29ce79274ef3204129505e44f223f70abe4aadd Mon Sep 17 00:00:00 2001 From: hualxie Date: Wed, 6 May 2026 17:07:13 +0800 Subject: [PATCH 03/17] add generator Co-authored-by: Copilot --- .aitk/configs/checks.json | 2 +- .aitk/scripts/project_processor.py | 3 ++ .aitk/scripts/sanitize/generator_webgpu.py | 35 +++++++++++++++ .../aitk/deepseek_webgpu.json.config | 44 ++++++++++++++++++- .../aitk/info.yml | 3 ++ 5 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 .aitk/scripts/sanitize/generator_webgpu.py diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index ee3a84873..9cacddb66 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -9,7 +9,7 @@ "modelProjectCheck": 46, "oliveCheck": 88, "oliveJsonCheck": 168, - "pathCheck": 1426, + "pathCheck": 1428, "requirementsCheck": 37, "templateCheck": 3, "venvRequirementsCheck": 18 diff --git a/.aitk/scripts/project_processor.py b/.aitk/scripts/project_processor.py index 74c2c66f8..85b8289fe 100644 --- a/.aitk/scripts/project_processor.py +++ b/.aitk/scripts/project_processor.py @@ -13,6 +13,7 @@ from sanitize.generator_intel import generator_intel from sanitize.generator_qnn import generator_qnn from sanitize.generator_trtrtx import generator_trtrtx +from sanitize.generator_webgpu import generator_webgpu from sanitize.model_info import ModelInfo, ModelList from sanitize.project_config import ModelInfoProject, ModelProjectConfig, WorkflowItem from sanitize.utils import GlobalVars, isLLM_by_id, open_ex @@ -183,6 +184,8 @@ def convert_yaml_to_project_config( generator_trtrtx(id, recipe, yml_file.parent, modelList) elif recipe.get("ep") == EPNames.DmlExecutionProvider.value: generator_dml(id, recipe, yml_file.parent, modelList) + elif recipe.get("ep") == EPNames.WebGpuExecutionProvider.value: + generator_webgpu(id, recipe, yml_file.parent, modelList) runtimes = get_runtime(recipe) for runtime in runtimes: modelSummary.recipes.setdefault(runtime, []).append(file) diff --git a/.aitk/scripts/sanitize/generator_webgpu.py b/.aitk/scripts/sanitize/generator_webgpu.py new file mode 100644 index 000000000..80557f841 --- /dev/null +++ b/.aitk/scripts/sanitize/generator_webgpu.py @@ -0,0 +1,35 @@ +from pathlib import Path + +from .generator_common import create_model_parameter, set_optimization_path +from .generator_dml import generate_quantization_config +from .model_info import ModelList +from .model_parameter import ModelParameter +from .utils import isLLM_by_id + +def generator_webgpu(id: str, recipe, folder: Path, modelList: ModelList): + aitk = recipe.get("aitk", {}) + auto = aitk.get("auto", True) + if not auto: + return + + isLLM = isLLM_by_id(id) + file = recipe.get("file") + configFile = folder / file + + if not isLLM: + modelParameter = ModelParameter.Read(str(configFile) + ".config") + set_optimization_path(modelParameter, str(configFile)) + modelParameter.writeIfChanged() + return + + name = "Convert to WebGPU" + + parameter = create_model_parameter(aitk, name, configFile) + parameter.isLLM = isLLM + + quantize = generate_quantization_config(configFile, parameter) + if quantize: + parameter.sections.append(quantize) + + parameter.writeIfChanged() + print(f"\tGenerated WebGPU configuration for {file}") diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config index 3810be3cd..8c3a740ee 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config @@ -7,7 +7,6 @@ "autoGenerated": true, "useModelBuilder": "m" }, - "isGPUSuggested": true, "runtimeOverwrite": { "autoGenerated": true, "executeRequirement": "WebGPU/WebGPU_py3.12.13" @@ -48,6 +47,49 @@ ], "readOnly": true } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } } ] } diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml index d211dd33c..4f025c44a 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml @@ -42,6 +42,9 @@ recipes: - file: "deepseek_webgpu.json" device: gpu ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" From 9a9d79ad386bb279fcb551a1f998273552b85997 Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 11:14:07 +0800 Subject: [PATCH 04/17] freeze --- .../requirements/WebGPU/WebGPU_py3.12.13.txt | 80 ++++++++++++++++++- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index aa1a81919..c074a2e6b 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -1,5 +1,79 @@ +--extra-index-url https://download.pytorch.org/whl/cu128 +accelerate==1.13.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.5 +aiosignal==1.4.0 +alembic==1.18.4 +annotated-types==0.7.0 +anyio==4.13.0 +attrs==26.1.0 +certifi==2026.4.22 +charset-normalizer==3.4.7 +colorama==0.4.6 +colorlog==6.10.1 +datasets==4.8.5 +dill==0.4.1 +filelock==3.29.0 +flatbuffers==25.12.19 +frozenlist==1.8.0 +fsspec==2026.2.0 +greenlet==3.5.0 +h11==0.16.0 +hf-xet==1.5.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.36.2 +idna==3.13 +importlib-metadata==8.7.1 +jinja2==3.1.6 +lightning-utilities==0.15.3 +mako==1.3.12 +markupsafe==3.0.3 +ml-dtypes==0.5.4 +mpmath==1.3.0 +multidict==6.7.1 +multiprocess==0.70.19 +networkx==3.6.1 +numpy==2.4.4 olive-ai==0.12.1 -accelerate -datasets -onnxruntime-genai +onnx==1.21.0 +onnx-ir==0.2.1 +onnxruntime-genai==0.13.2 +onnxruntime-webgpu==1.25.1 +onnxscript==0.7.0 +opentelemetry-api==1.41.1 +opentelemetry-sdk==1.41.1 +opentelemetry-semantic-conventions==0.62b1 +optuna==4.8.0 +packaging==26.2 +pandas==3.0.2 +prompt-toolkit==3.0.52 +propcache==0.4.1 +protobuf==7.34.1 +psutil==7.2.2 +pyarrow==24.0.0 +pydantic==2.13.3 +pydantic-core==2.46.3 +python-dateutil==2.9.0.post0 +pyyaml==6.0.3 +questionary==2.1.1 +regex==2026.4.4 +requests==2.33.1 +safetensors==0.7.0 +setuptools==81.0.0 +six==1.17.0 +sqlalchemy==2.0.49 +sympy==1.14.0 +tokenizers==0.21.4 +torch==2.8.0+cu128 +torchmetrics==1.7.1 +tqdm==4.67.3 transformers==4.52.4 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +tzdata==2026.2 +urllib3==2.6.3 +wcwidth==0.7.0 +xxhash==3.7.0 +yarl==1.23.0 +zipp==3.23.1 From 628a24721a9e23dc22261a7c49662ba05789a03f Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 11:32:34 +0800 Subject: [PATCH 05/17] add intel bert Co-authored-by: Copilot --- .aitk/configs/checks.json | 6 +-- .aitk/configs/model_list.json | 3 +- .aitk/docs/guide/ModelList.md | 2 +- .../aitk/bert_webgpu.json | 35 ++++++++++++++ .../aitk/bert_webgpu.json.config | 47 +++++++++++++++++++ intel-bert-base-uncased-mrpc/aitk/info.yml | 3 ++ .../aitk/model_project.config | 4 ++ 7 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json create mode 100644 intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 9cacddb66..fc2de1dd9 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,5 +1,5 @@ { - "configCheck": 168, + "configCheck": 169, "copyCheck": 182, "extensionCheck": 2, "gitignoreCheck": 44, @@ -8,8 +8,8 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 168, - "pathCheck": 1428, + "oliveJsonCheck": 169, + "pathCheck": 1431, "requirementsCheck": 37, "templateCheck": 3, "venvRequirementsCheck": 18 diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index afa6e2615..1c7db6d3f 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -140,7 +140,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index 034fc5429..f05df07c1 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -35,7 +35,7 @@ | Model Name | Supported Runtimes | |------------|--------------------| | [Bert Base Multilingual Cased](https://huggingface.co/google-bert/bert-base-multilingual-cased) | [Qualcomm NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_qnn.json), [Qualcomm GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qnn_gpu.json), [AMD NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_amd.json), [AMD GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_trtrtx.json), [Intel CPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [DirectML](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_dml.json) | -| [Bert Base Uncased Mrpc](https://huggingface.co/Intel/bert-base-uncased-mrpc) | [Qualcomm NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_qnn.json), [Qualcomm GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qnn_gpu.json), [AMD NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_amd.json), [AMD GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_migraphx.json), [NVIDIA TensorRT for RTX](../../../intel-bert-base-uncased-mrpc/aitk/bert_trtrtx.json), [Intel CPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [DirectML](../../../intel-bert-base-uncased-mrpc/aitk/bert_dml.json) | +| [Bert Base Uncased Mrpc](https://huggingface.co/Intel/bert-base-uncased-mrpc) | [Qualcomm NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_qnn.json), [Qualcomm GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qnn_gpu.json), [AMD NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_amd.json), [AMD GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_migraphx.json), [NVIDIA TensorRT for RTX](../../../intel-bert-base-uncased-mrpc/aitk/bert_trtrtx.json), [Intel CPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [DirectML](../../../intel-bert-base-uncased-mrpc/aitk/bert_dml.json), [WebGPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json) | | [Chinese Clip Vit Base Patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) | [Intel CPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json) | | [Clip Vit B 32 Laion2B S34B B79K](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) | [Qualcomm NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn.json), [Qualcomm GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn_gpu.json), [AMD NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qdq_amd.json), [AMD GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx.json), [Intel CPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [DirectML](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml.json) | | [Clip Vit Base Patch16](https://huggingface.co/openai/clip-vit-base-patch16) | [Qualcomm NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch16/aitk/openai_clip_dml.json) | diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json new file mode 100644 index 000000000..96ede55b4 --- /dev/null +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json @@ -0,0 +1,35 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Intel/bert-base-uncased-mrpc", + "task": "text-classification" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/bert_webgpu", + "evaluate_input_model": false +} diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/intel-bert-base-uncased-mrpc/aitk/info.yml b/intel-bert-base-uncased-mrpc/aitk/info.yml index db957fe08..fb250bcac 100644 --- a/intel-bert-base-uncased-mrpc/aitk/info.yml +++ b/intel-bert-base-uncased-mrpc/aitk/info.yml @@ -29,6 +29,9 @@ recipes: - file: "bert_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "bert_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/Intel/bert-base-uncased-mrpc" diff --git a/intel-bert-base-uncased-mrpc/aitk/model_project.config b/intel-bert-base-uncased-mrpc/aitk/model_project.config index 5b21e504f..60a31234e 100644 --- a/intel-bert-base-uncased-mrpc/aitk/model_project.config +++ b/intel-bert-base-uncased-mrpc/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "bert_qnn_gpu.json", "templateName": "bert_qnn_gpu" + }, + { + "file": "bert_webgpu.json", + "templateName": "bert_webgpu" } ], "modelInfo": { From 87bd35e95f4a17a9d778e9981985ab2fc89914f2 Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 11:39:36 +0800 Subject: [PATCH 06/17] update --- .aitk/requirements/WebGPU/WebGPU_py3.12.13.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index c074a2e6b..3768b67fa 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -38,7 +38,8 @@ numpy==2.4.4 olive-ai==0.12.1 onnx==1.21.0 onnx-ir==0.2.1 -onnxruntime-genai==0.13.2 +# install it separatly with no deps as it will install onnxruntime to overwrite onnxruntime-webgpu +# uvpip:install onnxruntime-genai==0.13.2 --no-deps;post onnxruntime-webgpu==1.25.1 onnxscript==0.7.0 opentelemetry-api==1.41.1 @@ -65,8 +66,8 @@ six==1.17.0 sqlalchemy==2.0.49 sympy==1.14.0 tokenizers==0.21.4 -torch==2.8.0+cu128 -torchmetrics==1.7.1 +torch==2.11.0+cu128 +torchmetrics==1.9.0 tqdm==4.67.3 transformers==4.52.4 typing-extensions==4.15.0 From 449bd6227cbcfc91fcc5c820826f3b3568b053c0 Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 11:54:50 +0800 Subject: [PATCH 07/17] add more Co-authored-by: Copilot --- .../requirements/WebGPU/WebGPU_py3.12.13.txt | 2 + .../aitk/bert_webgpu.json | 35 +++++++++++++ .../aitk/bert_webgpu.json.config | 47 +++++++++++++++++ .../aitk/info.yml | 3 ++ google-vit-base-patch16-224/aitk/info.yml | 3 ++ .../aitk/vit_webgpu.json | 51 +++++++++++++++++++ .../aitk/vit_webgpu.json.config | 47 +++++++++++++++++ .../aitk/bert_webgpu.json | 5 +- microsoft-resnet-50/aitk/info.yml | 3 ++ microsoft-resnet-50/aitk/resnet_webgpu.json | 51 +++++++++++++++++++ .../aitk/resnet_webgpu.json.config | 47 +++++++++++++++++ 11 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json create mode 100644 google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config create mode 100644 google-vit-base-patch16-224/aitk/vit_webgpu.json create mode 100644 google-vit-base-patch16-224/aitk/vit_webgpu.json.config create mode 100644 microsoft-resnet-50/aitk/resnet_webgpu.json create mode 100644 microsoft-resnet-50/aitk/resnet_webgpu.json.config diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index 3768b67fa..3b4b6a6d5 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -40,6 +40,7 @@ onnx==1.21.0 onnx-ir==0.2.1 # install it separatly with no deps as it will install onnxruntime to overwrite onnxruntime-webgpu # uvpip:install onnxruntime-genai==0.13.2 --no-deps;post +onnxoptimizer==0.4.2 onnxruntime-webgpu==1.25.1 onnxscript==0.7.0 opentelemetry-api==1.41.1 @@ -65,6 +66,7 @@ setuptools==81.0.0 six==1.17.0 sqlalchemy==2.0.49 sympy==1.14.0 +tabulate==0.10.0 tokenizers==0.21.4 torch==2.11.0+cu128 torchmetrics==1.9.0 diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json new file mode 100644 index 000000000..04815ae4b --- /dev/null +++ b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json @@ -0,0 +1,35 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-cased", + "task": "feature-extraction" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/bert_webgpu", + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config new file mode 100644 index 000000000..2a8bbc29b --- /dev/null +++ b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/info.yml b/google-bert-bert-base-multilingual-cased/aitk/info.yml index cdfa911f4..8f546a03a 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/info.yml +++ b/google-bert-bert-base-multilingual-cased/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "bert-base-multilingual-cased_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "bert_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/google-bert/bert-base-multilingual-cased" diff --git a/google-vit-base-patch16-224/aitk/info.yml b/google-vit-base-patch16-224/aitk/info.yml index 5a4186142..c1f21f260 100644 --- a/google-vit-base-patch16-224/aitk/info.yml +++ b/google-vit-base-patch16-224/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "vit-base-patch16-224_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "vit_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/google/vit-base-patch16-224" diff --git a/google-vit-base-patch16-224/aitk/vit_webgpu.json b/google-vit-base-patch16-224/aitk/vit_webgpu.json new file mode 100644 index 000000000..1b9f439e7 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit_webgpu.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google/vit-base-patch16-224", + "task": "image-classification", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 224, + 224 + ] + ], + "output_names": [ + "output" + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/vit_webgpu", + "evaluate_input_model": false +} diff --git a/google-vit-base-patch16-224/aitk/vit_webgpu.json.config b/google-vit-base-patch16-224/aitk/vit_webgpu.json.config new file mode 100644 index 000000000..2a8bbc29b --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json index 96ede55b4..f67676762 100644 --- a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json @@ -2,7 +2,10 @@ "input_model": { "type": "HfModel", "model_path": "Intel/bert-base-uncased-mrpc", - "task": "text-classification" + "task": "text-classification", + "load_kwargs": { + "attn_implementation": "eager" + } }, "systems": { "local_system": { diff --git a/microsoft-resnet-50/aitk/info.yml b/microsoft-resnet-50/aitk/info.yml index fe9387cc5..f558f572e 100644 --- a/microsoft-resnet-50/aitk/info.yml +++ b/microsoft-resnet-50/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "resnet_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "resnet_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/microsoft/resnet-50" diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json b/microsoft-resnet-50/aitk/resnet_webgpu.json new file mode 100644 index 000000000..1c44d2f51 --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "microsoft/resnet-50", + "task": "image-classification", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 224, + 224 + ] + ], + "output_names": [ + "logits" + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/resnet_webgpu", + "evaluate_input_model": false +} diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json.config b/microsoft-resnet-50/aitk/resnet_webgpu.json.config new file mode 100644 index 000000000..2a8bbc29b --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} From c46cb4df8e5c0762c0ce1e77eda49e8df0b63c8e Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 12:00:17 +0800 Subject: [PATCH 08/17] clip Co-authored-by: Copilot --- .../aitk/_copy.json.config | 15 ++++ .../aitk/info.yml | 3 + openai-clip-vit-base-patch16/aitk/info.yml | 3 + .../aitk/openai_clip_webgpu.json | 90 +++++++++++++++++++ .../aitk/openai_clip_webgpu.json.config | 47 ++++++++++ .../aitk/_copy.json.config | 15 ++++ openai-clip-vit-base-patch32/aitk/info.yml | 3 + 7 files changed, 176 insertions(+) create mode 100644 openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json create mode 100644 openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config index f6ce51a00..c3c112dba 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config @@ -117,6 +117,21 @@ "dst": "laion_clip_dml.json.config", "replacements": [] }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json", + "dst": "laion_clip_webgpu.json", + "replacements": [ + { + "find": "openai/clip-vit-base-patch16", + "replace": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" + } + ] + }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config", + "dst": "laion_clip_webgpu.json.config", + "replacements": [] + }, { "src": "laion_clip_dml.json", "dst": "laion_clip_migraphx.json", diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml index ffde088d1..44669b315 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "laion_clip_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "laion_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/laion/CLIP-ViT-B-32-laion2B-s34B-b79K" diff --git a/openai-clip-vit-base-patch16/aitk/info.yml b/openai-clip-vit-base-patch16/aitk/info.yml index 633a5133f..141dd7e9b 100644 --- a/openai-clip-vit-base-patch16/aitk/info.yml +++ b/openai-clip-vit-base-patch16/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "openai_clip_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "openai_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/openai/clip-vit-base-patch16" diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json new file mode 100644 index 000000000..e0f5adc4e --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json @@ -0,0 +1,90 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "openai/clip-vit-base-patch16", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/clip_webgpu", + "evaluate_input_model": false +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config new file mode 100644 index 000000000..2a8bbc29b --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/openai-clip-vit-base-patch32/aitk/_copy.json.config b/openai-clip-vit-base-patch32/aitk/_copy.json.config index c6c72ccee..005a6cb5d 100644 --- a/openai-clip-vit-base-patch32/aitk/_copy.json.config +++ b/openai-clip-vit-base-patch32/aitk/_copy.json.config @@ -109,6 +109,21 @@ "dst": "openai_clip_dml.json.config", "replacements": [] }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json", + "dst": "openai_clip_webgpu.json", + "replacements": [ + { + "find": "openai/clip-vit-base-patch16", + "replace": "openai/clip-vit-base-patch32" + } + ] + }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config", + "dst": "openai_clip_webgpu.json.config", + "replacements": [] + }, { "src": "openai_clip_dml.json", "dst": "openai_clip_migraphx.json", diff --git a/openai-clip-vit-base-patch32/aitk/info.yml b/openai-clip-vit-base-patch32/aitk/info.yml index 1f9fa87dc..b7476660f 100644 --- a/openai-clip-vit-base-patch32/aitk/info.yml +++ b/openai-clip-vit-base-patch32/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "openai_clip_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "openai_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/openai/clip-vit-base-patch32" From 76f99a7ac96cde004eb6636db92b4fedc9930dfd Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 12:04:06 +0800 Subject: [PATCH 09/17] sanitize --- .aitk/configs/checks.json | 8 +- .aitk/configs/model_list.json | 18 ++-- .aitk/docs/guide/ModelList.md | 12 +-- .../aitk/bert_webgpu.json.config | 94 +++++++++---------- .../aitk/model_project.config | 4 + .../aitk/model_project.config | 4 + .../aitk/vit_webgpu.json.config | 94 +++++++++---------- .../aitk/laion_clip_webgpu.json | 90 ++++++++++++++++++ .../aitk/laion_clip_webgpu.json.config | 47 ++++++++++ .../aitk/model_project.config | 4 + microsoft-resnet-50/aitk/model_project.config | 4 + .../aitk/resnet_webgpu.json.config | 94 +++++++++---------- .../aitk/model_project.config | 4 + .../aitk/openai_clip_webgpu.json.config | 94 +++++++++---------- .../aitk/model_project.config | 4 + .../aitk/openai_clip_webgpu.json | 90 ++++++++++++++++++ .../aitk/openai_clip_webgpu.json.config | 47 ++++++++++ 17 files changed, 508 insertions(+), 204 deletions(-) create mode 100644 laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json create mode 100644 laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config create mode 100644 openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json create mode 100644 openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index fc2de1dd9..6d2666e0f 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,6 +1,6 @@ { - "configCheck": 169, - "copyCheck": 182, + "configCheck": 175, + "copyCheck": 186, "extensionCheck": 2, "gitignoreCheck": 44, "inferenceModelCheck": 25, @@ -8,8 +8,8 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 169, - "pathCheck": 1431, + "oliveJsonCheck": 175, + "pathCheck": 1449, "requirementsCheck": 37, "templateCheck": 3, "venvRequirementsCheck": 18 diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 1c7db6d3f..7768ed3d9 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -38,7 +38,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "CNN", "status": "Ready", @@ -90,7 +91,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -115,7 +117,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -166,7 +169,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -215,7 +219,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -240,7 +245,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index f05df07c1..53fdec086 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -34,14 +34,14 @@ | Model Name | Supported Runtimes | |------------|--------------------| -| [Bert Base Multilingual Cased](https://huggingface.co/google-bert/bert-base-multilingual-cased) | [Qualcomm NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_qnn.json), [Qualcomm GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qnn_gpu.json), [AMD NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_amd.json), [AMD GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_trtrtx.json), [Intel CPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [DirectML](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_dml.json) | +| [Bert Base Multilingual Cased](https://huggingface.co/google-bert/bert-base-multilingual-cased) | [Qualcomm NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_qnn.json), [Qualcomm GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qnn_gpu.json), [AMD NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_amd.json), [AMD GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_trtrtx.json), [Intel CPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [DirectML](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_dml.json), [WebGPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json) | | [Bert Base Uncased Mrpc](https://huggingface.co/Intel/bert-base-uncased-mrpc) | [Qualcomm NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_qnn.json), [Qualcomm GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qnn_gpu.json), [AMD NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_amd.json), [AMD GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_migraphx.json), [NVIDIA TensorRT for RTX](../../../intel-bert-base-uncased-mrpc/aitk/bert_trtrtx.json), [Intel CPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [DirectML](../../../intel-bert-base-uncased-mrpc/aitk/bert_dml.json), [WebGPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json) | | [Chinese Clip Vit Base Patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) | [Intel CPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json) | -| [Clip Vit B 32 Laion2B S34B B79K](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) | [Qualcomm NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn.json), [Qualcomm GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn_gpu.json), [AMD NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qdq_amd.json), [AMD GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx.json), [Intel CPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [DirectML](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml.json) | -| [Clip Vit Base Patch16](https://huggingface.co/openai/clip-vit-base-patch16) | [Qualcomm NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch16/aitk/openai_clip_dml.json) | -| [Clip Vit Base Patch32](https://huggingface.co/openai/clip-vit-base-patch32) | [Qualcomm NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch32/aitk/openai_clip_dml.json) | +| [Clip Vit B 32 Laion2B S34B B79K](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) | [Qualcomm NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn.json), [Qualcomm GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn_gpu.json), [AMD NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qdq_amd.json), [AMD GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx.json), [Intel CPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [DirectML](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml.json), [WebGPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json) | +| [Clip Vit Base Patch16](https://huggingface.co/openai/clip-vit-base-patch16) | [Qualcomm NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch16/aitk/openai_clip_dml.json), [WebGPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json) | +| [Clip Vit Base Patch32](https://huggingface.co/openai/clip-vit-base-patch32) | [Qualcomm NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch32/aitk/openai_clip_dml.json), [WebGPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json) | | [Clip Vit Large Patch14](https://huggingface.co/openai/clip-vit-large-patch14) | [Qualcomm NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qnn.json), [AMD NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-large-patch14/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-large-patch14/aitk/openai_clip_dml.json) | -| [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [Qualcomm GPU](../../../microsoft-resnet-50/aitk/resnet_qnn_gpu.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json) | +| [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [Qualcomm GPU](../../../microsoft-resnet-50/aitk/resnet_qnn_gpu.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json), [WebGPU](../../../microsoft-resnet-50/aitk/resnet_webgpu.json) | | [Stable Diffusion V1 5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) | [Qualcomm NPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json), [Intel CPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json), [Intel GPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json) | -| [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [Qualcomm GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qnn_gpu.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json) | +| [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [Qualcomm GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qnn_gpu.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json), [WebGPU](../../../google-vit-base-patch16-224/aitk/vit_webgpu.json) | | [Whisper Large V3 Turbo](https://huggingface.co/openai/whisper-large-v3-turbo) | [Qualcomm NPU](../../../openai-whisper-large-v3-turbo/aitk/qnn_workflow.json) | diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config index 2a8bbc29b..4575e8895 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config +++ b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config @@ -1,47 +1,47 @@ -{ - "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", - "name": "Convert to WebGPU", - "evalRuntime": "WebGPU", - "runtimeOverwrite": { - "executeRequirement": "WebGPU/WebGPU_py3.12.13" - }, - "runtime": { - "autoGenerated": true, - "name": "Evaluate on", - "type": "enum", - "displayNames": [ - "WebGPU" - ], - "path": "systems.local_system.accelerators.0.execution_providers.0", - "values": [ - "WebGpuExecutionProvider" - ], - "readOnly": false - }, - "optimizationPaths": [ - { - "path": "passes.conversion", - "name": "fp32" - } - ], - "optimizationDefault": "fp32", - "sections": [ - { - "autoGenerated": true, - "name": "Convert", - "phase": "Conversion", - "parameters": [], - "toggle": { - "autoGenerated": true, - "name": "Convert to ONNX format", - "type": "bool", - "path": "passes.conversion", - "actions": [ - [], - [] - ], - "readOnly": true - } - } - ] -} +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/model_project.config b/google-bert-bert-base-multilingual-cased/aitk/model_project.config index 0c243b19c..647418708 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/model_project.config +++ b/google-bert-bert-base-multilingual-cased/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "bert-base-multilingual-cased_qnn_gpu.json", "templateName": "bert-base-multilingual-cased_qnn_gpu" + }, + { + "file": "bert_webgpu.json", + "templateName": "bert_webgpu" } ], "modelInfo": { diff --git a/google-vit-base-patch16-224/aitk/model_project.config b/google-vit-base-patch16-224/aitk/model_project.config index a06dd8750..149292581 100644 --- a/google-vit-base-patch16-224/aitk/model_project.config +++ b/google-vit-base-patch16-224/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "vit-base-patch16-224_qnn_gpu.json", "templateName": "vit-base-patch16-224_qnn_gpu" + }, + { + "file": "vit_webgpu.json", + "templateName": "vit_webgpu" } ], "modelInfo": { diff --git a/google-vit-base-patch16-224/aitk/vit_webgpu.json.config b/google-vit-base-patch16-224/aitk/vit_webgpu.json.config index 2a8bbc29b..4575e8895 100644 --- a/google-vit-base-patch16-224/aitk/vit_webgpu.json.config +++ b/google-vit-base-patch16-224/aitk/vit_webgpu.json.config @@ -1,47 +1,47 @@ -{ - "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", - "name": "Convert to WebGPU", - "evalRuntime": "WebGPU", - "runtimeOverwrite": { - "executeRequirement": "WebGPU/WebGPU_py3.12.13" - }, - "runtime": { - "autoGenerated": true, - "name": "Evaluate on", - "type": "enum", - "displayNames": [ - "WebGPU" - ], - "path": "systems.local_system.accelerators.0.execution_providers.0", - "values": [ - "WebGpuExecutionProvider" - ], - "readOnly": false - }, - "optimizationPaths": [ - { - "path": "passes.conversion", - "name": "fp32" - } - ], - "optimizationDefault": "fp32", - "sections": [ - { - "autoGenerated": true, - "name": "Convert", - "phase": "Conversion", - "parameters": [], - "toggle": { - "autoGenerated": true, - "name": "Convert to ONNX format", - "type": "bool", - "path": "passes.conversion", - "actions": [ - [], - [] - ], - "readOnly": true - } - } - ] -} +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json new file mode 100644 index 000000000..94d4dbae2 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json @@ -0,0 +1,90 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/clip_webgpu", + "evaluate_input_model": false +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config index a9a556885..59a48c533 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "laion_clip_qnn_gpu.json", "templateName": "laion_clip_qnn_gpu" + }, + { + "file": "laion_clip_webgpu.json", + "templateName": "laion_clip_webgpu" } ], "modelInfo": { diff --git a/microsoft-resnet-50/aitk/model_project.config b/microsoft-resnet-50/aitk/model_project.config index d11ed84be..90865571d 100644 --- a/microsoft-resnet-50/aitk/model_project.config +++ b/microsoft-resnet-50/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "resnet_qnn_gpu.json", "templateName": "resnet_qnn_gpu" + }, + { + "file": "resnet_webgpu.json", + "templateName": "resnet_webgpu" } ], "modelInfo": { diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json.config b/microsoft-resnet-50/aitk/resnet_webgpu.json.config index 2a8bbc29b..4575e8895 100644 --- a/microsoft-resnet-50/aitk/resnet_webgpu.json.config +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json.config @@ -1,47 +1,47 @@ -{ - "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", - "name": "Convert to WebGPU", - "evalRuntime": "WebGPU", - "runtimeOverwrite": { - "executeRequirement": "WebGPU/WebGPU_py3.12.13" - }, - "runtime": { - "autoGenerated": true, - "name": "Evaluate on", - "type": "enum", - "displayNames": [ - "WebGPU" - ], - "path": "systems.local_system.accelerators.0.execution_providers.0", - "values": [ - "WebGpuExecutionProvider" - ], - "readOnly": false - }, - "optimizationPaths": [ - { - "path": "passes.conversion", - "name": "fp32" - } - ], - "optimizationDefault": "fp32", - "sections": [ - { - "autoGenerated": true, - "name": "Convert", - "phase": "Conversion", - "parameters": [], - "toggle": { - "autoGenerated": true, - "name": "Convert to ONNX format", - "type": "bool", - "path": "passes.conversion", - "actions": [ - [], - [] - ], - "readOnly": true - } - } - ] -} +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/openai-clip-vit-base-patch16/aitk/model_project.config b/openai-clip-vit-base-patch16/aitk/model_project.config index a15e48590..120402cf4 100644 --- a/openai-clip-vit-base-patch16/aitk/model_project.config +++ b/openai-clip-vit-base-patch16/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "openai_clip_qnn_gpu.json", "templateName": "openai_clip_qnn_gpu" + }, + { + "file": "openai_clip_webgpu.json", + "templateName": "openai_clip_webgpu" } ], "modelInfo": { diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config index 2a8bbc29b..4575e8895 100644 --- a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config @@ -1,47 +1,47 @@ -{ - "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", - "name": "Convert to WebGPU", - "evalRuntime": "WebGPU", - "runtimeOverwrite": { - "executeRequirement": "WebGPU/WebGPU_py3.12.13" - }, - "runtime": { - "autoGenerated": true, - "name": "Evaluate on", - "type": "enum", - "displayNames": [ - "WebGPU" - ], - "path": "systems.local_system.accelerators.0.execution_providers.0", - "values": [ - "WebGpuExecutionProvider" - ], - "readOnly": false - }, - "optimizationPaths": [ - { - "path": "passes.conversion", - "name": "fp32" - } - ], - "optimizationDefault": "fp32", - "sections": [ - { - "autoGenerated": true, - "name": "Convert", - "phase": "Conversion", - "parameters": [], - "toggle": { - "autoGenerated": true, - "name": "Convert to ONNX format", - "type": "bool", - "path": "passes.conversion", - "actions": [ - [], - [] - ], - "readOnly": true - } - } - ] -} +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/openai-clip-vit-base-patch32/aitk/model_project.config b/openai-clip-vit-base-patch32/aitk/model_project.config index 084fa3a91..6226974d8 100644 --- a/openai-clip-vit-base-patch32/aitk/model_project.config +++ b/openai-clip-vit-base-patch32/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "openai_clip_qnn_gpu.json", "templateName": "openai_clip_qnn_gpu" + }, + { + "file": "openai_clip_webgpu.json", + "templateName": "openai_clip_webgpu" } ], "modelInfo": { diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json new file mode 100644 index 000000000..7f8d0bd3f --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json @@ -0,0 +1,90 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "openai/clip-vit-base-patch32", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/clip_webgpu", + "evaluate_input_model": false +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} From 0b6de73c42bb06a83fcd3f5405baf130472ef121 Mon Sep 17 00:00:00 2001 From: hualxie Date: Thu, 7 May 2026 14:51:20 +0800 Subject: [PATCH 10/17] cuda 130 works --- .aitk/requirements/WebGPU/WebGPU_py3.12.13.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index 3b4b6a6d5..cf1e762a5 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -1,4 +1,4 @@ ---extra-index-url https://download.pytorch.org/whl/cu128 +--extra-index-url https://download.pytorch.org/whl/cu130 accelerate==1.13.0 aiohappyeyeballs==2.6.1 aiohttp==3.13.5 @@ -68,7 +68,7 @@ sqlalchemy==2.0.49 sympy==1.14.0 tabulate==0.10.0 tokenizers==0.21.4 -torch==2.11.0+cu128 +torch==2.11.0+cu130 torchmetrics==1.9.0 tqdm==4.67.3 transformers==4.52.4 From 5af1ceb91438b826e9b6e75c9792377138d31751 Mon Sep 17 00:00:00 2001 From: hualxie Date: Fri, 8 May 2026 10:11:28 +0800 Subject: [PATCH 11/17] for Qwen3VLForConditionalGeneration --- .aitk/requirements/WebGPU/WebGPU_py3.12.13.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index cf1e762a5..f12a05e18 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -67,11 +67,11 @@ six==1.17.0 sqlalchemy==2.0.49 sympy==1.14.0 tabulate==0.10.0 -tokenizers==0.21.4 +tokenizers==0.22.2 torch==2.11.0+cu130 torchmetrics==1.9.0 tqdm==4.67.3 -transformers==4.52.4 +transformers==4.57.6 typing-extensions==4.15.0 typing-inspection==0.4.2 tzdata==2026.2 From 9b7bc87459d7bef6e01388000cd9334e6b8bbe37 Mon Sep 17 00:00:00 2001 From: hualxie Date: Fri, 8 May 2026 10:26:33 +0800 Subject: [PATCH 12/17] must exact same version.. --- .aitk/requirements/WebGPU/WebGPU_py3.12.13.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt index f12a05e18..b0ab103ce 100644 --- a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -39,7 +39,7 @@ olive-ai==0.12.1 onnx==1.21.0 onnx-ir==0.2.1 # install it separatly with no deps as it will install onnxruntime to overwrite onnxruntime-webgpu -# uvpip:install onnxruntime-genai==0.13.2 --no-deps;post +# uvpip:install onnxruntime-genai==0.12.2 --no-deps;post onnxoptimizer==0.4.2 onnxruntime-webgpu==1.25.1 onnxscript==0.7.0 @@ -67,11 +67,11 @@ six==1.17.0 sqlalchemy==2.0.49 sympy==1.14.0 tabulate==0.10.0 -tokenizers==0.22.2 +tokenizers==0.21.4 torch==2.11.0+cu130 torchmetrics==1.9.0 tqdm==4.67.3 -transformers==4.57.6 +transformers==4.52.4 typing-extensions==4.15.0 typing-inspection==0.4.2 tzdata==2026.2 From beb508049d0cc140e19f22d53785b7ffb74c3e9b Mon Sep 17 00:00:00 2001 From: hualxie Date: Fri, 8 May 2026 11:20:43 +0800 Subject: [PATCH 13/17] add llama --- .aitk/configs/checks.json | 8 +- .aitk/configs/model_list.json | 3 +- .aitk/docs/guide/ModelList.md | 2 +- .../aitk/_copy.json.config | 14 +++ .../aitk/info.yml | 6 ++ .../aitk/llama3_2_webgpu.json | 73 ++++++++++++++ .../aitk/llama3_2_webgpu.json.config | 95 +++++++++++++++++++ .../aitk/model_project.config | 4 + 8 files changed, 199 insertions(+), 6 deletions(-) create mode 100644 meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json create mode 100644 meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 6d2666e0f..9f01f55f7 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,6 +1,6 @@ { - "configCheck": 175, - "copyCheck": 186, + "configCheck": 176, + "copyCheck": 187, "extensionCheck": 2, "gitignoreCheck": 44, "inferenceModelCheck": 25, @@ -8,8 +8,8 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 175, - "pathCheck": 1449, + "oliveJsonCheck": 176, + "pathCheck": 1454, "requirementsCheck": 37, "templateCheck": 3, "venvRequirementsCheck": 18 diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 7768ed3d9..06ca4ae3f 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -194,7 +194,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index 53fdec086..4bf60e316 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -9,7 +9,7 @@ | [Deepseek R1 Distill Qwen 14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_trtrtx.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_npu_config.json) | | [Deepseek R1 Distill Qwen 7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_trtrtx.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_npu_config.json) | | [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json), [AMD NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_config.json), [DirectML](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_dml_config.json) | -| [Llama 3.2 1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json), [Qualcomm GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_gpu_config.json), [AMD NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_config.json), [DirectML](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_dml_config.json) | +| [Llama 3.2 1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json), [Qualcomm GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_gpu_config.json), [AMD NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_config.json), [DirectML](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_dml_config.json), [WebGPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json) | | [Mistral 7B Instruct V0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | [AMD NPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_trtrtx.json), [Intel CPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_gpu_context_ov_dy.json), [Intel GPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_gpu_context_ov_dy.json), [Intel NPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_npu_context_ov_dy.json) | | [Mistral 7B Instruct V0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | [Intel CPU](../../../mistralai-Mistral-7B-Instruct-v0.3/aitk/mistral-7b-instruct-v0.3-ov.json), [Intel GPU](../../../mistralai-Mistral-7B-Instruct-v0.3/aitk/mistral-7b-instruct-v0.3-ov.json) | | [Phi 3 Mini 128K Instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_qnn.json), [AMD NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_trtrtx.json), [Intel CPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_config.json), [Intel GPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_config.json), [Intel NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_npu_config.json) | diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config index d539528bc..74763f29e 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config @@ -42,6 +42,20 @@ } ] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json", + "dst": "llama3_2_webgpu.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "meta-llama/Llama-3.2-1B-Instruct" + }, + { + "find": "model/deepseek", + "replace": "model/llama3_2" + } + ] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml index cf6187b73..aabb016b0 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml @@ -39,6 +39,12 @@ recipes: isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider + - file: "llama3_2_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/meta-llama/Llama-3.2-1B-Instruct" diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json new file mode 100644 index 000000000..67572ad6d --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json @@ -0,0 +1,73 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "meta-llama/Llama-3.2-1B-Instruct", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "kld_gradient", + "bits": 4, + "high_bits": 8, + "ratio": 0.65, + "sym": false, + "group_size": 32 + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/llama3_2", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config index 3887e2f47..90d1e0ba0 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "llama3_2_qnn_gpu_config.json", "templateName": "llama3_2_qnn_gpu_config" + }, + { + "file": "llama3_2_webgpu.json", + "templateName": "llama3_2_webgpu" } ], "modelInfo": { From a44c252277e34f888197a0d9ba359deb00bba3f2 Mon Sep 17 00:00:00 2001 From: hualxie Date: Fri, 8 May 2026 12:01:32 +0800 Subject: [PATCH 14/17] add qwen, phi --- .aitk/configs/checks.json | 8 +- .aitk/configs/model_list.json | 6 +- .aitk/docs/guide/ModelList.md | 4 +- .../aitk/_copy.json.config | 14 +++ Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml | 6 ++ .../aitk/model_project.config | 4 + .../aitk/qwen2_5_webgpu.json | 73 ++++++++++++++ .../aitk/qwen2_5_webgpu.json.config | 95 +++++++++++++++++++ .../aitk/_copy.json.config | 14 +++ microsoft-Phi-3.5-mini-instruct/aitk/info.yml | 6 ++ .../aitk/model_project.config | 4 + .../aitk/phi3_5_webgpu.json | 73 ++++++++++++++ .../aitk/phi3_5_webgpu.json.config | 95 +++++++++++++++++++ 13 files changed, 394 insertions(+), 8 deletions(-) create mode 100644 Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json create mode 100644 Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config create mode 100644 microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json create mode 100644 microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 9f01f55f7..0e8897cac 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,6 +1,6 @@ { - "configCheck": 176, - "copyCheck": 187, + "configCheck": 178, + "copyCheck": 189, "extensionCheck": 2, "gitignoreCheck": 44, "inferenceModelCheck": 25, @@ -8,8 +8,8 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 176, - "pathCheck": 1454, + "oliveJsonCheck": 178, + "pathCheck": 1464, "requirementsCheck": 37, "templateCheck": 3, "venvRequirementsCheck": 18 diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 06ca4ae3f..9088cc137 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -13,7 +13,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -273,7 +274,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index 4bf60e316..fd663b079 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -14,14 +14,14 @@ | [Mistral 7B Instruct V0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | [Intel CPU](../../../mistralai-Mistral-7B-Instruct-v0.3/aitk/mistral-7b-instruct-v0.3-ov.json), [Intel GPU](../../../mistralai-Mistral-7B-Instruct-v0.3/aitk/mistral-7b-instruct-v0.3-ov.json) | | [Phi 3 Mini 128K Instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_qnn.json), [AMD NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_trtrtx.json), [Intel CPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_config.json), [Intel GPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_config.json), [Intel NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_npu_config.json) | | [Phi 3 Mini 4K Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_qnn.json), [AMD NPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_trtrtx.json), [Intel CPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_ov_config.json), [Intel GPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_ov_config.json), [Intel NPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_ov_npu_config.json) | -| [Phi 3.5 Mini Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json), [Qualcomm GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_gpu_config.json), [AMD NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_trtrtx_config.json), [Intel CPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_config.json), [DirectML](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_dml_config.json) | +| [Phi 3.5 Mini Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json), [Qualcomm GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_gpu_config.json), [AMD NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_trtrtx_config.json), [Intel CPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_config.json), [DirectML](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_dml_config.json), [WebGPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json) | | [Phi 4](https://huggingface.co/microsoft/Phi-4) | [NVIDIA TensorRT for RTX](../../../microsoft-Phi-4/aitk/phi4_trtrtx.json), [Intel CPU](../../../microsoft-Phi-4/aitk/phi4_ov_config.json), [Intel GPU](../../../microsoft-Phi-4/aitk/phi4_ov_config.json) | | [Phi 4 Mini Instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct) | [Qualcomm NPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_qnn.json), [AMD NPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_vitis_ai_config.json), [Intel CPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_ov_config.json), [Intel GPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_ov_config.json), [Intel NPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_ov_npu_config.json) | | [Phi 4 Mini Reasoning](https://huggingface.co/microsoft/Phi-4-mini-reasoning) | [AMD NPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_vitis_ai_config.json), [Intel CPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_ov_gpu_config.json), [Intel GPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_ov_gpu_config.json), [Intel NPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_ov_config.json) | | [Phi 4 Reasoning](https://huggingface.co/microsoft/Phi-4-reasoning) | [Intel NPU](../../../microsoft-Phi-4-reasoning/aitk/phi4_ov_config.json) | | [Phi 4 Reasoning Plus](https://huggingface.co/microsoft/Phi-4-reasoning-plus) | [Intel NPU](../../../microsoft-Phi-4-reasoning-plus/aitk/phi4_ov_config.json) | | [Qwen2.5 0.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | [AMD NPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_trtrtx.json), [Intel CPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_ov_npu_config.json) | -| [Qwen2.5 1.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | [Qualcomm NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json), [Qualcomm GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_gpu_config.json), [AMD NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_trtrtx_config.json), [Intel CPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_config.json), [DirectML](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_dml_config.json) | +| [Qwen2.5 1.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | [Qualcomm NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json), [Qualcomm GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_gpu_config.json), [AMD NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_trtrtx_config.json), [Intel CPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_config.json), [DirectML](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_dml_config.json), [WebGPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json) | | [Qwen2.5 14B Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_trtrtx.json), [Intel CPU](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_ov_npu_config.json) | | [Qwen2.5 3B Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | [Intel CPU](../../../Qwen-Qwen2.5-3B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-3B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-3B-Instruct/aitk/qwen2_5_ov_npu_config.json) | | [Qwen2.5 7B Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | [Qualcomm NPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_qnn_config.json), [AMD NPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_trtrtx.json), [Intel CPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_ov_npu_config.json) | diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config index 6a6e71a41..e38c1711a 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config @@ -28,6 +28,20 @@ } ] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json", + "dst": "qwen2_5_webgpu.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "Qwen/Qwen2.5-1.5B-Instruct" + }, + { + "find": "model/deepseek", + "replace": "model/qwen2_5" + } + ] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml index 09794cb70..f8f342e08 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml @@ -39,6 +39,12 @@ recipes: isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider + - file: "qwen2_5_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct" diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config index 8b192b6f5..214e98239 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "qwen2_5_qnn_gpu_config.json", "templateName": "qwen2_5_qnn_gpu_config" + }, + { + "file": "qwen2_5_webgpu.json", + "templateName": "qwen2_5_webgpu" } ], "modelInfo": { diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json new file mode 100644 index 000000000..f574e4b04 --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json @@ -0,0 +1,73 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-1.5B-Instruct", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "kld_gradient", + "bits": 4, + "high_bits": 8, + "ratio": 0.65, + "sym": false, + "group_size": 32 + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/qwen2_5", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config index cee9caa7b..36e8b8627 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config @@ -14,6 +14,20 @@ } ] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json", + "dst": "phi3_5_webgpu.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "microsoft/Phi-3.5-mini-instruct" + }, + { + "find": "model/deepseek", + "replace": "model/phi3_5" + } + ] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml index eba2844ca..c30fdee22 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml +++ b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml @@ -39,6 +39,12 @@ recipes: isGPURequired: true runtimeOverwrite: executeEp: NvTensorRTRTXExecutionProvider + - file: "phi3_5_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/microsoft/Phi-3.5-mini-instruct" diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config index d5d2fe50b..67ed43574 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "phi3_5_qnn_gpu_config.json", "templateName": "phi3_5_qnn_gpu_config" + }, + { + "file": "phi3_5_webgpu.json", + "templateName": "phi3_5_webgpu" } ], "modelInfo": { diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json new file mode 100644 index 000000000..f71026229 --- /dev/null +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json @@ -0,0 +1,73 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "microsoft/Phi-3.5-mini-instruct", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "kld_gradient", + "bits": 4, + "high_bits": 8, + "ratio": 0.65, + "sym": false, + "group_size": 32 + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/phi3_5", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} From bbb7fbab69b96ca556a5f81da20c17dce201d429 Mon Sep 17 00:00:00 2001 From: hualxie Date: Sat, 9 May 2026 10:30:25 +0800 Subject: [PATCH 15/17] guide --- .aitk/docs/others/FIX_GUIDE.md | 330 ++++++++++++++++++++++ .aitk/docs/others/fix_onnx_model.py | 414 ++++++++++++++++++++++++++++ 2 files changed, 744 insertions(+) create mode 100644 .aitk/docs/others/FIX_GUIDE.md create mode 100644 .aitk/docs/others/fix_onnx_model.py diff --git a/.aitk/docs/others/FIX_GUIDE.md b/.aitk/docs/others/FIX_GUIDE.md new file mode 100644 index 000000000..f69f24b17 --- /dev/null +++ b/.aitk/docs/others/FIX_GUIDE.md @@ -0,0 +1,330 @@ +# Generic WebGPU ONNX Model QKV Fix Guide + +## Problem Description + +WebGPU-converted ONNX models (DeepSeek, Llama, and others) with combined qkv_proj structures develop a critical dimension mismatch error in specific layers: + +``` +Node (/model/layers.X/attn/o_proj/MatMulNBits) Op (MatMulNBits) +[ShapeInferenceError] Incompatible dimensions for matrix multiplication +``` + +### Root Cause + +These layers have a **combined qkv_proj** structure (Q, K, V packed into one output), but the GroupQueryAttention operation was misconfigured: + +| Issue | Problem | +|-------|---------| +| **Q input** | Receiving full 2048-dim qkv output instead of just Q (1536 dims) | +| **K input** | Using K from previous layer instead of current layer (256 dims from wrong source) | +| **V input** | Using V from previous layer instead of current layer (256 dims from wrong source) | +| **Result** | GroupQueryAttention produces mismatched output → o_proj fails | + +### Layer Structure + +Different models have this issue in different layers: + +| Model | Layers with combined qkv_proj | Total QKV | Q | K | V | +|-------|-------------------------------|-----------|---|---|---| +| DeepSeek-R1-Distill-Qwen-1.5B | 0, 6, 8, 12, 25, 26, 27 | 2048 | 1536 | 256 | 256 | +| Llama-3.2-1B | 2, 5, 6, 8, 10, 13 | 3072 | 2048 | 512 | 512 | + +The `fix_onnx_model.py` script auto-detects this information automatically. + +## Solution + +For each affected layer, extract Q, K, V from the combined qkv_proj using Slice operations: + +``` +qkv_proj output (total_qkv dims): + [0:q_dim] → Q dimensions + [q_dim:q_dim+k_dim] → K dimensions + [q_dim+k_dim:total_qkv] → V dimensions + +GroupQueryAttention uses extracted Q, K, V → output matches o_proj expectations +``` + +**Example dimensions:** +- **DeepSeek:** [0:1536] Q, [1536:1792] K, [1792:2048] V +- **Llama:** [0:2048] Q, [2048:2560] K, [2560:3072] V + +## Implementation + +### Quick Start (Auto-Detect) + +The script automatically detects affected layers and dimensions: + +```bash +# From the model directory +cd ./model + +# Run the fix (auto-detects everything) +python ../fix_onnx_model.py model.onnx + +# Verify the fix +python ../fix_onnx_model.py model.onnx --verify +``` + +### Using Configuration File + +For reproducibility or multiple models, create a `config.json`: + +```json +{ + "layers_to_fix": [0, 6, 8, 12, 25, 26, 27], + "q_dim": 1536, + "k_dim": 256, + "v_dim": 256 +} +``` + +Then run: +```bash +python fix_onnx_model.py model.onnx --config config.json +``` + +### Examples for Common Models + +**DeepSeek-R1-Distill-Qwen-1.5B config.json:** +```json +{ + "layers_to_fix": [0, 6, 8, 12, 25, 26, 27], + "q_dim": 1536, + "k_dim": 256, + "v_dim": 256 +} +``` + +**Llama-3.2-1B config.json:** +```json +{ + "layers_to_fix": [2, 5, 6, 8, 10, 13], + "q_dim": 2048, + "k_dim": 512, + "v_dim": 512 +} +``` + +### Manual Implementation (Advanced) + +If you need to integrate this into your own code: + +```python +from fix_onnx_model import fix_webgpu_qkv_model, verify_fix + +# Auto-detect (recommended) +fix_webgpu_qkv_model('model.onnx') + +# Or with explicit parameters +fix_webgpu_qkv_model( + 'model.onnx', + layers_to_fix=[2, 5, 6, 8, 10, 13], # Llama layers + q_dim=2048, + k_dim=512, + v_dim=512, + auto_detect=False # Use provided values only +) + +# Verify +verify_fix('model.onnx', verbose=True) +``` + +## Key Technical Details + +### ONNX Slice Syntax + +The `Slice` operator (opset 21) takes inputs in this order: +``` +Slice(data, starts, ends, [axes], [steps]) +``` + +- **data:** Input tensor to slice +- **starts:** Tensor with starting indices +- **ends:** Tensor with ending indices +- **axes:** Tensor specifying which axes to slice (e.g., [2] for axis 2) +- **steps:** (optional) Step size for each axis + +**Important:** Pass `axes` as an input tensor, NOT as an attribute (common mistake with older ONNX versions). + +### Data Type Consistency + +All new tensors must be **FLOAT16** to match: +- Input: `qkv_proj/Add/output_0` (FLOAT16) +- Output: `GroupQueryAttention/output_0` (FLOAT16) +- Subsequent layers expect FLOAT16 inputs + +### Dimension Breakdown + +The exact dimensions depend on your model's architecture: + +**DeepSeek-R1-Distill-Qwen-1.5B:** +- num_heads=12, kv_num_heads=2, head_dim=128 +- Q: 12 × 128 = 1536 +- K: 2 × 128 = 256 +- V: 2 × 128 = 256 +- Total: 1536 + 256 + 256 = 2048 + +**Llama-3.2-1B:** +- num_heads=32, kv_num_heads=8, head_dim=64 +- Q: 32 × 64 = 2048 +- K: 8 × 64 = 512 +- V: 8 × 64 = 512 +- Total: 2048 + 512 + 512 = 3072 + +To find these for any model: +```python +import onnx + +model = onnx.load('model.onnx', load_external_data=False) +for vi in model.graph.value_info: + if 'layers.0/attn/qkv_proj' in vi.name and 'output' in vi.name: + qkv_dim = vi.type.tensor_type.shape.dim[-1].dim_value + print(f"Total QKV dimension: {qkv_dim}") + break + +for node in model.graph.node: + if 'layers.0/attn/o_proj' in node.name: + for attr in node.attribute: + if attr.name == 'K': + print(f"Q dimension (from o_proj K): {attr.i}") + break +``` + +## Verification + +After applying the fix, verify that: + +```python +import onnx + +model = onnx.load('model.onnx', load_external_data=False) +layers_to_check = [0, 6, 8, 12, 25, 26, 27] # Or your model's layers + +for layer_id in layers_to_check: + for node in model.graph.node: + if node.name == f'/model/layers.{layer_id}/attn/GroupQueryAttention': + print(f"Layer {layer_id}:") + print(f" Q: {node.input[0]}") # Should be q_proj_extracted + print(f" K: {node.input[1]}") # Should be k_proj_extracted + print(f" V: {node.input[2]}") # Should be v_proj_extracted + break +``` + +Expected pattern for fixed model: +``` +Layer 0: + Q: /model/layers.0/attn/q_proj_extracted/output_0 + K: /model/layers.0/attn/k_proj_extracted/output_0 + V: /model/layers.0/attn/v_proj_extracted/output_0 +``` + +The script's `--verify` flag does this automatically: +```bash +python fix_onnx_model.py model.onnx --verify +``` + +## Usage Example + +### DeepSeek-R1-Distill-Qwen-1.5B + +```bash +cd C:\path\to\deepseek\model +python fix_onnx_model.py model/model.onnx +``` + +### Llama-3.2-1B + +```bash +cd C:\path\to\llama\model +python fix_onnx_model.py model/model.onnx +``` + +Both commands auto-detect layers and dimensions automatically. After the fix, your inference notebooks should work without shape inference errors: + +```python +import onnxruntime_genai as og + +# Model now loads successfully +model = og.Model('./model') +tokenizer = og.Tokenizer(model) + +# Inference works correctly +generator = og.Generator(model, params) +``` + +## Detecting This Issue + +If your WebGPU-converted model fails with shape inference errors, you can check if it has this issue: + +```python +import onnx + +model = onnx.load('model.onnx', load_external_data=False) + +print("=== Checking for QKV cross-layer references ===") +affected_layers = [] + +for i in range(64): + gqa_node = None + for node in model.graph.node: + if node.name == f'/model/layers.{i}/attn/GroupQueryAttention': + gqa_node = node + break + + if not gqa_node: + continue + + has_qkv = any(f'layers.{i}/attn' in n.name and 'qkv_proj' in n.name + for n in model.graph.node) + + if has_qkv: + # Check if K/V come from different layers + k_input = gqa_node.input[1] + v_input = gqa_node.input[2] + + if f'layers.{i}' not in k_input or f'layers.{i}' not in v_input: + print(f" ✗ Layer {i}: Cross-layer reference detected") + affected_layers.append(i) + +if affected_layers: + print(f"\nFix required for layers: {affected_layers}") +else: + print("\nNo cross-layer references detected - model may not need fixing") +``` + +Typical output for affected models: +``` +✗ Layer 2: Cross-layer reference detected +✗ Layer 5: Cross-layer reference detected +✗ Layer 6: Cross-layer reference detected +... +Fix required for layers: [2, 5, 6, 8, 10, 13] +``` + +## Troubleshooting + +| Error | Solution | +|-------|----------| +| `Unrecognized attribute: axes for operator Slice` | Ensure `axes` is passed as an input tensor, not an attribute (automatic in script) | +| `Type (tensor(float)) does not match expected type (tensor(float16))` | Verify all new tensors use correct data type - script auto-detects this | +| `Incompatible dimensions for matrix multiplication` | Confirm Slice indices match your model's dimensions (script auto-detects) | +| Model still fails after fix | Run with `--verify` flag to check all layers were processed correctly | +| Auto-detection doesn't work | Provide explicit config with `--config` flag | + +## Supported Models + +This fix has been tested on: +- ✅ DeepSeek-R1-Distill-Qwen-1.5B +- ✅ Llama-3.2-1B-Instruct +- ✅ Other WebGPU-converted models with similar cross-layer QKV issues + +If you test this on other models, please note that auto-detection handles most cases. For models with non-standard structures, use the config file approach. + +## References + +- ONNX Slice operator: https://onnx.ai/onnx/operators/onnx__Slice.html +- ONNX spec: https://onnx.ai/onnx/ +- DeepSeek-R1 Model: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- Llama-3.2 Model: https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct +- WebGPU ONNX Runtime: https://onnxruntime.ai/docs/execution-providers/web-gpu-execution-provider.html +- ONNX Runtime GenAI: https://github.com/microsoft/onnxruntime-genai diff --git a/.aitk/docs/others/fix_onnx_model.py b/.aitk/docs/others/fix_onnx_model.py new file mode 100644 index 000000000..9559e6c9b --- /dev/null +++ b/.aitk/docs/others/fix_onnx_model.py @@ -0,0 +1,414 @@ +""" +Generic ONNX Model WebGPU Fix for Combined QKV Projection Issues + +PROBLEM SUMMARY: +================ +WebGPU-converted ONNX models with combined qkv_proj structures exhibit a critical +architecture mismatch: + +1. GroupQueryAttention nodes use K, V projections from PREVIOUS layers instead of + the same layer +2. GroupQueryAttention Q input receives the full combined qkv_proj output instead + of just the Q portion +3. This causes dimension mismatch: o_proj expects specific K dimension but receives + mismatched output from GroupQueryAttention + +EXAMPLES: +- DeepSeek-R1-Distill-Qwen-1.5B: qkv_proj=2048, Q=1536, K=256, V=256 +- Llama-3.2-1B: qkv_proj=3072, Q=2048, K=512, V=512 + +SOLUTION OVERVIEW: +================== +For each affected layer, we: +1. Extract Q from qkv_proj[0:Q_dim] +2. Extract K from qkv_proj[Q_dim:Q_dim+K_dim] +3. Extract V from qkv_proj[Q_dim+K_dim:total_dim] +4. Update GroupQueryAttention to use extracted tensors +5. Ensure all new tensors match model precision +6. Use proper ONNX Slice syntax (axes as input, not attribute) +""" + +import onnx +from onnx import helper +import sys +import json +from pathlib import Path + +def auto_detect_layers_and_dims(model_path): + """ + Auto-detect which layers have combined qkv_proj and their dimensions. + + Returns: (layers_to_fix, q_dim, k_dim, v_dim) or (None, None, None, None) if not found + """ + try: + model = onnx.load(model_path, load_external_data=False) + graph = model.graph + + layers_to_fix = [] + qkv_dim = None + + # Find layers with qkv_proj + for i in range(64): + has_qkv = False + for node in graph.node: + if f'layers.{i}/attn' in node.name and 'qkv_proj' in node.name: + has_qkv = True + if qkv_dim is None: + # Get qkv_proj output dimension + for vi in graph.value_info: + if f'layers.{i}/attn/qkv_proj' in vi.name and 'output' in vi.name: + dims = vi.type.tensor_type.shape.dim + qkv_dim = dims[-1].dim_value + + if has_qkv: + layers_to_fix.append(i) + + if not layers_to_fix or qkv_dim is None: + return None, None, None, None + + # Get o_proj K dimension to infer Q_dim + o_proj_k = None + for i in layers_to_fix: + for node in graph.node: + if node.name == f'/model/layers.{i}/attn/o_proj/MatMulNBits': + for attr in node.attribute: + if attr.type == 2 and attr.name == 'K': + o_proj_k = attr.i + break + if o_proj_k: + break + + if qkv_dim and o_proj_k: + q_dim = o_proj_k + remaining = qkv_dim - q_dim + k_dim = remaining // 2 + v_dim = remaining - k_dim + return layers_to_fix, q_dim, k_dim, v_dim + + return None, None, None, None + except Exception: + return None, None, None, None + + +def fix_webgpu_qkv_model(model_path, layers_to_fix=None, q_dim=None, k_dim=None, v_dim=None, auto_detect=True): + """ + Generic fix for WebGPU ONNX models with combined qkv_proj dimension mismatch. + + Parameters: + ----------- + model_path : str + Path to the ONNX model file + layers_to_fix : list + Layer IDs to fix (auto-detected if None) + q_dim : int + Query dimension (auto-detected if None) + k_dim : int + Key dimension (auto-detected if None) + v_dim : int + Value dimension (auto-detected if None) + auto_detect : bool + If True, auto-detect layers and dimensions (overrides manual params) + + Returns: + -------- + bool : True if successful, False otherwise + """ + + print("=" * 70) + print("Generic WebGPU ONNX QKV Model Fixer") + print("=" * 70) + + try: + # Load model + print(f"\n[1/4] Loading model from {model_path}...") + model = onnx.load(model_path, load_external_data=False) + graph = model.graph + print(f" ✓ Model loaded successfully") + print(f" - IR Version: {model.ir_version}") + print(f" - Opset: {model.opset_import[0].version if model.opset_import else 'unknown'}") + + # Auto-detect if enabled + if auto_detect: + print(f"\n[2/4] Auto-detecting layers and dimensions...") + det_layers, det_q, det_k, det_v = auto_detect_layers_and_dims(model_path) + if det_layers: + layers_to_fix = det_layers + q_dim = det_q + k_dim = det_k + v_dim = det_v + print(f" ✓ Detected layers: {layers_to_fix}") + print(f" ✓ Detected dimensions: Q={q_dim}, K={k_dim}, V={v_dim}") + + if not layers_to_fix or not q_dim or not k_dim or not v_dim: + print(f" ✗ Failed to detect or specify layers and dimensions") + return False + + total_dim = q_dim + k_dim + v_dim + print(f"\n[3/4] Setting up Slice operations...") + print(f" • Total QKV dim: {total_dim} = {q_dim} + {k_dim} + {v_dim}") + + # Create required constants for Slice operations + constants = { + 'const_0': 0, + f'const_{q_dim}': q_dim, + f'const_{q_dim + k_dim}': q_dim + k_dim, + f'const_{total_dim}': total_dim, + 'const_axes_2': [2] + } + + # Add constants to graph + for const_name, const_value in constants.items(): + if not any(init.name == const_name for init in graph.initializer): + if const_name == 'const_axes_2': + tensor = helper.make_tensor(const_name, onnx.TensorProto.INT64, [1], const_value) + else: + tensor = helper.make_tensor(const_name, onnx.TensorProto.INT64, [1], [const_value]) + graph.initializer.append(tensor) + + # Fix each layer + slices_added = 0 + for layer_id in layers_to_fix: + # Auto-detect qkv_proj output node (could be Add or MatMulNBits) + qkv_output = None + for node in graph.node: + if node.name == f'/model/layers.{layer_id}/attn/qkv_proj/Add': + qkv_output = f'/model/layers.{layer_id}/attn/qkv_proj/Add/output_0' + break + + if not qkv_output: + # Fall back to MatMulNBits if no Add node + for node in graph.node: + if node.name == f'/model/layers.{layer_id}/attn/qkv_proj/MatMulNBits': + qkv_output = f'/model/layers.{layer_id}/attn/qkv_proj/MatMulNBits/output_0' + break + + if not qkv_output: + print(f" ✗ Could not find qkv_proj output for layer {layer_id}") + return False + + # Find data type from qkv_proj output + dtype = onnx.TensorProto.FLOAT16 + for vi in graph.value_info: + if f'layers.{layer_id}/attn/qkv_proj' in vi.name and 'output' in vi.name: + dtype = vi.type.tensor_type.elem_type + break + + # Q extraction: [0:q_dim] + slice_q = helper.make_node( + 'Slice', + inputs=[qkv_output, + 'const_0', f'const_{q_dim}', 'const_axes_2'], + outputs=[f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0'], + name=f'/model/layers.{layer_id}/attn/q_proj_extracted/Slice' + ) + + # K extraction: [q_dim:q_dim+k_dim] + slice_k = helper.make_node( + 'Slice', + inputs=[qkv_output, + f'const_{q_dim}', f'const_{q_dim + k_dim}', 'const_axes_2'], + outputs=[f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0'], + name=f'/model/layers.{layer_id}/attn/k_proj_extracted/Slice' + ) + + # V extraction: [q_dim+k_dim:total] + slice_v = helper.make_node( + 'Slice', + inputs=[qkv_output, + f'const_{q_dim + k_dim}', f'const_{total_dim}', 'const_axes_2'], + outputs=[f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0'], + name=f'/model/layers.{layer_id}/attn/v_proj_extracted/Slice' + ) + + graph.node.extend([slice_q, slice_k, slice_v]) + slices_added += 3 + + # Add value_info for extracted tensors + q_info = helper.make_tensor_value_info( + f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0', + dtype, + ['batch_size', 'sequence_length', q_dim] + ) + k_info = helper.make_tensor_value_info( + f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0', + dtype, + ['batch_size', 'sequence_length', k_dim] + ) + v_info = helper.make_tensor_value_info( + f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0', + dtype, + ['batch_size', 'sequence_length', v_dim] + ) + graph.value_info.extend([q_info, k_info, v_info]) + + # Update GroupQueryAttention inputs + for node in graph.node: + if node.name == f'/model/layers.{layer_id}/attn/GroupQueryAttention': + node.input[0] = f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0' + node.input[1] = f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0' + node.input[2] = f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0' + break + + print(f" ✓ Added {slices_added} Slice nodes across {len(layers_to_fix)} layers") + print(f" ✓ Updated {len(layers_to_fix)} GroupQueryAttention nodes") + + # Save fixed model + print(f"\n[4/4] Saving fixed model...") + onnx.save(model, model_path) + print(f" ✓ Model saved successfully") + + print("\n" + "=" * 70) + print("FIX COMPLETED SUCCESSFULLY!") + print("=" * 70) + print("\nSummary of Changes:") + print(f" • Fixed {len(layers_to_fix)} layers: {layers_to_fix}") + print(f" • QKV dimensions: Q={q_dim}, K={k_dim}, V={v_dim}") + print(f" • Added {slices_added} Slice nodes for Q/K/V extraction") + print(f" • Corrected GroupQueryAttention layer cross-references") + print(f" • Ensured precision consistency for all new tensors") + print(f" • Updated Slice syntax for ONNX opset 21 compatibility") + + return True + + except Exception as e: + print(f"\n❌ ERROR: {str(e)}") + import traceback + traceback.print_exc() + return False + + +def verify_fix(model_path, verbose=False, layers_to_fix=None): + """ + Verify that the fix was applied correctly. + + Parameters: + ----------- + model_path : str + Path to the fixed ONNX model + verbose : bool + Print detailed information + layers_to_fix : list + Specific layers to verify (auto-detected if None) + + Returns: + -------- + bool : True if fix is verified, False otherwise + """ + + print("\nVerifying model fix...") + + try: + model = onnx.load(model_path, load_external_data=False) + graph = model.graph + + # Auto-detect layers if not provided + if layers_to_fix is None: + det_result = auto_detect_layers_and_dims(model_path) + if det_result and det_result[0]: + layers_to_fix = det_result[0] + else: + print(" ✗ No layers detected - model may not need fixing or has unknown structure") + return False + + if not layers_to_fix or not isinstance(layers_to_fix, list): + print(" ✗ Invalid layers list") + return False + + all_correct = True + + for layer_id in layers_to_fix: + # Check Slice nodes exist + slice_nodes = [n for n in graph.node + if f'layers.{layer_id}' in n.name and 'Slice' in n.name and 'proj_extracted' in n.name] + + if len(slice_nodes) != 3: + print(f" ✗ Layer {layer_id}: Expected 3 Slice nodes, found {len(slice_nodes)}") + all_correct = False + continue + + # Check GroupQueryAttention inputs + gqa_node = next((n for n in graph.node + if n.name == f'/model/layers.{layer_id}/attn/GroupQueryAttention'), None) + + if not gqa_node: + print(f" ✗ Layer {layer_id}: GroupQueryAttention node not found") + all_correct = False + continue + + # Verify inputs point to extracted tensors + q_correct = gqa_node.input[0] == f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0' + k_correct = gqa_node.input[1] == f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0' + v_correct = gqa_node.input[2] == f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0' + + if q_correct and k_correct and v_correct: + if verbose: + print(f" ✓ Layer {layer_id}: All checks passed") + else: + print(f" ✗ Layer {layer_id}: GroupQueryAttention inputs incorrect") + all_correct = False + + if all_correct: + print(" ✓ All verifications passed!") + + return all_correct + + except Exception as e: + print(f" ✗ Verification failed: {str(e)}") + import traceback + traceback.print_exc() + return False + + +if __name__ == "__main__": + # Usage: + # python fix_onnx_model.py [model_path] (auto-detect all) + # python fix_onnx_model.py [model_path] --verify (verify existing fix) + # python fix_onnx_model.py [model_path] --config config.json (use config file) + + model_path = "./model/model.onnx" + verify_only = False + config_file = None + + if len(sys.argv) > 1: + model_path = sys.argv[1] + + if "--verify" in sys.argv: + verify_only = True + + if "--config" in sys.argv: + idx = sys.argv.index("--config") + if idx + 1 < len(sys.argv): + config_file = sys.argv[idx + 1] + + if verify_only: + verify_fix(model_path, verbose=True) + sys.exit(0) + + # Load config if provided + q_dim = k_dim = v_dim = layers = None + if config_file: + try: + with open(config_file, 'r') as f: + config = json.load(f) + layers = config.get('layers_to_fix') + q_dim = config.get('q_dim') + k_dim = config.get('k_dim') + v_dim = config.get('v_dim') + print(f"Loaded config from {config_file}") + except Exception as e: + print(f"Warning: Failed to load config: {e}") + + success = fix_webgpu_qkv_model( + model_path, + layers_to_fix=layers, + q_dim=q_dim, + k_dim=k_dim, + v_dim=v_dim, + auto_detect=True # Always auto-detect if values not provided + ) + + if success: + verify_fix(model_path, verbose=True) + sys.exit(0) + else: + sys.exit(1) From 671c4284ab13baf6d717ddc3c764762b54de06d4 Mon Sep 17 00:00:00 2001 From: hualxie Date: Wed, 13 May 2026 16:18:03 +0800 Subject: [PATCH 16/17] merge fix --- .aitk/configs/checks.json | 12 +++++++----- .aitk/docs/guide/ModelList.md | 2 +- Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml | 2 -- .../aitk/info.yml | 2 -- meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml | 2 -- microsoft-Phi-3.5-mini-instruct/aitk/info.yml | 2 -- 6 files changed, 8 insertions(+), 14 deletions(-) diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 3bbc46767..b72e0388a 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,6 +1,7 @@ { - "configCheck": 178, - "copyCheck": 189, + "configCheck": 180, + "copyCheck": 190, + "executeRuntimeCheck": 115, "extensionCheck": 2, "gitignoreCheck": 44, "inferenceModelCheck": 25, @@ -8,9 +9,10 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 178, - "pathCheck": 1464, + "oliveJsonCheck": 180, + "pathCheck": 1480, "requirementsCheck": 37, "templateCheck": 3, - "venvRequirementsCheck": 18 + "venvRequirementsCheck": 22, + "winmlCopyCheck": 39 } diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index fd663b079..762fe6bf9 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -42,6 +42,6 @@ | [Clip Vit Base Patch32](https://huggingface.co/openai/clip-vit-base-patch32) | [Qualcomm NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch32/aitk/openai_clip_dml.json), [WebGPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json) | | [Clip Vit Large Patch14](https://huggingface.co/openai/clip-vit-large-patch14) | [Qualcomm NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qnn.json), [AMD NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-large-patch14/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-large-patch14/aitk/openai_clip_dml.json) | | [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [Qualcomm GPU](../../../microsoft-resnet-50/aitk/resnet_qnn_gpu.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json), [WebGPU](../../../microsoft-resnet-50/aitk/resnet_webgpu.json) | -| [Stable Diffusion V1 5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) | [Qualcomm NPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json), [Intel CPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json), [Intel GPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json) | +| [Stable Diffusion V1 5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) | [Qualcomm NPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json), [Intel CPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json), [Intel GPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json), [Intel NPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_npu_workflow.json) | | [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [Qualcomm GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qnn_gpu.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json), [WebGPU](../../../google-vit-base-patch16-224/aitk/vit_webgpu.json) | | [Whisper Large V3 Turbo](https://huggingface.co/openai/whisper-large-v3-turbo) | [Qualcomm NPU](../../../openai-whisper-large-v3-turbo/aitk/qnn_workflow.json) | diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml index ca5a0b4d0..65ae5875c 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml @@ -37,8 +37,6 @@ recipes: aitk: oliveFile: "QNN/config_gpu.json" isGPURequired: true - runtimeOverwrite: - executeEp: NvTensorRTRTXExecutionProvider requirements: General/CUDA_py3.12.9 - file: "qwen2_5_webgpu.json" device: gpu diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml index 770210df1..106acc988 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml @@ -38,8 +38,6 @@ recipes: oliveFile: "QNN/config_gpu.json" isGPURequired: true requirements: General/CUDA_py3.12.9 - runtimeOverwrite: - executeEp: NvTensorRTRTXExecutionProvider - file: "deepseek_webgpu.json" device: gpu ep: WebGpuExecutionProvider diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml index 0ecef6db7..3a1d18b2f 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml @@ -37,8 +37,6 @@ recipes: aitk: oliveFile: "QNN/config_gpu.json" isGPURequired: true - runtimeOverwrite: - executeEp: NvTensorRTRTXExecutionProvider requirements: General/CUDA_py3.12.9 - file: "llama3_2_webgpu.json" device: gpu diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml index 30a27e79b..29217e9ac 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml +++ b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml @@ -37,8 +37,6 @@ recipes: aitk: oliveFile: "QNN/config_gpu.json" isGPURequired: true - runtimeOverwrite: - executeEp: NvTensorRTRTXExecutionProvider requirements: General/CUDA_py3.12.9 - file: "phi3_5_webgpu.json" device: gpu From b3490208d5cf333b7ef74a219765564fde950461 Mon Sep 17 00:00:00 2001 From: hualxie Date: Fri, 15 May 2026 11:46:20 +0800 Subject: [PATCH 17/17] use k_quant_mixed --- Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json | 7 +------ .../aitk/deepseek_webgpu.json | 7 +------ meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json | 7 +------ microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json | 7 +------ 4 files changed, 4 insertions(+), 24 deletions(-) diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json index f574e4b04..a9d1937d0 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json @@ -22,12 +22,7 @@ "passes": { "s": { "type": "SelectiveMixedPrecision", - "algorithm": "kld_gradient", - "bits": 4, - "high_bits": 8, - "ratio": 0.65, - "sym": false, - "group_size": 32 + "algorithm": "k_quant_mixed" }, "g": { "type": "gptq", diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json index a54f42c81..314a606a0 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json @@ -22,12 +22,7 @@ "passes": { "s": { "type": "SelectiveMixedPrecision", - "algorithm": "kld_gradient", - "bits": 4, - "high_bits": 8, - "ratio": 0.65, - "sym": false, - "group_size": 32 + "algorithm": "k_quant_mixed" }, "g": { "type": "gptq", diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json index 67572ad6d..8ee4392e5 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json @@ -22,12 +22,7 @@ "passes": { "s": { "type": "SelectiveMixedPrecision", - "algorithm": "kld_gradient", - "bits": 4, - "high_bits": 8, - "ratio": 0.65, - "sym": false, - "group_size": 32 + "algorithm": "k_quant_mixed" }, "g": { "type": "gptq", diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json index f71026229..12c617ab4 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json @@ -22,12 +22,7 @@ "passes": { "s": { "type": "SelectiveMixedPrecision", - "algorithm": "kld_gradient", - "bits": 4, - "high_bits": 8, - "ratio": 0.65, - "sym": false, - "group_size": 32 + "algorithm": "k_quant_mixed" }, "g": { "type": "gptq",