diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 156c69154..b72e0388a 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,7 +1,7 @@ { - "configCheck": 169, - "copyCheck": 183, - "executeRuntimeCheck": 104, + "configCheck": 180, + "copyCheck": 190, + "executeRuntimeCheck": 115, "extensionCheck": 2, "gitignoreCheck": 44, "inferenceModelCheck": 25, @@ -9,10 +9,10 @@ "licenseCheck": 41, "modelProjectCheck": 46, "oliveCheck": 88, - "oliveJsonCheck": 169, - "pathCheck": 1439, + "oliveJsonCheck": 180, + "pathCheck": 1480, "requirementsCheck": 37, "templateCheck": 3, - "venvRequirementsCheck": 21, + "venvRequirementsCheck": 22, "winmlCopyCheck": 39 } diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 857bbfe9b..dfb098022 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -13,7 +13,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -38,7 +39,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "CNN", "status": "Ready", @@ -64,7 +66,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -89,7 +92,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -114,7 +118,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -139,7 +144,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -164,7 +170,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -188,7 +195,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -213,7 +221,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -238,7 +247,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", @@ -264,7 +274,8 @@ "IntelCPU", "IntelGPU", "IntelNPU", - "DML" + "DML", + "WebGPU" ], "architecture": "Transformer", "status": "Ready", diff --git a/.aitk/docs/guide/ModelList.md b/.aitk/docs/guide/ModelList.md index 3fbd653da..762fe6bf9 100644 --- a/.aitk/docs/guide/ModelList.md +++ b/.aitk/docs/guide/ModelList.md @@ -5,23 +5,23 @@ | Model Name | Supported Runtimes | |------------|--------------------| | [Deepseek R1 Distill Llama 8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Llama-8B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Llama-8B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Llama-8B/aitk/deepseek_ov_npu_config.json) | -| [Deepseek R1 Distill Qwen 1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | [Qualcomm NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json), [Qualcomm GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_gpu_config.json), [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_config.json), [DirectML](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json) | +| [Deepseek R1 Distill Qwen 1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | [Qualcomm NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_config.json), [Qualcomm GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_qnn_gpu_config.json), [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_trtrtx_config.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_gpu_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_ov_config.json), [DirectML](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_dml_config.json), [WebGPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json) | | [Deepseek R1 Distill Qwen 14B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_trtrtx.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-14B/aitk/deepseek_ov_npu_config.json) | | [Deepseek R1 Distill Qwen 7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | [AMD NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_trtrtx.json), [Intel CPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_config.json), [Intel GPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_config.json), [Intel NPU](../../../deepseek-ai-DeepSeek-R1-Distill-Qwen-7B/aitk/deepseek_ov_npu_config.json) | | [Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_qnn_config.json), [AMD NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_ov_config.json), [DirectML](../../../meta-llama-Llama-3.1-8B-Instruct/aitk/llama3_1_dml_config.json) | -| [Llama 3.2 1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json), [Qualcomm GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_gpu_config.json), [AMD NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_config.json), [DirectML](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_dml_config.json) | +| [Llama 3.2 1B Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | [Qualcomm NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_config.json), [Qualcomm GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_qnn_gpu_config.json), [AMD NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_trtrtx_config.json), [Intel CPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel GPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_gpu_config.json), [Intel NPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_ov_config.json), [DirectML](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_dml_config.json), [WebGPU](../../../meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json) | | [Mistral 7B Instruct V0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) | [AMD NPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_trtrtx.json), [Intel CPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_gpu_context_ov_dy.json), [Intel GPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_gpu_context_ov_dy.json), [Intel NPU](../../../mistralai-Mistral-7B-Instruct-v0.2/aitk/Mistral_7B_Instruct_v0.2_npu_context_ov_dy.json) | | [Mistral 7B Instruct V0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | [Intel CPU](../../../mistralai-Mistral-7B-Instruct-v0.3/aitk/mistral-7b-instruct-v0.3-ov.json), [Intel GPU](../../../mistralai-Mistral-7B-Instruct-v0.3/aitk/mistral-7b-instruct-v0.3-ov.json) | | [Phi 3 Mini 128K Instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_qnn.json), [AMD NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_trtrtx.json), [Intel CPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_config.json), [Intel GPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_config.json), [Intel NPU](../../../microsoft-Phi-3-mini-128k-instruct/aitk/phi3_ov_npu_config.json) | | [Phi 3 Mini 4K Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_qnn.json), [AMD NPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_trtrtx.json), [Intel CPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_ov_config.json), [Intel GPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_ov_config.json), [Intel NPU](../../../microsoft-Phi-3-mini-4k-instruct/aitk/phi3_ov_npu_config.json) | -| [Phi 3.5 Mini Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json), [Qualcomm GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_gpu_config.json), [AMD NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_trtrtx_config.json), [Intel CPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_config.json), [DirectML](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_dml_config.json) | +| [Phi 3.5 Mini Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) | [Qualcomm NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_config.json), [Qualcomm GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_qnn_gpu_config.json), [AMD NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_trtrtx_config.json), [Intel CPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel GPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_gpu_config.json), [Intel NPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_ov_config.json), [DirectML](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_dml_config.json), [WebGPU](../../../microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json) | | [Phi 4](https://huggingface.co/microsoft/Phi-4) | [NVIDIA TensorRT for RTX](../../../microsoft-Phi-4/aitk/phi4_trtrtx.json), [Intel CPU](../../../microsoft-Phi-4/aitk/phi4_ov_config.json), [Intel GPU](../../../microsoft-Phi-4/aitk/phi4_ov_config.json) | | [Phi 4 Mini Instruct](https://huggingface.co/microsoft/Phi-4-mini-instruct) | [Qualcomm NPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_qnn.json), [AMD NPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_vitis_ai_config.json), [Intel CPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_ov_config.json), [Intel GPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_ov_config.json), [Intel NPU](../../../microsoft-Phi-4-mini-instruct/aitk/phi4_ov_npu_config.json) | | [Phi 4 Mini Reasoning](https://huggingface.co/microsoft/Phi-4-mini-reasoning) | [AMD NPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_vitis_ai_config.json), [Intel CPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_ov_gpu_config.json), [Intel GPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_ov_gpu_config.json), [Intel NPU](../../../microsoft-Phi-4-mini-reasoning/aitk/phi4_ov_config.json) | | [Phi 4 Reasoning](https://huggingface.co/microsoft/Phi-4-reasoning) | [Intel NPU](../../../microsoft-Phi-4-reasoning/aitk/phi4_ov_config.json) | | [Phi 4 Reasoning Plus](https://huggingface.co/microsoft/Phi-4-reasoning-plus) | [Intel NPU](../../../microsoft-Phi-4-reasoning-plus/aitk/phi4_ov_config.json) | | [Qwen2.5 0.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | [AMD NPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_trtrtx.json), [Intel CPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-0.5B-Instruct/aitk/qwen2_5_ov_npu_config.json) | -| [Qwen2.5 1.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | [Qualcomm NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json), [Qualcomm GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_gpu_config.json), [AMD NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_trtrtx_config.json), [Intel CPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_config.json), [DirectML](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_dml_config.json) | +| [Qwen2.5 1.5B Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | [Qualcomm NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_config.json), [Qualcomm GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_qnn_gpu_config.json), [AMD NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_trtrtx_config.json), [Intel CPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel GPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_gpu_config.json), [Intel NPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_ov_config.json), [DirectML](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_dml_config.json), [WebGPU](../../../Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json) | | [Qwen2.5 14B Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_trtrtx.json), [Intel CPU](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-14B-Instruct/aitk/qwen2_5_ov_npu_config.json) | | [Qwen2.5 3B Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | [Intel CPU](../../../Qwen-Qwen2.5-3B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-3B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-3B-Instruct/aitk/qwen2_5_ov_npu_config.json) | | [Qwen2.5 7B Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | [Qualcomm NPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_qnn_config.json), [AMD NPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_vitis_ai_config.json), [NVIDIA TensorRT for RTX](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_trtrtx.json), [Intel CPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_ov_config.json), [Intel GPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_ov_config.json), [Intel NPU](../../../Qwen-Qwen2.5-7B-Instruct/aitk/qwen2_5_ov_npu_config.json) | @@ -34,14 +34,14 @@ | Model Name | Supported Runtimes | |------------|--------------------| -| [Bert Base Multilingual Cased](https://huggingface.co/google-bert/bert-base-multilingual-cased) | [Qualcomm NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_qnn.json), [Qualcomm GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qnn_gpu.json), [AMD NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_amd.json), [AMD GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_trtrtx.json), [Intel CPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [DirectML](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_dml.json) | -| [Bert Base Uncased Mrpc](https://huggingface.co/Intel/bert-base-uncased-mrpc) | [Qualcomm NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_qnn.json), [Qualcomm GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qnn_gpu.json), [AMD NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_amd.json), [AMD GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_migraphx.json), [NVIDIA TensorRT for RTX](../../../intel-bert-base-uncased-mrpc/aitk/bert_trtrtx.json), [Intel CPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [DirectML](../../../intel-bert-base-uncased-mrpc/aitk/bert_dml.json) | +| [Bert Base Multilingual Cased](https://huggingface.co/google-bert/bert-base-multilingual-cased) | [Qualcomm NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_qnn.json), [Qualcomm GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qnn_gpu.json), [AMD NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_qdq_amd.json), [AMD GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_trtrtx.json), [Intel CPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel GPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [Intel NPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_context_ov_static.json), [DirectML](../../../google-bert-bert-base-multilingual-cased/aitk/bert-base-multilingual-cased_dml.json), [WebGPU](../../../google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json) | +| [Bert Base Uncased Mrpc](https://huggingface.co/Intel/bert-base-uncased-mrpc) | [Qualcomm NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_qnn.json), [Qualcomm GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qnn_gpu.json), [AMD NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_qdq_amd.json), [AMD GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_migraphx.json), [NVIDIA TensorRT for RTX](../../../intel-bert-base-uncased-mrpc/aitk/bert_trtrtx.json), [Intel CPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel GPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [Intel NPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_ov.json), [DirectML](../../../intel-bert-base-uncased-mrpc/aitk/bert_dml.json), [WebGPU](../../../intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json) | | [Chinese Clip Vit Base Patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) | [Intel CPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../OFA-Sys-chinese-clip-vit-base-patch16/aitk/openai_clip_ov.json) | -| [Clip Vit B 32 Laion2B S34B B79K](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) | [Qualcomm NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn.json), [Qualcomm GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn_gpu.json), [AMD NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qdq_amd.json), [AMD GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx.json), [Intel CPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [DirectML](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml.json) | -| [Clip Vit Base Patch16](https://huggingface.co/openai/clip-vit-base-patch16) | [Qualcomm NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch16/aitk/openai_clip_dml.json) | -| [Clip Vit Base Patch32](https://huggingface.co/openai/clip-vit-base-patch32) | [Qualcomm NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch32/aitk/openai_clip_dml.json) | +| [Clip Vit B 32 Laion2B S34B B79K](https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K) | [Qualcomm NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn.json), [Qualcomm GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qnn_gpu.json), [AMD NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_qdq_amd.json), [AMD GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_trtrtx.json), [Intel CPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel GPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [Intel NPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_ov.json), [DirectML](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_dml.json), [WebGPU](../../../laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json) | +| [Clip Vit Base Patch16](https://huggingface.co/openai/clip-vit-base-patch16) | [Qualcomm NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch16/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch16/aitk/openai_clip_dml.json), [WebGPU](../../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json) | +| [Clip Vit Base Patch32](https://huggingface.co/openai/clip-vit-base-patch32) | [Qualcomm NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn.json), [Qualcomm GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qnn_gpu.json), [AMD NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-base-patch32/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-base-patch32/aitk/openai_clip_dml.json), [WebGPU](../../../openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json) | | [Clip Vit Large Patch14](https://huggingface.co/openai/clip-vit-large-patch14) | [Qualcomm NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qnn.json), [AMD NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_qdq_amd.json), [AMD GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_migraphx.json), [NVIDIA TensorRT for RTX](../../../openai-clip-vit-large-patch14/aitk/openai_clip_trtrtx.json), [Intel CPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel GPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [Intel NPU](../../../openai-clip-vit-large-patch14/aitk/openai_clip_ov.json), [DirectML](../../../openai-clip-vit-large-patch14/aitk/openai_clip_dml.json) | -| [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [Qualcomm GPU](../../../microsoft-resnet-50/aitk/resnet_qnn_gpu.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json) | +| [Resnet 50](https://huggingface.co/microsoft/resnet-50) | [Qualcomm NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_qnn.json), [Qualcomm GPU](../../../microsoft-resnet-50/aitk/resnet_qnn_gpu.json), [AMD NPU](../../../microsoft-resnet-50/aitk/resnet_qdq_amd.json), [AMD GPU](../../../microsoft-resnet-50/aitk/resnet_migraphx.json), [NVIDIA TensorRT for RTX](../../../microsoft-resnet-50/aitk/resnet_trtrtx.json), [Intel CPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel GPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [Intel NPU](../../../microsoft-resnet-50/aitk/resnet_context_ov_static.json), [DirectML](../../../microsoft-resnet-50/aitk/resnet_dml.json), [WebGPU](../../../microsoft-resnet-50/aitk/resnet_webgpu.json) | | [Stable Diffusion V1 5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) | [Qualcomm NPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_qnn_workflow.json), [Intel CPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json), [Intel GPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_workflow.json), [Intel NPU](../../../sd-legacy-stable-diffusion-v1-5/aitk/sd_ov_npu_workflow.json) | -| [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [Qualcomm GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qnn_gpu.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json) | +| [Vit Base Patch16 224](https://huggingface.co/google/vit-base-patch16-224) | [Qualcomm NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_qnn.json), [Qualcomm GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qnn_gpu.json), [AMD NPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_qdq_amd.json), [AMD GPU](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_migraphx.json), [NVIDIA TensorRT for RTX](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_trtrtx.json), [Intel CPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel GPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [Intel NPU](../../../google-vit-base-patch16-224/aitk/vit_base_patch16_224_context_ov_static.json), [DirectML](../../../google-vit-base-patch16-224/aitk/vit-base-patch16-224_dml.json), [WebGPU](../../../google-vit-base-patch16-224/aitk/vit_webgpu.json) | | [Whisper Large V3 Turbo](https://huggingface.co/openai/whisper-large-v3-turbo) | [Qualcomm NPU](../../../openai-whisper-large-v3-turbo/aitk/qnn_workflow.json) | diff --git a/.aitk/docs/others/FIX_GUIDE.md b/.aitk/docs/others/FIX_GUIDE.md new file mode 100644 index 000000000..f69f24b17 --- /dev/null +++ b/.aitk/docs/others/FIX_GUIDE.md @@ -0,0 +1,330 @@ +# Generic WebGPU ONNX Model QKV Fix Guide + +## Problem Description + +WebGPU-converted ONNX models (DeepSeek, Llama, and others) with combined qkv_proj structures develop a critical dimension mismatch error in specific layers: + +``` +Node (/model/layers.X/attn/o_proj/MatMulNBits) Op (MatMulNBits) +[ShapeInferenceError] Incompatible dimensions for matrix multiplication +``` + +### Root Cause + +These layers have a **combined qkv_proj** structure (Q, K, V packed into one output), but the GroupQueryAttention operation was misconfigured: + +| Issue | Problem | +|-------|---------| +| **Q input** | Receiving full 2048-dim qkv output instead of just Q (1536 dims) | +| **K input** | Using K from previous layer instead of current layer (256 dims from wrong source) | +| **V input** | Using V from previous layer instead of current layer (256 dims from wrong source) | +| **Result** | GroupQueryAttention produces mismatched output → o_proj fails | + +### Layer Structure + +Different models have this issue in different layers: + +| Model | Layers with combined qkv_proj | Total QKV | Q | K | V | +|-------|-------------------------------|-----------|---|---|---| +| DeepSeek-R1-Distill-Qwen-1.5B | 0, 6, 8, 12, 25, 26, 27 | 2048 | 1536 | 256 | 256 | +| Llama-3.2-1B | 2, 5, 6, 8, 10, 13 | 3072 | 2048 | 512 | 512 | + +The `fix_onnx_model.py` script auto-detects this information automatically. + +## Solution + +For each affected layer, extract Q, K, V from the combined qkv_proj using Slice operations: + +``` +qkv_proj output (total_qkv dims): + [0:q_dim] → Q dimensions + [q_dim:q_dim+k_dim] → K dimensions + [q_dim+k_dim:total_qkv] → V dimensions + +GroupQueryAttention uses extracted Q, K, V → output matches o_proj expectations +``` + +**Example dimensions:** +- **DeepSeek:** [0:1536] Q, [1536:1792] K, [1792:2048] V +- **Llama:** [0:2048] Q, [2048:2560] K, [2560:3072] V + +## Implementation + +### Quick Start (Auto-Detect) + +The script automatically detects affected layers and dimensions: + +```bash +# From the model directory +cd ./model + +# Run the fix (auto-detects everything) +python ../fix_onnx_model.py model.onnx + +# Verify the fix +python ../fix_onnx_model.py model.onnx --verify +``` + +### Using Configuration File + +For reproducibility or multiple models, create a `config.json`: + +```json +{ + "layers_to_fix": [0, 6, 8, 12, 25, 26, 27], + "q_dim": 1536, + "k_dim": 256, + "v_dim": 256 +} +``` + +Then run: +```bash +python fix_onnx_model.py model.onnx --config config.json +``` + +### Examples for Common Models + +**DeepSeek-R1-Distill-Qwen-1.5B config.json:** +```json +{ + "layers_to_fix": [0, 6, 8, 12, 25, 26, 27], + "q_dim": 1536, + "k_dim": 256, + "v_dim": 256 +} +``` + +**Llama-3.2-1B config.json:** +```json +{ + "layers_to_fix": [2, 5, 6, 8, 10, 13], + "q_dim": 2048, + "k_dim": 512, + "v_dim": 512 +} +``` + +### Manual Implementation (Advanced) + +If you need to integrate this into your own code: + +```python +from fix_onnx_model import fix_webgpu_qkv_model, verify_fix + +# Auto-detect (recommended) +fix_webgpu_qkv_model('model.onnx') + +# Or with explicit parameters +fix_webgpu_qkv_model( + 'model.onnx', + layers_to_fix=[2, 5, 6, 8, 10, 13], # Llama layers + q_dim=2048, + k_dim=512, + v_dim=512, + auto_detect=False # Use provided values only +) + +# Verify +verify_fix('model.onnx', verbose=True) +``` + +## Key Technical Details + +### ONNX Slice Syntax + +The `Slice` operator (opset 21) takes inputs in this order: +``` +Slice(data, starts, ends, [axes], [steps]) +``` + +- **data:** Input tensor to slice +- **starts:** Tensor with starting indices +- **ends:** Tensor with ending indices +- **axes:** Tensor specifying which axes to slice (e.g., [2] for axis 2) +- **steps:** (optional) Step size for each axis + +**Important:** Pass `axes` as an input tensor, NOT as an attribute (common mistake with older ONNX versions). + +### Data Type Consistency + +All new tensors must be **FLOAT16** to match: +- Input: `qkv_proj/Add/output_0` (FLOAT16) +- Output: `GroupQueryAttention/output_0` (FLOAT16) +- Subsequent layers expect FLOAT16 inputs + +### Dimension Breakdown + +The exact dimensions depend on your model's architecture: + +**DeepSeek-R1-Distill-Qwen-1.5B:** +- num_heads=12, kv_num_heads=2, head_dim=128 +- Q: 12 × 128 = 1536 +- K: 2 × 128 = 256 +- V: 2 × 128 = 256 +- Total: 1536 + 256 + 256 = 2048 + +**Llama-3.2-1B:** +- num_heads=32, kv_num_heads=8, head_dim=64 +- Q: 32 × 64 = 2048 +- K: 8 × 64 = 512 +- V: 8 × 64 = 512 +- Total: 2048 + 512 + 512 = 3072 + +To find these for any model: +```python +import onnx + +model = onnx.load('model.onnx', load_external_data=False) +for vi in model.graph.value_info: + if 'layers.0/attn/qkv_proj' in vi.name and 'output' in vi.name: + qkv_dim = vi.type.tensor_type.shape.dim[-1].dim_value + print(f"Total QKV dimension: {qkv_dim}") + break + +for node in model.graph.node: + if 'layers.0/attn/o_proj' in node.name: + for attr in node.attribute: + if attr.name == 'K': + print(f"Q dimension (from o_proj K): {attr.i}") + break +``` + +## Verification + +After applying the fix, verify that: + +```python +import onnx + +model = onnx.load('model.onnx', load_external_data=False) +layers_to_check = [0, 6, 8, 12, 25, 26, 27] # Or your model's layers + +for layer_id in layers_to_check: + for node in model.graph.node: + if node.name == f'/model/layers.{layer_id}/attn/GroupQueryAttention': + print(f"Layer {layer_id}:") + print(f" Q: {node.input[0]}") # Should be q_proj_extracted + print(f" K: {node.input[1]}") # Should be k_proj_extracted + print(f" V: {node.input[2]}") # Should be v_proj_extracted + break +``` + +Expected pattern for fixed model: +``` +Layer 0: + Q: /model/layers.0/attn/q_proj_extracted/output_0 + K: /model/layers.0/attn/k_proj_extracted/output_0 + V: /model/layers.0/attn/v_proj_extracted/output_0 +``` + +The script's `--verify` flag does this automatically: +```bash +python fix_onnx_model.py model.onnx --verify +``` + +## Usage Example + +### DeepSeek-R1-Distill-Qwen-1.5B + +```bash +cd C:\path\to\deepseek\model +python fix_onnx_model.py model/model.onnx +``` + +### Llama-3.2-1B + +```bash +cd C:\path\to\llama\model +python fix_onnx_model.py model/model.onnx +``` + +Both commands auto-detect layers and dimensions automatically. After the fix, your inference notebooks should work without shape inference errors: + +```python +import onnxruntime_genai as og + +# Model now loads successfully +model = og.Model('./model') +tokenizer = og.Tokenizer(model) + +# Inference works correctly +generator = og.Generator(model, params) +``` + +## Detecting This Issue + +If your WebGPU-converted model fails with shape inference errors, you can check if it has this issue: + +```python +import onnx + +model = onnx.load('model.onnx', load_external_data=False) + +print("=== Checking for QKV cross-layer references ===") +affected_layers = [] + +for i in range(64): + gqa_node = None + for node in model.graph.node: + if node.name == f'/model/layers.{i}/attn/GroupQueryAttention': + gqa_node = node + break + + if not gqa_node: + continue + + has_qkv = any(f'layers.{i}/attn' in n.name and 'qkv_proj' in n.name + for n in model.graph.node) + + if has_qkv: + # Check if K/V come from different layers + k_input = gqa_node.input[1] + v_input = gqa_node.input[2] + + if f'layers.{i}' not in k_input or f'layers.{i}' not in v_input: + print(f" ✗ Layer {i}: Cross-layer reference detected") + affected_layers.append(i) + +if affected_layers: + print(f"\nFix required for layers: {affected_layers}") +else: + print("\nNo cross-layer references detected - model may not need fixing") +``` + +Typical output for affected models: +``` +✗ Layer 2: Cross-layer reference detected +✗ Layer 5: Cross-layer reference detected +✗ Layer 6: Cross-layer reference detected +... +Fix required for layers: [2, 5, 6, 8, 10, 13] +``` + +## Troubleshooting + +| Error | Solution | +|-------|----------| +| `Unrecognized attribute: axes for operator Slice` | Ensure `axes` is passed as an input tensor, not an attribute (automatic in script) | +| `Type (tensor(float)) does not match expected type (tensor(float16))` | Verify all new tensors use correct data type - script auto-detects this | +| `Incompatible dimensions for matrix multiplication` | Confirm Slice indices match your model's dimensions (script auto-detects) | +| Model still fails after fix | Run with `--verify` flag to check all layers were processed correctly | +| Auto-detection doesn't work | Provide explicit config with `--config` flag | + +## Supported Models + +This fix has been tested on: +- ✅ DeepSeek-R1-Distill-Qwen-1.5B +- ✅ Llama-3.2-1B-Instruct +- ✅ Other WebGPU-converted models with similar cross-layer QKV issues + +If you test this on other models, please note that auto-detection handles most cases. For models with non-standard structures, use the config file approach. + +## References + +- ONNX Slice operator: https://onnx.ai/onnx/operators/onnx__Slice.html +- ONNX spec: https://onnx.ai/onnx/ +- DeepSeek-R1 Model: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- Llama-3.2 Model: https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct +- WebGPU ONNX Runtime: https://onnxruntime.ai/docs/execution-providers/web-gpu-execution-provider.html +- ONNX Runtime GenAI: https://github.com/microsoft/onnxruntime-genai diff --git a/.aitk/docs/others/fix_onnx_model.py b/.aitk/docs/others/fix_onnx_model.py new file mode 100644 index 000000000..9559e6c9b --- /dev/null +++ b/.aitk/docs/others/fix_onnx_model.py @@ -0,0 +1,414 @@ +""" +Generic ONNX Model WebGPU Fix for Combined QKV Projection Issues + +PROBLEM SUMMARY: +================ +WebGPU-converted ONNX models with combined qkv_proj structures exhibit a critical +architecture mismatch: + +1. GroupQueryAttention nodes use K, V projections from PREVIOUS layers instead of + the same layer +2. GroupQueryAttention Q input receives the full combined qkv_proj output instead + of just the Q portion +3. This causes dimension mismatch: o_proj expects specific K dimension but receives + mismatched output from GroupQueryAttention + +EXAMPLES: +- DeepSeek-R1-Distill-Qwen-1.5B: qkv_proj=2048, Q=1536, K=256, V=256 +- Llama-3.2-1B: qkv_proj=3072, Q=2048, K=512, V=512 + +SOLUTION OVERVIEW: +================== +For each affected layer, we: +1. Extract Q from qkv_proj[0:Q_dim] +2. Extract K from qkv_proj[Q_dim:Q_dim+K_dim] +3. Extract V from qkv_proj[Q_dim+K_dim:total_dim] +4. Update GroupQueryAttention to use extracted tensors +5. Ensure all new tensors match model precision +6. Use proper ONNX Slice syntax (axes as input, not attribute) +""" + +import onnx +from onnx import helper +import sys +import json +from pathlib import Path + +def auto_detect_layers_and_dims(model_path): + """ + Auto-detect which layers have combined qkv_proj and their dimensions. + + Returns: (layers_to_fix, q_dim, k_dim, v_dim) or (None, None, None, None) if not found + """ + try: + model = onnx.load(model_path, load_external_data=False) + graph = model.graph + + layers_to_fix = [] + qkv_dim = None + + # Find layers with qkv_proj + for i in range(64): + has_qkv = False + for node in graph.node: + if f'layers.{i}/attn' in node.name and 'qkv_proj' in node.name: + has_qkv = True + if qkv_dim is None: + # Get qkv_proj output dimension + for vi in graph.value_info: + if f'layers.{i}/attn/qkv_proj' in vi.name and 'output' in vi.name: + dims = vi.type.tensor_type.shape.dim + qkv_dim = dims[-1].dim_value + + if has_qkv: + layers_to_fix.append(i) + + if not layers_to_fix or qkv_dim is None: + return None, None, None, None + + # Get o_proj K dimension to infer Q_dim + o_proj_k = None + for i in layers_to_fix: + for node in graph.node: + if node.name == f'/model/layers.{i}/attn/o_proj/MatMulNBits': + for attr in node.attribute: + if attr.type == 2 and attr.name == 'K': + o_proj_k = attr.i + break + if o_proj_k: + break + + if qkv_dim and o_proj_k: + q_dim = o_proj_k + remaining = qkv_dim - q_dim + k_dim = remaining // 2 + v_dim = remaining - k_dim + return layers_to_fix, q_dim, k_dim, v_dim + + return None, None, None, None + except Exception: + return None, None, None, None + + +def fix_webgpu_qkv_model(model_path, layers_to_fix=None, q_dim=None, k_dim=None, v_dim=None, auto_detect=True): + """ + Generic fix for WebGPU ONNX models with combined qkv_proj dimension mismatch. + + Parameters: + ----------- + model_path : str + Path to the ONNX model file + layers_to_fix : list + Layer IDs to fix (auto-detected if None) + q_dim : int + Query dimension (auto-detected if None) + k_dim : int + Key dimension (auto-detected if None) + v_dim : int + Value dimension (auto-detected if None) + auto_detect : bool + If True, auto-detect layers and dimensions (overrides manual params) + + Returns: + -------- + bool : True if successful, False otherwise + """ + + print("=" * 70) + print("Generic WebGPU ONNX QKV Model Fixer") + print("=" * 70) + + try: + # Load model + print(f"\n[1/4] Loading model from {model_path}...") + model = onnx.load(model_path, load_external_data=False) + graph = model.graph + print(f" ✓ Model loaded successfully") + print(f" - IR Version: {model.ir_version}") + print(f" - Opset: {model.opset_import[0].version if model.opset_import else 'unknown'}") + + # Auto-detect if enabled + if auto_detect: + print(f"\n[2/4] Auto-detecting layers and dimensions...") + det_layers, det_q, det_k, det_v = auto_detect_layers_and_dims(model_path) + if det_layers: + layers_to_fix = det_layers + q_dim = det_q + k_dim = det_k + v_dim = det_v + print(f" ✓ Detected layers: {layers_to_fix}") + print(f" ✓ Detected dimensions: Q={q_dim}, K={k_dim}, V={v_dim}") + + if not layers_to_fix or not q_dim or not k_dim or not v_dim: + print(f" ✗ Failed to detect or specify layers and dimensions") + return False + + total_dim = q_dim + k_dim + v_dim + print(f"\n[3/4] Setting up Slice operations...") + print(f" • Total QKV dim: {total_dim} = {q_dim} + {k_dim} + {v_dim}") + + # Create required constants for Slice operations + constants = { + 'const_0': 0, + f'const_{q_dim}': q_dim, + f'const_{q_dim + k_dim}': q_dim + k_dim, + f'const_{total_dim}': total_dim, + 'const_axes_2': [2] + } + + # Add constants to graph + for const_name, const_value in constants.items(): + if not any(init.name == const_name for init in graph.initializer): + if const_name == 'const_axes_2': + tensor = helper.make_tensor(const_name, onnx.TensorProto.INT64, [1], const_value) + else: + tensor = helper.make_tensor(const_name, onnx.TensorProto.INT64, [1], [const_value]) + graph.initializer.append(tensor) + + # Fix each layer + slices_added = 0 + for layer_id in layers_to_fix: + # Auto-detect qkv_proj output node (could be Add or MatMulNBits) + qkv_output = None + for node in graph.node: + if node.name == f'/model/layers.{layer_id}/attn/qkv_proj/Add': + qkv_output = f'/model/layers.{layer_id}/attn/qkv_proj/Add/output_0' + break + + if not qkv_output: + # Fall back to MatMulNBits if no Add node + for node in graph.node: + if node.name == f'/model/layers.{layer_id}/attn/qkv_proj/MatMulNBits': + qkv_output = f'/model/layers.{layer_id}/attn/qkv_proj/MatMulNBits/output_0' + break + + if not qkv_output: + print(f" ✗ Could not find qkv_proj output for layer {layer_id}") + return False + + # Find data type from qkv_proj output + dtype = onnx.TensorProto.FLOAT16 + for vi in graph.value_info: + if f'layers.{layer_id}/attn/qkv_proj' in vi.name and 'output' in vi.name: + dtype = vi.type.tensor_type.elem_type + break + + # Q extraction: [0:q_dim] + slice_q = helper.make_node( + 'Slice', + inputs=[qkv_output, + 'const_0', f'const_{q_dim}', 'const_axes_2'], + outputs=[f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0'], + name=f'/model/layers.{layer_id}/attn/q_proj_extracted/Slice' + ) + + # K extraction: [q_dim:q_dim+k_dim] + slice_k = helper.make_node( + 'Slice', + inputs=[qkv_output, + f'const_{q_dim}', f'const_{q_dim + k_dim}', 'const_axes_2'], + outputs=[f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0'], + name=f'/model/layers.{layer_id}/attn/k_proj_extracted/Slice' + ) + + # V extraction: [q_dim+k_dim:total] + slice_v = helper.make_node( + 'Slice', + inputs=[qkv_output, + f'const_{q_dim + k_dim}', f'const_{total_dim}', 'const_axes_2'], + outputs=[f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0'], + name=f'/model/layers.{layer_id}/attn/v_proj_extracted/Slice' + ) + + graph.node.extend([slice_q, slice_k, slice_v]) + slices_added += 3 + + # Add value_info for extracted tensors + q_info = helper.make_tensor_value_info( + f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0', + dtype, + ['batch_size', 'sequence_length', q_dim] + ) + k_info = helper.make_tensor_value_info( + f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0', + dtype, + ['batch_size', 'sequence_length', k_dim] + ) + v_info = helper.make_tensor_value_info( + f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0', + dtype, + ['batch_size', 'sequence_length', v_dim] + ) + graph.value_info.extend([q_info, k_info, v_info]) + + # Update GroupQueryAttention inputs + for node in graph.node: + if node.name == f'/model/layers.{layer_id}/attn/GroupQueryAttention': + node.input[0] = f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0' + node.input[1] = f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0' + node.input[2] = f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0' + break + + print(f" ✓ Added {slices_added} Slice nodes across {len(layers_to_fix)} layers") + print(f" ✓ Updated {len(layers_to_fix)} GroupQueryAttention nodes") + + # Save fixed model + print(f"\n[4/4] Saving fixed model...") + onnx.save(model, model_path) + print(f" ✓ Model saved successfully") + + print("\n" + "=" * 70) + print("FIX COMPLETED SUCCESSFULLY!") + print("=" * 70) + print("\nSummary of Changes:") + print(f" • Fixed {len(layers_to_fix)} layers: {layers_to_fix}") + print(f" • QKV dimensions: Q={q_dim}, K={k_dim}, V={v_dim}") + print(f" • Added {slices_added} Slice nodes for Q/K/V extraction") + print(f" • Corrected GroupQueryAttention layer cross-references") + print(f" • Ensured precision consistency for all new tensors") + print(f" • Updated Slice syntax for ONNX opset 21 compatibility") + + return True + + except Exception as e: + print(f"\n❌ ERROR: {str(e)}") + import traceback + traceback.print_exc() + return False + + +def verify_fix(model_path, verbose=False, layers_to_fix=None): + """ + Verify that the fix was applied correctly. + + Parameters: + ----------- + model_path : str + Path to the fixed ONNX model + verbose : bool + Print detailed information + layers_to_fix : list + Specific layers to verify (auto-detected if None) + + Returns: + -------- + bool : True if fix is verified, False otherwise + """ + + print("\nVerifying model fix...") + + try: + model = onnx.load(model_path, load_external_data=False) + graph = model.graph + + # Auto-detect layers if not provided + if layers_to_fix is None: + det_result = auto_detect_layers_and_dims(model_path) + if det_result and det_result[0]: + layers_to_fix = det_result[0] + else: + print(" ✗ No layers detected - model may not need fixing or has unknown structure") + return False + + if not layers_to_fix or not isinstance(layers_to_fix, list): + print(" ✗ Invalid layers list") + return False + + all_correct = True + + for layer_id in layers_to_fix: + # Check Slice nodes exist + slice_nodes = [n for n in graph.node + if f'layers.{layer_id}' in n.name and 'Slice' in n.name and 'proj_extracted' in n.name] + + if len(slice_nodes) != 3: + print(f" ✗ Layer {layer_id}: Expected 3 Slice nodes, found {len(slice_nodes)}") + all_correct = False + continue + + # Check GroupQueryAttention inputs + gqa_node = next((n for n in graph.node + if n.name == f'/model/layers.{layer_id}/attn/GroupQueryAttention'), None) + + if not gqa_node: + print(f" ✗ Layer {layer_id}: GroupQueryAttention node not found") + all_correct = False + continue + + # Verify inputs point to extracted tensors + q_correct = gqa_node.input[0] == f'/model/layers.{layer_id}/attn/q_proj_extracted/output_0' + k_correct = gqa_node.input[1] == f'/model/layers.{layer_id}/attn/k_proj_extracted/output_0' + v_correct = gqa_node.input[2] == f'/model/layers.{layer_id}/attn/v_proj_extracted/output_0' + + if q_correct and k_correct and v_correct: + if verbose: + print(f" ✓ Layer {layer_id}: All checks passed") + else: + print(f" ✗ Layer {layer_id}: GroupQueryAttention inputs incorrect") + all_correct = False + + if all_correct: + print(" ✓ All verifications passed!") + + return all_correct + + except Exception as e: + print(f" ✗ Verification failed: {str(e)}") + import traceback + traceback.print_exc() + return False + + +if __name__ == "__main__": + # Usage: + # python fix_onnx_model.py [model_path] (auto-detect all) + # python fix_onnx_model.py [model_path] --verify (verify existing fix) + # python fix_onnx_model.py [model_path] --config config.json (use config file) + + model_path = "./model/model.onnx" + verify_only = False + config_file = None + + if len(sys.argv) > 1: + model_path = sys.argv[1] + + if "--verify" in sys.argv: + verify_only = True + + if "--config" in sys.argv: + idx = sys.argv.index("--config") + if idx + 1 < len(sys.argv): + config_file = sys.argv[idx + 1] + + if verify_only: + verify_fix(model_path, verbose=True) + sys.exit(0) + + # Load config if provided + q_dim = k_dim = v_dim = layers = None + if config_file: + try: + with open(config_file, 'r') as f: + config = json.load(f) + layers = config.get('layers_to_fix') + q_dim = config.get('q_dim') + k_dim = config.get('k_dim') + v_dim = config.get('v_dim') + print(f"Loaded config from {config_file}") + except Exception as e: + print(f"Warning: Failed to load config: {e}") + + success = fix_webgpu_qkv_model( + model_path, + layers_to_fix=layers, + q_dim=q_dim, + k_dim=k_dim, + v_dim=v_dim, + auto_detect=True # Always auto-detect if values not provided + ) + + if success: + verify_fix(model_path, verbose=True) + sys.exit(0) + else: + sys.exit(1) diff --git a/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt new file mode 100644 index 000000000..b0ab103ce --- /dev/null +++ b/.aitk/requirements/WebGPU/WebGPU_py3.12.13.txt @@ -0,0 +1,82 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +accelerate==1.13.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.5 +aiosignal==1.4.0 +alembic==1.18.4 +annotated-types==0.7.0 +anyio==4.13.0 +attrs==26.1.0 +certifi==2026.4.22 +charset-normalizer==3.4.7 +colorama==0.4.6 +colorlog==6.10.1 +datasets==4.8.5 +dill==0.4.1 +filelock==3.29.0 +flatbuffers==25.12.19 +frozenlist==1.8.0 +fsspec==2026.2.0 +greenlet==3.5.0 +h11==0.16.0 +hf-xet==1.5.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.36.2 +idna==3.13 +importlib-metadata==8.7.1 +jinja2==3.1.6 +lightning-utilities==0.15.3 +mako==1.3.12 +markupsafe==3.0.3 +ml-dtypes==0.5.4 +mpmath==1.3.0 +multidict==6.7.1 +multiprocess==0.70.19 +networkx==3.6.1 +numpy==2.4.4 +olive-ai==0.12.1 +onnx==1.21.0 +onnx-ir==0.2.1 +# install it separatly with no deps as it will install onnxruntime to overwrite onnxruntime-webgpu +# uvpip:install onnxruntime-genai==0.12.2 --no-deps;post +onnxoptimizer==0.4.2 +onnxruntime-webgpu==1.25.1 +onnxscript==0.7.0 +opentelemetry-api==1.41.1 +opentelemetry-sdk==1.41.1 +opentelemetry-semantic-conventions==0.62b1 +optuna==4.8.0 +packaging==26.2 +pandas==3.0.2 +prompt-toolkit==3.0.52 +propcache==0.4.1 +protobuf==7.34.1 +psutil==7.2.2 +pyarrow==24.0.0 +pydantic==2.13.3 +pydantic-core==2.46.3 +python-dateutil==2.9.0.post0 +pyyaml==6.0.3 +questionary==2.1.1 +regex==2026.4.4 +requests==2.33.1 +safetensors==0.7.0 +setuptools==81.0.0 +six==1.17.0 +sqlalchemy==2.0.49 +sympy==1.14.0 +tabulate==0.10.0 +tokenizers==0.21.4 +torch==2.11.0+cu130 +torchmetrics==1.9.0 +tqdm==4.67.3 +transformers==4.52.4 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +tzdata==2026.2 +urllib3==2.6.3 +wcwidth==0.7.0 +xxhash==3.7.0 +yarl==1.23.0 +zipp==3.23.1 diff --git a/.aitk/scripts/project_processor.py b/.aitk/scripts/project_processor.py index 4fb5a09c6..9c5366473 100644 --- a/.aitk/scripts/project_processor.py +++ b/.aitk/scripts/project_processor.py @@ -12,6 +12,7 @@ from sanitize.generator_intel import generator_intel from sanitize.generator_qnn import generator_qnn from sanitize.generator_trtrtx import generator_trtrtx +from sanitize.generator_webgpu import generator_webgpu from sanitize.model_info import ModelInfo, ModelList from sanitize.project_config import ModelInfoProject, ModelProjectConfig, WorkflowItem from sanitize.utils import ( @@ -189,6 +190,8 @@ def convert_yaml_to_project_config( generator_trtrtx(id, recipe, yml_file.parent, modelList) elif recipe.get("ep") == EPNames.DmlExecutionProvider.value: generator_dml(id, recipe, yml_file.parent, modelList) + elif recipe.get("ep") == EPNames.WebGpuExecutionProvider.value: + generator_webgpu(id, recipe, yml_file.parent, modelList) runtimes = get_runtime(recipe) for runtime in runtimes: modelSummary.recipes.setdefault(runtime, []).append(file) diff --git a/.aitk/scripts/sanitize/generator_webgpu.py b/.aitk/scripts/sanitize/generator_webgpu.py new file mode 100644 index 000000000..80557f841 --- /dev/null +++ b/.aitk/scripts/sanitize/generator_webgpu.py @@ -0,0 +1,35 @@ +from pathlib import Path + +from .generator_common import create_model_parameter, set_optimization_path +from .generator_dml import generate_quantization_config +from .model_info import ModelList +from .model_parameter import ModelParameter +from .utils import isLLM_by_id + +def generator_webgpu(id: str, recipe, folder: Path, modelList: ModelList): + aitk = recipe.get("aitk", {}) + auto = aitk.get("auto", True) + if not auto: + return + + isLLM = isLLM_by_id(id) + file = recipe.get("file") + configFile = folder / file + + if not isLLM: + modelParameter = ModelParameter.Read(str(configFile) + ".config") + set_optimization_path(modelParameter, str(configFile)) + modelParameter.writeIfChanged() + return + + name = "Convert to WebGPU" + + parameter = create_model_parameter(aitk, name, configFile) + parameter.isLLM = isLLM + + quantize = generate_quantization_config(configFile, parameter) + if quantize: + parameter.sections.append(quantize) + + parameter.writeIfChanged() + print(f"\tGenerated WebGPU configuration for {file}") diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config index d5d6e0975..cc8121b4d 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/_copy.json.config @@ -28,6 +28,20 @@ } ] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json", + "dst": "qwen2_5_webgpu.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "Qwen/Qwen2.5-1.5B-Instruct" + }, + { + "find": "model/deepseek", + "replace": "model/qwen2_5" + } + ] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml index af6ec72c3..65ae5875c 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/info.yml @@ -38,6 +38,12 @@ recipes: oliveFile: "QNN/config_gpu.json" isGPURequired: true requirements: General/CUDA_py3.12.9 + - file: "qwen2_5_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/Qwen/Qwen2.5-1.5B-Instruct" diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config index fdd9f88f7..3bad9483e 100644 --- a/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "qwen2_5_qnn_gpu_config.json", "templateName": "qwen2_5_qnn_gpu_config" + }, + { + "file": "qwen2_5_webgpu.json", + "templateName": "qwen2_5_webgpu" } ], "modelInfo": { diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json new file mode 100644 index 000000000..a9d1937d0 --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json @@ -0,0 +1,68 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Qwen/Qwen2.5-1.5B-Instruct", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "k_quant_mixed" + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/qwen2_5", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/Qwen-Qwen2.5-1.5B-Instruct/aitk/qwen2_5_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json new file mode 100644 index 000000000..314a606a0 --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json @@ -0,0 +1,68 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "k_quant_mixed" + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/deepseek", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml index 9486b13f9..106acc988 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/info.yml @@ -38,6 +38,12 @@ recipes: oliveFile: "QNN/config_gpu.json" isGPURequired: true requirements: General/CUDA_py3.12.9 + - file: "deepseek_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" diff --git a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config index 64ea7551e..52e53d2af 100644 --- a/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config +++ b/deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "deepseek_qnn_gpu_config.json", "templateName": "deepseek_qnn_gpu_config" + }, + { + "file": "deepseek_webgpu.json", + "templateName": "deepseek_webgpu" } ], "modelInfo": { diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json new file mode 100644 index 000000000..04815ae4b --- /dev/null +++ b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json @@ -0,0 +1,35 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google-bert/bert-base-multilingual-cased", + "task": "feature-extraction" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/bert_webgpu", + "evaluate_input_model": false +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/google-bert-bert-base-multilingual-cased/aitk/bert_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/google-bert-bert-base-multilingual-cased/aitk/info.yml b/google-bert-bert-base-multilingual-cased/aitk/info.yml index c3bf4db00..af086bc3c 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/info.yml +++ b/google-bert-bert-base-multilingual-cased/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "bert-base-multilingual-cased_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "bert_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/google-bert/bert-base-multilingual-cased" diff --git a/google-bert-bert-base-multilingual-cased/aitk/model_project.config b/google-bert-bert-base-multilingual-cased/aitk/model_project.config index c7a3fa7da..b7020ec6f 100644 --- a/google-bert-bert-base-multilingual-cased/aitk/model_project.config +++ b/google-bert-bert-base-multilingual-cased/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "bert-base-multilingual-cased_qnn_gpu.json", "templateName": "bert-base-multilingual-cased_qnn_gpu" + }, + { + "file": "bert_webgpu.json", + "templateName": "bert_webgpu" } ], "modelInfo": { diff --git a/google-vit-base-patch16-224/aitk/info.yml b/google-vit-base-patch16-224/aitk/info.yml index 00835ef69..0014c79a4 100644 --- a/google-vit-base-patch16-224/aitk/info.yml +++ b/google-vit-base-patch16-224/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "vit-base-patch16-224_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "vit_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/google/vit-base-patch16-224" diff --git a/google-vit-base-patch16-224/aitk/model_project.config b/google-vit-base-patch16-224/aitk/model_project.config index a13675ac8..c16a34e96 100644 --- a/google-vit-base-patch16-224/aitk/model_project.config +++ b/google-vit-base-patch16-224/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "vit-base-patch16-224_qnn_gpu.json", "templateName": "vit-base-patch16-224_qnn_gpu" + }, + { + "file": "vit_webgpu.json", + "templateName": "vit_webgpu" } ], "modelInfo": { diff --git a/google-vit-base-patch16-224/aitk/vit_webgpu.json b/google-vit-base-patch16-224/aitk/vit_webgpu.json new file mode 100644 index 000000000..1b9f439e7 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit_webgpu.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "google/vit-base-patch16-224", + "task": "image-classification", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 224, + 224 + ] + ], + "output_names": [ + "output" + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/vit_webgpu", + "evaluate_input_model": false +} diff --git a/google-vit-base-patch16-224/aitk/vit_webgpu.json.config b/google-vit-base-patch16-224/aitk/vit_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/google-vit-base-patch16-224/aitk/vit_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json new file mode 100644 index 000000000..f67676762 --- /dev/null +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json @@ -0,0 +1,38 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "Intel/bert-base-uncased-mrpc", + "task": "text-classification", + "load_kwargs": { + "attn_implementation": "eager" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/bert_webgpu", + "evaluate_input_model": false +} diff --git a/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/intel-bert-base-uncased-mrpc/aitk/bert_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/intel-bert-base-uncased-mrpc/aitk/info.yml b/intel-bert-base-uncased-mrpc/aitk/info.yml index f8781cde7..d052c8025 100644 --- a/intel-bert-base-uncased-mrpc/aitk/info.yml +++ b/intel-bert-base-uncased-mrpc/aitk/info.yml @@ -29,6 +29,9 @@ recipes: - file: "bert_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "bert_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/Intel/bert-base-uncased-mrpc" diff --git a/intel-bert-base-uncased-mrpc/aitk/model_project.config b/intel-bert-base-uncased-mrpc/aitk/model_project.config index 72a32db9d..b6b99b857 100644 --- a/intel-bert-base-uncased-mrpc/aitk/model_project.config +++ b/intel-bert-base-uncased-mrpc/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "bert_qnn_gpu.json", "templateName": "bert_qnn_gpu" + }, + { + "file": "bert_webgpu.json", + "templateName": "bert_webgpu" } ], "modelInfo": { diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config index f6ce51a00..c3c112dba 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/_copy.json.config @@ -117,6 +117,21 @@ "dst": "laion_clip_dml.json.config", "replacements": [] }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json", + "dst": "laion_clip_webgpu.json", + "replacements": [ + { + "find": "openai/clip-vit-base-patch16", + "replace": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" + } + ] + }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config", + "dst": "laion_clip_webgpu.json.config", + "replacements": [] + }, { "src": "laion_clip_dml.json", "dst": "laion_clip_migraphx.json", diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml index cc0da630e..ad1eca92b 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "laion_clip_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "laion_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/laion/CLIP-ViT-B-32-laion2B-s34B-b79K" diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json new file mode 100644 index 000000000..94d4dbae2 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json @@ -0,0 +1,90 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/clip_webgpu", + "evaluate_input_model": false +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/laion_clip_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config index 2ebfd7066..0188d4c64 100644 --- a/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config +++ b/laion-CLIP-ViT-B-32-laion2B-s34B-b79K/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "laion_clip_qnn_gpu.json", "templateName": "laion_clip_qnn_gpu" + }, + { + "file": "laion_clip_webgpu.json", + "templateName": "laion_clip_webgpu" } ], "modelInfo": { diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config index 607b92270..87a0d6a41 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/_copy.json.config @@ -42,6 +42,20 @@ } ] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json", + "dst": "llama3_2_webgpu.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "meta-llama/Llama-3.2-1B-Instruct" + }, + { + "find": "model/deepseek", + "replace": "model/llama3_2" + } + ] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml index 803ac4a12..3a1d18b2f 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/info.yml @@ -38,6 +38,12 @@ recipes: oliveFile: "QNN/config_gpu.json" isGPURequired: true requirements: General/CUDA_py3.12.9 + - file: "llama3_2_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/meta-llama/Llama-3.2-1B-Instruct" diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json new file mode 100644 index 000000000..8ee4392e5 --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json @@ -0,0 +1,68 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "meta-llama/Llama-3.2-1B-Instruct", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "k_quant_mixed" + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/llama3_2", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/llama3_2_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config index 3df076bbb..f26533b06 100644 --- a/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config +++ b/meta-llama-Llama-3.2-1B-Instruct/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "llama3_2_qnn_gpu_config.json", "templateName": "llama3_2_qnn_gpu_config" + }, + { + "file": "llama3_2_webgpu.json", + "templateName": "llama3_2_webgpu" } ], "modelInfo": { diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config index d260de070..6f9870282 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/_copy.json.config @@ -14,6 +14,20 @@ } ] }, + { + "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/deepseek_webgpu.json", + "dst": "phi3_5_webgpu.json", + "replacements": [ + { + "find": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "replace": "microsoft/Phi-3.5-mini-instruct" + }, + { + "find": "model/deepseek", + "replace": "model/phi3_5" + } + ] + }, { "src": "../../deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B/aitk/README.md", "dst": "README.md", diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml index 1f85b22d8..29217e9ac 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/info.yml +++ b/microsoft-Phi-3.5-mini-instruct/aitk/info.yml @@ -38,6 +38,12 @@ recipes: oliveFile: "QNN/config_gpu.json" isGPURequired: true requirements: General/CUDA_py3.12.9 + - file: "phi3_5_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider + aitk: + requirements: WebGPU/WebGPU_py3.12.13 + evalRuntime: WebGPU aitk: modelInfo: id: "huggingface/microsoft/Phi-3.5-mini-instruct" diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config index 1162f8288..5bff7a1f1 100644 --- a/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config +++ b/microsoft-Phi-3.5-mini-instruct/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "phi3_5_qnn_gpu_config.json", "templateName": "phi3_5_qnn_gpu_config" + }, + { + "file": "phi3_5_webgpu.json", + "templateName": "phi3_5_webgpu" } ], "modelInfo": { diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json new file mode 100644 index 000000000..12c617ab4 --- /dev/null +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json @@ -0,0 +1,68 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "microsoft/Phi-3.5-mini-instruct", + "load_kwargs": { + "torch_dtype": "float16" + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "s": { + "type": "SelectiveMixedPrecision", + "algorithm": "k_quant_mixed" + }, + "g": { + "type": "gptq", + "bits": 4, + "sym": false, + "group_size": 32 + }, + "r": { + "type": "rtn", + "bits": 8, + "sym": false, + "group_size": 32, + "lm_head": true, + "embeds": true, + "overrides": { + "lm_head": { + "bits": 8 + }, + "model.embed_tokens": { + "bits": 8 + } + } + }, + "m": { + "type": "ModelBuilder", + "precision": "int4" + }, + "t": { + "type": "GraphSurgeries", + "surgeries": [ + { + "surgeon": "TieWordEmbeddings" + } + ] + } + }, + "target": "local_system", + "log_severity_level": 0, + "output_dir": "model/phi3_5", + "cache_dir": "cache", + "no_artifacts": true, + "evaluate_input_model": false +} diff --git a/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config new file mode 100644 index 000000000..8c3a740ee --- /dev/null +++ b/microsoft-Phi-3.5-mini-instruct/aitk/phi3_5_webgpu.json.config @@ -0,0 +1,95 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "isLLM": true, + "evalRuntime": "WebGPU", + "debugInfo": { + "autoGenerated": true, + "useModelBuilder": "m" + }, + "runtimeOverwrite": { + "autoGenerated": true, + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.m.precision" + } + ], + "optimizationDefault": "int4", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + }, + { + "autoGenerated": true, + "name": "Optimization", + "phase": "Quantization", + "parameters": [ + { + "autoGenerated": true, + "name": "Precision", + "description": "Precision of model", + "type": "enum", + "displayNames": [ + "Int4", + "Bf16", + "Fp16", + "Fp32" + ], + "displayType": "RadioGroup", + "path": "passes.m.precision", + "values": [ + "int4", + "bf16", + "fp16", + "fp32" + ], + "template": { + "path": "passes.m.precision", + "template": "ModelBuilderPrecision" + } + } + ], + "disableToggleGeneration": true, + "toggle": { + "autoGenerated": true, + "name": "Optimize model", + "type": "bool", + "path": "passes.m", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/microsoft-resnet-50/aitk/info.yml b/microsoft-resnet-50/aitk/info.yml index 990b55773..f9d53c81f 100644 --- a/microsoft-resnet-50/aitk/info.yml +++ b/microsoft-resnet-50/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "resnet_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "resnet_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/microsoft/resnet-50" diff --git a/microsoft-resnet-50/aitk/model_project.config b/microsoft-resnet-50/aitk/model_project.config index be2778a56..c4c8dfd39 100644 --- a/microsoft-resnet-50/aitk/model_project.config +++ b/microsoft-resnet-50/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "resnet_qnn_gpu.json", "templateName": "resnet_qnn_gpu" + }, + { + "file": "resnet_webgpu.json", + "templateName": "resnet_webgpu" } ], "modelInfo": { diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json b/microsoft-resnet-50/aitk/resnet_webgpu.json new file mode 100644 index 000000000..1c44d2f51 --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json @@ -0,0 +1,51 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "microsoft/resnet-50", + "task": "image-classification", + "io_config": { + "input_names": [ + "pixel_values" + ], + "input_shapes": [ + [ + 1, + 3, + 224, + 224 + ] + ], + "output_names": [ + "logits" + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/resnet_webgpu", + "evaluate_input_model": false +} diff --git a/microsoft-resnet-50/aitk/resnet_webgpu.json.config b/microsoft-resnet-50/aitk/resnet_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/microsoft-resnet-50/aitk/resnet_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/openai-clip-vit-base-patch16/aitk/info.yml b/openai-clip-vit-base-patch16/aitk/info.yml index 1686cc549..16d11fdbb 100644 --- a/openai-clip-vit-base-patch16/aitk/info.yml +++ b/openai-clip-vit-base-patch16/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "openai_clip_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "openai_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/openai/clip-vit-base-patch16" diff --git a/openai-clip-vit-base-patch16/aitk/model_project.config b/openai-clip-vit-base-patch16/aitk/model_project.config index 0506a0fd7..76699ef5a 100644 --- a/openai-clip-vit-base-patch16/aitk/model_project.config +++ b/openai-clip-vit-base-patch16/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "openai_clip_qnn_gpu.json", "templateName": "openai_clip_qnn_gpu" + }, + { + "file": "openai_clip_webgpu.json", + "templateName": "openai_clip_webgpu" } ], "modelInfo": { diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json new file mode 100644 index 000000000..e0f5adc4e --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json @@ -0,0 +1,90 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "openai/clip-vit-base-patch16", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/clip_webgpu", + "evaluate_input_model": false +} diff --git a/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +} diff --git a/openai-clip-vit-base-patch32/aitk/_copy.json.config b/openai-clip-vit-base-patch32/aitk/_copy.json.config index c6c72ccee..005a6cb5d 100644 --- a/openai-clip-vit-base-patch32/aitk/_copy.json.config +++ b/openai-clip-vit-base-patch32/aitk/_copy.json.config @@ -109,6 +109,21 @@ "dst": "openai_clip_dml.json.config", "replacements": [] }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json", + "dst": "openai_clip_webgpu.json", + "replacements": [ + { + "find": "openai/clip-vit-base-patch16", + "replace": "openai/clip-vit-base-patch32" + } + ] + }, + { + "src": "../../openai-clip-vit-base-patch16/aitk/openai_clip_webgpu.json.config", + "dst": "openai_clip_webgpu.json.config", + "replacements": [] + }, { "src": "openai_clip_dml.json", "dst": "openai_clip_migraphx.json", diff --git a/openai-clip-vit-base-patch32/aitk/info.yml b/openai-clip-vit-base-patch32/aitk/info.yml index 545b1f463..46d92dcdd 100644 --- a/openai-clip-vit-base-patch32/aitk/info.yml +++ b/openai-clip-vit-base-patch32/aitk/info.yml @@ -26,6 +26,9 @@ recipes: - file: "openai_clip_qnn_gpu.json" device: gpu ep: QNNExecutionProvider + - file: "openai_clip_webgpu.json" + device: gpu + ep: WebGpuExecutionProvider aitk: modelInfo: id: "huggingface/openai/clip-vit-base-patch32" diff --git a/openai-clip-vit-base-patch32/aitk/model_project.config b/openai-clip-vit-base-patch32/aitk/model_project.config index 3932bafd5..95d9e3cc8 100644 --- a/openai-clip-vit-base-patch32/aitk/model_project.config +++ b/openai-clip-vit-base-patch32/aitk/model_project.config @@ -27,6 +27,10 @@ { "file": "openai_clip_qnn_gpu.json", "templateName": "openai_clip_qnn_gpu" + }, + { + "file": "openai_clip_webgpu.json", + "templateName": "openai_clip_webgpu" } ], "modelInfo": { diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json new file mode 100644 index 000000000..7f8d0bd3f --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json @@ -0,0 +1,90 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "openai/clip-vit-base-patch32", + "task": "zero-shot-image-classification", + "load_kwargs": { + "attn_implementation": "eager" + }, + "io_config": { + "input_names": [ + "input_ids", + "pixel_values", + "attention_mask" + ], + "input_shapes": [ + [ + 10, + 77 + ], + [ + 1, + 3, + 224, + 224 + ], + [ + 10, + 77 + ] + ], + "input_types": [ + "int64", + "float32", + "int64" + ], + "output_names": [ + "logits_per_image", + "logits_per_text", + "text_embeds", + "image_embeds" + ], + "output_shapes": [ + [ + 1, + 10 + ], + [ + 10, + 1 + ], + [ + 10, + 512 + ], + [ + 1, + 512 + ] + ] + } + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "passes": { + "conversion": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true + }, + "peephole": { + "type": "OnnxPeepholeOptimizer", + "save_as_external_data": true + } + }, + "target": "local_system", + "cache_dir": "cache", + "output_dir": "model/clip_webgpu", + "evaluate_input_model": false +} diff --git a/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config new file mode 100644 index 000000000..4575e8895 --- /dev/null +++ b/openai-clip-vit-base-patch32/aitk/openai_clip_webgpu.json.config @@ -0,0 +1,47 @@ +{ + "$schema": "https://github.com/microsoft/olive-recipes/raw/refs/heads/main/.aitk/configs/config_schema.json", + "name": "Convert to WebGPU", + "evalRuntime": "WebGPU", + "runtimeOverwrite": { + "executeRequirement": "WebGPU/WebGPU_py3.12.13" + }, + "runtime": { + "autoGenerated": true, + "name": "Evaluate on", + "type": "enum", + "displayNames": [ + "WebGPU" + ], + "path": "systems.local_system.accelerators.0.execution_providers.0", + "values": [ + "WebGpuExecutionProvider" + ], + "readOnly": false + }, + "optimizationPaths": [ + { + "path": "passes.conversion", + "name": "fp32" + } + ], + "optimizationDefault": "fp32", + "sections": [ + { + "autoGenerated": true, + "name": "Convert", + "phase": "Conversion", + "parameters": [], + "toggle": { + "autoGenerated": true, + "name": "Convert to ONNX format", + "type": "bool", + "path": "passes.conversion", + "actions": [ + [], + [] + ], + "readOnly": true + } + } + ] +}